diff options
31 files changed, 25572 insertions, 0 deletions
diff --git a/src/amd/compiler/README b/src/amd/compiler/README new file mode 100644 index 00000000000..87d63c07024 --- /dev/null +++ b/src/amd/compiler/README @@ -0,0 +1,87 @@ +# Unofficial GCN/RDNA ISA reference errata + +## v_sad_u32 + +The Vega ISA reference writes it's behaviour as: +``` +D.u = abs(S0.i - S1.i) + S2.u. +``` +This is incorrect. The actual behaviour is what is written in the GCN3 reference +guide: +``` +ABS_DIFF (A,B) = (A>B) ? (A-B) : (B-A) +D.u = ABS_DIFF (S0.u,S1.u) + S2.u +``` +The instruction doesn't subtract the S0 and S1 and use the absolute value (the +_signed_ distance), it uses the _unsigned_ distance between the operands. So +`v_sad_u32(-5, 0, 0)` would return `4294967291` (`-5` interpreted as unsigned), +not `5`. + +## s_bfe_* + +Both the Vega and GCN3 ISA references write that these instructions don't write +SCC. They do. + +## v_bcnt_u32_b32 + +The Vega ISA reference writes it's behaviour as: +``` +D.u = 0; +for i in 0 ... 31 do +D.u += (S0.u[i] == 1 ? 1 : 0); +endfor. +``` +This is incorrect. The actual behaviour (and number of operands) is what +is written in the GCN3 reference guide: +``` +D.u = CountOneBits(S0.u) + S1.u. +``` + +## SMEM stores + +The Vega ISA references doesn't say this (or doesn't make it clear), but +the offset for SMEM stores must be in m0 if IMM == 0. + +The RDNA ISA doesn't mention SMEM stores at all, but they seem to be supported +by the chip and are present in LLVM. AMD devs however highly recommend avoiding +these instructions. + +## SMEM atomics + +RDNA ISA: same as the SMEM stores, the ISA pretends they don't exist, but they +are there in LLVM. + +## VMEM stores + +All reference guides say (under "Vector Memory Instruction Data Dependencies"): +> When a VM instruction is issued, the address is immediately read out of VGPRs +> and sent to the texture cache. Any texture or buffer resources and samplers +> are also sent immediately. However, write-data is not immediately sent to the +> texture cache. +Reading that, one might think that waitcnts need to be added when writing to +the registers used for a VMEM store's data. Experimentation has shown that this +does not seem to be the case on GFX8 and GFX9 (GFX6 and GFX7 are untested). It +also seems unlikely, since NOPs are apparently needed in a subset of these +situations. + +## MIMG opcodes on GFX8/GCN3 + +The `image_atomic_{swap,cmpswap,add,sub}` opcodes in the GCN3 ISA reference +guide are incorrect. The Vega ISA reference guide has the correct ones. + +## Legacy instructions + +Some instructions have a `_LEGACY` variant which implements "DX9 rules", in which +the zero "wins" in multiplications, ie. `0.0*x` is always `0.0`. The VEGA ISA +mentions `V_MAC_LEGACY_F32` but this instruction is not really there on VEGA. + +# Hardware Bugs + +## SMEM corrupts VCCZ on SI/CI + +https://github.com/llvm/llvm-project/blob/acb089e12ae48b82c0b05c42326196a030df9b82/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp#L580-L616 +After issuing a SMEM instructions, we need to wait for the SMEM instructions to +finish and then write to vcc (for example, `s_mov_b64 vcc, vcc`) to correct vccz + +Currently, we don't do this. + diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp new file mode 100644 index 00000000000..a6bf2a3e0db --- /dev/null +++ b/src/amd/compiler/aco_assembler.cpp @@ -0,0 +1,497 @@ +#include <map> + +#include "aco_ir.h" +#include "common/sid.h" + +namespace aco { + +struct asm_context { + Program *program; + enum chip_class chip_class; + std::map<int, SOPP_instruction*> branches; + std::vector<unsigned> constaddrs; + const int16_t* opcode; + // TODO: keep track of branch instructions referring blocks + // and, when emitting the block, correct the offset in instr + asm_context(Program* program) : program(program), chip_class(program->chip_class) { + if (chip_class <= GFX9) + opcode = &instr_info.opcode_gfx9[0]; + } +}; + +void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr) +{ + uint32_t instr_offset = out.size() * 4u; + + /* lower remaining pseudo-instructions */ + if (instr->opcode == aco_opcode::p_constaddr) { + unsigned dest = instr->definitions[0].physReg(); + unsigned offset = instr->operands[0].constantValue(); + + /* s_getpc_b64 dest[0:1] */ + uint32_t encoding = (0b101111101 << 23); + uint32_t opcode = ctx.opcode[(int)aco_opcode::s_getpc_b64]; + if (opcode >= 55 && ctx.chip_class <= GFX9) { + assert(ctx.chip_class == GFX9 && opcode < 60); + opcode = opcode - 4; + } + encoding |= dest << 16; + encoding |= opcode << 8; + out.push_back(encoding); + + /* s_add_u32 dest[0], dest[0], ... */ + encoding = (0b10 << 30); + encoding |= ctx.opcode[(int)aco_opcode::s_add_u32] << 23; + encoding |= dest << 16; + encoding |= dest; + encoding |= 255 << 8; + out.push_back(encoding); + ctx.constaddrs.push_back(out.size()); + out.push_back(-(instr_offset + 4) + offset); + + /* s_addc_u32 dest[1], dest[1], 0 */ + encoding = (0b10 << 30); + encoding |= ctx.opcode[(int)aco_opcode::s_addc_u32] << 23; + encoding |= (dest + 1) << 16; + encoding |= dest + 1; + encoding |= 128 << 8; + out.push_back(encoding); + return; + } + + uint32_t opcode = ctx.opcode[(int)instr->opcode]; + if (opcode == (uint32_t)-1) { + fprintf(stderr, "Unsupported opcode: "); + aco_print_instr(instr, stderr); + abort(); + } + + switch (instr->format) { + case Format::SOP2: { + uint32_t encoding = (0b10 << 30); + encoding |= opcode << 23; + encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0; + encoding |= instr->operands.size() >= 2 ? instr->operands[1].physReg() << 8 : 0; + encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; + out.push_back(encoding); + break; + } + case Format::SOPK: { + uint32_t encoding = (0b1011 << 28); + encoding |= opcode << 23; + encoding |= + !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) ? + instr->definitions[0].physReg() << 16 : + !instr->operands.empty() && !(instr->operands[0].physReg() == scc) ? + instr->operands[0].physReg() << 16 : 0; + encoding |= static_cast<SOPK_instruction*>(instr)->imm; + out.push_back(encoding); + break; + } + case Format::SOP1: { + uint32_t encoding = (0b101111101 << 23); + if (opcode >= 55 && ctx.chip_class <= GFX9) { + assert(ctx.chip_class == GFX9 && opcode < 60); + opcode = opcode - 4; + } + encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0; + encoding |= opcode << 8; + encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; + out.push_back(encoding); + break; + } + case Format::SOPC: { + uint32_t encoding = (0b101111110 << 23); + encoding |= opcode << 16; + encoding |= instr->operands.size() == 2 ? instr->operands[1].physReg() << 8 : 0; + encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0; + out.push_back(encoding); + break; + } + case Format::SOPP: { + SOPP_instruction* sopp = static_cast<SOPP_instruction*>(instr); + uint32_t encoding = (0b101111111 << 23); + encoding |= opcode << 16; + encoding |= (uint16_t) sopp->imm; + if (sopp->block != -1) + ctx.branches.insert({out.size(), sopp}); + out.push_back(encoding); + break; + } + case Format::SMEM: { + SMEM_instruction* smem = static_cast<SMEM_instruction*>(instr); + uint32_t encoding = (0b110000 << 26); + encoding |= opcode << 18; + if (instr->operands.size() >= 2) + encoding |= instr->operands[1].isConstant() ? 1 << 17 : 0; + bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4); + assert(!soe || ctx.chip_class >= GFX9); + encoding |= soe ? 1 << 14 : 0; + encoding |= smem->glc ? 1 << 16 : 0; + if (!instr->definitions.empty() || instr->operands.size() >= 3) + encoding |= (!instr->definitions.empty() ? instr->definitions[0].physReg() : instr->operands[2].physReg().reg) << 6; + if (instr->operands.size() >= 1) + encoding |= instr->operands[0].physReg() >> 1; + out.push_back(encoding); + encoding = 0; + if (instr->operands.size() >= 2) + encoding |= instr->operands[1].isConstant() ? instr->operands[1].constantValue() : instr->operands[1].physReg().reg; + encoding |= soe ? instr->operands.back().physReg() << 25 : 0; + out.push_back(encoding); + return; + } + case Format::VOP2: { + uint32_t encoding = 0; + encoding |= opcode << 25; + encoding |= (0xFF & instr->definitions[0].physReg().reg) << 17; + encoding |= (0xFF & instr->operands[1].physReg().reg) << 9; + encoding |= instr->operands[0].physReg().reg; + out.push_back(encoding); + break; + } + case Format::VOP1: { + uint32_t encoding = (0b0111111 << 25); + encoding |= (0xFF & instr->definitions[0].physReg().reg) << 17; + encoding |= opcode << 9; + encoding |= instr->operands[0].physReg().reg; + out.push_back(encoding); + break; + } + case Format::VOPC: { + uint32_t encoding = (0b0111110 << 25); + encoding |= opcode << 17; + encoding |= (0xFF & instr->operands[1].physReg().reg) << 9; + encoding |= instr->operands[0].physReg().reg; + out.push_back(encoding); + break; + } + case Format::VINTRP: { + Interp_instruction* interp = static_cast<Interp_instruction*>(instr); + uint32_t encoding = (0b110101 << 26); + encoding |= (0xFF & instr->definitions[0].physReg().reg) << 18; + encoding |= opcode << 16; + encoding |= interp->attribute << 10; + encoding |= interp->component << 8; + if (instr->opcode == aco_opcode::v_interp_mov_f32) + encoding |= (0x3 & instr->operands[0].constantValue()); + else + encoding |= (0xFF & instr->operands[0].physReg().reg); + out.push_back(encoding); + break; + } + case Format::DS: { + DS_instruction* ds = static_cast<DS_instruction*>(instr); + uint32_t encoding = (0b110110 << 26); + encoding |= opcode << 17; + encoding |= (ds->gds ? 1 : 0) << 16; + encoding |= ((0xFF & ds->offset1) << 8); + encoding |= (0xFFFF & ds->offset0); + out.push_back(encoding); + encoding = 0; + unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0; + encoding |= (0xFF & reg) << 24; + reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0) ? instr->operands[2].physReg() : 0; + encoding |= (0xFF & reg) << 16; + reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) ? instr->operands[1].physReg() : 0; + encoding |= (0xFF & reg) << 8; + encoding |= (0xFF & instr->operands[0].physReg().reg); + out.push_back(encoding); + break; + } + case Format::MUBUF: { + MUBUF_instruction* mubuf = static_cast<MUBUF_instruction*>(instr); + uint32_t encoding = (0b111000 << 26); + encoding |= opcode << 18; + encoding |= (mubuf->slc ? 1 : 0) << 17; + encoding |= (mubuf->lds ? 1 : 0) << 16; + encoding |= (mubuf->glc ? 1 : 0) << 14; + encoding |= (mubuf->idxen ? 1 : 0) << 13; + encoding |= (mubuf->offen ? 1 : 0) << 12; + encoding |= 0x0FFF & mubuf->offset; + out.push_back(encoding); + encoding = 0; + encoding |= instr->operands[2].physReg() << 24; + encoding |= (mubuf->tfe ? 1 : 0) << 23; + encoding |= (instr->operands[1].physReg() >> 2) << 16; + unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg().reg; + encoding |= (0xFF & reg) << 8; + encoding |= (0xFF & instr->operands[0].physReg().reg); + out.push_back(encoding); + break; + } + case Format::MTBUF: { + MTBUF_instruction* mtbuf = static_cast<MTBUF_instruction*>(instr); + uint32_t encoding = (0b111010 << 26); + encoding |= opcode << 15; + encoding |= (mtbuf->glc ? 1 : 0) << 14; + encoding |= (mtbuf->idxen ? 1 : 0) << 13; + encoding |= (mtbuf->offen ? 1 : 0) << 12; + encoding |= 0x0FFF & mtbuf->offset; + encoding |= (0xF & mtbuf->dfmt) << 19; + encoding |= (0x7 & mtbuf->nfmt) << 23; + out.push_back(encoding); + encoding = 0; + encoding |= instr->operands[2].physReg().reg << 24; + encoding |= (mtbuf->tfe ? 1 : 0) << 23; + encoding |= (mtbuf->slc ? 1 : 0) << 22; + encoding |= (instr->operands[1].physReg().reg >> 2) << 16; + unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg().reg : instr->definitions[0].physReg().reg; + encoding |= (0xFF & reg) << 8; + encoding |= (0xFF & instr->operands[0].physReg().reg); + out.push_back(encoding); + break; + } + case Format::MIMG: { + MIMG_instruction* mimg = static_cast<MIMG_instruction*>(instr); + uint32_t encoding = (0b111100 << 26); + encoding |= mimg->slc ? 1 << 25 : 0; + encoding |= opcode << 18; + encoding |= mimg->lwe ? 1 << 17 : 0; + encoding |= mimg->tfe ? 1 << 16 : 0; + encoding |= mimg->r128 ? 1 << 15 : 0; + encoding |= mimg->da ? 1 << 14 : 0; + encoding |= mimg->glc ? 1 << 13 : 0; + encoding |= mimg->unrm ? 1 << 12 : 0; + encoding |= (0xF & mimg->dmask) << 8; + out.push_back(encoding); + encoding = (0xFF & instr->operands[0].physReg().reg); /* VADDR */ + if (!instr->definitions.empty()) { + encoding |= (0xFF & instr->definitions[0].physReg().reg) << 8; /* VDATA */ + } else if (instr->operands.size() == 4) { + encoding |= (0xFF & instr->operands[3].physReg().reg) << 8; /* VDATA */ + } + encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 16; /* T# (resource) */ + if (instr->operands.size() > 2) + encoding |= (0x1F & (instr->operands[2].physReg() >> 2)) << 21; /* sampler */ + // TODO VEGA: D16 + out.push_back(encoding); + break; + } + case Format::FLAT: + case Format::SCRATCH: + case Format::GLOBAL: { + FLAT_instruction *flat = static_cast<FLAT_instruction*>(instr); + uint32_t encoding = (0b110111 << 26); + encoding |= opcode << 18; + encoding |= flat->offset & 0x1fff; + if (instr->format == Format::SCRATCH) + encoding |= 1 << 14; + else if (instr->format == Format::GLOBAL) + encoding |= 2 << 14; + encoding |= flat->lds ? 1 << 13 : 0; + encoding |= flat->glc ? 1 << 13 : 0; + encoding |= flat->slc ? 1 << 13 : 0; + out.push_back(encoding); + encoding = (0xFF & instr->operands[0].physReg().reg); + if (!instr->definitions.empty()) + encoding |= (0xFF & instr->definitions[0].physReg().reg) << 24; + else + encoding |= (0xFF & instr->operands[2].physReg().reg) << 8; + if (!instr->operands[1].isUndefined()) { + assert(instr->operands[1].physReg() != 0x7f); + assert(instr->format != Format::FLAT); + encoding |= instr->operands[1].physReg() << 16; + } else if (instr->format != Format::FLAT) { + encoding |= 0x7F << 16; + } + encoding |= flat->nv ? 1 << 23 : 0; + out.push_back(encoding); + break; + } + case Format::EXP: { + Export_instruction* exp = static_cast<Export_instruction*>(instr); + uint32_t encoding = (0b110001 << 26); + encoding |= exp->valid_mask ? 0b1 << 12 : 0; + encoding |= exp->done ? 0b1 << 11 : 0; + encoding |= exp->compressed ? 0b1 << 10 : 0; + encoding |= exp->dest << 4; + encoding |= exp->enabled_mask; + out.push_back(encoding); + encoding = 0xFF & exp->operands[0].physReg().reg; + encoding |= (0xFF & exp->operands[1].physReg().reg) << 8; + encoding |= (0xFF & exp->operands[2].physReg().reg) << 16; + encoding |= (0xFF & exp->operands[3].physReg().reg) << 24; + out.push_back(encoding); + break; + } + case Format::PSEUDO: + case Format::PSEUDO_BARRIER: + unreachable("Pseudo instructions should be lowered before assembly."); + default: + if ((uint16_t) instr->format & (uint16_t) Format::VOP3A) { + VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr); + + if ((uint16_t) instr->format & (uint16_t) Format::VOP2) + opcode = opcode + 0x100; + else if ((uint16_t) instr->format & (uint16_t) Format::VOP1) + opcode = opcode + 0x140; + else if ((uint16_t) instr->format & (uint16_t) Format::VOPC) + opcode = opcode + 0x0; + else if ((uint16_t) instr->format & (uint16_t) Format::VINTRP) + opcode = opcode + 0x270; + + // TODO: op_sel + uint32_t encoding = (0b110100 << 26); + encoding |= opcode << 16; + encoding |= (vop3->clamp ? 1 : 0) << 15; + for (unsigned i = 0; i < 3; i++) + encoding |= vop3->abs[i] << (8+i); + if (instr->definitions.size() == 2) + encoding |= instr->definitions[1].physReg() << 8; + encoding |= (0xFF & instr->definitions[0].physReg().reg); + out.push_back(encoding); + encoding = 0; + if (instr->opcode == aco_opcode::v_interp_mov_f32) { + encoding = 0x3 & instr->operands[0].constantValue(); + } else { + for (unsigned i = 0; i < instr->operands.size(); i++) + encoding |= instr->operands[i].physReg() << (i * 9); + } + encoding |= vop3->omod << 27; + for (unsigned i = 0; i < 3; i++) + encoding |= vop3->neg[i] << (29+i); + out.push_back(encoding); + return; + + } else if (instr->isDPP()){ + /* first emit the instruction without the DPP operand */ + Operand dpp_op = instr->operands[0]; + instr->operands[0] = Operand(PhysReg{250}, v1); + instr->format = (Format) ((uint32_t) instr->format & ~(1 << 14)); + emit_instruction(ctx, out, instr); + DPP_instruction* dpp = static_cast<DPP_instruction*>(instr); + uint32_t encoding = (0xF & dpp->row_mask) << 28; + encoding |= (0xF & dpp->bank_mask) << 24; + encoding |= dpp->abs[1] << 23; + encoding |= dpp->neg[1] << 22; + encoding |= dpp->abs[0] << 21; + encoding |= dpp->neg[0] << 20; + encoding |= dpp->bound_ctrl << 19; + encoding |= dpp->dpp_ctrl << 8; + encoding |= (0xFF) & dpp_op.physReg().reg; + out.push_back(encoding); + return; + } else { + unreachable("unimplemented instruction format"); + } + } + + /* append literal dword */ + for (const Operand& op : instr->operands) { + if (op.isLiteral()) { + out.push_back(op.constantValue()); + break; + } + } +} + +void emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block) +{ + for (aco_ptr<Instruction>& instr : block.instructions) { +#if 0 + int start_idx = out.size(); + std::cerr << "Encoding:\t" << std::endl; + aco_print_instr(&*instr, stderr); + std::cerr << std::endl; +#endif + emit_instruction(ctx, out, instr.get()); +#if 0 + for (int i = start_idx; i < out.size(); i++) + std::cerr << "encoding: " << "0x" << std::setfill('0') << std::setw(8) << std::hex << out[i] << std::endl; +#endif + } +} + +void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program) +{ + for (int idx = program->blocks.size() - 1; idx >= 0; idx--) { + Block& block = program->blocks[idx]; + std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin(); + bool endBlock = false; + bool exported = false; + while ( it != block.instructions.rend()) + { + if ((*it)->format == Format::EXP && endBlock) { + Export_instruction* exp = static_cast<Export_instruction*>((*it).get()); + if (program->stage & hw_vs) { + if (exp->dest >= V_008DFC_SQ_EXP_POS && exp->dest <= (V_008DFC_SQ_EXP_POS + 3)) { + exp->done = true; + exported = true; + break; + } + } else { + exp->done = true; + exp->valid_mask = true; + exported = true; + break; + } + } else if ((*it)->definitions.size() && (*it)->definitions[0].physReg() == exec) + break; + else if ((*it)->opcode == aco_opcode::s_endpgm) { + if (endBlock) + break; + endBlock = true; + } + ++it; + } + if (!endBlock || exported) + continue; + /* we didn't find an Export instruction and have to insert a null export */ + aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)}; + for (unsigned i = 0; i < 4; i++) + exp->operands[i] = Operand(v1); + exp->enabled_mask = 0; + exp->compressed = false; + exp->done = true; + exp->valid_mask = program->stage & hw_fs; + if (program->stage & hw_fs) + exp->dest = 9; /* NULL */ + else + exp->dest = V_008DFC_SQ_EXP_POS; + /* insert the null export 1 instruction before endpgm */ + block.instructions.insert(block.instructions.end() - 1, std::move(exp)); + } +} + +void fix_branches(asm_context& ctx, std::vector<uint32_t>& out) +{ + for (std::pair<int, SOPP_instruction*> branch : ctx.branches) + { + int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1; + out[branch.first] |= (uint16_t) offset; + } +} + +void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out) +{ + for (unsigned addr : ctx.constaddrs) + out[addr] += out.size() * 4u; +} + +unsigned emit_program(Program* program, + std::vector<uint32_t>& code) +{ + asm_context ctx(program); + + if (program->stage & (hw_vs | hw_fs)) + fix_exports(ctx, code, program); + + for (Block& block : program->blocks) { + block.offset = code.size(); + emit_block(ctx, code, block); + } + + fix_branches(ctx, code); + fix_constaddrs(ctx, code); + + unsigned constant_data_offset = code.size() * sizeof(uint32_t); + while (program->constant_data.size() % 4u) + program->constant_data.push_back(0); + /* Copy constant data */ + code.insert(code.end(), (uint32_t*)program->constant_data.data(), + (uint32_t*)(program->constant_data.data() + program->constant_data.size())); + + return constant_data_offset; +} + +} diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py new file mode 100644 index 00000000000..7be3a664c4a --- /dev/null +++ b/src/amd/compiler/aco_builder_h.py @@ -0,0 +1,400 @@ + +template = """\ +/* + * Copyright (c) 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * This file was generated by aco_builder_h.py + */ + +#ifndef _ACO_BUILDER_ +#define _ACO_BUILDER_ + +#include "aco_ir.h" +#include "util/u_math.h" +#include "util/bitscan.h" + +namespace aco { +enum dpp_ctrl { + _dpp_quad_perm = 0x000, + _dpp_row_sl = 0x100, + _dpp_row_sr = 0x110, + _dpp_row_rr = 0x120, + dpp_wf_sl1 = 0x130, + dpp_wf_rl1 = 0x134, + dpp_wf_sr1 = 0x138, + dpp_wf_rr1 = 0x13C, + dpp_row_mirror = 0x140, + dpp_row_half_mirror = 0x141, + dpp_row_bcast15 = 0x142, + dpp_row_bcast31 = 0x143 +}; + +inline dpp_ctrl +dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3) +{ + assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4); + return (dpp_ctrl)(lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6)); +} + +inline dpp_ctrl +dpp_row_sl(unsigned amount) +{ + assert(amount > 0 && amount < 16); + return (dpp_ctrl)(((unsigned) _dpp_row_sl) | amount); +} + +inline dpp_ctrl +dpp_row_sr(unsigned amount) +{ + assert(amount > 0 && amount < 16); + return (dpp_ctrl)(((unsigned) _dpp_row_sr) | amount); +} + +inline unsigned +ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask) +{ + assert(and_mask < 32 && or_mask < 32 && xor_mask < 32); + return and_mask | (or_mask << 5) | (xor_mask << 10); +} + +aco_ptr<Instruction> create_s_mov(Definition dst, Operand src); + +class Builder { +public: + struct Result { + Instruction *instr; + + Result(Instruction *instr) : instr(instr) {} + + operator Instruction *() const { + return instr; + } + + operator Temp() const { + return instr->definitions[0].getTemp(); + } + + operator Operand() const { + return Operand((Temp)*this); + } + + Definition& def(unsigned index) const { + return instr->definitions[index]; + } + + aco_ptr<Instruction> get_ptr() const { + return aco_ptr<Instruction>(instr); + } + }; + + struct Op { + Operand op; + Op(Temp tmp) : op(tmp) {} + Op(Operand op_) : op(op_) {} + Op(Result res) : op((Temp)res) {} + }; + + Program *program; + bool use_iterator; + union { + bool forwards; //when use_iterator == true + bool start; //when use_iterator == false + }; + std::vector<aco_ptr<Instruction>> *instructions; + std::vector<aco_ptr<Instruction>>::iterator it; + + Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), instructions(NULL) {} + Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), instructions(&block->instructions) {} + Builder(Program *pgm, std::vector<aco_ptr<Instruction>> *instrs) : program(pgm), use_iterator(false), start(false), instructions(instrs) {} + + void moveEnd(Block *block) { + instructions = &block->instructions; + } + + void reset() { + use_iterator = false; + start = false; + instructions = NULL; + } + + void reset(Block *block) { + use_iterator = false; + start = false; + instructions = &block->instructions; + } + + void reset(std::vector<aco_ptr<Instruction>> *instrs) { + use_iterator = false; + start = false; + instructions = instrs; + } + + Result insert(aco_ptr<Instruction> instr) { + Instruction *instr_ptr = instr.get(); + if (instructions) { + if (use_iterator) { + it = instructions->emplace(it, std::move(instr)); + if (forwards) + it = std::next(it); + } else if (!start) { + instructions->emplace_back(std::move(instr)); + } else { + instructions->emplace(instructions->begin(), std::move(instr)); + } + } + return Result(instr_ptr); + } + + Result insert(Instruction* instr) { + if (instructions) { + if (use_iterator) { + it = instructions->emplace(it, aco_ptr<Instruction>(instr)); + if (forwards) + it = std::next(it); + } else if (!start) { + instructions->emplace_back(aco_ptr<Instruction>(instr)); + } else { + instructions->emplace(instructions->begin(), aco_ptr<Instruction>(instr)); + } + } + return Result(instr); + } + + Temp tmp(RegClass rc) { + return (Temp){program->allocateId(), rc}; + } + + Temp tmp(RegType type, unsigned size) { + return (Temp){program->allocateId(), RegClass(type, size)}; + } + + Definition def(RegClass rc) { + return Definition((Temp){program->allocateId(), rc}); + } + + Definition def(RegType type, unsigned size) { + return Definition((Temp){program->allocateId(), RegClass(type, size)}); + } + + Definition def(RegClass rc, PhysReg reg) { + return Definition(program->allocateId(), reg, rc); + } + +% for fixed in ['m0', 'vcc', 'exec', 'scc']: + Operand ${fixed}(Temp tmp) { + Operand op(tmp); + op.setFixed(aco::${fixed}); + return op; + } + + Definition ${fixed}(Definition def) { + def.setFixed(aco::${fixed}); + return def; + } + + Definition hint_${fixed}(Definition def) { + def.setHint(aco::${fixed}); + return def; + } + +% endfor + /* hand-written helpers */ + Temp as_uniform(Op op) + { + assert(op.op.isTemp()); + if (op.op.getTemp().type() == RegType::vgpr) + return pseudo(aco_opcode::p_as_uniform, def(RegType::sgpr, op.op.size()), op); + else + return op.op.getTemp(); + } + + Result v_mul_imm(Definition dst, Temp tmp, uint32_t imm, bool bits24=false) + { + assert(tmp.type() == RegType::vgpr); + if (imm == 0) { + return vop1(aco_opcode::v_mov_b32, dst, Operand(0u)); + } else if (imm == 1) { + return copy(dst, Operand(tmp)); + } else if (util_is_power_of_two_or_zero(imm)) { + return vop2(aco_opcode::v_lshlrev_b32, dst, Operand((uint32_t)ffs(imm) - 1u), tmp); + } else if (bits24) { + return vop2(aco_opcode::v_mul_u32_u24, dst, Operand(imm), tmp); + } else { + Temp imm_tmp = copy(def(v1), Operand(imm)); + return vop3(aco_opcode::v_mul_lo_u32, dst, imm_tmp, tmp); + } + } + + Result v_mul24_imm(Definition dst, Temp tmp, uint32_t imm) + { + return v_mul_imm(dst, tmp, imm, true); + } + + Result copy(Definition dst, Op op_) { + Operand op = op_.op; + if (dst.regClass() == s1 && op.size() == 1 && op.isLiteral()) { + uint32_t imm = op.constantValue(); + if (imm >= 0xffff8000 || imm <= 0x7fff) { + return sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu); + } else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) { + uint32_t rev = util_bitreverse(imm); + return dst.regClass() == v1 ? + vop1(aco_opcode::v_bfrev_b32, dst, Operand(rev)) : + sop1(aco_opcode::s_brev_b32, dst, Operand(rev)); + } else if (imm != 0) { + unsigned start = (ffs(imm) - 1) & 0x1f; + unsigned size = util_bitcount(imm) & 0x1f; + if ((((1u << size) - 1u) << start) == imm) + return sop2(aco_opcode::s_bfm_b32, dst, Operand(size), Operand(start)); + } + } + + if (dst.regClass() == s2) { + return sop1(aco_opcode::s_mov_b64, dst, op); + } else if (op.size() > 1) { + return pseudo(aco_opcode::p_create_vector, dst, op); + } else if (dst.regClass() == v1 || dst.regClass() == v1.as_linear()) { + return vop1(aco_opcode::v_mov_b32, dst, op); + } else { + assert(dst.regClass() == s1); + return sop1(aco_opcode::s_mov_b32, dst, op); + } + } + + Result vadd32(Definition dst, Op a, Op b, bool carry_out=false, Op carry_in=Op(Operand(s2))) { + if (!b.op.isTemp() || b.op.regClass().type() != RegType::vgpr) + std::swap(a, b); + assert(b.op.isTemp() && b.op.regClass().type() == RegType::vgpr); + + if (!carry_in.op.isUndefined()) + return vop2(aco_opcode::v_addc_co_u32, Definition(dst), hint_vcc(def(s2)), a, b, carry_in); + else if (program->chip_class < GFX9 || carry_out) + return vop2(aco_opcode::v_add_co_u32, Definition(dst), hint_vcc(def(s2)), a, b); + else + return vop2(aco_opcode::v_add_u32, Definition(dst), a, b); + } + + Result vsub32(Definition dst, Op a, Op b, bool carry_out=false, Op borrow=Op(Operand(s2))) + { + if (!borrow.op.isUndefined() || program->chip_class < GFX9) + carry_out = true; + + bool reverse = !b.op.isTemp() || b.op.regClass().type() != RegType::vgpr; + if (reverse) + std::swap(a, b); + assert(b.op.isTemp() && b.op.regClass().type() == RegType::vgpr); + + aco_opcode op; + Temp carry; + if (carry_out) { + carry = tmp(s2); + if (borrow.op.isUndefined()) + op = reverse ? aco_opcode::v_subrev_co_u32 : aco_opcode::v_sub_co_u32; + else + op = reverse ? aco_opcode::v_subbrev_co_u32 : aco_opcode::v_subb_co_u32; + } else { + op = reverse ? aco_opcode::v_subrev_u32 : aco_opcode::v_sub_u32; + } + + int num_ops = borrow.op.isUndefined() ? 2 : 3; + int num_defs = carry_out ? 2 : 1; + aco_ptr<Instruction> sub{create_instruction<VOP2_instruction>(op, Format::VOP2, num_ops, num_defs)}; + sub->operands[0] = a.op; + sub->operands[1] = b.op; + if (!borrow.op.isUndefined()) + sub->operands[2] = borrow.op; + sub->definitions[0] = dst; + if (carry_out) { + sub->definitions[1] = Definition(carry); + sub->definitions[1].setHint(aco::vcc); + } + return insert(std::move(sub)); + } +<% +import itertools +formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(5))) + [(8, 1), (1, 8)]), + ("sop1", [Format.SOP1], 'SOP1_instruction', [(1, 1), (2, 1), (3, 2)]), + ("sop2", [Format.SOP2], 'SOP2_instruction', itertools.product([1, 2], [2, 3])), + ("sopk", [Format.SOPK], 'SOPK_instruction', itertools.product([0, 1, 2], [0, 1])), + ("sopp", [Format.SOPP], 'SOPP_instruction', [(0, 0), (0, 1)]), + ("sopc", [Format.SOPC], 'SOPC_instruction', [(1, 2)]), + ("smem", [Format.SMEM], 'SMEM_instruction', [(0, 4), (0, 3), (1, 0), (1, 3), (1, 2), (0, 0)]), + ("ds", [Format.DS], 'DS_instruction', [(1, 1), (1, 2), (0, 3), (0, 4)]), + ("mubuf", [Format.MUBUF], 'MUBUF_instruction', [(0, 4), (1, 3)]), + ("mimg", [Format.MIMG], 'MIMG_instruction', [(0, 4), (1, 3), (0, 3), (1, 2)]), #TODO(pendingchaos): less shapes? + ("exp", [Format.EXP], 'Export_instruction', [(0, 4)]), + ("branch", [Format.PSEUDO_BRANCH], 'Pseudo_branch_instruction', itertools.product([0], [0, 1])), + ("barrier", [Format.PSEUDO_BARRIER], 'Pseudo_barrier_instruction', [(0, 0)]), + ("reduction", [Format.PSEUDO_REDUCTION], 'Pseudo_reduction_instruction', [(3, 2)]), + ("vop1", [Format.VOP1], 'VOP1_instruction', [(1, 1), (2, 2)]), + ("vop2", [Format.VOP2], 'VOP2_instruction', itertools.product([1, 2], [2, 3])), + ("vopc", [Format.VOPC], 'VOPC_instruction', itertools.product([1, 2], [2])), + ("vop3", [Format.VOP3A], 'VOP3A_instruction', [(1, 3), (1, 2), (1, 1), (2, 2)]), + ("vintrp", [Format.VINTRP], 'Interp_instruction', [(1, 2), (1, 3)]), + ("vop1_dpp", [Format.VOP1, Format.DPP], 'DPP_instruction', [(1, 1)]), + ("vop2_dpp", [Format.VOP2, Format.DPP], 'DPP_instruction', itertools.product([1, 2], [2, 3])), + ("vopc_dpp", [Format.VOPC, Format.DPP], 'DPP_instruction', itertools.product([1, 2], [2])), + ("vop1_e64", [Format.VOP1, Format.VOP3A], 'VOP3A_instruction', itertools.product([1], [1])), + ("vop2_e64", [Format.VOP2, Format.VOP3A], 'VOP3A_instruction', itertools.product([1, 2], [2, 3])), + ("vopc_e64", [Format.VOPC, Format.VOP3A], 'VOP3A_instruction', itertools.product([1, 2], [2])), + ("flat", [Format.FLAT], 'FLAT_instruction', [(0, 3), (1, 2)]), + ("global", [Format.GLOBAL], 'FLAT_instruction', [(0, 3), (1, 2)])] +%>\\ +% for name, formats, struct, shapes in formats: + % for num_definitions, num_operands in shapes: + <% + args = ['aco_opcode opcode'] + for i in range(num_definitions): + args.append('Definition def%d' % i) + for i in range(num_operands): + args.append('Op op%d' % i) + for f in formats: + args += f.get_builder_field_decls() + %>\\ + + Result ${name}(${', '.join(args)}) + { + ${struct} *instr = create_instruction<${struct}>(opcode, (Format)(${'|'.join('(int)Format::%s' % f.name for f in formats)}), ${num_operands}, ${num_definitions}); + % for i in range(num_definitions): + instr->definitions[${i}] = def${i}; + % endfor + % for i in range(num_operands): + instr->operands[${i}] = op${i}.op; + % endfor + % for f in formats: + % for dest, field_name in zip(f.get_builder_field_dests(), f.get_builder_field_names()): + instr->${dest} = ${field_name}; + % endfor + % endfor + return insert(instr); + } + % endfor +% endfor +}; + +} +#endif /* _ACO_BUILDER_ */""" + +from aco_opcodes import opcodes, Format +from mako.template import Template + +print(Template(template).render(opcodes=opcodes, Format=Format)) diff --git a/src/amd/compiler/aco_dead_code_analysis.cpp b/src/amd/compiler/aco_dead_code_analysis.cpp new file mode 100644 index 00000000000..f56718f0479 --- /dev/null +++ b/src/amd/compiler/aco_dead_code_analysis.cpp @@ -0,0 +1,102 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" + +#include <algorithm> + +/* + * Implements an analysis pass to determine the number of uses + * for each SSA-definition. + */ + +namespace aco { +namespace { + +struct dce_ctx { + int current_block; + std::vector<uint16_t> uses; + std::vector<std::vector<bool>> live; + + dce_ctx(Program* program) : current_block(program->blocks.size() - 1), uses(program->peekAllocationId()) + { + live.reserve(program->blocks.size()); + for (Block& block : program->blocks) + live.emplace_back(block.instructions.size()); + } +}; + +void process_block(dce_ctx& ctx, Block& block) +{ + std::vector<bool>& live = ctx.live[block.index]; + assert(live.size() == block.instructions.size()); + bool process_predecessors = false; + for (int idx = block.instructions.size() - 1; idx >= 0; idx--) { + if (live[idx]) + continue; + + aco_ptr<Instruction>& instr = block.instructions[idx]; + const bool is_live = instr->definitions.empty() || + std::any_of(instr->definitions.begin(), instr->definitions.end(), + [&ctx] (const Definition& def) { return !def.isTemp() || ctx.uses[def.tempId()];}); + + if (is_live) { + for (const Operand& op : instr->operands) { + if (op.isTemp()) { + if (ctx.uses[op.tempId()] == 0) + process_predecessors = true; + ctx.uses[op.tempId()]++; + } + } + live[idx] = true; + } + } + + if (process_predecessors) { + for (unsigned pred_idx : block.linear_preds) + ctx.current_block = std::max(ctx.current_block, (int) pred_idx); + } +} + +} /* end namespace */ + +std::vector<uint16_t> dead_code_analysis(Program *program) { + + dce_ctx ctx(program); + + while (ctx.current_block >= 0) { + unsigned next_block = ctx.current_block--; + process_block(ctx, program->blocks[next_block]); + } + + /* add one use to exec to prevent startpgm from being removed */ + aco_ptr<Instruction>& startpgm = program->blocks[0].instructions[0]; + assert(startpgm->opcode == aco_opcode::p_startpgm); + ctx.uses[startpgm->definitions.back().tempId()]++; + + return ctx.uses; +} + +} + diff --git a/src/amd/compiler/aco_dominance.cpp b/src/amd/compiler/aco_dominance.cpp new file mode 100644 index 00000000000..de5549eec1a --- /dev/null +++ b/src/amd/compiler/aco_dominance.cpp @@ -0,0 +1,93 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Daniel Schürmann ([email protected]) + * + */ + +#ifndef ACO_DOMINANCE_CPP +#define ACO_DOMINANCE_CPP + +#include "aco_ir.h" + +/* + * Implements the algorithms for computing the dominator tree from + * "A Simple, Fast Dominance Algorithm" by Cooper, Harvey, and Kennedy. + * + * Different from the paper, our CFG allows to compute the dominator tree + * in a single pass as it is guaranteed that the dominating predecessors + * are processed before the current block. + */ + +namespace aco { + +void dominator_tree(Program* program) +{ + program->blocks[0].logical_idom = 0; + program->blocks[0].linear_idom = 0; + + for (unsigned i = 1; i < program->blocks.size(); i++) { + Block& block = program->blocks[i]; + int new_logical_idom = -1; + int new_linear_idom = -1; + for (unsigned pred_idx : block.logical_preds) { + if ((int) program->blocks[pred_idx].logical_idom == -1) + continue; + + if (new_logical_idom == -1) { + new_logical_idom = pred_idx; + continue; + } + + while ((int) pred_idx != new_logical_idom) { + if ((int) pred_idx > new_logical_idom) + pred_idx = program->blocks[pred_idx].logical_idom; + if ((int) pred_idx < new_logical_idom) + new_logical_idom = program->blocks[new_logical_idom].logical_idom; + } + } + + for (unsigned pred_idx : block.linear_preds) { + if ((int) program->blocks[pred_idx].linear_idom == -1) + continue; + + if (new_linear_idom == -1) { + new_linear_idom = pred_idx; + continue; + } + + while ((int) pred_idx != new_linear_idom) { + if ((int) pred_idx > new_linear_idom) + pred_idx = program->blocks[pred_idx].linear_idom; + if ((int) pred_idx < new_linear_idom) + new_linear_idom = program->blocks[new_linear_idom].linear_idom; + } + } + + block.logical_idom = new_logical_idom; + block.linear_idom = new_linear_idom; + } +} + +} +#endif diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp new file mode 100644 index 00000000000..fea1364072e --- /dev/null +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -0,0 +1,282 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" + +namespace aco { +namespace { + +struct NOP_ctx { + /* just initialize these with something less than max NOPs */ + int VALU_wrexec = -10; + int VALU_wrvcc = -10; + int VALU_wrsgpr = -10; + enum chip_class chip_class; + unsigned vcc_physical; + NOP_ctx(Program* program) : chip_class(program->chip_class) { + vcc_physical = program->config->num_sgprs - 2; + } +}; + +bool VALU_writes_sgpr(aco_ptr<Instruction>& instr) +{ + if ((uint32_t) instr->format & (uint32_t) Format::VOPC) + return true; + if (instr->isVOP3() && instr->definitions.size() == 2) + return true; + if (instr->opcode == aco_opcode::v_readfirstlane_b32 || instr->opcode == aco_opcode::v_readlane_b32) + return true; + return false; +} + +bool regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size) +{ + return a_reg > b_reg ? + (a_reg - b_reg < b_size) : + (b_reg - a_reg < a_size); +} + +int handle_instruction(NOP_ctx& ctx, aco_ptr<Instruction>& instr, + std::vector<aco_ptr<Instruction>>& old_instructions, + std::vector<aco_ptr<Instruction>>& new_instructions) +{ + int new_idx = new_instructions.size(); + + // TODO: setreg / getreg / m0 writes + // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles + + /* break off from prevous SMEM clause if needed */ + if (instr->format == Format::SMEM && ctx.chip_class >= GFX8) { + const bool is_store = instr->definitions.empty(); + for (int pred_idx = new_idx - 1; pred_idx >= 0; pred_idx--) { + aco_ptr<Instruction>& pred = new_instructions[pred_idx]; + if (pred->format != Format::SMEM) + break; + + /* Don't allow clauses with store instructions since the clause's + * instructions may use the same address. */ + if (is_store || pred->definitions.empty()) + return 1; + + Definition& instr_def = instr->definitions[0]; + Definition& pred_def = pred->definitions[0]; + + /* ISA reference doesn't say anything about this, but best to be safe */ + if (regs_intersect(instr_def.physReg(), instr_def.size(), pred_def.physReg(), pred_def.size())) + return 1; + + for (const Operand& op : pred->operands) { + if (op.isConstant() || !op.isFixed()) + continue; + if (regs_intersect(instr_def.physReg(), instr_def.size(), op.physReg(), op.size())) + return 1; + } + for (const Operand& op : instr->operands) { + if (op.isConstant() || !op.isFixed()) + continue; + if (regs_intersect(pred_def.physReg(), pred_def.size(), op.physReg(), op.size())) + return 1; + } + } + } else if (instr->isVALU() || instr->format == Format::VINTRP) { + int NOPs = 0; + + if (instr->isDPP()) { + /* VALU does not forward EXEC to DPP. */ + if (ctx.VALU_wrexec + 5 >= new_idx) + NOPs = 5 + ctx.VALU_wrexec - new_idx + 1; + + /* VALU DPP reads VGPR written by VALU */ + for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 2; pred_idx--) { + aco_ptr<Instruction>& pred = new_instructions[pred_idx]; + if ((pred->isVALU() || pred->format == Format::VINTRP) && + !pred->definitions.empty() && + pred->definitions[0].physReg() == instr->operands[0].physReg()) { + NOPs = std::max(NOPs, 2 + pred_idx - new_idx + 1); + break; + } + } + } + + /* SALU writes M0 */ + if (instr->format == Format::VINTRP && new_idx > 0 && ctx.chip_class >= GFX9) { + aco_ptr<Instruction>& pred = new_instructions.back(); + if (pred->isSALU() && + !pred->definitions.empty() && + pred->definitions[0].physReg() == m0) + NOPs = std::max(NOPs, 1); + } + + for (const Operand& op : instr->operands) { + /* VALU which uses VCCZ */ + if (op.physReg() == PhysReg{251} && + ctx.VALU_wrvcc + 5 >= new_idx) + NOPs = std::max(NOPs, 5 + ctx.VALU_wrvcc - new_idx + 1); + + /* VALU which uses EXECZ */ + if (op.physReg() == PhysReg{252} && + ctx.VALU_wrexec + 5 >= new_idx) + NOPs = std::max(NOPs, 5 + ctx.VALU_wrexec - new_idx + 1); + + /* VALU which reads VCC as a constant */ + if (ctx.VALU_wrvcc + 1 >= new_idx) { + for (unsigned k = 0; k < op.size(); k++) { + unsigned reg = op.physReg() + k; + if (reg == ctx.vcc_physical || reg == ctx.vcc_physical + 1) + NOPs = std::max(NOPs, 1); + } + } + } + + switch (instr->opcode) { + case aco_opcode::v_readlane_b32: + case aco_opcode::v_writelane_b32: { + if (ctx.VALU_wrsgpr + 4 < new_idx) + break; + PhysReg reg = instr->operands[1].physReg(); + for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 4; pred_idx--) { + aco_ptr<Instruction>& pred = new_instructions[pred_idx]; + if (!pred->isVALU() || !VALU_writes_sgpr(pred)) + continue; + for (const Definition& def : pred->definitions) { + if (def.physReg() == reg) + NOPs = std::max(NOPs, 4 + pred_idx - new_idx + 1); + } + } + break; + } + case aco_opcode::v_div_fmas_f32: + case aco_opcode::v_div_fmas_f64: { + if (ctx.VALU_wrvcc + 4 >= new_idx) + NOPs = std::max(NOPs, 4 + ctx.VALU_wrvcc - new_idx + 1); + break; + } + default: + break; + } + + /* Write VGPRs holding writedata > 64 bit from MIMG/MUBUF instructions */ + // FIXME: handle case if the last instruction of a block without branch is such store + // TODO: confirm that DS instructions cannot cause WAR hazards here + if (new_idx > 0) { + aco_ptr<Instruction>& pred = new_instructions.back(); + if (pred->isVMEM() && + pred->operands.size() == 4 && + pred->operands[3].size() > 2 && + pred->operands[1].size() != 8 && + (pred->format != Format::MUBUF || pred->operands[2].physReg() >= 102)) { + /* Ops that use a 256-bit T# do not need a wait state. + * BUFFER_STORE_* operations that use an SGPR for "offset" + * do not require any wait states. */ + PhysReg wrdata = pred->operands[3].physReg(); + unsigned size = pred->operands[3].size(); + assert(wrdata >= 256); + for (const Definition& def : instr->definitions) { + if (regs_intersect(def.physReg(), def.size(), wrdata, size)) + NOPs = std::max(NOPs, 1); + } + } + } + + if (VALU_writes_sgpr(instr)) { + for (const Definition& def : instr->definitions) { + if (def.physReg() == vcc) + ctx.VALU_wrvcc = NOPs ? new_idx : new_idx + 1; + else if (def.physReg() == exec) + ctx.VALU_wrexec = NOPs ? new_idx : new_idx + 1; + else if (def.physReg() <= 102) + ctx.VALU_wrsgpr = NOPs ? new_idx : new_idx + 1; + } + } + return NOPs; + } else if (instr->isVMEM() && ctx.VALU_wrsgpr + 5 >= new_idx) { + /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */ + for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 5; pred_idx--) { + aco_ptr<Instruction>& pred = new_instructions[pred_idx]; + if (!(pred->isVALU() && VALU_writes_sgpr(pred))) + continue; + + for (const Definition& def : pred->definitions) { + if (def.physReg() > 102) + continue; + + if (instr->operands.size() > 1 && + regs_intersect(instr->operands[1].physReg(), instr->operands[1].size(), + def.physReg(), def.size())) { + return 5 + pred_idx - new_idx + 1; + } + + if (instr->operands.size() > 2 && + regs_intersect(instr->operands[2].physReg(), instr->operands[2].size(), + def.physReg(), def.size())) { + return 5 + pred_idx - new_idx + 1; + } + } + } + } + + return 0; +} + + +void handle_block(NOP_ctx& ctx, Block& block) +{ + std::vector<aco_ptr<Instruction>> instructions; + instructions.reserve(block.instructions.size()); + for (unsigned i = 0; i < block.instructions.size(); i++) { + aco_ptr<Instruction>& instr = block.instructions[i]; + unsigned NOPs = handle_instruction(ctx, instr, block.instructions, instructions); + if (NOPs) { + // TODO: try to move the instruction down + /* create NOP */ + aco_ptr<SOPP_instruction> nop{create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)}; + nop->imm = NOPs - 1; + nop->block = -1; + instructions.emplace_back(std::move(nop)); + } + + instructions.emplace_back(std::move(instr)); + } + + ctx.VALU_wrvcc -= instructions.size(); + ctx.VALU_wrexec -= instructions.size(); + ctx.VALU_wrsgpr -= instructions.size(); + block.instructions = std::move(instructions); +} + +} /* end namespace */ + + +void insert_NOPs(Program* program) +{ + NOP_ctx ctx(program); + for (Block& block : program->blocks) { + if (block.instructions.empty()) + continue; + + handle_block(ctx, block); + } +} + +} diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp new file mode 100644 index 00000000000..7886a4c77e2 --- /dev/null +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -0,0 +1,1078 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" +#include "aco_builder.h" + +namespace aco { + +namespace { + +enum WQMState : uint8_t { + Unspecified = 0, + Exact = 1 << 0, + WQM = 1 << 1, /* with control flow applied */ + Preserve_WQM = 1 << 2, + Exact_Branch = 1 << 3, +}; + +enum mask_type : uint8_t { + mask_type_global = 1 << 0, + mask_type_exact = 1 << 1, + mask_type_wqm = 1 << 2, + mask_type_loop = 1 << 3, /* active lanes of a loop */ + mask_type_initial = 1 << 4, /* initially active lanes */ +}; + +struct wqm_ctx { + Program* program; + /* state for WQM propagation */ + std::set<unsigned> worklist; + std::vector<uint16_t> defined_in; + std::vector<bool> needs_wqm; + std::vector<bool> branch_wqm; /* true if the branch condition in this block should be in wqm */ + bool loop; + bool wqm; + wqm_ctx(Program* program) : program(program), + defined_in(program->peekAllocationId(), 0xFFFF), + needs_wqm(program->peekAllocationId()), + branch_wqm(program->blocks.size()), + loop(false), + wqm(false) + { + for (unsigned i = 0; i < program->blocks.size(); i++) + worklist.insert(i); + } +}; + +struct loop_info { + Block* loop_header; + uint16_t num_exec_masks; + uint8_t needs; + bool has_divergent_break; + bool has_divergent_continue; + bool has_discard; + loop_info(Block* b, uint16_t num, uint8_t needs, bool breaks, bool cont, bool discard) : + loop_header(b), num_exec_masks(num), needs(needs), has_divergent_break(breaks), + has_divergent_continue(cont), has_discard(discard) {} +}; + +struct block_info { + std::vector<std::pair<Temp, uint8_t>> exec; + std::vector<WQMState> instr_needs; + uint8_t block_needs; + uint8_t ever_again_needs; + /* more... */ +}; + +struct exec_ctx { + Program *program; + std::vector<block_info> info; + std::vector<loop_info> loop; + bool handle_wqm = false; + exec_ctx(Program *program) : program(program), info(program->blocks.size()) {} +}; + +bool pred_by_exec_mask(aco_ptr<Instruction>& instr) { + if (instr->format == Format::SMEM || instr->isSALU()) + return false; + if (instr->format == Format::PSEUDO_BARRIER) + return false; + + if (instr->format == Format::PSEUDO) { + switch (instr->opcode) { + case aco_opcode::p_create_vector: + return instr->definitions[0].getTemp().type() == RegType::vgpr; + case aco_opcode::p_extract_vector: + case aco_opcode::p_split_vector: + return instr->operands[0].getTemp().type() == RegType::vgpr; + case aco_opcode::p_spill: + case aco_opcode::p_reload: + return false; + default: + break; + } + } + + if (instr->opcode == aco_opcode::v_readlane_b32 || + instr->opcode == aco_opcode::v_writelane_b32) + return false; + + return true; +} + +bool needs_exact(aco_ptr<Instruction>& instr) { + if (instr->format == Format::MUBUF) { + MUBUF_instruction *mubuf = static_cast<MUBUF_instruction *>(instr.get()); + return mubuf->disable_wqm; + } else if (instr->format == Format::MTBUF) { + MTBUF_instruction *mtbuf = static_cast<MTBUF_instruction *>(instr.get()); + return mtbuf->disable_wqm; + } else if (instr->format == Format::MIMG) { + MIMG_instruction *mimg = static_cast<MIMG_instruction *>(instr.get()); + return mimg->disable_wqm; + } else { + return instr->format == Format::EXP || instr->opcode == aco_opcode::p_fs_buffer_store_smem; + } +} + +void set_needs_wqm(wqm_ctx &ctx, Temp tmp) +{ + if (!ctx.needs_wqm[tmp.id()]) { + ctx.needs_wqm[tmp.id()] = true; + if (ctx.defined_in[tmp.id()] != 0xFFFF) + ctx.worklist.insert(ctx.defined_in[tmp.id()]); + } +} + +void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx) +{ + if (ctx.branch_wqm[block_idx]) + return; + + ctx.branch_wqm[block_idx] = true; + Block& block = ctx.program->blocks[block_idx]; + aco_ptr<Instruction>& branch = block.instructions.back(); + + if (branch->opcode != aco_opcode::p_branch) { + assert(!branch->operands.empty() && branch->operands[0].isTemp()); + set_needs_wqm(ctx, branch->operands[0].getTemp()); + } + + /* TODO: this sets more branch conditions to WQM than it needs to + * it should be enough to stop at the "exec mask top level" */ + if (block.kind & block_kind_top_level) + return; + + for (unsigned pred_idx : block.logical_preds) + mark_block_wqm(ctx, pred_idx); +} + +void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block) +{ + block_info& info = exec_ctx.info[block->index]; + + std::vector<WQMState> instr_needs(block->instructions.size()); + + if (block->kind & block_kind_top_level) { + if (ctx.loop && ctx.wqm) { + /* mark all break conditions as WQM */ + unsigned block_idx = block->index + 1; + while (!(ctx.program->blocks[block_idx].kind & block_kind_top_level)) { + if (ctx.program->blocks[block_idx].kind & block_kind_break) + mark_block_wqm(ctx, block_idx); + block_idx++; + } + } else if (ctx.loop && !ctx.wqm) { + /* Ensure a branch never results in an exec mask with only helper + * invocations (which can cause a loop to repeat infinitively if it's + * break branches are done in exact). */ + unsigned block_idx = block->index; + do { + if ((ctx.program->blocks[block_idx].kind & block_kind_branch)) + exec_ctx.info[block_idx].block_needs |= Exact_Branch; + block_idx++; + } while (!(ctx.program->blocks[block_idx].kind & block_kind_top_level)); + } + + ctx.loop = false; + ctx.wqm = false; + } + + for (int i = block->instructions.size() - 1; i >= 0; --i) + { + aco_ptr<Instruction>& instr = block->instructions[i]; + + WQMState needs = needs_exact(instr) ? Exact : Unspecified; + bool propagate_wqm = instr->opcode == aco_opcode::p_wqm; + bool preserve_wqm = instr->opcode == aco_opcode::p_discard_if; + bool pred_by_exec = pred_by_exec_mask(instr); + for (const Definition& definition : instr->definitions) { + if (!definition.isTemp()) + continue; + const unsigned def = definition.tempId(); + ctx.defined_in[def] = block->index; + if (needs == Unspecified && ctx.needs_wqm[def]) { + needs = pred_by_exec ? WQM : Unspecified; + propagate_wqm = true; + } + } + + if (propagate_wqm) { + for (const Operand& op : instr->operands) { + if (op.isTemp()) { + set_needs_wqm(ctx, op.getTemp()); + } + } + } else if (preserve_wqm && info.block_needs & WQM) { + needs = Preserve_WQM; + } + + /* ensure the condition controlling the control flow for this phi is in WQM */ + if (needs == WQM && instr->opcode == aco_opcode::p_phi) { + for (unsigned pred_idx : block->logical_preds) + mark_block_wqm(ctx, pred_idx); + } + + instr_needs[i] = needs; + info.block_needs |= needs; + } + + info.instr_needs = instr_needs; + + /* for "if (<cond>) <wqm code>" or "while (<cond>) <wqm code>", + * <cond> should be computed in WQM */ + if (info.block_needs & WQM && !(block->kind & block_kind_top_level)) { + for (unsigned pred_idx : block->logical_preds) + mark_block_wqm(ctx, pred_idx); + ctx.wqm = true; + } + if (block->kind & block_kind_loop_header) + ctx.loop = true; +} + +void calculate_wqm_needs(exec_ctx& exec_ctx) +{ + wqm_ctx ctx(exec_ctx.program); + + while (!ctx.worklist.empty()) { + unsigned block_index = *std::prev(ctx.worklist.end()); + ctx.worklist.erase(std::prev(ctx.worklist.end())); + + get_block_needs(ctx, exec_ctx, &exec_ctx.program->blocks[block_index]); + } + + uint8_t ever_again_needs = 0; + for (int i = exec_ctx.program->blocks.size() - 1; i >= 0; i--) { + exec_ctx.info[i].ever_again_needs = ever_again_needs; + Block& block = exec_ctx.program->blocks[i]; + + if (block.kind & block_kind_needs_lowering) + exec_ctx.info[i].block_needs |= Exact; + + /* if discard is used somewhere in nested CF, we need to preserve the WQM mask */ + if ((block.kind & block_kind_discard || + block.kind & block_kind_uses_discard_if) && + ever_again_needs & WQM) + exec_ctx.info[i].block_needs |= Preserve_WQM; + + ever_again_needs |= exec_ctx.info[i].block_needs & ~Exact_Branch; + if (block.kind & block_kind_discard || + block.kind & block_kind_uses_discard_if) + ever_again_needs |= Exact; + + /* don't propagate WQM preservation further than the next top_level block */ + if (block.kind & block_kind_top_level) + ever_again_needs &= ~Preserve_WQM; + else + exec_ctx.info[i].block_needs &= ~Preserve_WQM; + } + exec_ctx.handle_wqm = true; +} + +void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx) +{ + if (ctx.info[idx].exec.back().second & mask_type_wqm) + return; + if (ctx.info[idx].exec.back().second & mask_type_global) { + Temp exec_mask = ctx.info[idx].exec.back().first; + exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), exec_mask); + ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm); + return; + } + /* otherwise, the WQM mask should be one below the current mask */ + ctx.info[idx].exec.pop_back(); + assert(ctx.info[idx].exec.back().second & mask_type_wqm); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + ctx.info[idx].exec.back().first); +} + +void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx) +{ + if (ctx.info[idx].exec.back().second & mask_type_exact) + return; + if (ctx.info[idx].exec.back().second & mask_type_global) { + ctx.info[idx].exec.pop_back(); + assert(ctx.info[idx].exec.back().second & mask_type_exact); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + ctx.info[idx].exec.back().first); + return; + } + /* otherwise, we create an exact mask and push to the stack */ + Temp wqm = ctx.info[idx].exec.back().first; + Temp exact = bld.tmp(s2); + wqm = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm)); + ctx.info[idx].exec.back().first = wqm; + ctx.info[idx].exec.emplace_back(exact, mask_type_exact); +} + +unsigned add_coupling_code(exec_ctx& ctx, Block* block, + std::vector<aco_ptr<Instruction>>& instructions) +{ + unsigned idx = block->index; + Builder bld(ctx.program, &instructions); + std::vector<unsigned>& preds = block->linear_preds; + + /* start block */ + if (idx == 0) { + aco_ptr<Instruction>& startpgm = block->instructions[0]; + assert(startpgm->opcode == aco_opcode::p_startpgm); + Temp exec_mask = startpgm->definitions.back().getTemp(); + bld.insert(std::move(startpgm)); + + if (ctx.handle_wqm) { + ctx.info[0].exec.emplace_back(exec_mask, mask_type_global | mask_type_exact | mask_type_initial); + /* if this block only needs WQM, initialize already */ + if (ctx.info[0].block_needs == WQM) + transition_to_WQM(ctx, bld, 0); + } else { + uint8_t mask = mask_type_global; + if (ctx.program->needs_wqm) { + exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), bld.exec(exec_mask)); + mask |= mask_type_wqm; + } else { + mask |= mask_type_exact; + } + ctx.info[0].exec.emplace_back(exec_mask, mask); + } + + return 1; + } + + /* loop entry block */ + if (block->kind & block_kind_loop_header) { + assert(preds[0] == idx - 1); + ctx.info[idx].exec = ctx.info[idx - 1].exec; + loop_info& info = ctx.loop.back(); + while (ctx.info[idx].exec.size() > info.num_exec_masks) + ctx.info[idx].exec.pop_back(); + + /* create ssa names for outer exec masks */ + if (info.has_discard) { + aco_ptr<Pseudo_instruction> phi; + for (int i = 0; i < info.num_exec_masks - 1; i++) { + phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)); + phi->definitions[0] = bld.def(s2); + phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first); + ctx.info[idx].exec[i].first = bld.insert(std::move(phi)); + } + } + + /* create ssa name for restore mask */ + if (info.has_divergent_break) { + /* this phi might be trivial but ensures a parallelcopy on the loop header */ + aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; + phi->definitions[0] = bld.def(s2); + phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first); + ctx.info[idx].exec.back().first = bld.insert(std::move(phi)); + } + + /* create ssa name for loop active mask */ + aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; + if (info.has_divergent_continue) + phi->definitions[0] = bld.def(s2); + else + phi->definitions[0] = bld.def(s2, exec); + phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first); + Temp loop_active = bld.insert(std::move(phi)); + + if (info.has_divergent_break) { + uint8_t mask_type = (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop; + ctx.info[idx].exec.emplace_back(loop_active, mask_type); + } else { + ctx.info[idx].exec.back().first = loop_active; + ctx.info[idx].exec.back().second |= mask_type_loop; + } + + /* create a parallelcopy to move the active mask to exec */ + unsigned i = 0; + if (info.has_divergent_continue) { + while (block->instructions[i]->opcode != aco_opcode::p_logical_start) { + bld.insert(std::move(block->instructions[i])); + i++; + } + uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); + ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + ctx.info[idx].exec.back().first), mask_type); + } + + return i; + } + + /* loop exit block */ + if (block->kind & block_kind_loop_exit) { + Block* header = ctx.loop.back().loop_header; + loop_info& info = ctx.loop.back(); + + for (ASSERTED unsigned pred : preds) + assert(ctx.info[pred].exec.size() >= info.num_exec_masks); + + /* fill the loop header phis */ + std::vector<unsigned>& header_preds = header->linear_preds; + int k = 0; + if (info.has_discard) { + while (k < info.num_exec_masks - 1) { + aco_ptr<Instruction>& phi = header->instructions[k]; + assert(phi->opcode == aco_opcode::p_linear_phi); + for (unsigned i = 1; i < phi->operands.size(); i++) + phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[k].first); + k++; + } + } + aco_ptr<Instruction>& phi = header->instructions[k++]; + assert(phi->opcode == aco_opcode::p_linear_phi); + for (unsigned i = 1; i < phi->operands.size(); i++) + phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first); + + if (info.has_divergent_break) { + aco_ptr<Instruction>& phi = header->instructions[k]; + assert(phi->opcode == aco_opcode::p_linear_phi); + for (unsigned i = 1; i < phi->operands.size(); i++) + phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks].first); + } + + assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2); + + /* create the loop exit phis if not trivial */ + for (unsigned k = 0; k < info.num_exec_masks; k++) { + Temp same = ctx.info[preds[0]].exec[k].first; + uint8_t type = ctx.info[header_preds[0]].exec[k].second; + bool trivial = true; + + for (unsigned i = 1; i < preds.size() && trivial; i++) { + if (ctx.info[preds[i]].exec[k].first != same) + trivial = false; + } + + if (trivial) { + ctx.info[idx].exec.emplace_back(same, type); + } else { + /* create phi for loop footer */ + aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; + phi->definitions[0] = bld.def(s2); + for (unsigned i = 0; i < phi->operands.size(); i++) + phi->operands[i] = Operand(ctx.info[preds[i]].exec[k].first); + ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type); + } + } + assert(ctx.info[idx].exec.size() == info.num_exec_masks); + + /* create a parallelcopy to move the live mask to exec */ + unsigned i = 0; + while (block->instructions[i]->opcode != aco_opcode::p_logical_start) { + bld.insert(std::move(block->instructions[i])); + i++; + } + + if (ctx.handle_wqm) { + if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) { + if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 || + (ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) { + ctx.info[idx].exec.back().second |= mask_type_global; + transition_to_Exact(ctx, bld, idx); + ctx.handle_wqm = false; + } + } + if (ctx.info[idx].block_needs == WQM) + transition_to_WQM(ctx, bld, idx); + else if (ctx.info[idx].block_needs == Exact) + transition_to_Exact(ctx, bld, idx); + } + + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + ctx.info[idx].exec.back().first); + + ctx.loop.pop_back(); + return i; + } + + if (preds.size() == 1) { + ctx.info[idx].exec = ctx.info[preds[0]].exec; + } else { + assert(preds.size() == 2); + /* if one of the predecessors ends in exact mask, we pop it from stack */ + unsigned num_exec_masks = std::min(ctx.info[preds[0]].exec.size(), + ctx.info[preds[1]].exec.size()); + if (block->kind & block_kind_top_level && !(block->kind & block_kind_merge)) + num_exec_masks = std::min(num_exec_masks, 2u); + + /* create phis for diverged exec masks */ + for (unsigned i = 0; i < num_exec_masks; i++) { + bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge); + if (!in_exec && ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) { + assert(ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second); + ctx.info[idx].exec.emplace_back(ctx.info[preds[0]].exec[i]); + continue; + } + + Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(s2, exec) : bld.def(s2), + ctx.info[preds[0]].exec[i].first, + ctx.info[preds[1]].exec[i].first); + uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second; + ctx.info[idx].exec.emplace_back(phi, mask_type); + } + } + + unsigned i = 0; + while (block->instructions[i]->opcode == aco_opcode::p_phi || + block->instructions[i]->opcode == aco_opcode::p_linear_phi) { + bld.insert(std::move(block->instructions[i])); + i++; + } + + if (block->kind & block_kind_merge) + ctx.info[idx].exec.pop_back(); + + if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 3) { + assert(ctx.info[idx].exec.back().second == mask_type_exact); + assert(block->kind & block_kind_merge); + ctx.info[idx].exec.pop_back(); + } + + /* try to satisfy the block's needs */ + if (ctx.handle_wqm) { + if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) { + if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 || + (ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) { + ctx.info[idx].exec.back().second |= mask_type_global; + transition_to_Exact(ctx, bld, idx); + ctx.handle_wqm = false; + } + } + if (ctx.info[idx].block_needs == WQM) + transition_to_WQM(ctx, bld, idx); + else if (ctx.info[idx].block_needs == Exact) + transition_to_Exact(ctx, bld, idx); + } + + if (block->kind & block_kind_merge) { + Temp restore = ctx.info[idx].exec.back().first; + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), restore); + } + + return i; +} + +void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr<Instruction>& instr, Temp cur_exec) +{ + Operand offset = instr->operands[1]; + if (need_check) { + /* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */ + Temp nonempty = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), cur_exec, Operand(0u)); + + if (offset.isLiteral()) + offset = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1), offset); + + offset = bld.sop2(aco_opcode::s_cselect_b32, bld.hint_m0(bld.def(s1)), + offset, Operand(UINT32_MAX), bld.scc(nonempty)); + } else if (offset.isConstant() && offset.constantValue() > 0xFFFFF) { + offset = bld.sop1(aco_opcode::s_mov_b32, bld.hint_m0(bld.def(s1)), offset); + } + if (!offset.isConstant()) + offset.setFixed(m0); + + switch (instr->operands[2].size()) { + case 1: + instr->opcode = aco_opcode::s_buffer_store_dword; + break; + case 2: + instr->opcode = aco_opcode::s_buffer_store_dwordx2; + break; + case 4: + instr->opcode = aco_opcode::s_buffer_store_dwordx4; + break; + default: + unreachable("Invalid SMEM buffer store size"); + } + instr->operands[1] = offset; + /* as_uniform() needs to be done here so it's done in exact mode and helper + * lanes don't contribute. */ + instr->operands[2] = Operand(bld.as_uniform(instr->operands[2])); +} + +void process_instructions(exec_ctx& ctx, Block* block, + std::vector<aco_ptr<Instruction>>& instructions, + unsigned idx) +{ + WQMState state; + if (ctx.info[block->index].exec.back().second & mask_type_wqm) + state = WQM; + else { + assert(!ctx.handle_wqm || ctx.info[block->index].exec.back().second & mask_type_exact); + state = Exact; + } + + /* if the block doesn't need both, WQM and Exact, we can skip processing the instructions */ + bool process = (ctx.handle_wqm && + (ctx.info[block->index].block_needs & state) != + (ctx.info[block->index].block_needs & (WQM | Exact))) || + block->kind & block_kind_uses_discard_if || + block->kind & block_kind_needs_lowering; + if (!process) { + std::vector<aco_ptr<Instruction>>::iterator it = std::next(block->instructions.begin(), idx); + instructions.insert(instructions.end(), + std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(it), + std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end())); + return; + } + + Builder bld(ctx.program, &instructions); + + for (; idx < block->instructions.size(); idx++) { + aco_ptr<Instruction> instr = std::move(block->instructions[idx]); + + WQMState needs = ctx.handle_wqm ? ctx.info[block->index].instr_needs[idx] : Unspecified; + + if (instr->opcode == aco_opcode::p_discard_if) { + if (ctx.info[block->index].block_needs & Preserve_WQM) { + assert(block->kind & block_kind_top_level); + transition_to_WQM(ctx, bld, block->index); + ctx.info[block->index].exec.back().second &= ~mask_type_global; + } + unsigned num = ctx.info[block->index].exec.size(); + assert(num); + Operand cond = instr->operands[0]; + instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1)); + for (unsigned i = 0; i < num; i++) { + instr->operands[i] = Operand(ctx.info[block->index].exec[i].first); + if (i == num - 1) + instr->operands[i].setFixed(exec); + Temp new_mask = bld.tmp(s2); + instr->definitions[i] = Definition(new_mask); + ctx.info[block->index].exec[i].first = new_mask; + } + assert((ctx.info[block->index].exec[0].second & mask_type_wqm) == 0); + instr->definitions[num - 1].setFixed(exec); + instr->operands[num] = cond; + instr->definitions[num] = bld.def(s1, scc); + + } else if (needs == WQM && state != WQM) { + transition_to_WQM(ctx, bld, block->index); + state = WQM; + } else if (needs == Exact && state != Exact) { + transition_to_Exact(ctx, bld, block->index); + state = Exact; + } + + if (instr->opcode == aco_opcode::p_is_helper || instr->opcode == aco_opcode::p_load_helper) { + Definition dst = instr->definitions[0]; + if (state == Exact) { + instr.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b64, Format::SOP1, 1, 1)); + instr->operands[0] = Operand(0u); + instr->definitions[0] = dst; + } else { + std::pair<Temp, uint8_t>& exact_mask = ctx.info[block->index].exec[0]; + if (instr->opcode == aco_opcode::p_load_helper && + !(ctx.info[block->index].exec[0].second & mask_type_initial)) { + /* find last initial exact mask */ + for (int i = block->index; i >= 0; i--) { + if (ctx.program->blocks[i].kind & block_kind_top_level && + ctx.info[i].exec[0].second & mask_type_initial) { + exact_mask = ctx.info[i].exec[0]; + break; + } + } + } + + assert(instr->opcode == aco_opcode::p_is_helper || exact_mask.second & mask_type_initial); + assert(exact_mask.second & mask_type_exact); + + instr.reset(create_instruction<SOP2_instruction>(aco_opcode::s_andn2_b64, Format::SOP2, 2, 2)); + instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */ + instr->operands[1] = Operand(exact_mask.first); + instr->definitions[0] = dst; + instr->definitions[1] = bld.def(s1, scc); + } + } else if (instr->opcode == aco_opcode::p_demote_to_helper) { + /* turn demote into discard_if with only exact masks */ + assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) == (mask_type_exact | mask_type_global)); + ctx.info[block->index].exec[0].second &= ~mask_type_initial; + + int num = 0; + Temp cond; + if (instr->operands.empty()) { + /* transition to exact and set exec to zero */ + Temp old_exec = ctx.info[block->index].exec.back().first; + Temp new_exec = bld.tmp(s2); + cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec)); + if (ctx.info[block->index].exec.back().second & mask_type_exact) { + ctx.info[block->index].exec.back().first = new_exec; + } else { + ctx.info[block->index].exec.back().first = cond; + ctx.info[block->index].exec.emplace_back(new_exec, mask_type_exact); + } + } else { + /* demote_if: transition to exact */ + transition_to_Exact(ctx, bld, block->index); + assert(instr->operands[0].isTemp()); + cond = instr->operands[0].getTemp(); + num = 1; + } + + for (unsigned i = 0; i < ctx.info[block->index].exec.size() - 1; i++) + num += ctx.info[block->index].exec[i].second & mask_type_exact ? 1 : 0; + instr.reset(create_instruction<Instruction>(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1)); + int k = 0; + for (unsigned i = 0; k < num; i++) { + if (ctx.info[block->index].exec[i].second & mask_type_exact) { + instr->operands[k] = Operand(ctx.info[block->index].exec[i].first); + Temp new_mask = bld.tmp(s2); + instr->definitions[k] = Definition(new_mask); + if (i == ctx.info[block->index].exec.size() - 1) + instr->definitions[k].setFixed(exec); + k++; + ctx.info[block->index].exec[i].first = new_mask; + } + } + assert(k == num); + instr->definitions[num] = bld.def(s1, scc); + instr->operands[num] = Operand(cond); + state = Exact; + + } else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) { + bool need_check = ctx.info[block->index].exec.size() != 1 && + !(ctx.info[block->index].exec[ctx.info[block->index].exec.size() - 2].second & Exact); + lower_fs_buffer_store_smem(bld, need_check, instr, ctx.info[block->index].exec.back().first); + } + + bld.insert(std::move(instr)); + } +} + +void add_branch_code(exec_ctx& ctx, Block* block) +{ + unsigned idx = block->index; + Builder bld(ctx.program, block); + + if (idx == ctx.program->blocks.size() - 1) + return; + + /* try to disable wqm handling */ + if (ctx.handle_wqm && block->kind & block_kind_top_level) { + if (ctx.info[idx].exec.size() == 3) { + assert(ctx.info[idx].exec[1].second == mask_type_wqm); + ctx.info[idx].exec.pop_back(); + } + assert(ctx.info[idx].exec.size() <= 2); + + if (ctx.info[idx].ever_again_needs == 0 || + ctx.info[idx].ever_again_needs == Exact) { + /* transition to Exact */ + aco_ptr<Instruction> branch = std::move(block->instructions.back()); + block->instructions.pop_back(); + ctx.info[idx].exec.back().second |= mask_type_global; + transition_to_Exact(ctx, bld, idx); + bld.insert(std::move(branch)); + ctx.handle_wqm = false; + + } else if (ctx.info[idx].block_needs & Preserve_WQM) { + /* transition to WQM and remove global flag */ + aco_ptr<Instruction> branch = std::move(block->instructions.back()); + block->instructions.pop_back(); + transition_to_WQM(ctx, bld, idx); + ctx.info[idx].exec.back().second &= ~mask_type_global; + bld.insert(std::move(branch)); + } + } + + if (block->kind & block_kind_loop_preheader) { + /* collect information about the succeeding loop */ + bool has_divergent_break = false; + bool has_divergent_continue = false; + bool has_discard = false; + uint8_t needs = 0; + unsigned loop_nest_depth = ctx.program->blocks[idx + 1].loop_nest_depth; + + for (unsigned i = idx + 1; ctx.program->blocks[i].loop_nest_depth >= loop_nest_depth; i++) { + Block& loop_block = ctx.program->blocks[i]; + needs |= ctx.info[i].block_needs; + + if (loop_block.kind & block_kind_uses_discard_if || + loop_block.kind & block_kind_discard) + has_discard = true; + if (loop_block.loop_nest_depth != loop_nest_depth) + continue; + + if (loop_block.kind & block_kind_uniform) + continue; + else if (loop_block.kind & block_kind_break) + has_divergent_break = true; + else if (loop_block.kind & block_kind_continue) + has_divergent_continue = true; + } + + if (ctx.handle_wqm) { + if (needs & WQM) { + aco_ptr<Instruction> branch = std::move(block->instructions.back()); + block->instructions.pop_back(); + transition_to_WQM(ctx, bld, idx); + bld.insert(std::move(branch)); + } else { + aco_ptr<Instruction> branch = std::move(block->instructions.back()); + block->instructions.pop_back(); + transition_to_Exact(ctx, bld, idx); + bld.insert(std::move(branch)); + } + } + + unsigned num_exec_masks = ctx.info[idx].exec.size(); + if (block->kind & block_kind_top_level) + num_exec_masks = std::min(num_exec_masks, 2u); + + ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], + num_exec_masks, + needs, + has_divergent_break, + has_divergent_continue, + has_discard); + } + + if (block->kind & block_kind_discard) { + + assert(block->instructions.back()->format == Format::PSEUDO_BRANCH); + aco_ptr<Instruction> branch = std::move(block->instructions.back()); + block->instructions.pop_back(); + + /* create a discard_if() instruction with the exec mask as condition */ + unsigned num = 0; + if (ctx.loop.size()) { + /* if we're in a loop, only discard from the outer exec masks */ + num = ctx.loop.back().num_exec_masks; + } else { + num = ctx.info[idx].exec.size() - 1; + } + + Temp old_exec = ctx.info[idx].exec.back().first; + Temp new_exec = bld.tmp(s2); + Temp cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec)); + ctx.info[idx].exec.back().first = new_exec; + + aco_ptr<Pseudo_instruction> discard{create_instruction<Pseudo_instruction>(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1)}; + for (unsigned i = 0; i < num; i++) { + discard->operands[i] = Operand(ctx.info[block->index].exec[i].first); + Temp new_mask = bld.tmp(s2); + discard->definitions[i] = Definition(new_mask); + ctx.info[block->index].exec[i].first = new_mask; + } + assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0); + discard->operands[num] = Operand(cond); + discard->definitions[num] = bld.def(s1, scc); + + bld.insert(std::move(discard)); + if ((block->kind & (block_kind_break | block_kind_uniform)) == block_kind_break) + ctx.info[idx].exec.back().first = cond; + bld.insert(std::move(branch)); + /* no return here as it can be followed by a divergent break */ + } + + if (block->kind & block_kind_continue_or_break) { + assert(block->instructions.back()->opcode == aco_opcode::p_branch); + block->instructions.pop_back(); + + /* because of how linear_succs is created, this needs to be swapped */ + std::swap(block->linear_succs[0], block->linear_succs[1]); + + assert(ctx.program->blocks[block->linear_succs[1]].kind & block_kind_loop_header); + assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind & block_kind_loop_exit); + + if (ctx.info[idx].exec.back().second & mask_type_loop) { + bld.branch(aco_opcode::p_cbranch_nz, bld.exec(ctx.info[idx].exec.back().first), block->linear_succs[1], block->linear_succs[0]); + } else { + Temp cond = Temp(); + for (int exec_idx = ctx.info[idx].exec.size() - 1; exec_idx >= 0; exec_idx--) { + if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) { + cond = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u)); + break; + } + } + assert(cond != Temp()); + + bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); + } + return; + } + + if (block->kind & block_kind_uniform) { + Pseudo_branch_instruction* branch = static_cast<Pseudo_branch_instruction*>(block->instructions.back().get()); + if (branch->opcode == aco_opcode::p_branch) { + branch->target[0] = block->linear_succs[0]; + } else { + branch->target[0] = block->linear_succs[1]; + branch->target[1] = block->linear_succs[0]; + } + return; + } + + if (block->kind & block_kind_branch) { + + if (ctx.handle_wqm && + ctx.info[idx].exec.size() >= 2 && + ctx.info[idx].exec.back().second == mask_type_exact && + !(ctx.info[idx].block_needs & Exact_Branch) && + ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) { + /* return to wqm before branching */ + ctx.info[idx].exec.pop_back(); + } + + // orig = s_and_saveexec_b64 + assert(block->linear_succs.size() == 2); + assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z); + Temp cond = block->instructions.back()->operands[0].getTemp(); + block->instructions.pop_back(); + + if (ctx.info[idx].block_needs & Exact_Branch) + transition_to_Exact(ctx, bld, idx); + + Temp current_exec = ctx.info[idx].exec.back().first; + uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); + + Temp then_mask = bld.tmp(s2); + Temp old_exec = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + bld.exec(Definition(then_mask)), cond, bld.exec(current_exec)); + + ctx.info[idx].exec.back().first = old_exec; + + /* add next current exec to the stack */ + ctx.info[idx].exec.emplace_back(then_mask, mask_type); + + bld.branch(aco_opcode::p_cbranch_z, bld.exec(then_mask), block->linear_succs[1], block->linear_succs[0]); + return; + } + + if (block->kind & block_kind_invert) { + // exec = s_andn2_b64 (original_exec, exec) + assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_nz); + block->instructions.pop_back(); + Temp then_mask = ctx.info[idx].exec.back().first; + uint8_t mask_type = ctx.info[idx].exec.back().second; + ctx.info[idx].exec.pop_back(); + Temp orig_exec = ctx.info[idx].exec.back().first; + Temp else_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2, exec), + bld.def(s1, scc), orig_exec, bld.exec(then_mask)); + + /* add next current exec to the stack */ + ctx.info[idx].exec.emplace_back(else_mask, mask_type); + + bld.branch(aco_opcode::p_cbranch_z, bld.exec(else_mask), block->linear_succs[1], block->linear_succs[0]); + return; + } + + if (block->kind & block_kind_break) { + // loop_mask = s_andn2_b64 (loop_mask, exec) + assert(block->instructions.back()->opcode == aco_opcode::p_branch); + block->instructions.pop_back(); + + Temp current_exec = ctx.info[idx].exec.back().first; + Temp cond = Temp(); + for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) { + cond = bld.tmp(s1); + Temp exec_mask = ctx.info[idx].exec[exec_idx].first; + exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)), + exec_mask, current_exec); + ctx.info[idx].exec[exec_idx].first = exec_mask; + if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) + break; + } + + /* check if the successor is the merge block, otherwise set exec to 0 */ + // TODO: this could be done better by directly branching to the merge block + unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; + Block& succ = ctx.program->blocks[succ_idx]; + if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { + ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u)); + } + + bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); + return; + } + + if (block->kind & block_kind_continue) { + assert(block->instructions.back()->opcode == aco_opcode::p_branch); + block->instructions.pop_back(); + + Temp current_exec = ctx.info[idx].exec.back().first; + Temp cond = Temp(); + for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) { + if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) + break; + cond = bld.tmp(s1); + Temp exec_mask = ctx.info[idx].exec[exec_idx].first; + exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)), + exec_mask, bld.exec(current_exec)); + ctx.info[idx].exec[exec_idx].first = exec_mask; + } + assert(cond != Temp()); + + /* check if the successor is the merge block, otherwise set exec to 0 */ + // TODO: this could be done better by directly branching to the merge block + unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; + Block& succ = ctx.program->blocks[succ_idx]; + if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { + ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u)); + } + + bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); + return; + } +} + +void process_block(exec_ctx& ctx, Block* block) +{ + std::vector<aco_ptr<Instruction>> instructions; + instructions.reserve(block->instructions.size()); + + unsigned idx = add_coupling_code(ctx, block, instructions); + + assert(block->index != ctx.program->blocks.size() - 1 || + ctx.info[block->index].exec.size() <= 2); + + process_instructions(ctx, block, instructions, idx); + + block->instructions = std::move(instructions); + + add_branch_code(ctx, block); + + block->live_out_exec = ctx.info[block->index].exec.back().first; +} + +} /* end namespace */ + + +void insert_exec_mask(Program *program) +{ + exec_ctx ctx(program); + + if (program->needs_wqm && program->needs_exact) + calculate_wqm_needs(ctx); + + for (Block& block : program->blocks) + process_block(ctx, &block); + +} + +} + diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp new file mode 100644 index 00000000000..d19fdadadea --- /dev/null +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -0,0 +1,697 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <algorithm> +#include <map> + +#include "aco_ir.h" +#include "vulkan/radv_shader.h" + +namespace aco { + +namespace { + +/** + * The general idea of this pass is: + * The CFG is traversed in reverse postorder (forward). + * Per BB one wait_ctx is maintained. + * The in-context is the joined out-contexts of the predecessors. + * The context contains a map: gpr -> wait_entry + * consisting of the information about the cnt values to be waited for. + * Note: After merge-nodes, it might occur that for the same register + * multiple cnt values are to be waited for. + * + * The values are updated according to the encountered instructions: + * - additional events increment the counter of waits of the same type + * - or erase gprs with counters higher than to be waited for. + */ + +// TODO: do a more clever insertion of wait_cnt (lgkm_cnt) when there is a load followed by a use of a previous load + +/* Instructions of the same event will finish in-order except for smem + * and maybe flat. Instructions of different events may not finish in-order. */ +enum wait_event : uint16_t { + event_smem = 1 << 0, + event_lds = 1 << 1, + event_gds = 1 << 2, + event_vmem = 1 << 3, + event_vmem_store = 1 << 4, /* GFX10+ */ + event_flat = 1 << 5, + event_exp_pos = 1 << 6, + event_exp_param = 1 << 7, + event_exp_mrt_null = 1 << 8, + event_gds_gpr_lock = 1 << 9, + event_vmem_gpr_lock = 1 << 10, +}; + +enum counter_type : uint8_t { + counter_exp = 1 << 0, + counter_lgkm = 1 << 1, + counter_vm = 1 << 2, + counter_vs = 1 << 3, +}; + +static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock; +static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat; +static const uint16_t vm_events = event_vmem | event_flat; +static const uint16_t vs_events = event_vmem_store; + +uint8_t get_counters_for_event(wait_event ev) +{ + switch (ev) { + case event_smem: + case event_lds: + case event_gds: + return counter_lgkm; + case event_vmem: + return counter_vm; + case event_vmem_store: + return counter_vs; + case event_flat: + return counter_vm | counter_lgkm; + case event_exp_pos: + case event_exp_param: + case event_exp_mrt_null: + case event_gds_gpr_lock: + case event_vmem_gpr_lock: + return counter_exp; + default: + return 0; + } +} + +struct wait_imm { + static const uint8_t unset_counter = 0xff; + + uint8_t vm; + uint8_t exp; + uint8_t lgkm; + uint8_t vs; + + wait_imm() : + vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) {} + wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) : + vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {} + + uint16_t pack(enum chip_class chip) const + { + uint16_t imm = 0; + assert(exp == unset_counter || exp <= 0x7); + switch (chip) { + case GFX10: + assert(lgkm == unset_counter || lgkm <= 0x3f); + assert(vm == unset_counter || vm <= 0x3f); + imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); + break; + case GFX9: + assert(lgkm == unset_counter || lgkm <= 0xf); + assert(vm == unset_counter || vm <= 0x3f); + imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); + break; + default: + assert(lgkm == unset_counter || lgkm <= 0xf); + assert(vm == unset_counter || vm <= 0xf); + imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); + break; + } + if (chip < GFX9 && vm == wait_imm::unset_counter) + imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the architecture when interpreting the immediate */ + if (chip < GFX10 && lgkm == wait_imm::unset_counter) + imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the architecture when interpreting the immediate */ + return imm; + } + + void combine(const wait_imm& other) + { + vm = std::min(vm, other.vm); + exp = std::min(exp, other.exp); + lgkm = std::min(lgkm, other.lgkm); + vs = std::min(vs, other.vs); + } + + bool empty() const + { + return vm == unset_counter && exp == unset_counter && + lgkm == unset_counter && vs == unset_counter; + } +}; + +struct wait_entry { + wait_imm imm; + uint16_t events; /* use wait_event notion */ + uint8_t counters; /* use counter_type notion */ + bool wait_on_read:1; + bool logical:1; + + wait_entry(wait_event event, wait_imm imm, bool logical, bool wait_on_read) + : imm(imm), events(event), counters(get_counters_for_event(event)), + wait_on_read(wait_on_read), logical(logical) {} + + void join(const wait_entry& other) + { + events |= other.events; + counters |= other.counters; + imm.combine(other.imm); + wait_on_read = wait_on_read || other.wait_on_read; + assert(logical == other.logical); + } + + void remove_counter(counter_type counter) + { + counters &= ~counter; + + if (counter == counter_lgkm) { + imm.lgkm = wait_imm::unset_counter; + events &= ~(event_smem | event_lds | event_gds); + } + + if (counter == counter_vm) { + imm.vm = wait_imm::unset_counter; + events &= ~event_vmem; + } + + if (counter == counter_exp) { + imm.exp = wait_imm::unset_counter; + events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock); + } + + if (counter == counter_vs) { + imm.vs = wait_imm::unset_counter; + events &= ~event_vmem_store; + } + + if (!(counters & counter_lgkm) && !(counters & counter_vm)) + events &= ~event_flat; + } +}; + +struct wait_ctx { + Program *program; + enum chip_class chip_class; + uint16_t max_vm_cnt; + uint16_t max_exp_cnt; + uint16_t max_lgkm_cnt; + uint16_t max_vs_cnt; + uint16_t unordered_events = event_smem | event_flat; + + uint8_t vm_cnt = 0; + uint8_t exp_cnt = 0; + uint8_t lgkm_cnt = 0; + uint8_t vs_cnt = 0; + bool pending_flat_lgkm = false; + bool pending_flat_vm = false; + + wait_imm barrier_imm[barrier_count]; + + std::map<PhysReg,wait_entry> gpr_map; + + wait_ctx() {} + wait_ctx(Program *program_) + : program(program_), + chip_class(program_->chip_class), + max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14), + max_exp_cnt(6), + max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14), + max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0), + unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {} + + void join(const wait_ctx* other, bool logical) + { + exp_cnt = std::max(exp_cnt, other->exp_cnt); + vm_cnt = std::max(vm_cnt, other->vm_cnt); + lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt); + vs_cnt = std::max(vs_cnt, other->vs_cnt); + pending_flat_lgkm |= other->pending_flat_lgkm; + pending_flat_vm |= other->pending_flat_vm; + + for (std::pair<PhysReg,wait_entry> entry : other->gpr_map) + { + std::map<PhysReg,wait_entry>::iterator it = gpr_map.find(entry.first); + if (entry.second.logical != logical) + continue; + + if (it != gpr_map.end()) + it->second.join(entry.second); + else + gpr_map.insert(entry); + } + + for (unsigned i = 0; i < barrier_count; i++) + barrier_imm[i].combine(other->barrier_imm[i]); + } +}; + +wait_imm check_instr(Instruction* instr, wait_ctx& ctx) +{ + wait_imm wait; + + for (const Operand op : instr->operands) { + if (op.isConstant() || op.isUndefined()) + continue; + + /* check consecutively read gprs */ + for (unsigned j = 0; j < op.size(); j++) { + PhysReg reg{op.physReg() + j}; + std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg); + if (it == ctx.gpr_map.end() || !it->second.wait_on_read) + continue; + + wait.combine(it->second.imm); + } + } + + for (const Definition& def : instr->definitions) { + /* check consecutively written gprs */ + for (unsigned j = 0; j < def.getTemp().size(); j++) + { + PhysReg reg{def.physReg() + j}; + + std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg); + if (it == ctx.gpr_map.end()) + continue; + + /* Vector Memory reads and writes return in the order they were issued */ + if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem)) { + it->second.remove_counter(counter_vm); + if (!it->second.counters) + it = ctx.gpr_map.erase(it); + continue; + } + + /* LDS reads and writes return in the order they were issued. same for GDS */ + if (instr->format == Format::DS) { + bool gds = static_cast<DS_instruction*>(instr)->gds; + if ((it->second.events & lgkm_events) == (gds ? event_gds : event_lds)) { + it->second.remove_counter(counter_lgkm); + if (!it->second.counters) + it = ctx.gpr_map.erase(it); + continue; + } + } + + wait.combine(it->second.imm); + } + } + + return wait; +} + +wait_imm kill(Instruction* instr, wait_ctx& ctx) +{ + wait_imm imm; + if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt) + imm.combine(check_instr(instr, ctx)); + + if (instr->format == Format::PSEUDO_BARRIER) { + unsigned* bsize = ctx.program->info->cs.block_size; + unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2]; + switch (instr->opcode) { + case aco_opcode::p_memory_barrier_all: + for (unsigned i = 0; i < barrier_count; i++) { + if ((1 << i) == barrier_shared && workgroup_size <= 64) + continue; + imm.combine(ctx.barrier_imm[i]); + } + break; + case aco_opcode::p_memory_barrier_atomic: + imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]); + break; + /* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */ + case aco_opcode::p_memory_barrier_buffer: + case aco_opcode::p_memory_barrier_image: + imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]); + imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]); + break; + case aco_opcode::p_memory_barrier_shared: + if (workgroup_size > 64) + imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]); + break; + default: + assert(false); + break; + } + } + + if (!imm.empty()) { + if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter) + imm.vm = 0; + if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter) + imm.lgkm = 0; + + /* reset counters */ + ctx.exp_cnt = std::min(ctx.exp_cnt, imm.exp); + ctx.vm_cnt = std::min(ctx.vm_cnt, imm.vm); + ctx.lgkm_cnt = std::min(ctx.lgkm_cnt, imm.lgkm); + ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs); + + /* update barrier wait imms */ + for (unsigned i = 0; i < barrier_count; i++) { + wait_imm& bar = ctx.barrier_imm[i]; + if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) + bar.exp = wait_imm::unset_counter; + if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm) + bar.vm = wait_imm::unset_counter; + if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm) + bar.lgkm = wait_imm::unset_counter; + if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs) + bar.vs = wait_imm::unset_counter; + } + + /* remove all vgprs with higher counter from map */ + std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.begin(); + while (it != ctx.gpr_map.end()) + { + if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp) + it->second.remove_counter(counter_exp); + if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm) + it->second.remove_counter(counter_vm); + if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm) + it->second.remove_counter(counter_lgkm); + if (imm.lgkm != wait_imm::unset_counter && imm.vs <= it->second.imm.vs) + it->second.remove_counter(counter_vs); + if (!it->second.counters) + it = ctx.gpr_map.erase(it); + else + it++; + } + } + + if (imm.vm == 0) + ctx.pending_flat_vm = false; + if (imm.lgkm == 0) + ctx.pending_flat_lgkm = false; + + return imm; +} + +void update_barrier_imm(wait_ctx& ctx, uint8_t counters, barrier_interaction barrier) +{ + unsigned barrier_index = ffs(barrier) - 1; + for (unsigned i = 0; i < barrier_count; i++) { + wait_imm& bar = ctx.barrier_imm[i]; + if (i == barrier_index) { + if (counters & counter_lgkm) + bar.lgkm = 0; + if (counters & counter_vm) + bar.vm = 0; + if (counters & counter_exp) + bar.exp = 0; + if (counters & counter_vs) + bar.vs = 0; + } else { + if (counters & counter_lgkm && bar.lgkm != wait_imm::unset_counter && bar.lgkm < ctx.max_lgkm_cnt) + bar.lgkm++; + if (counters & counter_vm && bar.vm != wait_imm::unset_counter && bar.vm < ctx.max_vm_cnt) + bar.vm++; + if (counters & counter_exp && bar.exp != wait_imm::unset_counter && bar.exp < ctx.max_exp_cnt) + bar.exp++; + if (counters & counter_vs && bar.vs != wait_imm::unset_counter && bar.vs < ctx.max_vs_cnt) + bar.vs++; + } + } +} + +void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none) +{ + uint8_t counters = get_counters_for_event(event); + + if (counters & counter_lgkm && ctx.lgkm_cnt <= ctx.max_lgkm_cnt) + ctx.lgkm_cnt++; + if (counters & counter_vm && ctx.vm_cnt <= ctx.max_vm_cnt) + ctx.vm_cnt++; + if (counters & counter_exp && ctx.exp_cnt <= ctx.max_exp_cnt) + ctx.exp_cnt++; + if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt) + ctx.vs_cnt++; + + update_barrier_imm(ctx, counters, barrier); + + if (ctx.unordered_events & event) + return; + + if (ctx.pending_flat_lgkm) + counters &= ~counter_lgkm; + if (ctx.pending_flat_vm) + counters &= ~counter_vm; + + for (std::pair<const PhysReg,wait_entry>& e : ctx.gpr_map) { + wait_entry& entry = e.second; + + if (entry.events & ctx.unordered_events) + continue; + + assert(entry.events); + + if ((counters & counter_exp) && (entry.events & exp_events) == event && entry.imm.exp < ctx.max_exp_cnt) + entry.imm.exp++; + if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && entry.imm.lgkm < ctx.max_lgkm_cnt) + entry.imm.lgkm++; + if ((counters & counter_vm) && (entry.events & vm_events) == event && entry.imm.vm < ctx.max_vm_cnt) + entry.imm.vm++; + if ((counters & counter_vs) && (entry.events & vs_events) == event && entry.imm.vs < ctx.max_vs_cnt) + entry.imm.vs++; + } +} + +void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none) +{ + assert(ctx.chip_class < GFX10); + + if (ctx.lgkm_cnt <= ctx.max_lgkm_cnt) + ctx.lgkm_cnt++; + if (ctx.lgkm_cnt <= ctx.max_vm_cnt) + ctx.vm_cnt++; + + update_barrier_imm(ctx, counter_vm | counter_lgkm, barrier); + + for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map) + { + if (e.second.counters & counter_vm) + e.second.imm.vm = 0; + if (e.second.counters & counter_lgkm) + e.second.imm.lgkm = 0; + } + ctx.pending_flat_lgkm = true; + ctx.pending_flat_vm = true; +} + +void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read) +{ + uint16_t counters = get_counters_for_event(event); + wait_imm imm; + if (counters & counter_lgkm) + imm.lgkm = 0; + if (counters & counter_vm) + imm.vm = 0; + if (counters & counter_exp) + imm.exp = 0; + if (counters & counter_vs) + imm.vs = 0; + + wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read); + + for (unsigned i = 0; i < rc.size(); i++) { + auto it = ctx.gpr_map.emplace(PhysReg{reg.reg+i}, new_entry); + if (!it.second) + it.first->second.join(new_entry); + } +} + +void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event) +{ + if (!op.isConstant() && !op.isUndefined()) + insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false); +} + +void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event) +{ + insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true); +} + +void gen(Instruction* instr, wait_ctx& ctx) +{ + switch (instr->format) { + case Format::EXP: { + Export_instruction* exp_instr = static_cast<Export_instruction*>(instr); + + wait_event ev; + if (exp_instr->dest <= 9) + ev = event_exp_mrt_null; + else if (exp_instr->dest <= 15) + ev = event_exp_pos; + else + ev = event_exp_param; + update_counters(ctx, ev); + + /* insert new entries for exported vgprs */ + for (unsigned i = 0; i < 4; i++) + { + if (exp_instr->enabled_mask & (1 << i)) { + unsigned idx = exp_instr->compressed ? i >> 1 : i; + assert(idx < exp_instr->operands.size()); + insert_wait_entry(ctx, exp_instr->operands[idx], ev); + + } + } + insert_wait_entry(ctx, exec, s2, ev, false); + break; + } + case Format::FLAT: { + if (ctx.chip_class < GFX10 && !instr->definitions.empty()) + update_counters_for_flat_load(ctx, barrier_buffer); + else + update_counters(ctx, event_flat, barrier_buffer); + + if (!instr->definitions.empty()) + insert_wait_entry(ctx, instr->definitions[0], event_flat); + break; + } + case Format::SMEM: { + update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier); + + if (!instr->definitions.empty()) + insert_wait_entry(ctx, instr->definitions[0], event_smem); + break; + } + case Format::DS: { + bool gds = static_cast<DS_instruction*>(instr)->gds; + update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared); + if (gds) + update_counters(ctx, event_gds_gpr_lock); + + if (!instr->definitions.empty()) + insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds); + + if (gds) { + for (const Operand& op : instr->operands) + insert_wait_entry(ctx, op, event_gds_gpr_lock); + insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false); + } + break; + } + case Format::MUBUF: + case Format::MTBUF: + case Format::MIMG: + case Format::GLOBAL: { + wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store; + update_counters(ctx, ev, get_barrier_interaction(instr)); + + if (!instr->definitions.empty()) + insert_wait_entry(ctx, instr->definitions[0], ev); + + if (instr->operands.size() == 4 && ctx.chip_class == GFX6) { + ctx.exp_cnt++; + update_counters(ctx, event_vmem_gpr_lock); + insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock); + } + break; + } + default: + break; + } +} + +void emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm) +{ + if (imm.vs != wait_imm::unset_counter) { + assert(ctx.chip_class >= GFX10); + SOPK_instruction* waitcnt_vs = create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 0); + waitcnt_vs->imm = imm.vs; + instructions.emplace_back(waitcnt_vs); + imm.vs = wait_imm::unset_counter; + } + if (!imm.empty()) { + SOPP_instruction* waitcnt = create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0); + waitcnt->imm = imm.pack(ctx.chip_class); + waitcnt->block = -1; + instructions.emplace_back(waitcnt); + } +} + +void handle_block(Program *program, Block& block, wait_ctx& ctx) +{ + std::vector<aco_ptr<Instruction>> new_instructions; + + for (aco_ptr<Instruction>& instr : block.instructions) { + wait_imm imm = kill(instr.get(), ctx); + + if (!imm.empty()) + emit_waitcnt(ctx, new_instructions, imm); + + gen(instr.get(), ctx); + + if (instr->format != Format::PSEUDO_BARRIER) + new_instructions.emplace_back(std::move(instr)); + } + + /* check if this block is at the end of a loop */ + for (unsigned succ_idx : block.linear_succs) { + /* eliminate any remaining counters */ + if (succ_idx <= block.index && (ctx.vm_cnt || ctx.exp_cnt || ctx.lgkm_cnt || ctx.vs_cnt)) { + // TODO: we could do better if we only wait if the regs between the block and other predecessors differ + + aco_ptr<Instruction> branch = std::move(new_instructions.back()); + new_instructions.pop_back(); + + wait_imm imm(ctx.vm_cnt ? 0 : wait_imm::unset_counter, + ctx.exp_cnt ? 0 : wait_imm::unset_counter, + ctx.lgkm_cnt ? 0 : wait_imm::unset_counter, + ctx.vs_cnt ? 0 : wait_imm::unset_counter); + emit_waitcnt(ctx, new_instructions, imm); + + new_instructions.push_back(std::move(branch)); + + ctx = wait_ctx(program); + break; + } + } + block.instructions.swap(new_instructions); +} + +} /* end namespace */ + +void insert_wait_states(Program* program) +{ + wait_ctx out_ctx[program->blocks.size()]; /* per BB ctx */ + for (unsigned i = 0; i < program->blocks.size(); i++) + out_ctx[i] = wait_ctx(program); + + for (unsigned i = 0; i < program->blocks.size(); i++) { + Block& current = program->blocks[i]; + wait_ctx& in = out_ctx[current.index]; + + for (unsigned b : current.linear_preds) + in.join(&out_ctx[b], false); + for (unsigned b : current.logical_preds) + in.join(&out_ctx[b], true); + + if (current.instructions.empty()) + continue; + + handle_block(program, current, in); + } +} + +} + diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp new file mode 100644 index 00000000000..d52043f3c0d --- /dev/null +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -0,0 +1,7621 @@ +/* + * Copyright © 2018 Valve Corporation + * Copyright © 2018 Google + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <algorithm> +#include <map> + +#include "aco_ir.h" +#include "aco_builder.h" +#include "aco_interface.h" +#include "aco_instruction_selection_setup.cpp" +#include "util/fast_idiv_by_const.h" + +namespace aco { +namespace { + +class loop_info_RAII { + isel_context* ctx; + unsigned header_idx_old; + Block* exit_old; + bool divergent_cont_old; + bool divergent_branch_old; + bool divergent_if_old; + +public: + loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit) + : ctx(ctx), + header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit), + divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue), + divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch), + divergent_if_old(ctx->cf_info.parent_if.is_divergent) + { + ctx->cf_info.parent_loop.header_idx = loop_header_idx; + ctx->cf_info.parent_loop.exit = loop_exit; + ctx->cf_info.parent_loop.has_divergent_continue = false; + ctx->cf_info.parent_loop.has_divergent_branch = false; + ctx->cf_info.parent_if.is_divergent = false; + ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1; + } + + ~loop_info_RAII() + { + ctx->cf_info.parent_loop.header_idx = header_idx_old; + ctx->cf_info.parent_loop.exit = exit_old; + ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old; + ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old; + ctx->cf_info.parent_if.is_divergent = divergent_if_old; + ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1; + if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) + ctx->cf_info.exec_potentially_empty = false; + } +}; + +struct if_context { + Temp cond; + + bool divergent_old; + bool exec_potentially_empty_old; + + unsigned BB_if_idx; + unsigned invert_idx; + bool then_branch_divergent; + Block BB_invert; + Block BB_endif; +}; + +static void visit_cf_list(struct isel_context *ctx, + struct exec_list *list); + +static void add_logical_edge(unsigned pred_idx, Block *succ) +{ + succ->logical_preds.emplace_back(pred_idx); +} + + +static void add_linear_edge(unsigned pred_idx, Block *succ) +{ + succ->linear_preds.emplace_back(pred_idx); +} + +static void add_edge(unsigned pred_idx, Block *succ) +{ + add_logical_edge(pred_idx, succ); + add_linear_edge(pred_idx, succ); +} + +static void append_logical_start(Block *b) +{ + Builder(NULL, b).pseudo(aco_opcode::p_logical_start); +} + +static void append_logical_end(Block *b) +{ + Builder(NULL, b).pseudo(aco_opcode::p_logical_end); +} + +Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def) +{ + assert(ctx->allocated[def->index].id()); + return ctx->allocated[def->index]; +} + +Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false) +{ + Builder bld(ctx->program, ctx->block); + + if (!dst.id()) + dst = bld.tmp(src.regClass()); + + if (ctx->stage != fragment_fs) { + if (!dst.id()) + return src; + + if (src.type() == RegType::vgpr || src.size() > 1) + bld.copy(Definition(dst), src); + else + bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src); + return dst; + } + + bld.pseudo(aco_opcode::p_wqm, Definition(dst), src); + ctx->program->needs_wqm |= program_needs_wqm; + return dst; +} + +Temp as_vgpr(isel_context *ctx, Temp val) +{ + if (val.type() == RegType::sgpr) { + Builder bld(ctx->program, ctx->block); + return bld.copy(bld.def(RegType::vgpr, val.size()), val); + } + assert(val.type() == RegType::vgpr); + return val; +} + +//assumes a != 0xffffffff +void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b) +{ + assert(b != 0); + Builder bld(ctx->program, ctx->block); + + if (util_is_power_of_two_or_zero(b)) { + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a); + return; + } + + util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32); + + assert(info.multiplier <= 0xffffffff); + + bool pre_shift = info.pre_shift != 0; + bool increment = info.increment != 0; + bool multiply = true; + bool post_shift = info.post_shift != 0; + + if (!pre_shift && !increment && !multiply && !post_shift) { + bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a); + return; + } + + Temp pre_shift_dst = a; + if (pre_shift) { + pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst; + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a); + } + + Temp increment_dst = pre_shift_dst; + if (increment) { + increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst; + bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst); + } + + Temp multiply_dst = increment_dst; + if (multiply) { + multiply_dst = post_shift ? bld.tmp(v1) : dst; + bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst, + bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier))); + } + + if (post_shift) { + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst); + } +} + +void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst) +{ + Builder bld(ctx->program, ctx->block); + bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx)); +} + + +Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc) +{ + /* no need to extract the whole vector */ + if (src.regClass() == dst_rc) { + assert(idx == 0); + return src; + } + assert(src.size() > idx); + Builder bld(ctx->program, ctx->block); + auto it = ctx->allocated_vec.find(src.id()); + /* the size check needs to be early because elements other than 0 may be garbage */ + if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) { + if (it->second[idx].regClass() == dst_rc) { + return it->second[idx]; + } else { + assert(dst_rc.size() == it->second[idx].regClass().size()); + assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr); + return bld.copy(bld.def(dst_rc), it->second[idx]); + } + } + + if (src.size() == dst_rc.size()) { + assert(idx == 0); + return bld.copy(bld.def(dst_rc), src); + } else { + Temp dst = bld.tmp(dst_rc); + emit_extract_vector(ctx, src, idx, dst); + return dst; + } +} + +void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components) +{ + if (num_components == 1) + return; + if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end()) + return; + aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)}; + split->operands[0] = Operand(vec_src); + std::array<Temp,4> elems; + for (unsigned i = 0; i < num_components; i++) { + elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)}; + split->definitions[i] = Definition(elems[i]); + } + ctx->block->instructions.emplace_back(std::move(split)); + ctx->allocated_vec.emplace(vec_src.id(), elems); +} + +/* This vector expansion uses a mask to determine which elements in the new vector + * come from the original vector. The other elements are undefined. */ +void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask) +{ + emit_split_vector(ctx, vec_src, util_bitcount(mask)); + + if (vec_src == dst) + return; + + Builder bld(ctx->program, ctx->block); + if (num_components == 1) { + if (dst.type() == RegType::sgpr) + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src); + else + bld.copy(Definition(dst), vec_src); + return; + } + + unsigned component_size = dst.size() / num_components; + std::array<Temp,4> elems; + + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; + vec->definitions[0] = Definition(dst); + unsigned k = 0; + for (unsigned i = 0; i < num_components; i++) { + if (mask & (1 << i)) { + Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size)); + if (dst.type() == RegType::sgpr) + src = bld.as_uniform(src); + vec->operands[i] = Operand(src); + } else { + vec->operands[i] = Operand(0u); + } + elems[i] = vec->operands[i].getTemp(); + } + ctx->block->instructions.emplace_back(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), elems); +} + +Temp as_divergent_bool(isel_context *ctx, Temp val, bool vcc_hint) +{ + if (val.regClass() == s2) { + return val; + } else { + assert(val.regClass() == s1); + Builder bld(ctx->program, ctx->block); + Definition& def = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), + Operand((uint32_t) -1), Operand(0u), bld.scc(val)).def(0); + if (vcc_hint) + def.setHint(vcc); + return def.getTemp(); + } +} + +Temp as_uniform_bool(isel_context *ctx, Temp val) +{ + if (val.regClass() == s1) { + return val; + } else { + assert(val.regClass() == s2); + Builder bld(ctx->program, ctx->block); + return bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(0u), Operand(val)); + } +} + +Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1) +{ + if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1) + return get_ssa_temp(ctx, src.src.ssa); + + if (src.src.ssa->num_components == size) { + bool identity_swizzle = true; + for (unsigned i = 0; identity_swizzle && i < size; i++) { + if (src.swizzle[i] != i) + identity_swizzle = false; + } + if (identity_swizzle) + return get_ssa_temp(ctx, src.src.ssa); + } + + Temp vec = get_ssa_temp(ctx, src.src.ssa); + unsigned elem_size = vec.size() / src.src.ssa->num_components; + assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */ + assert(vec.size() % elem_size == 0); + + RegClass elem_rc = RegClass(vec.type(), elem_size); + if (size == 1) { + return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc); + } else { + assert(size <= 4); + std::array<Temp,4> elems; + aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)}; + for (unsigned i = 0; i < size; ++i) { + elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc); + vec_instr->operands[i] = Operand{elems[i]}; + } + Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)}; + vec_instr->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec_instr)); + ctx->allocated_vec.emplace(dst.id(), elems); + return dst; + } +} + +Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr) +{ + if (ptr.size() == 2) + return ptr; + Builder bld(ctx->program, ctx->block); + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), + ptr, Operand((unsigned)ctx->options->address32_hi)); +} + +void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc) +{ + aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)}; + sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0])); + sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1])); + sop2->definitions[0] = Definition(dst); + if (writes_scc) + sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1); + ctx->block->instructions.emplace_back(std::move(sop2)); +} + +void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool commutative, bool swap_srcs=false) +{ + Builder bld(ctx->program, ctx->block); + Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]); + Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]); + if (src1.type() == RegType::sgpr) { + if (commutative && src0.type() == RegType::vgpr) { + Temp t = src0; + src0 = src1; + src1 = t; + } else if (src0.type() == RegType::vgpr && + op != aco_opcode::v_madmk_f32 && + op != aco_opcode::v_madak_f32 && + op != aco_opcode::v_madmk_f16 && + op != aco_opcode::v_madak_f16) { + /* If the instruction is not commutative, we emit a VOP3A instruction */ + bld.vop2_e64(op, Definition(dst), src0, src1); + return; + } else { + src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr + } + } + bld.vop2(op, Definition(dst), src0, src1); +} + +void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) +{ + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + Temp src2 = get_alu_src(ctx, instr->src[2]); + + /* ensure that the instruction has at most 1 sgpr operand + * The optimizer will inline constants for us */ + if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr) + src0 = as_vgpr(ctx, src0); + if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr) + src1 = as_vgpr(ctx, src1); + if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr) + src2 = as_vgpr(ctx, src2); + + Builder bld(ctx->program, ctx->block); + bld.vop3(op, Definition(dst), src0, src1, src2); +} + +void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) +{ + Builder bld(ctx->program, ctx->block); + bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0])); +} + +void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) +{ + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + aco_ptr<Instruction> vopc; + if (src1.type() == RegType::sgpr) { + if (src0.type() == RegType::vgpr) { + /* to swap the operands, we might also have to change the opcode */ + switch (op) { + case aco_opcode::v_cmp_lt_f32: + op = aco_opcode::v_cmp_gt_f32; + break; + case aco_opcode::v_cmp_ge_f32: + op = aco_opcode::v_cmp_le_f32; + break; + case aco_opcode::v_cmp_lt_i32: + op = aco_opcode::v_cmp_gt_i32; + break; + case aco_opcode::v_cmp_ge_i32: + op = aco_opcode::v_cmp_le_i32; + break; + case aco_opcode::v_cmp_lt_u32: + op = aco_opcode::v_cmp_gt_u32; + break; + case aco_opcode::v_cmp_ge_u32: + op = aco_opcode::v_cmp_le_u32; + break; + case aco_opcode::v_cmp_lt_f64: + op = aco_opcode::v_cmp_gt_f64; + break; + case aco_opcode::v_cmp_ge_f64: + op = aco_opcode::v_cmp_le_f64; + break; + case aco_opcode::v_cmp_lt_i64: + op = aco_opcode::v_cmp_gt_i64; + break; + case aco_opcode::v_cmp_ge_i64: + op = aco_opcode::v_cmp_le_i64; + break; + case aco_opcode::v_cmp_lt_u64: + op = aco_opcode::v_cmp_gt_u64; + break; + case aco_opcode::v_cmp_ge_u64: + op = aco_opcode::v_cmp_le_u64; + break; + default: /* eq and ne are commutative */ + break; + } + Temp t = src0; + src0 = src1; + src1 = t; + } else { + src1 = as_vgpr(ctx, src1); + } + } + Builder bld(ctx->program, ctx->block); + bld.vopc(op, Definition(dst), src0, src1).def(0).setHint(vcc); +} + +void emit_comparison(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) +{ + if (dst.regClass() == s2) { + emit_vopc_instruction(ctx, instr, op, dst); + if (!ctx->divergent_vals[instr->dest.dest.ssa.index]) + emit_split_vector(ctx, dst, 2); + } else if (dst.regClass() == s1) { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + assert(src0.type() == RegType::sgpr && src1.type() == RegType::sgpr); + + Builder bld(ctx->program, ctx->block); + bld.sopc(op, bld.scc(Definition(dst)), src0, src1); + + } else { + assert(false); + } +} + +void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op32, aco_opcode op64, Temp dst) +{ + Builder bld(ctx->program, ctx->block); + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.regClass() == s2) { + bld.sop2(op64, Definition(dst), bld.def(s1, scc), + as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false)); + } else { + assert(dst.regClass() == s1); + bld.sop2(op32, bld.def(s1), bld.scc(Definition(dst)), + as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1)); + } +} + + +void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) +{ + Builder bld(ctx->program, ctx->block); + Temp cond = get_alu_src(ctx, instr->src[0]); + Temp then = get_alu_src(ctx, instr->src[1]); + Temp els = get_alu_src(ctx, instr->src[2]); + + if (dst.type() == RegType::vgpr) { + cond = as_divergent_bool(ctx, cond, true); + + aco_ptr<Instruction> bcsel; + if (dst.size() == 1) { + then = as_vgpr(ctx, then); + els = as_vgpr(ctx, els); + + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond); + } else if (dst.size() == 2) { + Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then); + Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els); + + Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond); + Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + return; + } + + if (instr->dest.dest.ssa.bit_size != 1) { /* uniform condition and values in sgpr */ + if (dst.regClass() == s1 || dst.regClass() == s2) { + assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass()); + aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64; + bld.sop2(op, Definition(dst), then, els, bld.scc(as_uniform_bool(ctx, cond))); + } else { + fprintf(stderr, "Unimplemented uniform bcsel bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + return; + } + + /* boolean bcsel */ + assert(instr->dest.dest.ssa.bit_size == 1); + + if (dst.regClass() == s1) + cond = as_uniform_bool(ctx, cond); + + if (cond.regClass() == s1) { /* uniform selection */ + aco_opcode op; + if (dst.regClass() == s2) { + op = aco_opcode::s_cselect_b64; + then = as_divergent_bool(ctx, then, false); + els = as_divergent_bool(ctx, els, false); + } else { + assert(dst.regClass() == s1); + op = aco_opcode::s_cselect_b32; + then = as_uniform_bool(ctx, then); + els = as_uniform_bool(ctx, els); + } + bld.sop2(op, Definition(dst), then, els, bld.scc(cond)); + return; + } + + /* divergent boolean bcsel + * this implements bcsel on bools: dst = s0 ? s1 : s2 + * are going to be: dst = (s0 & s1) | (~s0 & s2) */ + assert (dst.regClass() == s2); + then = as_divergent_bool(ctx, then, false); + els = as_divergent_bool(ctx, els, false); + + if (cond.id() != then.id()) + then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then); + + if (cond.id() == els.id()) + bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then); + else + bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then, + bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond)); +} + +void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) +{ + if (!instr->dest.dest.is_ssa) { + fprintf(stderr, "nir alu dst not in ssa: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + Builder bld(ctx->program, ctx->block); + Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa); + switch(instr->op) { + case nir_op_vec2: + case nir_op_vec3: + case nir_op_vec4: { + std::array<Temp,4> elems; + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)}; + for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) { + elems[i] = get_alu_src(ctx, instr->src[i]); + vec->operands[i] = Operand{elems[i]}; + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), elems); + break; + } + case nir_op_mov: { + Temp src = get_alu_src(ctx, instr->src[0]); + aco_ptr<Instruction> mov; + if (dst.type() == RegType::sgpr) { + if (src.type() == RegType::vgpr) + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src); + else if (src.regClass() == s1) + bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src); + else if (src.regClass() == s2) + bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src); + else + unreachable("wrong src register class for nir_op_imov"); + } else if (dst.regClass() == v1) { + bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src); + } else if (dst.regClass() == v2) { + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src); + } else { + nir_print_instr(&instr->instr, stderr); + unreachable("Should have been lowered to scalar."); + } + break; + } + case nir_op_inot: { + Temp src = get_alu_src(ctx, instr->src[0]); + /* uniform booleans */ + if (instr->dest.dest.ssa.bit_size == 1 && dst.regClass() == s1) { + if (src.regClass() == s1) { + /* in this case, src is either 1 or 0 */ + bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.scc(Definition(dst)), Operand(1u), src); + } else { + /* src is either exec_mask or 0 */ + assert(src.regClass() == s2); + bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(Definition(dst)), Operand(0u), src); + } + } else if (dst.regClass() == v1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst); + } else if (dst.type() == RegType::sgpr) { + aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64; + bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ineg: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == v1) { + bld.vsub32(Definition(dst), Operand(0u), Operand(src)); + } else if (dst.regClass() == s1) { + bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_iabs: { + if (dst.regClass() == s1) { + bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0])); + } else if (dst.regClass() == v1) { + Temp src = get_alu_src(ctx, instr->src[0]); + bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src)); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_isign: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == s1) { + Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u)); + Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u)); + bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp); + } else if (dst.regClass() == s2) { + Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u)); + Temp neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u)); + bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz); + } else if (dst.regClass() == v1) { + Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src); + Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz); + } else if (dst.regClass() == v2) { + Temp upper = emit_extract_vector(ctx, src, 1, v1); + Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper); + Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz); + upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imax: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_umax: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imin: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_umin: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ior: { + if (instr->dest.dest.ssa.bit_size == 1) { + emit_boolean_logic(ctx, instr, aco_opcode::s_or_b32, aco_opcode::s_or_b64, dst); + } else if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true); + } else if (dst.regClass() == s2) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_iand: { + if (instr->dest.dest.ssa.bit_size == 1) { + emit_boolean_logic(ctx, instr, aco_opcode::s_and_b32, aco_opcode::s_and_b64, dst); + } else if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true); + } else if (dst.regClass() == s2) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ixor: { + if (instr->dest.dest.ssa.bit_size == 1) { + emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::s_xor_b64, dst); + } else if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true); + } else if (dst.regClass() == s2) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ushr: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true); + } else if (dst.regClass() == v2) { + bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), + get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0])); + } else if (dst.regClass() == s2) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ishl: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true); + } else if (dst.regClass() == v2) { + bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), + get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0])); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true); + } else if (dst.regClass() == s2) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ishr: { + if (dst.regClass() == v1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true); + } else if (dst.regClass() == v2) { + bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), + get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0])); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true); + } else if (dst.regClass() == s2) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_find_lsb: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (src.regClass() == s1) { + bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src); + } else if (src.regClass() == v1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst); + } else if (src.regClass() == s2) { + bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ufind_msb: + case nir_op_ifind_msb: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (src.regClass() == s1 || src.regClass() == s2) { + aco_opcode op = src.regClass() == s2 ? + (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) : + (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32); + Temp msb_rev = bld.sop1(op, bld.def(s1), src); + + Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), + Operand(src.size() * 32u - 1u), msb_rev); + Temp msb = sub.def(0).getTemp(); + Temp carry = sub.def(1).getTemp(); + + bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry); + } else if (src.regClass() == v1) { + aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32; + Temp msb_rev = bld.tmp(v1); + emit_vop1_instruction(ctx, instr, op, msb_rev); + Temp msb = bld.tmp(v1); + Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp(); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_bitfield_reverse: { + if (dst.regClass() == s1) { + bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0])); + } else if (dst.regClass() == v1) { + bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0])); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_iadd: { + if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true); + break; + } + + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.regClass() == v1) { + bld.vadd32(Definition(dst), Operand(src0), Operand(src1)); + break; + } + + assert(src0.size() == 2 && src1.size() == 2); + Temp src00 = bld.tmp(src0.type(), 1); + Temp src01 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); + Temp src10 = bld.tmp(src1.type(), 1); + Temp src11 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); + + if (dst.regClass() == s2) { + Temp carry = bld.tmp(s1); + Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10); + Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); + } else if (dst.regClass() == v2) { + Temp dst0 = bld.tmp(v1); + Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp(); + Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_uadd_sat: { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.regClass() == s1) { + Temp tmp = bld.tmp(s1), carry = bld.tmp(s1); + bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), + src0, src1); + bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry)); + } else if (dst.regClass() == v1) { + if (ctx->options->chip_class >= GFX9) { + aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)}; + add->operands[0] = Operand(src0); + add->operands[1] = Operand(src1); + add->definitions[0] = Definition(dst); + add->clamp = 1; + ctx->block->instructions.emplace_back(std::move(add)); + } else { + if (src1.regClass() != v1) + std::swap(src0, src1); + assert(src1.regClass() == v1); + Temp tmp = bld.tmp(v1); + Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp(); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry); + } + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_uadd_carry: { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.regClass() == s1) { + bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1); + break; + } + if (dst.regClass() == v1) { + Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp(); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry); + break; + } + + Temp src00 = bld.tmp(src0.type(), 1); + Temp src01 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); + Temp src10 = bld.tmp(src1.type(), 1); + Temp src11 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); + if (dst.regClass() == s2) { + Temp carry = bld.tmp(s1); + bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10); + carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp(); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u)); + } else if (dst.regClass() == v2) { + Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp(); + carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp(); + carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u)); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_isub: { + if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true); + break; + } + + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.regClass() == v1) { + bld.vsub32(Definition(dst), src0, src1); + break; + } + + Temp src00 = bld.tmp(src0.type(), 1); + Temp src01 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); + Temp src10 = bld.tmp(src1.type(), 1); + Temp src11 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); + if (dst.regClass() == s2) { + Temp carry = bld.tmp(s1); + Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10); + Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); + } else if (dst.regClass() == v2) { + Temp lower = bld.tmp(v1); + Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp(); + Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_usub_borrow: { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.regClass() == s1) { + bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1); + break; + } else if (dst.regClass() == v1) { + Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp(); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow); + break; + } + + Temp src00 = bld.tmp(src0.type(), 1); + Temp src01 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); + Temp src10 = bld.tmp(src1.type(), 1); + Temp src11 = bld.tmp(dst.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); + if (dst.regClass() == s2) { + Temp borrow = bld.tmp(s1); + bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10); + borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp(); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u)); + } else if (dst.regClass() == v2) { + Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp(); + borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp(); + borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u)); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imul: { + if (dst.regClass() == v1) { + bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst), + get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1])); + } else if (dst.regClass() == s1) { + emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_umul_high: { + if (dst.regClass() == v1) { + bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1])); + } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) { + bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1])); + } else if (dst.regClass() == s1) { + Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imul_high: { + if (dst.regClass() == v1) { + bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1])); + } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) { + bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1])); + } else if (dst.regClass() == s1) { + Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fmul: { + if (dst.size() == 1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true); + } else if (dst.size() == 2) { + bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fadd: { + if (dst.size() == 1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true); + } else if (dst.size() == 2) { + bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fsub: { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.size() == 1) { + if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr) + emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false); + else + emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true); + } else if (dst.size() == 2) { + Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), + get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add); + sub->neg[1] = true; + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fmod: + case nir_op_frem: { + if (dst.size() == 1) { + Temp rcp = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_alu_src(ctx, instr->src[1])); + Temp mul = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), get_alu_src(ctx, instr->src[0]), rcp); + + aco_opcode op = instr->op == nir_op_fmod ? aco_opcode::v_floor_f32 : aco_opcode::v_trunc_f32; + Temp floor = bld.vop1(op, bld.def(v1), mul); + + mul = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), get_alu_src(ctx, instr->src[1]), floor); + bld.vop2(aco_opcode::v_sub_f32, Definition(dst), get_alu_src(ctx, instr->src[0]), mul); + } else if (dst.size() == 2) { + Temp rcp = bld.vop1(aco_opcode::v_rcp_f64, bld.def(v2), get_alu_src(ctx, instr->src[1])); + Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), get_alu_src(ctx, instr->src[0]), rcp); + + aco_opcode op = instr->op == nir_op_fmod ? aco_opcode::v_floor_f64 : aco_opcode::v_trunc_f64; + Temp floor = bld.vop1(op, bld.def(v1), mul); + + mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), get_alu_src(ctx, instr->src[1]), floor); + Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]), mul); + VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add); + sub->neg[1] = true; + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fmax: { + if (dst.size() == 1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true); + } else if (dst.size() == 2) { + bld.vop3(aco_opcode::v_max_f64, Definition(dst), + get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fmin: { + if (dst.size() == 1) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true); + } else if (dst.size() == 2) { + bld.vop3(aco_opcode::v_min_f64, Definition(dst), + get_alu_src(ctx, instr->src[0]), + as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fmax3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fmin3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fmed3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_umax3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_umin3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_umed3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imax3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imin3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_imed3: { + if (dst.size() == 1) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_cube_face_coord: { + Temp in = get_alu_src(ctx, instr->src[0], 3); + Temp src[3] = { emit_extract_vector(ctx, in, 0, v1), + emit_extract_vector(ctx, in, 1, v1), + emit_extract_vector(ctx, in, 2, v1) }; + Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]); + ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma); + Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]); + Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]); + sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/)); + tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc); + break; + } + case nir_op_cube_face_index: { + Temp in = get_alu_src(ctx, instr->src[0], 3); + Temp src[3] = { emit_extract_vector(ctx, in, 0, v1), + emit_extract_vector(ctx, in, 1, v1), + emit_extract_vector(ctx, in, 2, v1) }; + bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]); + break; + } + case nir_op_bcsel: { + emit_bcsel(ctx, instr, dst); + break; + } + case nir_op_frsq: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f32, dst); + } else if (dst.size() == 2) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fneg: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.size() == 1) { + bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src)); + } else if (dst.size() == 2) { + Temp upper = bld.tmp(v1), lower = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); + upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fabs: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.size() == 1) { + bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src)); + } else if (dst.size() == 2) { + Temp upper = bld.tmp(v1), lower = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); + upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fsat: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.size() == 1) { + bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src); + } else if (dst.size() == 2) { + Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u)); + VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add); + vop3->clamp = true; + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_flog2: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_frcp: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f32, dst); + } else if (dst.size() == 2) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fexp2: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fsqrt: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f32, dst); + } else if (dst.size() == 2) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ffract: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst); + } else if (dst.size() == 2) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ffloor: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst); + } else if (dst.size() == 2) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fceil: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst); + } else if (dst.size() == 2) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ftrunc: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst); + } else if (dst.size() == 2) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fround_even: { + if (dst.size() == 1) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst); + } else if (dst.size() == 2) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fsin: + case nir_op_fcos: { + Temp src = get_alu_src(ctx, instr->src[0]); + aco_ptr<Instruction> norm; + if (dst.size() == 1) { + Temp tmp; + Operand half_pi(0x3e22f983u); + if (src.type() == RegType::sgpr) + tmp = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src); + else + tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src); + + /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */ + if (ctx->options->chip_class < GFX9) + tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp); + + aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32; + bld.vop1(opcode, Definition(dst), tmp); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ldexp: { + if (dst.size() == 1) { + bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), + as_vgpr(ctx, get_alu_src(ctx, instr->src[0])), + get_alu_src(ctx, instr->src[1])); + } else if (dst.size() == 2) { + bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), + as_vgpr(ctx, get_alu_src(ctx, instr->src[0])), + get_alu_src(ctx, instr->src[1])); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_frexp_sig: { + if (dst.size() == 1) { + bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), + get_alu_src(ctx, instr->src[0])); + } else if (dst.size() == 2) { + bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), + get_alu_src(ctx, instr->src[0])); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_frexp_exp: { + if (instr->src[0].src.ssa->bit_size == 32) { + bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), + get_alu_src(ctx, instr->src[0])); + } else if (instr->src[0].src.ssa->bit_size == 64) { + bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), + get_alu_src(ctx, instr->src[0])); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fsign: { + Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); + if (dst.size() == 1) { + Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond); + cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond); + } else if (dst.size() == 2) { + Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u)); + Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, src, cond); + + cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u)); + upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_f2f32: { + if (instr->src[0].src.ssa->bit_size == 64) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_f2f64: { + if (instr->src[0].src.ssa->bit_size == 32) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_i2f32: { + assert(dst.size() == 1); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst); + break; + } + case nir_op_i2f64: { + if (instr->src[0].src.ssa->bit_size == 32) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst); + } else if (instr->src[0].src.ssa->bit_size == 64) { + Temp src = get_alu_src(ctx, instr->src[0]); + RegClass rc = RegClass(src.type(), 1); + Temp lower = bld.tmp(rc), upper = bld.tmp(rc); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); + lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower); + upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper); + upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u)); + bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper); + + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_u2f32: { + assert(dst.size() == 1); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst); + break; + } + case nir_op_u2f64: { + if (instr->src[0].src.ssa->bit_size == 32) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst); + } else if (instr->src[0].src.ssa->bit_size == 64) { + Temp src = get_alu_src(ctx, instr->src[0]); + RegClass rc = RegClass(src.type(), 1); + Temp lower = bld.tmp(rc), upper = bld.tmp(rc); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); + lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower); + upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper); + upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u)); + bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_f2i32: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 32) { + if (dst.type() == RegType::vgpr) + bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src); + else + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), + bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src)); + + } else if (instr->src[0].src.ssa->bit_size == 64) { + if (dst.type() == RegType::vgpr) + bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src); + else + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), + bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src)); + + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_f2u32: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 32) { + if (dst.type() == RegType::vgpr) + bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src); + else + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), + bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src)); + + } else if (instr->src[0].src.ssa->bit_size == 64) { + if (dst.type() == RegType::vgpr) + bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src); + else + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), + bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src)); + + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_f2i64: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) { + Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); + exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u)); + Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src); + Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src); + mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa); + mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa); + mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa); + Temp new_exponent = bld.tmp(v1); + Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp(); + mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa); + Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu)); + Temp lower = bld.tmp(v1), upper = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); + lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow); + upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow); + lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower); + upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper); + Temp new_lower = bld.tmp(v1); + borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp(); + Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper); + + } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) { + if (src.type() == RegType::vgpr) + src = bld.as_uniform(src); + Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u)); + exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u)); + exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent); + exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent); + Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src); + Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u)); + mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa); + mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u)); + mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa); + exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent); + mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent); + Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64 + Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu)); + mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond); + Temp lower = bld.tmp(s1), upper = bld.tmp(s1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); + lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower); + upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper); + Temp borrow = bld.tmp(s1); + lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign); + upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + + } else if (instr->src[0].src.ssa->bit_size == 64) { + Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u)); + Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src); + Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec); + vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u)); + Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul); + Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc); + Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma); + Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor); + if (dst.type() == RegType::sgpr) { + lower = bld.as_uniform(lower); + upper = bld.as_uniform(upper); + } + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_f2u64: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) { + Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); + Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent); + exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent); + Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src); + mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa); + Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent); + Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa); + mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa); + Temp new_exponent = bld.tmp(v1); + Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp(); + mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa); + Temp lower = bld.tmp(v1), upper = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); + lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small); + upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small); + lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range); + upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + + } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) { + if (src.type() == RegType::vgpr) + src = bld.as_uniform(src); + Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u)); + exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u)); + exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent); + Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src); + mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa); + Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent); + Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small); + mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa); + Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u)); + mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large); + Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent); + mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond); + Temp lower = bld.tmp(s1), upper = bld.tmp(s1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); + Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u)); + lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small); + upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + + } else if (instr->src[0].src.ssa->bit_size == 64) { + Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u)); + Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src); + Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec); + vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u)); + Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul); + Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc); + Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma); + Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor); + if (dst.type() == RegType::sgpr) { + lower = bld.as_uniform(lower); + upper = bld.as_uniform(upper); + } + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); + + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_b2f32: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == s1) { + src = as_uniform_bool(ctx, src); + bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src); + } else if (dst.regClass() == v1) { + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), + as_divergent_bool(ctx, src, true)); + } else { + unreachable("Wrong destination register class for nir_op_b2f32."); + } + break; + } + case nir_op_b2f64: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == s2) { + src = as_uniform_bool(ctx, src); + bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src)); + } else if (dst.regClass() == v2) { + Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u)); + Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, + as_divergent_bool(ctx, src, true)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper); + } else { + unreachable("Wrong destination register class for nir_op_b2f64."); + } + break; + } + case nir_op_i2i32: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 64) { + /* we can actually just say dst = src, as it would map the lower register */ + emit_extract_vector(ctx, src, 0, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_u2u32: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 16) { + if (dst.regClass() == s1) { + bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src); + } else { + // TODO: do better with SDWA + bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src); + } + } else if (instr->src[0].src.ssa->bit_size == 64) { + /* we can actually just say dst = src, as it would map the lower register */ + emit_extract_vector(ctx, src, 0, dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_i2i64: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 32) { + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u)); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_u2u64: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 32) { + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u)); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_b2i32: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == s1) { + if (src.regClass() == s1) { + bld.copy(Definition(dst), src); + } else { + // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ + assert(src.regClass() == s2); + bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(Definition(dst)), Operand(0u), src); + } + } else { + assert(dst.regClass() == v1 && src.regClass() == s2); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src); + } + break; + } + case nir_op_i2b1: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == s2) { + assert(src.regClass() == v1 || src.regClass() == v2); + bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32, + Definition(dst), Operand(0u), src).def(0).setHint(vcc); + } else { + assert(src.regClass() == s1 && dst.regClass() == s1); + bld.sopc(aco_opcode::s_cmp_lg_u32, bld.scc(Definition(dst)), Operand(0u), src); + } + break; + } + case nir_op_pack_64_2x32_split: { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1); + break; + } + case nir_op_unpack_64_2x32_split_x: + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0])); + break; + case nir_op_unpack_64_2x32_split_y: + bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0])); + break; + case nir_op_pack_half_2x16: { + Temp src = get_alu_src(ctx, instr->src[0], 2); + + if (dst.regClass() == v1) { + Temp src0 = bld.tmp(v1); + Temp src1 = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src); + bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1); + + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_unpack_half_2x16_split_x: { + if (dst.regClass() == v1) { + Builder bld(ctx->program, ctx->block); + bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0])); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_unpack_half_2x16_split_y: { + if (dst.regClass() == v1) { + Builder bld(ctx->program, ctx->block); + /* TODO: use SDWA here */ + bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), + bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0])))); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_fquantize2f16: { + Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), get_alu_src(ctx, instr->src[0])); + + Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */ + + Temp cmp_res = bld.tmp(s2); + bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc); + + Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16); + + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res); + break; + } + case nir_op_bfm: { + Temp bits = get_alu_src(ctx, instr->src[0]); + Temp offset = get_alu_src(ctx, instr->src[1]); + + if (dst.regClass() == s1) { + bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset); + } else if (dst.regClass() == v1) { + bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_bitfield_select: { + /* (mask & insert) | (~mask & base) */ + Temp bitmask = get_alu_src(ctx, instr->src[0]); + Temp insert = get_alu_src(ctx, instr->src[1]); + Temp base = get_alu_src(ctx, instr->src[2]); + + /* dst = (insert & bitmask) | (base & ~bitmask) */ + if (dst.regClass() == s1) { + aco_ptr<Instruction> sop2; + nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src); + nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src); + Operand lhs; + if (const_insert && const_bitmask) { + lhs = Operand(const_insert->u32 & const_bitmask->u32); + } else { + insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask); + lhs = Operand(insert); + } + + Operand rhs; + nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src); + if (const_base && const_bitmask) { + rhs = Operand(const_base->u32 & ~const_bitmask->u32); + } else { + base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask); + rhs = Operand(base); + } + + bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs); + + } else if (dst.regClass() == v1) { + if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr))) + base = as_vgpr(ctx, base); + if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr) + insert = as_vgpr(ctx, insert); + + bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base); + + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ubfe: + case nir_op_ibfe: { + Temp base = get_alu_src(ctx, instr->src[0]); + Temp offset = get_alu_src(ctx, instr->src[1]); + Temp bits = get_alu_src(ctx, instr->src[2]); + + if (dst.type() == RegType::sgpr) { + Operand extract; + nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src); + nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src); + if (const_offset && const_bits) { + uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32; + extract = Operand(const_extract); + } else { + Operand width; + if (const_bits) { + width = Operand(const_bits->u32 << 16); + } else { + width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u)); + } + extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width); + } + + aco_opcode opcode; + if (dst.regClass() == s1) { + if (instr->op == nir_op_ubfe) + opcode = aco_opcode::s_bfe_u32; + else + opcode = aco_opcode::s_bfe_i32; + } else if (dst.regClass() == s2) { + if (instr->op == nir_op_ubfe) + opcode = aco_opcode::s_bfe_u64; + else + opcode = aco_opcode::s_bfe_i64; + } else { + unreachable("Unsupported BFE bit size"); + } + + bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract); + + } else { + aco_opcode opcode; + if (dst.regClass() == v1) { + if (instr->op == nir_op_ubfe) + opcode = aco_opcode::v_bfe_u32; + else + opcode = aco_opcode::v_bfe_i32; + } else { + unreachable("Unsupported BFE bit size"); + } + + emit_vop3a_instruction(ctx, instr, opcode, dst); + } + break; + } + case nir_op_bit_count: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (src.regClass() == s1) { + bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src); + } else if (src.regClass() == v1) { + bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u)); + } else if (src.regClass() == v2) { + bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), + emit_extract_vector(ctx, src, 1, v1), + bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), + emit_extract_vector(ctx, src, 0, v1), Operand(0u))); + } else if (src.regClass() == s2) { + bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_flt: { + if (instr->src[0].src.ssa->bit_size == 32) + emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f32, dst); + else if (instr->src[0].src.ssa->bit_size == 64) + emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f64, dst); + break; + } + case nir_op_fge: { + if (instr->src[0].src.ssa->bit_size == 32) + emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f32, dst); + else if (instr->src[0].src.ssa->bit_size == 64) + emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f64, dst); + break; + } + case nir_op_feq: { + if (instr->src[0].src.ssa->bit_size == 32) + emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f32, dst); + else if (instr->src[0].src.ssa->bit_size == 64) + emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f64, dst); + break; + } + case nir_op_fne: { + if (instr->src[0].src.ssa->bit_size == 32) + emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f32, dst); + else if (instr->src[0].src.ssa->bit_size == 64) + emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f64, dst); + break; + } + case nir_op_ilt: { + if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) + emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i32, dst); + else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) + emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_i32, dst); + else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) + emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i64, dst); + break; + } + case nir_op_ige: { + if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) + emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i32, dst); + else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) + emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_i32, dst); + else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) + emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i64, dst); + break; + } + case nir_op_ieq: { + if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) { + emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i32, dst); + } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) { + emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_i32, dst); + } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) { + emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i64, dst); + } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) { + emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_u64, dst); + } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + bld.sopc(aco_opcode::s_cmp_eq_i32, bld.scc(Definition(dst)), + as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1)); + } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + bld.sop2(aco_opcode::s_xnor_b64, Definition(dst), bld.def(s1, scc), + as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false)); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ine: { + if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) { + emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i32, dst); + } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) { + emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i64, dst); + } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) { + emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_i32, dst); + } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) { + emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_u64, dst); + } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + bld.sopc(aco_opcode::s_cmp_lg_i32, bld.scc(Definition(dst)), + as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1)); + } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + bld.sop2(aco_opcode::s_xor_b64, Definition(dst), bld.def(s1, scc), + as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false)); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_op_ult: { + if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) + emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u32, dst); + else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) + emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_u32, dst); + else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) + emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u64, dst); + break; + } + case nir_op_uge: { + if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) + emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u32, dst); + else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) + emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_u32, dst); + else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) + emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u64, dst); + break; + } + case nir_op_fddx: + case nir_op_fddy: + case nir_op_fddx_fine: + case nir_op_fddy_fine: + case nir_op_fddx_coarse: + case nir_op_fddy_coarse: { + Definition tl = bld.def(v1); + uint16_t dpp_ctrl; + if (instr->op == nir_op_fddx_fine) { + bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 2, 2)); + dpp_ctrl = dpp_quad_perm(1, 1, 3, 3); + } else if (instr->op == nir_op_fddy_fine) { + bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 1, 0, 1)); + dpp_ctrl = dpp_quad_perm(2, 3, 2, 3); + } else { + bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 0, 0)); + if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse) + dpp_ctrl = dpp_quad_perm(1, 1, 1, 1); + else + dpp_ctrl = dpp_quad_perm(2, 2, 2, 2); + } + + Definition tmp = bld.def(v1); + bld.vop2_dpp(aco_opcode::v_sub_f32, tmp, get_alu_src(ctx, instr->src[0]), tl.getTemp(), dpp_ctrl); + emit_wqm(ctx, tmp.getTemp(), dst, true); + break; + } + default: + fprintf(stderr, "Unknown NIR ALU instr: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } +} + +void visit_load_const(isel_context *ctx, nir_load_const_instr *instr) +{ + Temp dst = get_ssa_temp(ctx, &instr->def); + + // TODO: we really want to have the resulting type as this would allow for 64bit literals + // which get truncated the lsb if double and msb if int + // for now, we only use s_mov_b64 with 64bit inline constants + assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar."); + assert(dst.type() == RegType::sgpr); + + if (dst.size() == 1) + { + Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(instr->value[0].u32)); + } else { + assert(dst.size() != 1); + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + if (instr->def.bit_size == 64) + for (unsigned i = 0; i < dst.size(); i++) + vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)}; + else { + for (unsigned i = 0; i < dst.size(); i++) + vec->operands[i] = Operand{instr->value[i].u32}; + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + } +} + +uint32_t widen_mask(uint32_t mask, unsigned multiplier) +{ + uint32_t new_mask = 0; + for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i) + if (mask & (1u << i)) + new_mask |= ((1u << multiplier) - 1u) << (i * multiplier); + return new_mask; +} + +void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr) +{ + /* This wouldn't work inside control flow or with indirect offsets but + * that doesn't happen because of nir_lower_io_to_temporaries(). */ + + unsigned write_mask = nir_intrinsic_write_mask(instr); + unsigned component = nir_intrinsic_component(instr); + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + unsigned idx = nir_intrinsic_base(instr) + component; + + nir_instr *off_instr = instr->src[1].ssa->parent_instr; + if (off_instr->type != nir_instr_type_load_const) { + fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n"); + nir_print_instr(off_instr, stderr); + fprintf(stderr, "\n"); + } + idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u; + + if (instr->src[0].ssa->bit_size == 64) + write_mask = widen_mask(write_mask, 2); + + for (unsigned i = 0; i < 8; ++i) { + if (write_mask & (1 << i)) { + ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u); + ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1); + } + idx++; + } +} + +void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr) +{ + unsigned write_mask = nir_intrinsic_write_mask(instr); + Operand values[4]; + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + for (unsigned i = 0; i < 4; ++i) { + if (write_mask & (1 << i)) { + Temp tmp = emit_extract_vector(ctx, src, i, v1); + values[i] = Operand(tmp); + } else { + values[i] = Operand(v1); + } + } + + unsigned index = nir_intrinsic_base(instr) / 4; + unsigned target, col_format; + unsigned enabled_channels = 0xF; + aco_opcode compr_op = (aco_opcode)0; + + nir_const_value* offset = nir_src_as_const_value(instr->src[1]); + assert(offset && "Non-const offsets on exports not yet supported"); + index += offset->u32; + + assert(index != FRAG_RESULT_COLOR); + + /* Unlike vertex shader exports, it's fine to use multiple exports to + * export separate channels of one target. So shaders which export both + * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine. + * TODO: combine the exports in those cases and create better code + */ + + if (index == FRAG_RESULT_SAMPLE_MASK) { + + if (ctx->program->info->ps.writes_z) { + target = V_008DFC_SQ_EXP_MRTZ; + enabled_channels = 0x4; + col_format = (unsigned) -1; + + values[2] = values[0]; + values[0] = Operand(v1); + } else { + aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)}; + exp->valid_mask = false; + exp->done = false; + exp->compressed = true; + exp->dest = V_008DFC_SQ_EXP_MRTZ; + exp->enabled_mask = 0xc; + for (int i = 0; i < 4; i++) + exp->operands[i] = Operand(v1); + exp->operands[1] = Operand(values[0]); + ctx->block->instructions.emplace_back(std::move(exp)); + return; + } + + } else if (index == FRAG_RESULT_DEPTH) { + + target = V_008DFC_SQ_EXP_MRTZ; + enabled_channels = 0x1; + col_format = (unsigned) -1; + + } else if (index == FRAG_RESULT_STENCIL) { + + if (ctx->program->info->ps.writes_z) { + target = V_008DFC_SQ_EXP_MRTZ; + enabled_channels = 0x2; + col_format = (unsigned) -1; + + values[1] = values[0]; + values[0] = Operand(v1); + } else { + aco_ptr<Instruction> shift{create_instruction<VOP2_instruction>(aco_opcode::v_lshlrev_b32, Format::VOP2, 2, 1)}; + shift->operands[0] = Operand((uint32_t) 16); + shift->operands[1] = values[0]; + Temp tmp = {ctx->program->allocateId(), v1}; + shift->definitions[0] = Definition(tmp); + ctx->block->instructions.emplace_back(std::move(shift)); + + aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)}; + exp->valid_mask = false; + exp->done = false; + exp->compressed = true; + exp->dest = V_008DFC_SQ_EXP_MRTZ; + exp->enabled_mask = 0x3; + exp->operands[0] = Operand(tmp); + for (int i = 1; i < 4; i++) + exp->operands[i] = Operand(v1); + ctx->block->instructions.emplace_back(std::move(exp)); + return; + } + + } else { + index -= FRAG_RESULT_DATA0; + target = V_008DFC_SQ_EXP_MRT + index; + col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf; + } + ASSERTED bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1; + ASSERTED bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1; + assert(!is_int8 && !is_int10); + + switch (col_format) + { + case V_028714_SPI_SHADER_ZERO: + enabled_channels = 0; /* writemask */ + target = V_008DFC_SQ_EXP_NULL; + break; + + case V_028714_SPI_SHADER_32_R: + enabled_channels = 1; + break; + + case V_028714_SPI_SHADER_32_GR: + enabled_channels = 0x3; + break; + + case V_028714_SPI_SHADER_32_AR: + enabled_channels = 0x9; + break; + + case V_028714_SPI_SHADER_FP16_ABGR: + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pkrtz_f16_f32; + break; + + case V_028714_SPI_SHADER_UNORM16_ABGR: + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pknorm_u16_f32; + break; + + case V_028714_SPI_SHADER_SNORM16_ABGR: + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pknorm_i16_f32; + break; + + case V_028714_SPI_SHADER_UINT16_ABGR: + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pk_u16_u32; + break; + + case V_028714_SPI_SHADER_SINT16_ABGR: + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pk_i16_i32; + break; + + case V_028714_SPI_SHADER_32_ABGR: + enabled_channels = 0xF; + break; + + default: + break; + } + + if (target == V_008DFC_SQ_EXP_NULL) + return; + + if ((bool)compr_op) + { + for (int i = 0; i < 2; i++) + { + /* check if at least one of the values to be compressed is enabled */ + unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1; + if (enabled) { + enabled_channels |= enabled << (i*2); + aco_ptr<VOP3A_instruction> compr{create_instruction<VOP3A_instruction>(compr_op, Format::VOP3A, 2, 1)}; + Temp tmp{ctx->program->allocateId(), v1}; + compr->operands[0] = values[i*2].isUndefined() ? Operand(0u) : values[i*2]; + compr->operands[1] = values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]; + compr->definitions[0] = Definition(tmp); + values[i] = Operand(tmp); + ctx->block->instructions.emplace_back(std::move(compr)); + } else { + values[i] = Operand(v1); + } + } + } + + aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)}; + exp->valid_mask = false; + exp->done = false; + exp->compressed = (bool) compr_op; + exp->dest = target; + exp->enabled_mask = enabled_channels; + if ((bool) compr_op) { + for (int i = 0; i < 2; i++) + exp->operands[i] = enabled_channels & (3 << (i * 2)) ? values[i] : Operand(v1); + exp->operands[2] = Operand(v1); + exp->operands[3] = Operand(v1); + } else { + for (int i = 0; i < 4; i++) + exp->operands[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1); + } + + ctx->block->instructions.emplace_back(std::move(exp)); +} + +void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr) +{ + if (ctx->stage == vertex_vs) { + visit_store_vs_output(ctx, instr); + } else if (ctx->stage == fragment_fs) { + visit_store_fs_output(ctx, instr); + } else { + unreachable("Shader stage not implemented"); + } +} + +void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask) +{ + Temp coord1 = emit_extract_vector(ctx, src, 0, v1); + Temp coord2 = emit_extract_vector(ctx, src, 1, v1); + + Builder bld(ctx->program, ctx->block); + Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component); + bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component); +} + +void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components) +{ + aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)); + for (unsigned i = 0; i < num_components; i++) + vec->operands[i] = Operand(ctx->fs_inputs[fs_input::frag_pos_0 + i]); + + if (ctx->fs_vgpr_args[fs_input::frag_pos_3]) { + assert(num_components == 4); + Builder bld(ctx->program, ctx->block); + vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ctx->fs_inputs[fs_input::frag_pos_3]); + } + + for (Operand& op : vec->operands) + op = op.isUndefined() ? Operand(0u) : op; + + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + emit_split_vector(ctx, dst, num_components); + return; +} + +void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp coords = get_ssa_temp(ctx, instr->src[0].ssa); + unsigned idx = nir_intrinsic_base(instr); + unsigned component = nir_intrinsic_component(instr); + Temp prim_mask = ctx->prim_mask; + + nir_const_value* offset = nir_src_as_const_value(instr->src[1]); + if (offset) { + assert(offset->u32 == 0); + } else { + /* the lower 15bit of the prim_mask contain the offset into LDS + * while the upper bits contain the number of prims */ + Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa); + assert(offset_src.regClass() == s1 && "TODO: divergent offsets..."); + Builder bld(ctx->program, ctx->block); + Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u)); + stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride); + stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u)); + offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src); + prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask); + } + + if (instr->dest.ssa.num_components == 1) { + emit_interp_instr(ctx, idx, component, coords, dst, prim_mask); + } else { + aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1)); + for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) + { + Temp tmp = {ctx->program->allocateId(), v1}; + emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask); + vec->operands[i] = Operand(tmp); + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + } +} + +unsigned get_num_channels_from_data_format(unsigned data_format) +{ + switch (data_format) { + case V_008F0C_BUF_DATA_FORMAT_8: + case V_008F0C_BUF_DATA_FORMAT_16: + case V_008F0C_BUF_DATA_FORMAT_32: + return 1; + case V_008F0C_BUF_DATA_FORMAT_8_8: + case V_008F0C_BUF_DATA_FORMAT_16_16: + case V_008F0C_BUF_DATA_FORMAT_32_32: + return 2; + case V_008F0C_BUF_DATA_FORMAT_10_11_11: + case V_008F0C_BUF_DATA_FORMAT_11_11_10: + case V_008F0C_BUF_DATA_FORMAT_32_32_32: + return 3; + case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: + case V_008F0C_BUF_DATA_FORMAT_10_10_10_2: + case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: + case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: + case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: + return 4; + default: + break; + } + + return 4; +} + +/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW. + * so we may need to fix it up. */ +Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha) +{ + Builder bld(ctx->program, ctx->block); + + if (adjustment == RADV_ALPHA_ADJUST_SSCALED) + alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha); + + /* For the integer-like cases, do a natural sign extension. + * + * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 + * and happen to contain 0, 1, 2, 3 as the two LSBs of the + * exponent. + */ + alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha); + alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha); + + /* Convert back to the right type. */ + if (adjustment == RADV_ALPHA_ADJUST_SNORM) { + alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha); + Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha); + alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp); + } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) { + alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha); + } + + return alpha; +} + +void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (ctx->stage & sw_vs) { + + nir_instr *off_instr = instr->src[0].ssa->parent_instr; + if (off_instr->type != nir_instr_type_load_const) { + fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n"); + nir_print_instr(off_instr, stderr); + fprintf(stderr, "\n"); + } + uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32; + + Temp vertex_buffers = convert_pointer_to_64_bit(ctx, ctx->vertex_buffers); + + unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset; + unsigned component = nir_intrinsic_component(instr); + unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location]; + uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location]; + uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location]; + unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location]; + + unsigned dfmt = attrib_format & 0xf; + + unsigned nfmt = (attrib_format >> 4) & 0x7; + unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt); + unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component; + unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels); + unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3; + bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location); + if (post_shuffle) + num_channels = MAX2(num_channels, 3); + + Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, Operand(attrib_binding * 16u)); + + Temp index; + if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) { + uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location]; + if (divisor) { + ctx->needs_instance_id = true; + + if (divisor != 1) { + Temp divided = bld.tmp(v1); + emit_v_div_u32(ctx, divided, as_vgpr(ctx, ctx->instance_id), divisor); + index = bld.vadd32(bld.def(v1), ctx->start_instance, divided); + } else { + index = bld.vadd32(bld.def(v1), ctx->start_instance, ctx->instance_id); + } + } else { + index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), ctx->start_instance); + } + } else { + index = bld.vadd32(bld.def(v1), ctx->base_vertex, ctx->vertex_id); + } + + if (attrib_stride != 0 && attrib_offset > attrib_stride) { + index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index); + attrib_offset = attrib_offset % attrib_stride; + } + + Operand soffset(0u); + if (attrib_offset >= 4096) { + soffset = bld.copy(bld.def(s1), Operand(attrib_offset)); + attrib_offset = 0; + } + + aco_opcode opcode; + switch (num_channels) { + case 1: + opcode = aco_opcode::tbuffer_load_format_x; + break; + case 2: + opcode = aco_opcode::tbuffer_load_format_xy; + break; + case 3: + opcode = aco_opcode::tbuffer_load_format_xyz; + break; + case 4: + opcode = aco_opcode::tbuffer_load_format_xyzw; + break; + default: + unreachable("Unimplemented load_input vector size"); + } + + Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst; + + aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)}; + mubuf->operands[0] = Operand(index); + mubuf->operands[1] = Operand(list); + mubuf->operands[2] = soffset; + mubuf->definitions[0] = Definition(tmp); + mubuf->idxen = true; + mubuf->can_reorder = true; + mubuf->dfmt = dfmt; + mubuf->nfmt = nfmt; + assert(attrib_offset < 4096); + mubuf->offset = attrib_offset; + ctx->block->instructions.emplace_back(std::move(mubuf)); + + emit_split_vector(ctx, tmp, tmp.size()); + + if (tmp.id() != dst.id()) { + bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && + nfmt != V_008F0C_BUF_NUM_FORMAT_SINT; + + static const unsigned swizzle_normal[4] = {0, 1, 2, 3}; + static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3}; + const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal; + + aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + for (unsigned i = 0; i < dst.size(); i++) { + unsigned idx = i + component; + if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) { + Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1); + vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha)); + } else if (idx < num_channels) { + vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1)); + } else if (is_float && idx == 3) { + vec->operands[i] = Operand(0x3f800000u); + } else if (!is_float && idx == 3) { + vec->operands[i] = Operand(1u); + } else { + vec->operands[i] = Operand(0u); + } + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + emit_split_vector(ctx, dst, dst.size()); + } + + } else if (ctx->stage == fragment_fs) { + nir_instr *off_instr = instr->src[0].ssa->parent_instr; + if (off_instr->type != nir_instr_type_load_const || + nir_instr_as_load_const(off_instr)->value[0].u32 != 0) { + fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n"); + nir_print_instr(off_instr, stderr); + fprintf(stderr, "\n"); + } + + Temp prim_mask = ctx->prim_mask; + nir_const_value* offset = nir_src_as_const_value(instr->src[0]); + if (offset) { + assert(offset->u32 == 0); + } else { + /* the lower 15bit of the prim_mask contain the offset into LDS + * while the upper bits contain the number of prims */ + Temp offset_src = get_ssa_temp(ctx, instr->src[0].ssa); + assert(offset_src.regClass() == s1 && "TODO: divergent offsets..."); + Builder bld(ctx->program, ctx->block); + Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u)); + stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride); + stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u)); + offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src); + prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask); + } + + unsigned idx = nir_intrinsic_base(instr); + unsigned component = nir_intrinsic_component(instr); + + if (dst.size() == 1) { + bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(2u), bld.m0(prim_mask), idx, component); + } else { + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + for (unsigned i = 0; i < dst.size(); i++) + vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u), bld.m0(prim_mask), idx, component + i); + vec->definitions[0] = Definition(dst); + bld.insert(std::move(vec)); + } + + } else { + unreachable("Shader stage not implemented"); + } +} + +Temp load_desc_ptr(isel_context *ctx, unsigned desc_set) +{ + if (ctx->program->info->need_indirect_descriptor_sets) { + Builder bld(ctx->program, ctx->block); + Temp ptr64 = convert_pointer_to_64_bit(ctx, ctx->descriptor_sets[0]); + return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false); + } + + return ctx->descriptor_sets[desc_set]; +} + + +void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); + unsigned desc_set = nir_intrinsic_desc_set(instr); + unsigned binding = nir_intrinsic_binding(instr); + + Temp desc_ptr; + radv_pipeline_layout *pipeline_layout = ctx->options->layout; + radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout; + unsigned offset = layout->binding[binding].offset; + unsigned stride; + if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || + layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { + unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset; + desc_ptr = ctx->push_constants; + offset = pipeline_layout->push_constant_size + 16 * idx; + stride = 16; + } else { + desc_ptr = load_desc_ptr(ctx, desc_set); + stride = layout->binding[binding].size; + } + + nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]); + unsigned const_index = nir_const_index ? nir_const_index->u32 : 0; + if (stride != 1) { + if (nir_const_index) { + const_index = const_index * stride; + } else { + index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index)); + } + } + if (offset) { + if (nir_const_index) { + const_index = const_index + offset; + } else { + index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index)); + } + } + + if (nir_const_index && const_index == 0) { + index = desc_ptr; + } else { + index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), + nir_const_index ? Operand(const_index) : Operand(index), + Operand(desc_ptr)); + } + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.sop1(aco_opcode::s_mov_b32, Definition(dst), index); +} + +void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, Temp rsrc, Temp offset, bool glc=false) +{ + Builder bld(ctx->program, ctx->block); + + unsigned num_bytes = dst.size() * 4; + + aco_opcode op; + if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) { + if (ctx->options->chip_class < GFX8) + offset = as_vgpr(ctx, offset); + + Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); + Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); + unsigned const_offset = 0; + + Temp lower = Temp(); + if (num_bytes > 16) { + assert(num_components == 3 || num_components == 4); + op = aco_opcode::buffer_load_dwordx4; + lower = bld.tmp(v4); + aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)}; + mubuf->definitions[0] = Definition(lower); + mubuf->operands[0] = vaddr; + mubuf->operands[1] = Operand(rsrc); + mubuf->operands[2] = soffset; + mubuf->offen = (offset.type() == RegType::vgpr); + mubuf->glc = glc; + mubuf->barrier = barrier_buffer; + bld.insert(std::move(mubuf)); + emit_split_vector(ctx, lower, 2); + num_bytes -= 16; + const_offset = 16; + } + + switch (num_bytes) { + case 4: + op = aco_opcode::buffer_load_dword; + break; + case 8: + op = aco_opcode::buffer_load_dwordx2; + break; + case 12: + op = aco_opcode::buffer_load_dwordx3; + break; + case 16: + op = aco_opcode::buffer_load_dwordx4; + break; + default: + unreachable("Load SSBO not implemented for this size."); + } + aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)}; + mubuf->operands[0] = vaddr; + mubuf->operands[1] = Operand(rsrc); + mubuf->operands[2] = soffset; + mubuf->offen = (offset.type() == RegType::vgpr); + mubuf->glc = glc; + mubuf->barrier = barrier_buffer; + mubuf->offset = const_offset; + aco_ptr<Instruction> instr = std::move(mubuf); + + if (dst.size() > 4) { + assert(lower != Temp()); + Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size()); + instr->definitions[0] = Definition(upper); + bld.insert(std::move(instr)); + if (dst.size() == 8) + emit_split_vector(ctx, upper, 2); + instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1)); + instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2)); + instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2)); + instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2)); + if (dst.size() == 8) + instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2)); + } + + if (dst.type() == RegType::sgpr) { + Temp vec = bld.tmp(RegType::vgpr, dst.size()); + instr->definitions[0] = Definition(vec); + bld.insert(std::move(instr)); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec); + } else { + instr->definitions[0] = Definition(dst); + bld.insert(std::move(instr)); + } + } else { + switch (num_bytes) { + case 4: + op = aco_opcode::s_buffer_load_dword; + break; + case 8: + op = aco_opcode::s_buffer_load_dwordx2; + break; + case 12: + case 16: + op = aco_opcode::s_buffer_load_dwordx4; + break; + case 24: + case 32: + op = aco_opcode::s_buffer_load_dwordx8; + break; + default: + unreachable("Load SSBO not implemented for this size."); + } + aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)}; + load->operands[0] = Operand(rsrc); + load->operands[1] = Operand(bld.as_uniform(offset)); + assert(load->operands[1].getTemp().type() == RegType::sgpr); + load->definitions[0] = Definition(dst); + load->glc = glc; + load->barrier = barrier_buffer; + assert(ctx->options->chip_class >= GFX8 || !glc); + + /* trim vector */ + if (dst.size() == 3) { + Temp vec = bld.tmp(s4); + load->definitions[0] = Definition(vec); + bld.insert(std::move(load)); + emit_split_vector(ctx, vec, 4); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + emit_extract_vector(ctx, vec, 0, s1), + emit_extract_vector(ctx, vec, 1, s1), + emit_extract_vector(ctx, vec, 2, s1)); + } else if (dst.size() == 6) { + Temp vec = bld.tmp(s8); + load->definitions[0] = Definition(vec); + bld.insert(std::move(load)); + emit_split_vector(ctx, vec, 4); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + emit_extract_vector(ctx, vec, 0, s2), + emit_extract_vector(ctx, vec, 1, s2), + emit_extract_vector(ctx, vec, 2, s2)); + } else { + bld.insert(std::move(load)); + } + + } + emit_split_vector(ctx, dst, num_components); +} + +void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa); + + Builder bld(ctx->program, ctx->block); + + nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); + unsigned desc_set = nir_intrinsic_desc_set(idx_instr); + unsigned binding = nir_intrinsic_binding(idx_instr); + radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout; + + if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { + uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3), + Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)), + Operand(0xFFFFFFFFu), + Operand(desc_type)); + rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), + rsrc, upper_dwords); + } else { + rsrc = convert_pointer_to_64_bit(ctx, rsrc); + rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); + } + + load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa)); +} + +void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + unsigned offset = nir_intrinsic_base(instr); + nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]); + if (index_cv && instr->dest.ssa.bit_size == 32) { + + unsigned count = instr->dest.ssa.num_components; + unsigned start = (offset + index_cv->u32) / 4u; + start -= ctx->base_inline_push_consts; + if (start + count <= ctx->num_inline_push_consts) { + std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems; + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; + for (unsigned i = 0; i < count; ++i) { + elems[i] = ctx->inline_push_consts[start + i]; + vec->operands[i] = Operand{elems[i]}; + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), elems); + return; + } + } + + Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); + if (offset != 0) // TODO check if index != 0 as well + index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index); + Temp ptr = convert_pointer_to_64_bit(ctx, ctx->push_constants); + Temp vec = dst; + bool trim = false; + aco_opcode op; + + switch (dst.size()) { + case 1: + op = aco_opcode::s_load_dword; + break; + case 2: + op = aco_opcode::s_load_dwordx2; + break; + case 3: + vec = bld.tmp(s4); + trim = true; + case 4: + op = aco_opcode::s_load_dwordx4; + break; + case 6: + vec = bld.tmp(s8); + trim = true; + case 8: + op = aco_opcode::s_load_dwordx8; + break; + default: + unreachable("unimplemented or forbidden load_push_constant."); + } + + bld.smem(op, Definition(vec), ptr, index); + + if (trim) { + emit_split_vector(ctx, vec, 4); + RegClass rc = dst.size() == 3 ? s1 : s2; + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + emit_extract_vector(ctx, vec, 0, rc), + emit_extract_vector(ctx, vec, 1, rc), + emit_extract_vector(ctx, vec, 2, rc)); + + } + emit_split_vector(ctx, dst, instr->dest.ssa.num_components); +} + +void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + Builder bld(ctx->program, ctx->block); + + uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + if (ctx->options->chip_class >= GFX10) { + desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(3) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + + unsigned base = nir_intrinsic_base(instr) + ctx->constant_data_offset; + unsigned range = nir_intrinsic_range(instr); + + Temp offset = get_ssa_temp(ctx, instr->src[0].ssa); + if (base && offset.type() == RegType::sgpr) + offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base)); + else if (base && offset.type() == RegType::vgpr) + offset = bld.vadd32(bld.def(v1), Operand(base), offset); + + Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), + bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(0u)), + Operand(MIN2(range, ctx->shader->constant_data_size - nir_intrinsic_base(instr))), + Operand(desc_type)); + + load_buffer(ctx, instr->num_components, dst, rsrc, offset); +} + +void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr) +{ + if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent) + ctx->cf_info.exec_potentially_empty = true; + + ctx->program->needs_exact = true; + + Builder bld(ctx->program, ctx->block); + Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false); + src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + bld.pseudo(aco_opcode::p_discard_if, src); + ctx->block->kind |= block_kind_uses_discard_if; + return; +} + +void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + + if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent) + ctx->cf_info.exec_potentially_empty = true; + + bool divergent = ctx->cf_info.parent_if.is_divergent || + ctx->cf_info.parent_loop.has_divergent_continue; + + if (ctx->block->loop_nest_depth && + ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) { + /* we handle discards the same way as jump instructions */ + append_logical_end(ctx->block); + + /* in loops, discard behaves like break */ + Block *linear_target = ctx->cf_info.parent_loop.exit; + ctx->block->kind |= block_kind_discard; + + if (!divergent) { + /* uniform discard - loop ends here */ + assert(nir_instr_is_last(&instr->instr)); + ctx->block->kind |= block_kind_uniform; + ctx->cf_info.has_branch = true; + bld.branch(aco_opcode::p_branch); + add_linear_edge(ctx->block->index, linear_target); + return; + } + + /* we add a break right behind the discard() instructions */ + ctx->block->kind |= block_kind_break; + unsigned idx = ctx->block->index; + + /* remove critical edges from linear CFG */ + bld.branch(aco_opcode::p_branch); + Block* break_block = ctx->program->create_and_insert_block(); + break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth; + break_block->kind |= block_kind_uniform; + add_linear_edge(idx, break_block); + add_linear_edge(break_block->index, linear_target); + bld.reset(break_block); + bld.branch(aco_opcode::p_branch); + + Block* continue_block = ctx->program->create_and_insert_block(); + continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_linear_edge(idx, continue_block); + append_logical_start(continue_block); + ctx->block = continue_block; + + return; + } + + /* it can currently happen that NIR doesn't remove the unreachable code */ + if (!nir_instr_is_last(&instr->instr)) { + ctx->program->needs_exact = true; + /* save exec somewhere temporarily so that it doesn't get + * overwritten before the discard from outer exec masks */ + Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2)); + bld.pseudo(aco_opcode::p_discard_if, cond); + ctx->block->kind |= block_kind_uses_discard_if; + return; + } + + /* This condition is incorrect for uniformly branched discards in a loop + * predicated by a divergent condition, but the above code catches that case + * and the discard would end up turning into a discard_if. + * For example: + * if (divergent) { + * while (...) { + * if (uniform) { + * discard; + * } + * } + * } + */ + if (!ctx->cf_info.parent_if.is_divergent) { + /* program just ends here */ + ctx->block->kind |= block_kind_uniform; + bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), + 0 /* enabled mask */, 9 /* dest */, + false /* compressed */, true/* done */, true /* valid mask */); + bld.sopp(aco_opcode::s_endpgm); + // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis + } else { + ctx->block->kind |= block_kind_discard; + /* branch and linear edge is added by visit_if() */ + } +} + +enum aco_descriptor_type { + ACO_DESC_IMAGE, + ACO_DESC_FMASK, + ACO_DESC_SAMPLER, + ACO_DESC_BUFFER, + ACO_DESC_PLANE_0, + ACO_DESC_PLANE_1, + ACO_DESC_PLANE_2, +}; + +enum aco_image_dim { + aco_image_1d, + aco_image_2d, + aco_image_3d, + aco_image_cube, // includes cube arrays + aco_image_1darray, + aco_image_2darray, + aco_image_2dmsaa, + aco_image_2darraymsaa, +}; + +static enum aco_image_dim +get_sampler_dim(isel_context *ctx, enum glsl_sampler_dim dim, bool is_array) +{ + switch (dim) { + case GLSL_SAMPLER_DIM_1D: + if (ctx->options->chip_class >= GFX9) + return is_array ? aco_image_2darray : aco_image_2d; + return is_array ? aco_image_1darray : aco_image_1d; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_EXTERNAL: + return is_array ? aco_image_2darray : aco_image_2d; + case GLSL_SAMPLER_DIM_3D: + return aco_image_3d; + case GLSL_SAMPLER_DIM_CUBE: + return aco_image_cube; + case GLSL_SAMPLER_DIM_MS: + return is_array ? aco_image_2darraymsaa : aco_image_2dmsaa; + case GLSL_SAMPLER_DIM_SUBPASS: + return aco_image_2darray; + case GLSL_SAMPLER_DIM_SUBPASS_MS: + return aco_image_2darraymsaa; + default: + unreachable("bad sampler dim"); + } +} + +static bool +should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) { + if (sampler_dim == GLSL_SAMPLER_DIM_BUF) + return false; + aco_image_dim dim = get_sampler_dim(ctx, sampler_dim, is_array); + return dim == aco_image_cube || + dim == aco_image_1darray || + dim == aco_image_2darray || + dim == aco_image_2darraymsaa; +} + +Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr, + enum aco_descriptor_type desc_type, + const nir_tex_instr *tex_instr, bool image, bool write) +{ +/* FIXME: we should lower the deref with some new nir_intrinsic_load_desc + std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index); + if (it != ctx->tex_desc.end()) + return it->second; +*/ + Temp index = Temp(); + bool index_set = false; + unsigned constant_index = 0; + unsigned descriptor_set; + unsigned base_index; + Builder bld(ctx->program, ctx->block); + + if (!deref_instr) { + assert(tex_instr && !image); + descriptor_set = 0; + base_index = tex_instr->sampler_index; + } else { + while(deref_instr->deref_type != nir_deref_type_var) { + unsigned array_size = glsl_get_aoa_size(deref_instr->type); + if (!array_size) + array_size = 1; + + assert(deref_instr->deref_type == nir_deref_type_array); + nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index); + if (const_value) { + constant_index += array_size * const_value->u32; + } else { + Temp indirect = bld.as_uniform(get_ssa_temp(ctx, deref_instr->arr.index.ssa)); + + if (array_size != 1) + indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect); + + if (!index_set) { + index = indirect; + index_set = true; + } else { + index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect); + } + } + + deref_instr = nir_src_as_deref(deref_instr->parent); + } + descriptor_set = deref_instr->var->data.descriptor_set; + base_index = deref_instr->var->data.binding; + } + + Temp list = load_desc_ptr(ctx, descriptor_set); + list = convert_pointer_to_64_bit(ctx, list); + + struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout; + struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index; + unsigned offset = binding->offset; + unsigned stride = binding->size; + aco_opcode opcode; + RegClass type; + + assert(base_index < layout->binding_count); + + switch (desc_type) { + case ACO_DESC_IMAGE: + type = s8; + opcode = aco_opcode::s_load_dwordx8; + break; + case ACO_DESC_FMASK: + type = s8; + opcode = aco_opcode::s_load_dwordx8; + offset += 32; + break; + case ACO_DESC_SAMPLER: + type = s4; + opcode = aco_opcode::s_load_dwordx4; + if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) + offset += radv_combined_image_descriptor_sampler_offset(binding); + break; + case ACO_DESC_BUFFER: + type = s4; + opcode = aco_opcode::s_load_dwordx4; + break; + case ACO_DESC_PLANE_0: + case ACO_DESC_PLANE_1: + type = s8; + opcode = aco_opcode::s_load_dwordx8; + offset += 32 * (desc_type - ACO_DESC_PLANE_0); + break; + case ACO_DESC_PLANE_2: + type = s4; + opcode = aco_opcode::s_load_dwordx4; + offset += 64; + break; + default: + unreachable("invalid desc_type\n"); + } + + offset += constant_index * stride; + + if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset && + (!index_set || binding->immutable_samplers_equal)) { + if (binding->immutable_samplers_equal) + constant_index = 0; + + const uint32_t *samplers = radv_immutable_samplers(layout, binding); + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), + Operand(samplers[constant_index * 4 + 0]), + Operand(samplers[constant_index * 4 + 1]), + Operand(samplers[constant_index * 4 + 2]), + Operand(samplers[constant_index * 4 + 3])); + } + + Operand off; + if (!index_set) { + off = Operand(offset); + } else { + off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), + bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index))); + } + + Temp res = bld.smem(opcode, bld.def(type), list, off); + + if (desc_type == ACO_DESC_PLANE_2) { + Temp components[8]; + for (unsigned i = 0; i < 8; i++) + components[i] = bld.tmp(s1); + bld.pseudo(aco_opcode::p_split_vector, + Definition(components[0]), + Definition(components[1]), + Definition(components[2]), + Definition(components[3]), + res); + + Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write); + bld.pseudo(aco_opcode::p_split_vector, + bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1), + Definition(components[4]), + Definition(components[5]), + Definition(components[6]), + Definition(components[7]), + desc2); + + res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), + components[0], components[1], components[2], components[3], + components[4], components[5], components[6], components[7]); + } + + return res; +} + +static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array) +{ + switch (dim) { + case GLSL_SAMPLER_DIM_BUF: + return 1; + case GLSL_SAMPLER_DIM_1D: + return array ? 2 : 1; + case GLSL_SAMPLER_DIM_2D: + return array ? 3 : 2; + case GLSL_SAMPLER_DIM_MS: + return array ? 4 : 3; + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + return 3; + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_SUBPASS: + return 2; + case GLSL_SAMPLER_DIM_SUBPASS_MS: + return 3; + default: + break; + } + return 0; +} + + +/* Adjust the sample index according to FMASK. + * + * For uncompressed MSAA surfaces, FMASK should return 0x76543210, + * which is the identity mapping. Each nibble says which physical sample + * should be fetched to get that sample. + * + * For example, 0x11111100 means there are only 2 samples stored and + * the second sample covers 3/4 of the pixel. When reading samples 0 + * and 1, return physical sample 0 (determined by the first two 0s + * in FMASK), otherwise return physical sample 1. + * + * The sample index should be adjusted as follows: + * sample_index = (fmask >> (sample_index * 4)) & 0xF; + */ +static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coords, Operand sample_index, Temp fmask_desc_ptr) +{ + Builder bld(ctx->program, ctx->block); + Temp fmask = bld.tmp(v1); + + aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)}; + load->operands[0] = Operand(coords); + load->operands[1] = Operand(fmask_desc_ptr); + load->definitions[0] = Definition(fmask); + load->glc = false; + load->dmask = 0x1; + load->unrm = true; + load->da = da; + load->can_reorder = true; /* fmask images shouldn't be modified */ + ctx->block->instructions.emplace_back(std::move(load)); + + Operand sample_index4; + if (sample_index.isConstant() && sample_index.constantValue() < 16) { + sample_index4 = Operand(sample_index.constantValue() << 2); + } else if (sample_index.regClass() == s1) { + sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u)); + } else { + assert(sample_index.regClass() == v1); + sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index); + } + + Temp final_sample; + if (sample_index4.isConstant() && sample_index4.constantValue() == 0) + final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask); + else if (sample_index4.isConstant() && sample_index4.constantValue() == 28) + final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask); + else + final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u)); + + /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK + * resource descriptor is 0 (invalid), + */ + Temp compare = bld.tmp(s2); + bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare), + Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc); + + Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index); + + /* Replace the MSAA sample index. */ + return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare); +} + +static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type) +{ + + Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa); + enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); + bool is_array = glsl_sampler_type_is_array(type); + ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); + assert(!add_frag_pos && "Input attachments should be lowered."); + bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); + bool gfx9_1d = ctx->options->chip_class >= GFX9 && dim == GLSL_SAMPLER_DIM_1D; + int count = image_type_to_components_count(dim, is_array); + std::vector<Operand> coords(count); + + if (is_ms) { + Operand sample_index; + nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]); + if (sample_cv) + sample_index = Operand(sample_cv->u32); + else + sample_index = Operand(emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1)); + + if (instr->intrinsic == nir_intrinsic_image_deref_load) { + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, is_array ? 3 : 2, 1)}; + for (unsigned i = 0; i < vec->operands.size(); i++) + vec->operands[i] = Operand(emit_extract_vector(ctx, src0, i, v1)); + Temp fmask_load_address = {ctx->program->allocateId(), is_array ? v3 : v2}; + vec->definitions[0] = Definition(fmask_load_address); + ctx->block->instructions.emplace_back(std::move(vec)); + + Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false); + sample_index = Operand(adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr)); + } + count--; + coords[count] = sample_index; + } + + if (count == 1 && !gfx9_1d) + return emit_extract_vector(ctx, src0, 0, v1); + + if (gfx9_1d) { + coords[0] = Operand(emit_extract_vector(ctx, src0, 0, v1)); + coords.resize(coords.size() + 1); + coords[1] = Operand((uint32_t) 0); + if (is_array) + coords[2] = Operand(emit_extract_vector(ctx, src0, 1, v1)); + } else { + for (int i = 0; i < count; i++) + coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1)); + } + + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)}; + for (unsigned i = 0; i < coords.size(); i++) + vec->operands[i] = coords[i]; + Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())}; + vec->definitions[0] = Definition(res); + ctx->block->instructions.emplace_back(std::move(vec)); + return res; +} + + +void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + const struct glsl_type *type = glsl_without_array(var->type); + const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + if (dim == GLSL_SAMPLER_DIM_BUF) { + unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); + unsigned num_channels = util_last_bit(mask); + Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true); + Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); + + aco_opcode opcode; + switch (num_channels) { + case 1: + opcode = aco_opcode::buffer_load_format_x; + break; + case 2: + opcode = aco_opcode::buffer_load_format_xy; + break; + case 3: + opcode = aco_opcode::buffer_load_format_xyz; + break; + case 4: + opcode = aco_opcode::buffer_load_format_xyzw; + break; + default: + unreachable(">4 channel buffer image load"); + } + aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)}; + load->operands[0] = Operand(vindex); + load->operands[1] = Operand(rsrc); + load->operands[2] = Operand((uint32_t) 0); + Temp tmp; + if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr) + tmp = dst; + else + tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)}; + load->definitions[0] = Definition(tmp); + load->idxen = true; + load->barrier = barrier_image; + ctx->block->instructions.emplace_back(std::move(load)); + + expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1); + return; + } + + Temp coords = get_image_coords(ctx, instr, type); + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true); + //aco_image_dim img_dim = get_image_dim(ctx, glsl_get_sampler_dim(type), glsl_sampler_type_is_array(type)); + + unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa); + unsigned num_components = util_bitcount(dmask); + Temp tmp; + if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr) + tmp = dst; + else + tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)}; + + aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)}; + load->operands[0] = Operand(coords); + load->operands[1] = Operand(resource); + load->definitions[0] = Definition(tmp); + load->glc = var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0; + load->dmask = dmask; + load->unrm = true; + load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); + load->barrier = barrier_image; + ctx->block->instructions.emplace_back(std::move(load)); + + expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask); + return; +} + +void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) +{ + const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + const struct glsl_type *type = glsl_without_array(var->type); + const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa)); + + bool glc = ctx->options->chip_class == GFX6 || var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0; + + if (dim == GLSL_SAMPLER_DIM_BUF) { + Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true); + Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); + aco_opcode opcode; + switch (data.size()) { + case 1: + opcode = aco_opcode::buffer_store_format_x; + break; + case 2: + opcode = aco_opcode::buffer_store_format_xy; + break; + case 3: + opcode = aco_opcode::buffer_store_format_xyz; + break; + case 4: + opcode = aco_opcode::buffer_store_format_xyzw; + break; + default: + unreachable(">4 channel buffer image store"); + } + aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)}; + store->operands[0] = Operand(vindex); + store->operands[1] = Operand(rsrc); + store->operands[2] = Operand((uint32_t) 0); + store->operands[3] = Operand(data); + store->idxen = true; + store->glc = glc; + store->disable_wqm = true; + store->barrier = barrier_image; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(store)); + return; + } + + assert(data.type() == RegType::vgpr); + Temp coords = get_image_coords(ctx, instr, type); + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true); + + aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)}; + store->operands[0] = Operand(coords); + store->operands[1] = Operand(resource); + store->operands[2] = Operand(s4); + store->operands[3] = Operand(data); + store->glc = glc; + store->dmask = (1 << data.size()) - 1; + store->unrm = true; + store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); + store->disable_wqm = true; + store->barrier = barrier_image; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(store)); + return; +} + +void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) +{ + /* return the previous value if dest is ever used */ + bool return_previous = false; + nir_foreach_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + nir_foreach_if_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + + const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + const struct glsl_type *type = glsl_without_array(var->type); + const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type); + Builder bld(ctx->program, ctx->block); + + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa)); + assert(data.size() == 1 && "64bit ssbo atomics not yet implemented."); + + if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap) + data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data); + + aco_opcode buf_op, image_op; + switch (instr->intrinsic) { + case nir_intrinsic_image_deref_atomic_add: + buf_op = aco_opcode::buffer_atomic_add; + image_op = aco_opcode::image_atomic_add; + break; + case nir_intrinsic_image_deref_atomic_umin: + buf_op = aco_opcode::buffer_atomic_umin; + image_op = aco_opcode::image_atomic_umin; + break; + case nir_intrinsic_image_deref_atomic_imin: + buf_op = aco_opcode::buffer_atomic_smin; + image_op = aco_opcode::image_atomic_smin; + break; + case nir_intrinsic_image_deref_atomic_umax: + buf_op = aco_opcode::buffer_atomic_umax; + image_op = aco_opcode::image_atomic_umax; + break; + case nir_intrinsic_image_deref_atomic_imax: + buf_op = aco_opcode::buffer_atomic_smax; + image_op = aco_opcode::image_atomic_smax; + break; + case nir_intrinsic_image_deref_atomic_and: + buf_op = aco_opcode::buffer_atomic_and; + image_op = aco_opcode::image_atomic_and; + break; + case nir_intrinsic_image_deref_atomic_or: + buf_op = aco_opcode::buffer_atomic_or; + image_op = aco_opcode::image_atomic_or; + break; + case nir_intrinsic_image_deref_atomic_xor: + buf_op = aco_opcode::buffer_atomic_xor; + image_op = aco_opcode::image_atomic_xor; + break; + case nir_intrinsic_image_deref_atomic_exchange: + buf_op = aco_opcode::buffer_atomic_swap; + image_op = aco_opcode::image_atomic_swap; + break; + case nir_intrinsic_image_deref_atomic_comp_swap: + buf_op = aco_opcode::buffer_atomic_cmpswap; + image_op = aco_opcode::image_atomic_cmpswap; + break; + default: + unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions."); + } + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + if (dim == GLSL_SAMPLER_DIM_BUF) { + Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true); + //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented."); + aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)}; + mubuf->operands[0] = Operand(vindex); + mubuf->operands[1] = Operand(resource); + mubuf->operands[2] = Operand((uint32_t)0); + mubuf->operands[3] = Operand(data); + if (return_previous) + mubuf->definitions[0] = Definition(dst); + mubuf->offset = 0; + mubuf->idxen = true; + mubuf->glc = return_previous; + mubuf->disable_wqm = true; + mubuf->barrier = barrier_image; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(mubuf)); + return; + } + + Temp coords = get_image_coords(ctx, instr, type); + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true); + aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 4, return_previous ? 1 : 0)}; + mimg->operands[0] = Operand(coords); + mimg->operands[1] = Operand(resource); + mimg->operands[2] = Operand(s4); /* no sampler */ + mimg->operands[3] = Operand(data); + if (return_previous) + mimg->definitions[0] = Definition(dst); + mimg->glc = return_previous; + mimg->dmask = (1 << data.size()) - 1; + mimg->unrm = true; + mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); + mimg->disable_wqm = true; + mimg->barrier = barrier_image; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(mimg)); + return; +} + +void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements) +{ + if (in_elements && ctx->options->chip_class == GFX8) { + Builder bld(ctx->program, ctx->block); + + Temp stride = emit_extract_vector(ctx, desc, 1, s1); + stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u)); + stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride); + stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride); + + Temp size = emit_extract_vector(ctx, desc, 2, s1); + size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size); + + Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride); + res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res); + + // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16} + /* idea + * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32) + * in case 12 (or 3?), we have to divide by 3: + * set v_skip in case it's 12 (if we also have to take care of 3, shift first) + * use v_mul_hi_u32 with magic number to divide + * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane + * disable v_skip + * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions + */ + + } else { + emit_extract_vector(ctx, desc, 2, dst); + } +} + +void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr) +{ + const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + const struct glsl_type *type = glsl_without_array(var->type); + Builder bld(ctx->program, ctx->block); + + if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) { + Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false); + return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true); + } + + /* LOD */ + Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); + + /* Resource */ + Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false); + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)}; + mimg->operands[0] = Operand(lod); + mimg->operands[1] = Operand(resource); + unsigned& dmask = mimg->dmask; + mimg->dmask = (1 << instr->dest.ssa.num_components) - 1; + mimg->da = glsl_sampler_type_is_array(type); + mimg->can_reorder = true; + Definition& def = mimg->definitions[0]; + ctx->block->instructions.emplace_back(std::move(mimg)); + + if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE && + glsl_sampler_type_is_array(type)) { + + assert(instr->dest.ssa.num_components == 3); + Temp tmp = {ctx->program->allocateId(), v3}; + def = Definition(tmp); + emit_split_vector(ctx, tmp, 3); + + /* divide 3rd value by 6 by multiplying with magic number */ + Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB)); + Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + emit_extract_vector(ctx, tmp, 0, v1), + emit_extract_vector(ctx, tmp, 1, v1), + by_6); + + } else if (ctx->options->chip_class >= GFX9 && + glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D && + glsl_sampler_type_is_array(type)) { + assert(instr->dest.ssa.num_components == 2); + def = Definition(dst); + dmask = 0x5; + } else { + def = Definition(dst); + } + + emit_split_vector(ctx, dst, instr->dest.ssa.num_components); +} + +void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + unsigned num_components = instr->num_components; + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); + + bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); + load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc); +} + +void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + Temp data = get_ssa_temp(ctx, instr->src[0].ssa); + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; + unsigned writemask = nir_intrinsic_write_mask(instr); + + Temp offset; + if (ctx->options->chip_class < GFX8) + offset = as_vgpr(ctx,get_ssa_temp(ctx, instr->src[2].ssa)); + else + offset = get_ssa_temp(ctx, instr->src[2].ssa); + + Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); + + bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] && + ctx->options->chip_class >= GFX8; + if (smem) + offset = bld.as_uniform(offset); + bool smem_nonfs = smem && ctx->stage != fragment_fs; + + while (writemask) { + int start, count; + u_bit_scan_consecutive_range(&writemask, &start, &count); + if (count == 3 && smem) { + writemask |= 1u << (start + 2); + count = 2; + } + int num_bytes = count * elem_size_bytes; + + if (num_bytes > 16) { + assert(elem_size_bytes == 8); + writemask |= (((count - 2) << 1) - 1) << (start + 2); + count = 2; + num_bytes = 16; + } + + // TODO: check alignment of sub-dword stores + // TODO: split 3 bytes. there is no store instruction for that + + Temp write_data; + if (count != instr->num_components) { + emit_split_vector(ctx, data, instr->num_components); + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; + for (int i = 0; i < count; i++) { + Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4)); + vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem); + } + write_data = bld.tmp(smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4); + vec->definitions[0] = Definition(write_data); + ctx->block->instructions.emplace_back(std::move(vec)); + } else if (!smem && data.type() != RegType::vgpr) { + assert(num_bytes % 4 == 0); + write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data); + } else if (smem_nonfs && data.type() == RegType::vgpr) { + assert(num_bytes % 4 == 0); + write_data = bld.as_uniform(data); + } else { + write_data = data; + } + + aco_opcode vmem_op, smem_op; + switch (num_bytes) { + case 4: + vmem_op = aco_opcode::buffer_store_dword; + smem_op = aco_opcode::s_buffer_store_dword; + break; + case 8: + vmem_op = aco_opcode::buffer_store_dwordx2; + smem_op = aco_opcode::s_buffer_store_dwordx2; + break; + case 12: + vmem_op = aco_opcode::buffer_store_dwordx3; + smem_op = aco_opcode::last_opcode; + assert(!smem); + break; + case 16: + vmem_op = aco_opcode::buffer_store_dwordx4; + smem_op = aco_opcode::s_buffer_store_dwordx4; + break; + default: + unreachable("Store SSBO not implemented for this size."); + } + if (ctx->stage == fragment_fs) + smem_op = aco_opcode::p_fs_buffer_store_smem; + + if (smem) { + aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)}; + store->operands[0] = Operand(rsrc); + if (start) { + Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), + offset, Operand(start * elem_size_bytes)); + store->operands[1] = Operand(off); + } else { + store->operands[1] = Operand(offset); + } + if (smem_op != aco_opcode::p_fs_buffer_store_smem) + store->operands[1].setFixed(m0); + store->operands[2] = Operand(write_data); + store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + store->disable_wqm = true; + store->barrier = barrier_buffer; + ctx->block->instructions.emplace_back(std::move(store)); + ctx->program->wb_smem_l1_on_end = true; + if (smem_op == aco_opcode::p_fs_buffer_store_smem) { + ctx->block->kind |= block_kind_needs_lowering; + ctx->program->needs_exact = true; + } + } else { + aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)}; + store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); + store->operands[1] = Operand(rsrc); + store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); + store->operands[3] = Operand(write_data); + store->offset = start * elem_size_bytes; + store->offen = (offset.type() == RegType::vgpr); + store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + store->disable_wqm = true; + store->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(store)); + } + } +} + +void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) +{ + /* return the previous value if dest is ever used */ + bool return_previous = false; + nir_foreach_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + nir_foreach_if_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + + Builder bld(ctx->program, ctx->block); + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)); + + if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) + data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2), + get_ssa_temp(ctx, instr->src[3].ssa), data); + + Temp offset; + if (ctx->options->chip_class < GFX8) + offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + else + offset = get_ssa_temp(ctx, instr->src[1].ssa); + + Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + aco_opcode op32, op64; + switch (instr->intrinsic) { + case nir_intrinsic_ssbo_atomic_add: + op32 = aco_opcode::buffer_atomic_add; + op64 = aco_opcode::buffer_atomic_add_x2; + break; + case nir_intrinsic_ssbo_atomic_imin: + op32 = aco_opcode::buffer_atomic_smin; + op64 = aco_opcode::buffer_atomic_smin_x2; + break; + case nir_intrinsic_ssbo_atomic_umin: + op32 = aco_opcode::buffer_atomic_umin; + op64 = aco_opcode::buffer_atomic_umin_x2; + break; + case nir_intrinsic_ssbo_atomic_imax: + op32 = aco_opcode::buffer_atomic_smax; + op64 = aco_opcode::buffer_atomic_smax_x2; + break; + case nir_intrinsic_ssbo_atomic_umax: + op32 = aco_opcode::buffer_atomic_umax; + op64 = aco_opcode::buffer_atomic_umax_x2; + break; + case nir_intrinsic_ssbo_atomic_and: + op32 = aco_opcode::buffer_atomic_and; + op64 = aco_opcode::buffer_atomic_and_x2; + break; + case nir_intrinsic_ssbo_atomic_or: + op32 = aco_opcode::buffer_atomic_or; + op64 = aco_opcode::buffer_atomic_or_x2; + break; + case nir_intrinsic_ssbo_atomic_xor: + op32 = aco_opcode::buffer_atomic_xor; + op64 = aco_opcode::buffer_atomic_xor_x2; + break; + case nir_intrinsic_ssbo_atomic_exchange: + op32 = aco_opcode::buffer_atomic_swap; + op64 = aco_opcode::buffer_atomic_swap_x2; + break; + case nir_intrinsic_ssbo_atomic_comp_swap: + op32 = aco_opcode::buffer_atomic_cmpswap; + op64 = aco_opcode::buffer_atomic_cmpswap_x2; + break; + default: + unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions."); + } + aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; + aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; + mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); + mubuf->operands[1] = Operand(rsrc); + mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); + mubuf->operands[3] = Operand(data); + if (return_previous) + mubuf->definitions[0] = Definition(dst); + mubuf->offset = 0; + mubuf->offen = (offset.type() == RegType::vgpr); + mubuf->glc = return_previous; + mubuf->disable_wqm = true; + mubuf->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(mubuf)); +} + +void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) { + + Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Builder bld(ctx->program, ctx->block); + Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u)); + get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false); +} + +void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + unsigned num_components = instr->num_components; + unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8; + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); + + bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); + aco_opcode op; + if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) { + bool global = ctx->options->chip_class >= GFX9; + aco_opcode op; + switch (num_bytes) { + case 4: + op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword; + break; + case 8: + op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2; + break; + case 12: + op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3; + break; + case 16: + op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4; + break; + default: + unreachable("load_global not implemented for this size."); + } + aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)}; + flat->operands[0] = Operand(addr); + flat->operands[1] = Operand(s1); + flat->glc = glc; + + if (dst.type() == RegType::sgpr) { + Temp vec = bld.tmp(RegType::vgpr, dst.size()); + flat->definitions[0] = Definition(vec); + ctx->block->instructions.emplace_back(std::move(flat)); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec); + } else { + flat->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(flat)); + } + emit_split_vector(ctx, dst, num_components); + } else { + switch (num_bytes) { + case 4: + op = aco_opcode::s_load_dword; + break; + case 8: + op = aco_opcode::s_load_dwordx2; + break; + case 12: + case 16: + op = aco_opcode::s_load_dwordx4; + break; + default: + unreachable("load_global not implemented for this size."); + } + aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)}; + load->operands[0] = Operand(addr); + load->operands[1] = Operand(0u); + load->definitions[0] = Definition(dst); + load->glc = glc; + load->barrier = barrier_buffer; + assert(ctx->options->chip_class >= GFX8 || !glc); + + if (dst.size() == 3) { + /* trim vector */ + Temp vec = bld.tmp(s4); + load->definitions[0] = Definition(vec); + ctx->block->instructions.emplace_back(std::move(load)); + emit_split_vector(ctx, vec, 4); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + emit_extract_vector(ctx, vec, 0, s1), + emit_extract_vector(ctx, vec, 1, s1), + emit_extract_vector(ctx, vec, 2, s1)); + } else { + ctx->block->instructions.emplace_back(std::move(load)); + } + } +} + +void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; + + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + + unsigned writemask = nir_intrinsic_write_mask(instr); + while (writemask) { + int start, count; + u_bit_scan_consecutive_range(&writemask, &start, &count); + unsigned num_bytes = count * elem_size_bytes; + + Temp write_data = data; + if (count != instr->num_components) { + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; + for (int i = 0; i < count; i++) + vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1)); + write_data = bld.tmp(RegType::vgpr, count); + vec->definitions[0] = Definition(write_data); + ctx->block->instructions.emplace_back(std::move(vec)); + } + + unsigned offset = start * elem_size_bytes; + if (offset > 0 && ctx->options->chip_class < GFX9) { + Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1); + Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1); + Temp carry = bld.tmp(s2); + bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr); + + bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)), + Operand(offset), addr0); + bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2), + Operand(0u), addr1, + carry).def(1).setHint(vcc); + + addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1); + + offset = 0; + } + + bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + bool global = ctx->options->chip_class >= GFX9; + aco_opcode op; + switch (num_bytes) { + case 4: + op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; + break; + case 8: + op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2; + break; + case 12: + op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3; + break; + case 16: + op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4; + break; + default: + unreachable("store_global not implemented for this size."); + } + aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)}; + flat->operands[0] = Operand(addr); + flat->operands[1] = Operand(s1); + flat->operands[2] = Operand(data); + flat->glc = glc; + flat->offset = offset; + ctx->block->instructions.emplace_back(std::move(flat)); + } +} + +void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { + Builder bld(ctx->program, ctx->block); + switch(instr->intrinsic) { + case nir_intrinsic_group_memory_barrier: + case nir_intrinsic_memory_barrier: + bld.barrier(aco_opcode::p_memory_barrier_all); + break; + case nir_intrinsic_memory_barrier_atomic_counter: + bld.barrier(aco_opcode::p_memory_barrier_atomic); + break; + case nir_intrinsic_memory_barrier_buffer: + bld.barrier(aco_opcode::p_memory_barrier_buffer); + break; + case nir_intrinsic_memory_barrier_image: + bld.barrier(aco_opcode::p_memory_barrier_image); + break; + case nir_intrinsic_memory_barrier_shared: + bld.barrier(aco_opcode::p_memory_barrier_shared); + break; + default: + unreachable("Unimplemented memory barrier intrinsic"); + break; + } +} + +Operand load_lds_size_m0(isel_context *ctx) +{ + /* TODO: m0 does not need to be initialized on GFX9+ */ + Builder bld(ctx->program, ctx->block); + return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff)); +} + + +void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr) +{ + // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read() + Operand m = load_lds_size_m0(ctx); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared."); + Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Builder bld(ctx->program, ctx->block); + + unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8; + unsigned bytes_read = 0; + unsigned result_size = 0; + unsigned total_bytes = instr->num_components * elem_size_bytes; + unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : instr->dest.ssa.bit_size / 8; + std::array<Temp, 4> result; + + while (bytes_read < total_bytes) { + unsigned todo = total_bytes - bytes_read; + bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0; + bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0; + + aco_opcode op = aco_opcode::last_opcode; + if (todo >= 16 && aligned16) { + op = aco_opcode::ds_read_b128; + todo = 16; + } else if (todo >= 12 && aligned16) { + op = aco_opcode::ds_read_b96; + todo = 12; + } else if (todo >= 8) { + op = aligned8 ? aco_opcode::ds_read_b64 : aco_opcode::ds_read2_b32; + todo = 8; + } else if (todo >= 4) { + op = aco_opcode::ds_read_b32; + todo = 4; + } else { + assert(false); + } + assert(todo % elem_size_bytes == 0); + unsigned num_elements = todo / elem_size_bytes; + unsigned offset = nir_intrinsic_base(instr) + bytes_read; + unsigned max_offset = op == aco_opcode::ds_read2_b32 ? 1019 : 65535; + + Temp address_offset = address; + if (offset > max_offset) { + address_offset = bld.vadd32(bld.def(v1), Operand((uint32_t)nir_intrinsic_base(instr)), address_offset); + offset = bytes_read; + } + assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */ + + Temp res; + if (instr->num_components == 1 && dst.type() == RegType::vgpr) + res = dst; + else + res = bld.tmp(RegClass(RegType::vgpr, todo / 4)); + + if (op == aco_opcode::ds_read2_b32) + res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1); + else + res = bld.ds(op, Definition(res), address_offset, m, offset); + + if (instr->num_components == 1) { + assert(todo == total_bytes); + if (dst.type() == RegType::sgpr) + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res); + return; + } + + if (dst.type() == RegType::sgpr) + res = bld.as_uniform(res); + + if (num_elements == 1) { + result[result_size++] = res; + } else { + assert(res != dst && res.size() % num_elements == 0); + aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)}; + split->operands[0] = Operand(res); + for (unsigned i = 0; i < num_elements; i++) + split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4)); + ctx->block->instructions.emplace_back(std::move(split)); + } + + bytes_read += todo; + } + + assert(result_size == instr->num_components && result_size > 1); + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)}; + for (unsigned i = 0; i < result_size; i++) + vec->operands[i] = Operand(result[i]); + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), result); +} + +void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned offset0, unsigned offset1, unsigned align) +{ + Builder bld(ctx->program, ctx->block); + unsigned bytes_written = 0; + while (bytes_written < data.size() * 4) { + unsigned todo = data.size() * 4 - bytes_written; + bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0; + bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0; + + aco_opcode op = aco_opcode::last_opcode; + unsigned size = 0; + if (todo >= 16 && aligned16) { + op = aco_opcode::ds_write_b128; + size = 4; + } else if (todo >= 12 && aligned16) { + op = aco_opcode::ds_write_b96; + size = 3; + } else if (todo >= 8) { + op = aligned8 ? aco_opcode::ds_write_b64 : aco_opcode::ds_write2_b32; + size = 2; + } else if (todo >= 4) { + op = aco_opcode::ds_write_b32; + size = 1; + } else { + assert(false); + } + + bool write2 = op == aco_opcode::ds_write2_b32; + unsigned offset = offset0 + offset1 + bytes_written; + unsigned max_offset = write2 ? 1020 : 65535; + Temp address_offset = address; + if (offset > max_offset) { + address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset); + offset = offset1 + bytes_written; + } + assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */ + + if (write2) { + Temp val0 = emit_extract_vector(ctx, data, bytes_written >> 2, v1); + Temp val1 = emit_extract_vector(ctx, data, (bytes_written >> 2) + 1, v1); + bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1); + } else { + Temp val = emit_extract_vector(ctx, data, bytes_written >> 2, RegClass(RegType::vgpr, size)); + bld.ds(op, address_offset, val, m, offset); + } + + bytes_written += size * 4; + } +} + +void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr) +{ + unsigned offset = nir_intrinsic_base(instr); + unsigned writemask = nir_intrinsic_write_mask(instr); + Operand m = load_lds_size_m0(ctx); + Temp data = get_ssa_temp(ctx, instr->src[0].ssa); + Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; + assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported."); + + /* we need at most two stores for 32bit variables */ + int start[2], count[2]; + u_bit_scan_consecutive_range(&writemask, &start[0], &count[0]); + u_bit_scan_consecutive_range(&writemask, &start[1], &count[1]); + assert(writemask == 0); + + /* one combined store is sufficient */ + if (count[0] == count[1]) { + Builder bld(ctx->program, ctx->block); + + Temp address_offset = address; + if ((offset >> 2) + start[1] > 255) { + address_offset = bld.vadd32(bld.def(v1), Operand(offset), address_offset); + offset = 0; + } + + assert(count[0] == 1); + Temp val0 = emit_extract_vector(ctx, data, start[0], v1); + Temp val1 = emit_extract_vector(ctx, data, start[1], v1); + aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64; + offset = offset / elem_size_bytes; + bld.ds(op, address_offset, val0, val1, m, + offset + start[0], offset + start[1]); + return; + } + + unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes; + for (unsigned i = 0; i < 2; i++) { + if (count[i] == 0) + continue; + + Temp write_data = emit_extract_vector(ctx, data, start[i], RegClass(RegType::vgpr, count[i] * elem_size_bytes / 4)); + ds_write_helper(ctx, m, address, write_data, offset, start[i] * elem_size_bytes, align); + } + return; +} + +void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr) +{ + unsigned offset = nir_intrinsic_base(instr); + Operand m = load_lds_size_m0(ctx); + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + + unsigned num_operands = 3; + aco_opcode op32, op64, op32_rtn, op64_rtn; + switch(instr->intrinsic) { + case nir_intrinsic_shared_atomic_add: + op32 = aco_opcode::ds_add_u32; + op64 = aco_opcode::ds_add_u64; + op32_rtn = aco_opcode::ds_add_rtn_u32; + op64_rtn = aco_opcode::ds_add_rtn_u64; + break; + case nir_intrinsic_shared_atomic_imin: + op32 = aco_opcode::ds_min_i32; + op64 = aco_opcode::ds_min_i64; + op32_rtn = aco_opcode::ds_min_rtn_i32; + op64_rtn = aco_opcode::ds_min_rtn_i64; + break; + case nir_intrinsic_shared_atomic_umin: + op32 = aco_opcode::ds_min_u32; + op64 = aco_opcode::ds_min_u64; + op32_rtn = aco_opcode::ds_min_rtn_u32; + op64_rtn = aco_opcode::ds_min_rtn_u64; + break; + case nir_intrinsic_shared_atomic_imax: + op32 = aco_opcode::ds_max_i32; + op64 = aco_opcode::ds_max_i64; + op32_rtn = aco_opcode::ds_max_rtn_i32; + op64_rtn = aco_opcode::ds_max_rtn_i64; + break; + case nir_intrinsic_shared_atomic_umax: + op32 = aco_opcode::ds_max_u32; + op64 = aco_opcode::ds_max_u64; + op32_rtn = aco_opcode::ds_max_rtn_u32; + op64_rtn = aco_opcode::ds_max_rtn_u64; + break; + case nir_intrinsic_shared_atomic_and: + op32 = aco_opcode::ds_and_b32; + op64 = aco_opcode::ds_and_b64; + op32_rtn = aco_opcode::ds_and_rtn_b32; + op64_rtn = aco_opcode::ds_and_rtn_b64; + break; + case nir_intrinsic_shared_atomic_or: + op32 = aco_opcode::ds_or_b32; + op64 = aco_opcode::ds_or_b64; + op32_rtn = aco_opcode::ds_or_rtn_b32; + op64_rtn = aco_opcode::ds_or_rtn_b64; + break; + case nir_intrinsic_shared_atomic_xor: + op32 = aco_opcode::ds_xor_b32; + op64 = aco_opcode::ds_xor_b64; + op32_rtn = aco_opcode::ds_xor_rtn_b32; + op64_rtn = aco_opcode::ds_xor_rtn_b64; + break; + case nir_intrinsic_shared_atomic_exchange: + op32 = aco_opcode::ds_write_b32; + op64 = aco_opcode::ds_write_b64; + op32_rtn = aco_opcode::ds_wrxchg_rtn_b32; + op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64; + break; + case nir_intrinsic_shared_atomic_comp_swap: + op32 = aco_opcode::ds_cmpst_b32; + op64 = aco_opcode::ds_cmpst_b64; + op32_rtn = aco_opcode::ds_cmpst_rtn_b32; + op64_rtn = aco_opcode::ds_cmpst_rtn_b64; + num_operands = 4; + break; + default: + unreachable("Unhandled shared atomic intrinsic"); + } + + /* return the previous value if dest is ever used */ + bool return_previous = false; + nir_foreach_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + nir_foreach_if_use_safe(use_src, &instr->dest.ssa) { + return_previous = true; + break; + } + + aco_opcode op; + if (data.size() == 1) { + assert(instr->dest.ssa.bit_size == 32); + op = return_previous ? op32_rtn : op32; + } else { + assert(instr->dest.ssa.bit_size == 64); + op = return_previous ? op64_rtn : op64; + } + + if (offset > 65535) { + Builder bld(ctx->program, ctx->block); + address = bld.vadd32(bld.def(v1), Operand(offset), address); + offset = 0; + } + + aco_ptr<DS_instruction> ds; + ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0)); + ds->operands[0] = Operand(address); + ds->operands[1] = Operand(data); + if (num_operands == 4) + ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa)); + ds->operands[num_operands - 1] = m; + ds->offset0 = offset; + if (return_previous) + ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa)); + ctx->block->instructions.emplace_back(std::move(ds)); +} + +void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { + assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64); + Builder bld(ctx->program, ctx->block); + Temp scratch_addr = ctx->private_segment_buffer; + if (ctx->stage != MESA_SHADER_COMPUTE) + scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), ctx->private_segment_buffer, Operand(0u)); + uint32_t rsrc_conf; + /* older generations need element size = 16 bytes */ + if (ctx->program->chip_class >= GFX9) + rsrc_conf = 0x00E00000u; + else + rsrc_conf = 0x00F80000u; + /* buffer res = addr + num_records = -1, index_stride = 64, add_tid_enable = true */ + Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf)); + Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + aco_opcode op; + switch (dst.size()) { + case 1: + op = aco_opcode::buffer_load_dword; + break; + case 2: + op = aco_opcode::buffer_load_dwordx2; + break; + case 3: + op = aco_opcode::buffer_load_dwordx3; + break; + case 4: + op = aco_opcode::buffer_load_dwordx4; + break; + case 6: + case 8: { + std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems; + Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4, + bld.def(v4), offset, rsrc, + ctx->scratch_offset, 0, true); + Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 : + aco_opcode::buffer_load_dwordx4, + dst.size() == 6 ? bld.def(v2) : bld.def(v4), + offset, rsrc, ctx->scratch_offset, 16, true); + emit_split_vector(ctx, lower, 2); + elems[0] = emit_extract_vector(ctx, lower, 0, v2); + elems[1] = emit_extract_vector(ctx, lower, 1, v2); + if (dst.size() == 8) { + emit_split_vector(ctx, upper, 2); + elems[2] = emit_extract_vector(ctx, upper, 0, v2); + elems[3] = emit_extract_vector(ctx, upper, 1, v2); + } else { + elems[2] = upper; + } + + aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, + Format::PSEUDO, dst.size() / 2, 1)}; + for (unsigned i = 0; i < dst.size() / 2; i++) + vec->operands[i] = Operand(elems[i]); + vec->definitions[0] = Definition(dst); + bld.insert(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), elems); + return; + } + default: + unreachable("Wrong dst size for nir_intrinsic_load_scratch"); + } + + bld.mubuf(op, Definition(dst), offset, rsrc, ctx->scratch_offset, 0, true); + emit_split_vector(ctx, dst, instr->num_components); +} + +void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { + assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64); + Builder bld(ctx->program, ctx->block); + Temp scratch_addr = ctx->private_segment_buffer; + if (ctx->stage != MESA_SHADER_COMPUTE) + scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), ctx->private_segment_buffer, Operand(0u)); + uint32_t rsrc_conf; + /* older generations need element size = 16 bytes */ + if (ctx->program->chip_class >= GFX9) + rsrc_conf = 0x00E00000u; + else + rsrc_conf = 0x00F80000u; + /* buffer res = addr + num_records = -1, index_stride = 64, add_tid_enable = true */ + Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf)); + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; + unsigned writemask = nir_intrinsic_write_mask(instr); + + while (writemask) { + int start, count; + u_bit_scan_consecutive_range(&writemask, &start, &count); + int num_bytes = count * elem_size_bytes; + + if (num_bytes > 16) { + assert(elem_size_bytes == 8); + writemask |= (((count - 2) << 1) - 1) << (start + 2); + count = 2; + num_bytes = 16; + } + + // TODO: check alignment of sub-dword stores + // TODO: split 3 bytes. there is no store instruction for that + + Temp write_data; + if (count != instr->num_components) { + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; + for (int i = 0; i < count; i++) { + Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4)); + vec->operands[i] = Operand(elem); + } + write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4)); + vec->definitions[0] = Definition(write_data); + ctx->block->instructions.emplace_back(std::move(vec)); + } else { + write_data = data; + } + + aco_opcode op; + switch (num_bytes) { + case 4: + op = aco_opcode::buffer_store_dword; + break; + case 8: + op = aco_opcode::buffer_store_dwordx2; + break; + case 12: + op = aco_opcode::buffer_store_dwordx3; + break; + case 16: + op = aco_opcode::buffer_store_dwordx4; + break; + default: + unreachable("Invalid data size for nir_intrinsic_store_scratch."); + } + + bld.mubuf(op, offset, rsrc, ctx->scratch_offset, write_data, start * elem_size_bytes, true); + } +} + +void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) { + uint8_t log2_ps_iter_samples; + if (ctx->program->info->ps.force_persample) { + log2_ps_iter_samples = + util_logbase2(ctx->options->key.fs.num_samples); + } else { + log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples; + } + + /* The bit pattern matches that used by fixed function fragment + * processing. */ + static const unsigned ps_iter_masks[] = { + 0xffff, /* not used */ + 0x5555, + 0x1111, + 0x0101, + 0x0001, + }; + assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks)); + + Builder bld(ctx->program, ctx->block); + + Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ctx->fs_inputs[fs_input::ancillary], Operand(8u), Operand(4u)); + Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples])); + Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, ctx->fs_inputs[fs_input::sample_coverage]); +} + +Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src) +{ + Builder bld(ctx->program, ctx->block); + + if (cluster_size == 1) { + return src; + } if (op == nir_op_iand && cluster_size == 4) { + //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) + Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src); + return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc), + bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp)); + } else if (op == nir_op_ior && cluster_size == 4) { + //subgroupClusteredOr(val, 4) -> wqm(val & exec) + return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), + bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2))); + } else if (op == nir_op_iand && cluster_size == 64) { + //subgroupAnd(val) -> (exec & ~val) == 0 + Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp(); + return bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), tmp, Operand(0u)); + } else if (op == nir_op_ior && cluster_size == 64) { + //subgroupOr(val) -> (val & exec) != 0 + return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp(); + } else if (op == nir_op_ixor && cluster_size == 64) { + //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 + Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s2), bld.def(s1, scc), tmp); + return bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp(); + } else { + //subgroupClustered{And,Or,Xor}(val, n) -> + //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) + //cluster_offset = ~(n - 1) & lane_id + //cluster_mask = ((1 << n) - 1) + //subgroupClusteredAnd(): + // return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask + //subgroupClusteredOr(): + // return ((val & exec) >> cluster_offset) & cluster_mask != 0 + //subgroupClusteredXor(): + // return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0 + Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1), + bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u))); + Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id); + + Temp tmp; + if (op == nir_op_iand) + tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + else + tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + + uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u; + tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp); + tmp = emit_extract_vector(ctx, tmp, 0, v1); + if (cluster_mask != 0xffffffff) + tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp); + + Definition cmp_def = Definition(); + if (op == nir_op_iand) { + cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0); + } else if (op == nir_op_ior) { + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0); + } else if (op == nir_op_ixor) { + tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), + bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u))); + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0); + } + cmp_def.setHint(vcc); + return cmp_def.getTemp(); + } +} + +Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src) +{ + Builder bld(ctx->program, ctx->block); + + //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0 + //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0 + //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0 + Temp tmp; + if (op == nir_op_iand) + tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src); + else + tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + + Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp); + Temp lo = lohi.def(0).getTemp(); + Temp hi = lohi.def(1).getTemp(); + Temp mbcnt = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), hi, + bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), lo, Operand(0u))); + + Definition cmp_def = Definition(); + if (op == nir_op_iand) + cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0); + else if (op == nir_op_ior) + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0); + else if (op == nir_op_ixor) + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), + bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0); + cmp_def.setHint(vcc); + return cmp_def.getTemp(); +} + +Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src) +{ + Builder bld(ctx->program, ctx->block); + + //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val + //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val + //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val + Temp tmp = emit_boolean_exclusive_scan(ctx, op, src); + if (op == nir_op_iand) + return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src); + else if (op == nir_op_ior) + return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src); + else if (op == nir_op_ixor) + return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src); + + assert(false); + return Temp(); +} + +void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src) +{ + Builder bld(ctx->program, ctx->block); + Definition dst(get_ssa_temp(ctx, &instr->dest.ssa)); + if (src.regClass().type() == RegType::vgpr) { + bld.pseudo(aco_opcode::p_as_uniform, dst, src); + } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) { + bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(dst), Operand(0u), Operand(src)); + } else if (src.regClass() == s1) { + bld.sop1(aco_opcode::s_mov_b32, dst, src); + } else if (src.regClass() == s2) { + bld.sop1(aco_opcode::s_mov_b64, dst, src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } +} + +void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2) +{ + Builder bld(ctx->program, ctx->block); + Temp p1 = ctx->fs_inputs[fs_input::persp_center_p1]; + Temp p2 = ctx->fs_inputs[fs_input::persp_center_p2]; + + /* Build DD X/Y */ + Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_quad_perm(0, 0, 0, 0)); + Temp ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(1, 1, 1, 1)); + Temp ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(2, 2, 2, 2)); + Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_quad_perm(0, 0, 0, 0)); + Temp ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(1, 1, 1, 1)); + Temp ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(2, 2, 2, 2)); + + /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */ + Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1); + Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2); + tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1); + tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2); + Temp wqm1 = bld.tmp(v1); + emit_wqm(ctx, tmp1, wqm1, true); + Temp wqm2 = bld.tmp(v1); + emit_wqm(ctx, tmp2, wqm2, true); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2); + return; +} + +void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + switch(instr->intrinsic) { + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: { + glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr); + fs_input input = get_interp_input(instr->intrinsic, mode); + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (input == fs_input::max_inputs) { + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + Operand(0u), Operand(0u)); + } else { + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + ctx->fs_inputs[input], + ctx->fs_inputs[input + 1]); + } + emit_split_vector(ctx, dst, 2); + break; + } + case nir_intrinsic_load_barycentric_at_sample: { + uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16; + switch (ctx->options->key.fs.num_samples) { + case 2: sample_pos_offset += 1 << 3; break; + case 4: sample_pos_offset += 3 << 3; break; + case 8: sample_pos_offset += 7 << 3; break; + default: break; + } + Temp sample_pos; + Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); + nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]); + if (addr.type() == RegType::sgpr) { + Operand offset; + if (const_addr) { + sample_pos_offset += const_addr->u32 << 3; + offset = Operand(sample_pos_offset); + } else if (ctx->options->chip_class >= GFX9) { + offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset)); + } else { + offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u)); + offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset)); + } + addr = ctx->private_segment_buffer; + sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand(offset)); + + } else if (ctx->options->chip_class >= GFX9) { + addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr); + sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, ctx->private_segment_buffer, sample_pos_offset); + } else { + /* addr += ctx->private_segment_buffer + sample_pos_offset */ + Temp tmp0 = bld.tmp(s1); + Temp tmp1 = bld.tmp(s1); + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), ctx->private_segment_buffer); + Definition scc_tmp = bld.def(s1, scc); + tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset)); + tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), scc_tmp.getTemp()); + addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr); + Temp pck0 = bld.tmp(v1); + Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp(); + tmp1 = as_vgpr(ctx, tmp1); + Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry); + addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1); + + /* sample_pos = flat_load_dwordx2 addr */ + sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1)); + } + + /* sample_pos -= 0.5 */ + Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1)); + Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1)); + bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos); + pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u)); + pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u)); + + emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2); + break; + } + case nir_intrinsic_load_barycentric_at_offset: { + Temp offset = get_ssa_temp(ctx, instr->src[0].ssa); + RegClass rc = RegClass(offset.type(), 1); + Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc); + bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset); + emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2); + break; + } + case nir_intrinsic_load_front_face: { + bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + Operand(0u), ctx->fs_inputs[fs_input::front_face]).def(0).setHint(vcc); + break; + } + case nir_intrinsic_load_view_index: + case nir_intrinsic_load_layer_id: { + if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), Operand(ctx->view_index)); + break; + } + + unsigned idx = nir_intrinsic_base(instr); + bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + Operand(2u), bld.m0(ctx->prim_mask), idx, 0); + break; + } + case nir_intrinsic_load_frag_coord: { + emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4); + break; + } + case nir_intrinsic_load_sample_pos: { + Temp posx = ctx->fs_inputs[fs_input::frag_pos_0]; + Temp posy = ctx->fs_inputs[fs_input::frag_pos_1]; + bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u), + posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u)); + break; + } + case nir_intrinsic_load_interpolated_input: + visit_load_interpolated_input(ctx, instr); + break; + case nir_intrinsic_store_output: + visit_store_output(ctx, instr); + break; + case nir_intrinsic_load_input: + visit_load_input(ctx, instr); + break; + case nir_intrinsic_load_ubo: + visit_load_ubo(ctx, instr); + break; + case nir_intrinsic_load_push_constant: + visit_load_push_constant(ctx, instr); + break; + case nir_intrinsic_load_constant: + visit_load_constant(ctx, instr); + break; + case nir_intrinsic_vulkan_resource_index: + visit_load_resource(ctx, instr); + break; + case nir_intrinsic_discard: + visit_discard(ctx, instr); + break; + case nir_intrinsic_discard_if: + visit_discard_if(ctx, instr); + break; + case nir_intrinsic_load_shared: + visit_load_shared(ctx, instr); + break; + case nir_intrinsic_store_shared: + visit_store_shared(ctx, instr); + break; + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_comp_swap: + visit_shared_atomic(ctx, instr); + break; + case nir_intrinsic_image_deref_load: + visit_image_load(ctx, instr); + break; + case nir_intrinsic_image_deref_store: + visit_image_store(ctx, instr); + break; + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + visit_image_atomic(ctx, instr); + break; + case nir_intrinsic_image_deref_size: + visit_image_size(ctx, instr); + break; + case nir_intrinsic_load_ssbo: + visit_load_ssbo(ctx, instr); + break; + case nir_intrinsic_store_ssbo: + visit_store_ssbo(ctx, instr); + break; + case nir_intrinsic_load_global: + visit_load_global(ctx, instr); + break; + case nir_intrinsic_store_global: + visit_store_global(ctx, instr); + break; + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + visit_atomic_ssbo(ctx, instr); + break; + case nir_intrinsic_load_scratch: + visit_load_scratch(ctx, instr); + break; + case nir_intrinsic_store_scratch: + visit_store_scratch(ctx, instr); + break; + case nir_intrinsic_get_buffer_size: + visit_get_buffer_size(ctx, instr); + break; + case nir_intrinsic_barrier: { + unsigned* bsize = ctx->program->info->cs.block_size; + unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2]; + if (workgroup_size > 64) + bld.sopp(aco_opcode::s_barrier); + break; + } + case nir_intrinsic_group_memory_barrier: + case nir_intrinsic_memory_barrier: + case nir_intrinsic_memory_barrier_atomic_counter: + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + case nir_intrinsic_memory_barrier_shared: + emit_memory_barrier(ctx, instr); + break; + case nir_intrinsic_load_num_work_groups: + case nir_intrinsic_load_work_group_id: + case nir_intrinsic_load_local_invocation_id: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp* ids; + if (instr->intrinsic == nir_intrinsic_load_num_work_groups) + ids = ctx->num_workgroups; + else if (instr->intrinsic == nir_intrinsic_load_work_group_id) + ids = ctx->workgroup_ids; + else + ids = ctx->local_invocation_ids; + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + ids[0].id() ? Operand(ids[0]) : Operand(1u), + ids[1].id() ? Operand(ids[1]) : Operand(1u), + ids[2].id() ? Operand(ids[2]) : Operand(1u)); + emit_split_vector(ctx, dst, 3); + break; + } + case nir_intrinsic_load_local_invocation_index: { + Temp id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1), + bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u))); + Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size); + bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id); + break; + } + case nir_intrinsic_load_subgroup_id: { + if (ctx->stage == compute_cs) { + Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size); + bld.sop2(aco_opcode::s_lshr_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), tg_num, Operand(0x6u)); + } else { + bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u)); + } + break; + } + case nir_intrinsic_load_subgroup_invocation: { + bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand((uint32_t) -1), + bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u))); + break; + } + case nir_intrinsic_load_num_subgroups: { + if (ctx->stage == compute_cs) + bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), ctx->tg_size); + else + bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u)); + break; + } + case nir_intrinsic_ballot: { + Definition tmp = bld.def(s2); + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s2) { + bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src); + } else if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s1) { + bld.sop2(aco_opcode::s_cselect_b64, tmp, Operand(exec, s2), Operand(0u), bld.scc(src)); + } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) { + bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src); + } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) { + bld.vopc(aco_opcode::v_cmp_lg_u64, tmp, Operand(0u), src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa)); + break; + } + case nir_intrinsic_shuffle: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + if (!ctx->divergent_vals[instr->dest.ssa.index]) { + emit_uniform_subgroup(ctx, instr, src); + } else { + Temp tid = get_ssa_temp(ctx, instr->src[1].ssa); + assert(tid.regClass() == v1); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (src.regClass() == v1) { + tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid); + emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, src), dst); + } else if (src.regClass() == v2) { + tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid); + + Temp lo = bld.tmp(v1), hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); + lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, lo)); + hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, hi)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); + emit_split_vector(ctx, dst, 2); + } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) { + Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src); + tmp = emit_extract_vector(ctx, tmp, 0, v1); + tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp); + emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + } + break; + } + case nir_intrinsic_load_sample_id: { + bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + ctx->fs_inputs[ancillary], Operand(8u), Operand(4u)); + break; + } + case nir_intrinsic_load_sample_mask_in: { + visit_load_sample_mask_in(ctx, instr); + break; + } + case nir_intrinsic_read_first_invocation: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (src.regClass() == v1) { + emit_wqm(ctx, + bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), + dst); + } else if (src.regClass() == v2) { + Temp lo = bld.tmp(v1), hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); + lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo)); + hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); + emit_split_vector(ctx, dst, 2); + } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) { + emit_wqm(ctx, + bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, + bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2))), + dst); + } else if (src.regClass() == s1) { + bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src); + } else if (src.regClass() == s2) { + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_intrinsic_read_invocation: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + Temp lane = get_ssa_temp(ctx, instr->src[1].ssa); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + assert(lane.regClass() == s1); + if (src.regClass() == v1) { + emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), src, lane), dst); + } else if (src.regClass() == v2) { + Temp lo = bld.tmp(v1), hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); + lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), lo, lane)); + hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), hi, lane)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); + emit_split_vector(ctx, dst, 2); + } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) { + emit_wqm(ctx, bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, lane), dst); + } else if (src.regClass() == s1) { + bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src); + } else if (src.regClass() == s2) { + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_intrinsic_vote_all: { + Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + assert(src.regClass() == s2); + assert(dst.regClass() == s1); + + Definition tmp = bld.def(s1); + bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(tmp), + bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)), + Operand(exec, s2)); + emit_wqm(ctx, tmp.getTemp(), dst); + break; + } + case nir_intrinsic_vote_any: { + Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + assert(src.regClass() == s2); + assert(dst.regClass() == s1); + + Definition tmp = bld.def(s1); + bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(tmp), src, Operand(exec, s2)); + emit_wqm(ctx, tmp.getTemp(), dst); + break; + } + case nir_intrinsic_reduce: + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_exclusive_scan: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + nir_op op = (nir_op) nir_intrinsic_reduction_op(instr); + unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ? + nir_intrinsic_cluster_size(instr) : 0; + cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : 64, 64)); + + if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) { + emit_uniform_subgroup(ctx, instr, src); + } else if (instr->dest.ssa.bit_size == 1) { + if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin) + op = nir_op_iand; + else if (op == nir_op_iadd) + op = nir_op_ixor; + else if (op == nir_op_umax || op == nir_op_imax) + op = nir_op_ior; + assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor); + + switch (instr->intrinsic) { + case nir_intrinsic_reduce: + emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst); + break; + case nir_intrinsic_exclusive_scan: + emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst); + break; + case nir_intrinsic_inclusive_scan: + emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst); + break; + default: + assert(false); + } + } else if (cluster_size == 1) { + bld.copy(Definition(dst), src); + } else { + src = as_vgpr(ctx, src); + + ReduceOp reduce_op; + switch (op) { + #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break; + CASE(iadd) + CASE(imul) + CASE(fadd) + CASE(fmul) + CASE(imin) + CASE(umin) + CASE(fmin) + CASE(imax) + CASE(umax) + CASE(fmax) + CASE(iand) + CASE(ior) + CASE(ixor) + default: + unreachable("unknown reduction op"); + #undef CASE + } + + aco_opcode aco_op; + switch (instr->intrinsic) { + case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break; + case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break; + case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break; + default: + unreachable("unknown reduce intrinsic"); + } + + aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)}; + reduce->operands[0] = Operand(src); + // filled in by aco_reduce_assign.cpp, used internally as part of the + // reduce sequence + assert(dst.size() == 1 || dst.size() == 2); + reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear()); + reduce->operands[2] = Operand(v1.as_linear()); + + Temp tmp_dst = bld.tmp(dst.regClass()); + reduce->definitions[0] = Definition(tmp_dst); + reduce->definitions[1] = bld.def(s2); // used internally + reduce->definitions[2] = Definition(); + reduce->definitions[3] = Definition(scc, s1); + reduce->definitions[4] = Definition(); + reduce->reduce_op = reduce_op; + reduce->cluster_size = cluster_size; + ctx->block->instructions.emplace_back(std::move(reduce)); + + emit_wqm(ctx, tmp_dst, dst); + } + break; + } + case nir_intrinsic_quad_broadcast: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + if (!ctx->divergent_vals[instr->dest.ssa.index]) { + emit_uniform_subgroup(ctx, instr, src); + } else { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + unsigned lane = nir_src_as_const_value(instr->src[1])->u32; + if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) { + uint32_t half_mask = 0x11111111u << lane; + Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask)); + Temp tmp = bld.tmp(s2); + bld.sop1(aco_opcode::s_wqm_b64, Definition(tmp), + bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), mask_tmp, + bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)))); + emit_wqm(ctx, tmp, dst); + } else if (instr->dest.ssa.bit_size == 32) { + emit_wqm(ctx, + bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, + dpp_quad_perm(lane, lane, lane, lane)), + dst); + } else if (instr->dest.ssa.bit_size == 64) { + Temp lo = bld.tmp(v1), hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); + lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_quad_perm(lane, lane, lane, lane))); + hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_quad_perm(lane, lane, lane, lane))); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); + emit_split_vector(ctx, dst, 2); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + } + break; + } + case nir_intrinsic_quad_swap_horizontal: + case nir_intrinsic_quad_swap_vertical: + case nir_intrinsic_quad_swap_diagonal: + case nir_intrinsic_quad_swizzle_amd: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + if (!ctx->divergent_vals[instr->dest.ssa.index]) { + emit_uniform_subgroup(ctx, instr, src); + break; + } + uint16_t dpp_ctrl = 0; + switch (instr->intrinsic) { + case nir_intrinsic_quad_swap_horizontal: + dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); + break; + case nir_intrinsic_quad_swap_vertical: + dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); + break; + case nir_intrinsic_quad_swap_diagonal: + dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); + break; + case nir_intrinsic_quad_swizzle_amd: { + dpp_ctrl = nir_intrinsic_swizzle_mask(instr); + break; + } + default: + break; + } + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) { + src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src); + src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl); + Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src); + emit_wqm(ctx, tmp, dst); + } else if (instr->dest.ssa.bit_size == 32) { + Temp tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl); + emit_wqm(ctx, tmp, dst); + } else if (instr->dest.ssa.bit_size == 64) { + Temp lo = bld.tmp(v1), hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); + lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl)); + hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); + emit_split_vector(ctx, dst, 2); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_intrinsic_masked_swizzle_amd: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + if (!ctx->divergent_vals[instr->dest.ssa.index]) { + emit_uniform_subgroup(ctx, instr, src); + break; + } + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + uint32_t mask = nir_intrinsic_swizzle_mask(instr); + if (dst.regClass() == v1) { + emit_wqm(ctx, + bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false), + dst); + } else if (dst.regClass() == v2) { + Temp lo = bld.tmp(v1), hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); + lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false)); + hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); + emit_split_vector(ctx, dst, 2); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_intrinsic_write_invocation_amd: { + Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)); + Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (dst.regClass() == v1) { + /* src2 is ignored for writelane. RA assigns the same reg for dst */ + emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst); + } else if (dst.regClass() == v2) { + Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1); + Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src); + bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val); + Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi)); + Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); + emit_split_vector(ctx, dst, 2); + } else { + fprintf(stderr, "Unimplemented NIR instr bit size: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + } + break; + } + case nir_intrinsic_mbcnt_amd: { + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + RegClass rc = RegClass(src.type(), 1); + Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc); + bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src); + Temp tmp = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, Operand(0u)); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp wqm_tmp = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), mask_hi, tmp); + emit_wqm(ctx, wqm_tmp, dst); + break; + } + case nir_intrinsic_load_helper_invocation: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.pseudo(aco_opcode::p_load_helper, Definition(dst)); + ctx->block->kind |= block_kind_needs_lowering; + ctx->program->needs_exact = true; + break; + } + case nir_intrinsic_is_helper_invocation: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.pseudo(aco_opcode::p_is_helper, Definition(dst)); + ctx->block->kind |= block_kind_needs_lowering; + ctx->program->needs_exact = true; + break; + } + case nir_intrinsic_demote: + bld.pseudo(aco_opcode::p_demote_to_helper); + ctx->block->kind |= block_kind_needs_lowering; + ctx->program->needs_exact = true; + break; + case nir_intrinsic_demote_if: { + Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), + as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false), + Operand(exec, s2)); + bld.pseudo(aco_opcode::p_demote_to_helper, cond); + ctx->block->kind |= block_kind_needs_lowering; + ctx->program->needs_exact = true; + break; + } + case nir_intrinsic_first_invocation: { + emit_wqm(ctx, bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)), + get_ssa_temp(ctx, &instr->dest.ssa)); + break; + } + case nir_intrinsic_shader_clock: + bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa))); + break; + case nir_intrinsic_load_vertex_id_zero_base: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), ctx->vertex_id); + break; + } + case nir_intrinsic_load_first_vertex: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), ctx->base_vertex); + break; + } + case nir_intrinsic_load_base_instance: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), ctx->start_instance); + break; + } + case nir_intrinsic_load_instance_id: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), ctx->instance_id); + break; + } + case nir_intrinsic_load_draw_id: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), ctx->draw_id); + break; + } + default: + fprintf(stderr, "Unimplemented intrinsic instr: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + + break; + } +} + + +void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr, + Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr, + enum glsl_base_type *stype) +{ + nir_deref_instr *texture_deref_instr = NULL; + nir_deref_instr *sampler_deref_instr = NULL; + int plane = -1; + + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_texture_deref: + texture_deref_instr = nir_src_as_deref(instr->src[i].src); + break; + case nir_tex_src_sampler_deref: + sampler_deref_instr = nir_src_as_deref(instr->src[i].src); + break; + case nir_tex_src_plane: + plane = nir_src_as_int(instr->src[i].src); + break; + default: + break; + } + } + + *stype = glsl_get_sampler_result_type(texture_deref_instr->type); + + if (!sampler_deref_instr) + sampler_deref_instr = texture_deref_instr; + + if (plane >= 0) { + assert(instr->op != nir_texop_txf_ms && + instr->op != nir_texop_samples_identical); + assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF); + *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false); + } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { + *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false); + } else { + *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false); + } + if (samp_ptr) { + *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false); + if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) { + fprintf(stderr, "Unimplemented sampler descriptor: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + // TODO: build samp_ptr = and(samp_ptr, res_ptr) + } + } + if (fmask_ptr && (instr->op == nir_texop_txf_ms || + instr->op == nir_texop_samples_identical)) + *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false); +} + +void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv, + Temp *out_ma, Temp *out_sc, Temp *out_tc) +{ + Builder bld(ctx->program, ctx->block); + + Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1); + Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1); + Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1); + + Operand neg_one(0xbf800000u); + Operand one(0x3f800000u); + Operand two(0x40000000u); + Operand four(0x40800000u); + + Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), ma); + Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive); + Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma); + + Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), four, id); + Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id); + is_ma_y = bld.sop2(aco_opcode::s_andn2_b64, bld.hint_vcc(bld.def(s2)), is_ma_y, is_ma_z); + Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), is_ma_z, is_ma_y); + + // select sc + Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x); + Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), + bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), + one, is_ma_y); + *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn); + + // select tc + tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y); + sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y); + *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn); + + // select ma + tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y), + deriv_z, is_ma_z); + tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp); + *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp); +} + +void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array) +{ + Builder bld(ctx->program, ctx->block); + Temp coord_args[4], ma, tc, sc, id; + for (unsigned i = 0; i < (is_array ? 4 : 3); i++) + coord_args[i] = emit_extract_vector(ctx, *coords, i, v1); + + if (is_array) { + coord_args[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_args[3]); + + // see comment in ac_prepare_cube_coords() + if (ctx->options->chip_class <= GFX8) + coord_args[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coord_args[3]); + } + + ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]); + + aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)}; + vop3a->operands[0] = Operand(ma); + vop3a->abs[0] = true; + Temp invma = bld.tmp(v1); + vop3a->definitions[0] = Definition(invma); + ctx->block->instructions.emplace_back(std::move(vop3a)); + + sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]); + if (!is_deriv) + sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/)); + + tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]); + if (!is_deriv) + tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/)); + + id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]); + + if (is_deriv) { + sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma); + tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma); + + for (unsigned i = 0; i < 2; i++) { + // see comment in ac_prepare_cube_coords() + Temp deriv_ma; + Temp deriv_sc, deriv_tc; + build_cube_select(ctx, ma, id, i ? *ddy : *ddx, + &deriv_ma, &deriv_sc, &deriv_tc); + + deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma); + + Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma), + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc)); + Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma), + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc)); + *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y); + } + + sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc); + tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc); + } + + if (is_array) + id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coord_args[3], id, Operand(0x41000000u/*8.0*/)); + *coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), sc, tc, id); + +} + +Temp apply_round_slice(isel_context *ctx, Temp coords, unsigned idx) +{ + Temp coord_vec[3]; + for (unsigned i = 0; i < coords.size(); i++) + coord_vec[i] = emit_extract_vector(ctx, coords, i, v1); + + Builder bld(ctx->program, ctx->block); + coord_vec[idx] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_vec[idx]); + + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)}; + for (unsigned i = 0; i < coords.size(); i++) + vec->operands[i] = Operand(coord_vec[i]); + Temp res = bld.tmp(RegType::vgpr, coords.size()); + vec->definitions[0] = Definition(res); + ctx->block->instructions.emplace_back(std::move(vec)); + return res; +} + +void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4]) +{ + if (vec->parent_instr->type != nir_instr_type_alu) + return; + nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr); + if (vec_instr->op != nir_op_vec(vec->num_components)) + return; + + for (unsigned i = 0; i < vec->num_components; i++) { + cv[i] = vec_instr->src[i].swizzle[0] == 0 ? + nir_src_as_const_value(vec_instr->src[i].src) : NULL; + } +} + +void visit_tex(isel_context *ctx, nir_tex_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + bool has_bias = false, has_lod = false, level_zero = false, has_compare = false, + has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false; + Temp resource, sampler, fmask_ptr, bias = Temp(), coords, compare = Temp(), sample_index = Temp(), + lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), derivs = Temp(); + nir_const_value *sample_index_cv = NULL; + nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL}; + enum glsl_base_type stype; + tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype); + + bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 && + (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT); + bool tg4_integer_cube_workaround = tg4_integer_workarounds && + instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE; + + for (unsigned i = 0; i < instr->num_srcs; i++) { + switch (instr->src[i].src_type) { + case nir_tex_src_coord: + coords = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[i].src.ssa)); + break; + case nir_tex_src_bias: + if (instr->op == nir_texop_txb) { + bias = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_bias = true; + } + break; + case nir_tex_src_lod: { + nir_const_value *val = nir_src_as_const_value(instr->src[i].src); + + if (val && val->f32 <= 0.0) { + level_zero = true; + } else { + lod = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_lod = true; + } + break; + } + case nir_tex_src_comparator: + if (instr->is_shadow) { + compare = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_compare = true; + } + break; + case nir_tex_src_offset: + offset = get_ssa_temp(ctx, instr->src[i].src.ssa); + get_const_vec(instr->src[i].src.ssa, const_offset); + has_offset = true; + break; + case nir_tex_src_ddx: + ddx = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_ddx = true; + break; + case nir_tex_src_ddy: + ddy = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_ddy = true; + break; + case nir_tex_src_ms_index: + sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa); + sample_index_cv = nir_src_as_const_value(instr->src[i].src); + has_sample_index = true; + break; + case nir_tex_src_texture_offset: + case nir_tex_src_sampler_offset: + default: + break; + } + } +// TODO: all other cases: structure taken from ac_nir_to_llvm.c + if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) + return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true); + + if (instr->op == nir_texop_texture_samples) { + Temp dword3 = emit_extract_vector(ctx, resource, 3, s1); + + Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16)); + Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2); + Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */)); + Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u)); + + bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), + samples, Operand(1u), bld.scc(is_msaa)); + return; + } + + if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) { + aco_ptr<Instruction> tmp_instr; + Temp acc, pack = Temp(); + + uint32_t pack_const = 0; + for (unsigned i = 0; i < offset.size(); i++) { + if (!const_offset[i]) + continue; + pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i); + } + + if (offset.type() == RegType::sgpr) { + for (unsigned i = 0; i < offset.size(); i++) { + if (const_offset[i]) + continue; + + acc = emit_extract_vector(ctx, offset, i, s1); + acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu)); + + if (i) { + acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i)); + } + + if (pack == Temp()) { + pack = acc; + } else { + pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc); + } + } + + if (pack_const && pack != Temp()) + pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack); + } else { + for (unsigned i = 0; i < offset.size(); i++) { + if (const_offset[i]) + continue; + + acc = emit_extract_vector(ctx, offset, i, v1); + acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc); + + if (i) { + acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc); + } + + if (pack == Temp()) { + pack = acc; + } else { + pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc); + } + } + + if (pack_const && pack != Temp()) + pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack); + } + if (pack_const && pack == Temp()) + offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const)); + else if (pack == Temp()) + has_offset = false; + else + offset = pack; + } + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components) + prepare_cube_coords(ctx, &coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod); + + /* pack derivatives */ + if (has_ddx || has_ddy) { + if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class >= GFX9) { + derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4), + ddx, Operand(0u), ddy, Operand(0u)); + } else { + derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, ddx.size() + ddy.size()), ddx, ddy); + } + has_derivs = true; + } + + if (instr->coord_components > 1 && + instr->sampler_dim == GLSL_SAMPLER_DIM_1D && + instr->is_array && + instr->op != nir_texop_txf) + coords = apply_round_slice(ctx, coords, 1); + + if (instr->coord_components > 2 && + (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || + instr->sampler_dim == GLSL_SAMPLER_DIM_MS || + instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS || + instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && + instr->is_array && + instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) + coords = apply_round_slice(ctx, coords, 2); + + if (ctx->options->chip_class >= GFX9 && + instr->sampler_dim == GLSL_SAMPLER_DIM_1D && + instr->op != nir_texop_lod && instr->coord_components) { + assert(coords.size() > 0 && coords.size() < 3); + + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size() + 1, 1)}; + vec->operands[0] = Operand(emit_extract_vector(ctx, coords, 0, v1)); + vec->operands[1] = instr->op == nir_texop_txf ? Operand((uint32_t) 0) : Operand((uint32_t) 0x3f000000); + if (coords.size() > 1) + vec->operands[2] = Operand(emit_extract_vector(ctx, coords, 1, v1)); + coords = bld.tmp(RegType::vgpr, coords.size() + 1); + vec->definitions[0] = Definition(coords); + ctx->block->instructions.emplace_back(std::move(vec)); + } + + bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array); + + if (instr->op == nir_texop_samples_identical) + resource = fmask_ptr; + + else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS || + instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && + instr->op != nir_texop_txs) { + assert(has_sample_index); + Operand op(sample_index); + if (sample_index_cv) + op = Operand(sample_index_cv->u32); + sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr); + } + + if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) { + Temp split_coords[coords.size()]; + emit_split_vector(ctx, coords, coords.size()); + for (unsigned i = 0; i < coords.size(); i++) + split_coords[i] = emit_extract_vector(ctx, coords, i, v1); + + unsigned i = 0; + for (; i < std::min(offset.size(), instr->coord_components); i++) { + Temp off = emit_extract_vector(ctx, offset, i, v1); + split_coords[i] = bld.vadd32(bld.def(v1), split_coords[i], off); + } + + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)}; + for (unsigned i = 0; i < coords.size(); i++) + vec->operands[i] = Operand(split_coords[i]); + coords = bld.tmp(coords.regClass()); + vec->definitions[0] = Definition(coords); + ctx->block->instructions.emplace_back(std::move(vec)); + + has_offset = false; + } + + /* Build tex instruction */ + unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp tmp_dst = dst; + + /* gather4 selects the component by dmask and always returns vec4 */ + if (instr->op == nir_texop_tg4) { + assert(instr->dest.ssa.num_components == 4); + if (instr->is_shadow) + dmask = 1; + else + dmask = 1 << instr->component; + if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr) + tmp_dst = bld.tmp(v4); + } else if (instr->op == nir_texop_samples_identical) { + tmp_dst = bld.tmp(v1); + } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) { + tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask))); + } + + aco_ptr<MIMG_instruction> tex; + if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) { + if (!has_lod) + lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); + + bool div_by_6 = instr->op == nir_texop_txs && + instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && + instr->is_array && + (dmask & (1 << 2)); + if (tmp_dst.id() == dst.id() && div_by_6) + tmp_dst = bld.tmp(tmp_dst.regClass()); + + tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)); + tex->operands[0] = Operand(as_vgpr(ctx,lod)); + tex->operands[1] = Operand(resource); + if (ctx->options->chip_class >= GFX9 && + instr->op == nir_texop_txs && + instr->sampler_dim == GLSL_SAMPLER_DIM_1D && + instr->is_array) { + tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1); + } else if (instr->op == nir_texop_query_levels) { + tex->dmask = 1 << 3; + } else { + tex->dmask = dmask; + } + tex->da = da; + tex->definitions[0] = Definition(tmp_dst); + tex->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(tex)); + + if (div_by_6) { + /* divide 3rd value by 6 by multiplying with magic number */ + emit_split_vector(ctx, tmp_dst, tmp_dst.size()); + Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB)); + Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c); + assert(instr->dest.ssa.num_components == 3); + Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3); + tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), + emit_extract_vector(ctx, tmp_dst, 0, v1), + emit_extract_vector(ctx, tmp_dst, 1, v1), + by_6); + + } + + expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask); + return; + } + + Temp tg4_compare_cube_wa64 = Temp(); + + if (tg4_integer_workarounds) { + tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)); + tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); + tex->operands[1] = Operand(resource); + tex->dmask = 0x3; + tex->da = da; + Temp size = bld.tmp(v2); + tex->definitions[0] = Definition(size); + tex->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(tex)); + emit_split_vector(ctx, size, size.size()); + + Temp half_texel[2]; + for (unsigned i = 0; i < 2; i++) { + half_texel[i] = emit_extract_vector(ctx, size, i, v1); + half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]); + half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]); + half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]); + } + + Temp orig_coords[2] = { + emit_extract_vector(ctx, coords, 0, v1), + emit_extract_vector(ctx, coords, 1, v1)}; + Temp new_coords[2] = { + bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[0], half_texel[0]), + bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[1], half_texel[1]) + }; + + if (tg4_integer_cube_workaround) { + // see comment in ac_nir_to_llvm.c's lower_gather4_integer() + Temp desc[resource.size()]; + aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, + Format::PSEUDO, 1, resource.size())}; + split->operands[0] = Operand(resource); + for (unsigned i = 0; i < resource.size(); i++) { + desc[i] = bld.tmp(s1); + split->definitions[i] = Definition(desc[i]); + } + ctx->block->instructions.emplace_back(std::move(split)); + + Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16))); + Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt, + Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8)); + + Temp nfmt; + if (stype == GLSL_TYPE_UINT) { + nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), + Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED), + Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT), + bld.scc(compare_cube_wa)); + } else { + nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), + Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED), + Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT), + bld.scc(compare_cube_wa)); + } + tg4_compare_cube_wa64 = as_divergent_bool(ctx, compare_cube_wa, true); + nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u)); + + desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1], + Operand((uint32_t)C_008F14_NUM_FORMAT)); + desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt); + + aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, + Format::PSEUDO, resource.size(), 1)}; + for (unsigned i = 0; i < resource.size(); i++) + vec->operands[i] = Operand(desc[i]); + resource = bld.tmp(resource.regClass()); + vec->definitions[0] = Definition(resource); + ctx->block->instructions.emplace_back(std::move(vec)); + + new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + new_coords[0], orig_coords[0], tg4_compare_cube_wa64); + new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + new_coords[1], orig_coords[1], tg4_compare_cube_wa64); + } + + if (coords.size() == 3) { + coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), + new_coords[0], new_coords[1], + emit_extract_vector(ctx, coords, 2, v1)); + } else { + assert(coords.size() == 2); + coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), + new_coords[0], new_coords[1]); + } + } + + if (!(has_ddx && has_ddy) && !has_lod && !level_zero && + instr->sampler_dim != GLSL_SAMPLER_DIM_MS && + instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS) + coords = emit_wqm(ctx, coords, bld.tmp(coords.regClass()), true); + + std::vector<Operand> args; + if (has_offset) + args.emplace_back(Operand(offset)); + if (has_bias) + args.emplace_back(Operand(bias)); + if (has_compare) + args.emplace_back(Operand(compare)); + if (has_derivs) + args.emplace_back(Operand(derivs)); + args.emplace_back(Operand(coords)); + if (has_sample_index) + args.emplace_back(Operand(sample_index)); + if (has_lod) + args.emplace_back(lod); + + Operand arg; + if (args.size() > 1) { + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)}; + unsigned size = 0; + for (unsigned i = 0; i < args.size(); i++) { + size += args[i].size(); + vec->operands[i] = args[i]; + } + RegClass rc = RegClass(RegType::vgpr, size); + Temp tmp = bld.tmp(rc); + vec->definitions[0] = Definition(tmp); + ctx->block->instructions.emplace_back(std::move(vec)); + arg = Operand(tmp); + } else { + assert(args[0].isTemp()); + arg = Operand(as_vgpr(ctx, args[0].getTemp())); + } + + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { + //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe() + + assert(coords.size() == 1); + unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa)); + aco_opcode op; + switch (last_bit) { + case 1: + op = aco_opcode::buffer_load_format_x; break; + case 2: + op = aco_opcode::buffer_load_format_xy; break; + case 3: + op = aco_opcode::buffer_load_format_xyz; break; + case 4: + op = aco_opcode::buffer_load_format_xyzw; break; + default: + unreachable("Tex instruction loads more than 4 components."); + } + + /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */ + if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr) + tmp_dst = dst; + else + tmp_dst = bld.tmp(RegType::vgpr, last_bit); + + aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)}; + mubuf->operands[0] = Operand(coords); + mubuf->operands[1] = Operand(resource); + mubuf->operands[2] = Operand((uint32_t) 0); + mubuf->definitions[0] = Definition(tmp_dst); + mubuf->idxen = true; + mubuf->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(mubuf)); + + expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1); + return; + } + + + if (instr->op == nir_texop_txf || + instr->op == nir_texop_txf_ms || + instr->op == nir_texop_samples_identical) { + aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? aco_opcode::image_load : aco_opcode::image_load_mip; + tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 2, 1)); + tex->operands[0] = Operand(arg); + tex->operands[1] = Operand(resource); + tex->dmask = dmask; + tex->unrm = true; + tex->da = da; + tex->definitions[0] = Definition(tmp_dst); + tex->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(tex)); + + if (instr->op == nir_texop_samples_identical) { + assert(dmask == 1 && dst.regClass() == v1); + assert(dst.id() != tmp_dst.id()); + + Temp tmp = bld.tmp(s2); + bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc); + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp); + + } else { + expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask); + } + return; + } + + // TODO: would be better to do this by adding offsets, but needs the opcodes ordered. + aco_opcode opcode = aco_opcode::image_sample; + if (has_offset) { /* image_sample_*_o */ + if (has_compare) { + opcode = aco_opcode::image_sample_c_o; + if (has_derivs) + opcode = aco_opcode::image_sample_c_d_o; + if (has_bias) + opcode = aco_opcode::image_sample_c_b_o; + if (level_zero) + opcode = aco_opcode::image_sample_c_lz_o; + if (has_lod) + opcode = aco_opcode::image_sample_c_l_o; + } else { + opcode = aco_opcode::image_sample_o; + if (has_derivs) + opcode = aco_opcode::image_sample_d_o; + if (has_bias) + opcode = aco_opcode::image_sample_b_o; + if (level_zero) + opcode = aco_opcode::image_sample_lz_o; + if (has_lod) + opcode = aco_opcode::image_sample_l_o; + } + } else { /* no offset */ + if (has_compare) { + opcode = aco_opcode::image_sample_c; + if (has_derivs) + opcode = aco_opcode::image_sample_c_d; + if (has_bias) + opcode = aco_opcode::image_sample_c_b; + if (level_zero) + opcode = aco_opcode::image_sample_c_lz; + if (has_lod) + opcode = aco_opcode::image_sample_c_l; + } else { + opcode = aco_opcode::image_sample; + if (has_derivs) + opcode = aco_opcode::image_sample_d; + if (has_bias) + opcode = aco_opcode::image_sample_b; + if (level_zero) + opcode = aco_opcode::image_sample_lz; + if (has_lod) + opcode = aco_opcode::image_sample_l; + } + } + + if (instr->op == nir_texop_tg4) { + if (has_offset) { + opcode = aco_opcode::image_gather4_lz_o; + if (has_compare) + opcode = aco_opcode::image_gather4_c_lz_o; + } else { + opcode = aco_opcode::image_gather4_lz; + if (has_compare) + opcode = aco_opcode::image_gather4_c_lz; + } + } else if (instr->op == nir_texop_lod) { + opcode = aco_opcode::image_get_lod; + } + + tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1)); + tex->operands[0] = arg; + tex->operands[1] = Operand(resource); + tex->operands[2] = Operand(sampler); + tex->dmask = dmask; + tex->da = da; + tex->definitions[0] = Definition(tmp_dst); + tex->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(tex)); + + if (tg4_integer_cube_workaround) { + assert(tmp_dst.id() != dst.id()); + assert(tmp_dst.size() == dst.size() && dst.size() == 4); + + emit_split_vector(ctx, tmp_dst, tmp_dst.size()); + Temp val[4]; + for (unsigned i = 0; i < dst.size(); i++) { + val[i] = emit_extract_vector(ctx, tmp_dst, i, v1); + Temp cvt_val; + if (stype == GLSL_TYPE_UINT) + cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]); + else + cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]); + val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64); + } + Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4); + tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), + val[0], val[1], val[2], val[3]); + } + unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask; + expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask); + +} + + +Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa) +{ + Temp tmp = get_ssa_temp(ctx, ssa); + if (ssa->parent_instr->type == nir_instr_type_ssa_undef) + return Operand(tmp.regClass()); + else + return Operand(tmp); +} + +void visit_phi(isel_context *ctx, nir_phi_instr *instr) +{ + aco_ptr<Pseudo_instruction> phi; + unsigned num_src = exec_list_length(&instr->srcs); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + aco_opcode opcode = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index] ? aco_opcode::p_phi : aco_opcode::p_linear_phi; + + std::map<unsigned, nir_ssa_def*> phi_src; + bool all_undef = true; + nir_foreach_phi_src(src, instr) { + phi_src[src->pred->index] = src->src.ssa; + if (src->src.ssa->parent_instr->type != nir_instr_type_ssa_undef) + all_undef = false; + } + if (all_undef) { + Builder bld(ctx->program, ctx->block); + if (dst.regClass() == s1) { + bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u)); + } else if (dst.regClass() == v1) { + bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u)); + } else { + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + for (unsigned i = 0; i < dst.size(); i++) + vec->operands[i] = Operand(0u); + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + } + return; + } + + /* try to scalarize vector phis */ + if (dst.size() > 1) { + // TODO: scalarize linear phis on divergent ifs + bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge)); + std::array<Temp, 4> new_vec; + for (std::pair<const unsigned, nir_ssa_def*>& pair : phi_src) { + Operand src = get_phi_operand(ctx, pair.second); + if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end()) { + can_scalarize = false; + break; + } + } + if (can_scalarize) { + unsigned num_components = instr->dest.ssa.num_components; + assert(dst.size() % num_components == 0); + RegClass rc = RegClass(dst.type(), dst.size() / num_components); + + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; + for (unsigned k = 0; k < num_components; k++) { + phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src, 1)); + std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin(); + for (unsigned i = 0; i < num_src; i++) { + Operand src = get_phi_operand(ctx, it->second); + phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc); + ++it; + } + Temp phi_dst = {ctx->program->allocateId(), rc}; + phi->definitions[0] = Definition(phi_dst); + ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi)); + new_vec[k] = phi_dst; + vec->operands[k] = Operand(phi_dst); + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), new_vec); + return; + } + } + + unsigned extra_src = 0; + if (opcode == aco_opcode::p_linear_phi && (ctx->block->kind & block_kind_loop_exit) && + ctx->program->blocks[ctx->block->index-2].kind & block_kind_continue_or_break) { + extra_src++; + } + + phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src + extra_src, 1)); + + /* if we have a linear phi on a divergent if, we know that one src is undef */ + if (opcode == aco_opcode::p_linear_phi && ctx->block->kind & block_kind_merge) { + assert(extra_src == 0); + Block* block; + /* we place the phi either in the invert-block or in the current block */ + if (phi_src.begin()->second->parent_instr->type != nir_instr_type_ssa_undef) { + assert((++phi_src.begin())->second->parent_instr->type == nir_instr_type_ssa_undef); + Block& linear_else = ctx->program->blocks[ctx->block->linear_preds[1]]; + block = &ctx->program->blocks[linear_else.linear_preds[0]]; + assert(block->kind & block_kind_invert); + phi->operands[0] = get_phi_operand(ctx, phi_src.begin()->second); + } else { + assert((++phi_src.begin())->second->parent_instr->type != nir_instr_type_ssa_undef); + block = ctx->block; + phi->operands[0] = get_phi_operand(ctx, (++phi_src.begin())->second); + } + phi->operands[1] = Operand(dst.regClass()); + phi->definitions[0] = Definition(dst); + block->instructions.emplace(block->instructions.begin(), std::move(phi)); + return; + } + + std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin(); + for (unsigned i = 0; i < num_src; i++) { + phi->operands[i] = get_phi_operand(ctx, it->second); + ++it; + } + for (unsigned i = 0; i < extra_src; i++) + phi->operands[num_src + i] = Operand(dst.regClass()); + phi->definitions[0] = Definition(dst); + ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi)); +} + + +void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr) +{ + Temp dst = get_ssa_temp(ctx, &instr->def); + + assert(dst.type() == RegType::sgpr); + + if (dst.size() == 1) { + Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u)); + } else { + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + for (unsigned i = 0; i < dst.size(); i++) + vec->operands[i] = Operand(0u); + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + } +} + +void visit_jump(isel_context *ctx, nir_jump_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + Block *logical_target; + append_logical_end(ctx->block); + unsigned idx = ctx->block->index; + + switch (instr->type) { + case nir_jump_break: + logical_target = ctx->cf_info.parent_loop.exit; + add_logical_edge(idx, logical_target); + ctx->block->kind |= block_kind_break; + + if (!ctx->cf_info.parent_if.is_divergent && + !ctx->cf_info.parent_loop.has_divergent_continue) { + /* uniform break - directly jump out of the loop */ + ctx->block->kind |= block_kind_uniform; + ctx->cf_info.has_branch = true; + bld.branch(aco_opcode::p_branch); + add_linear_edge(idx, logical_target); + return; + } + ctx->cf_info.parent_loop.has_divergent_branch = true; + break; + case nir_jump_continue: + logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx]; + add_logical_edge(idx, logical_target); + ctx->block->kind |= block_kind_continue; + + if (ctx->cf_info.parent_if.is_divergent) { + /* for potential uniform breaks after this continue, + we must ensure that they are handled correctly */ + ctx->cf_info.parent_loop.has_divergent_continue = true; + ctx->cf_info.parent_loop.has_divergent_branch = true; + } else { + /* uniform continue - directly jump to the loop header */ + ctx->block->kind |= block_kind_uniform; + ctx->cf_info.has_branch = true; + bld.branch(aco_opcode::p_branch); + add_linear_edge(idx, logical_target); + return; + } + break; + default: + fprintf(stderr, "Unknown NIR jump instr: "); + nir_print_instr(&instr->instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + + /* remove critical edges from linear CFG */ + bld.branch(aco_opcode::p_branch); + Block* break_block = ctx->program->create_and_insert_block(); + break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth; + break_block->kind |= block_kind_uniform; + add_linear_edge(idx, break_block); + /* the loop_header pointer might be invalidated by this point */ + if (instr->type == nir_jump_continue) + logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx]; + add_linear_edge(break_block->index, logical_target); + bld.reset(break_block); + bld.branch(aco_opcode::p_branch); + + Block* continue_block = ctx->program->create_and_insert_block(); + continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_linear_edge(idx, continue_block); + append_logical_start(continue_block); + ctx->block = continue_block; + return; +} + +void visit_block(isel_context *ctx, nir_block *block) +{ + nir_foreach_instr(instr, block) { + switch (instr->type) { + case nir_instr_type_alu: + visit_alu_instr(ctx, nir_instr_as_alu(instr)); + break; + case nir_instr_type_load_const: + visit_load_const(ctx, nir_instr_as_load_const(instr)); + break; + case nir_instr_type_intrinsic: + visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_tex: + visit_tex(ctx, nir_instr_as_tex(instr)); + break; + case nir_instr_type_phi: + visit_phi(ctx, nir_instr_as_phi(instr)); + break; + case nir_instr_type_ssa_undef: + visit_undef(ctx, nir_instr_as_ssa_undef(instr)); + break; + case nir_instr_type_deref: + break; + case nir_instr_type_jump: + visit_jump(ctx, nir_instr_as_jump(instr)); + break; + default: + fprintf(stderr, "Unknown NIR instr type: "); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + //abort(); + } + } +} + + + +static void visit_loop(isel_context *ctx, nir_loop *loop) +{ + append_logical_end(ctx->block); + ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform; + Builder bld(ctx->program, ctx->block); + bld.branch(aco_opcode::p_branch); + unsigned loop_preheader_idx = ctx->block->index; + + Block loop_exit = Block(); + loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth; + loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level)); + + Block* loop_header = ctx->program->create_and_insert_block(); + loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1; + loop_header->kind |= block_kind_loop_header; + add_edge(loop_preheader_idx, loop_header); + ctx->block = loop_header; + + /* emit loop body */ + unsigned loop_header_idx = loop_header->index; + loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit); + append_logical_start(ctx->block); + visit_cf_list(ctx, &loop->body); + + //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken? + if (!ctx->cf_info.has_branch) { + append_logical_end(ctx->block); + if (ctx->cf_info.exec_potentially_empty) { + /* Discards can result in code running with an empty exec mask. + * This would result in divergent breaks not ever being taken. As a + * workaround, break the loop when the loop mask is empty instead of + * always continuing. */ + ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform); + + /* create "loop_almost_exit" to avoid critical edges */ + unsigned block_idx = ctx->block->index; + Block *loop_almost_exit = ctx->program->create_and_insert_block(); + loop_almost_exit->loop_nest_depth = ctx->cf_info.loop_nest_depth; + loop_almost_exit->kind = block_kind_uniform; + bld.reset(loop_almost_exit); + bld.branch(aco_opcode::p_branch); + + add_linear_edge(block_idx, loop_almost_exit); + add_linear_edge(loop_almost_exit->index, &loop_exit); + + ctx->block = &ctx->program->blocks[block_idx]; + } else { + ctx->block->kind |= (block_kind_continue | block_kind_uniform); + } + if (!ctx->cf_info.parent_loop.has_divergent_branch) + add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]); + else + add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]); + bld.reset(ctx->block); + bld.branch(aco_opcode::p_branch); + } + + /* fixup phis in loop header from unreachable blocks */ + if (ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch) { + bool linear = ctx->cf_info.has_branch; + bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch; + for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) { + if ((logical && instr->opcode == aco_opcode::p_phi) || + (linear && instr->opcode == aco_opcode::p_linear_phi)) { + /* the last operand should be the one that needs to be removed */ + instr->operands.pop_back(); + } else if (!is_phi(instr)) { + break; + } + } + } + + ctx->cf_info.has_branch = false; + + // TODO: if the loop has not a single exit, we must add one °° + /* emit loop successor block */ + ctx->block = ctx->program->insert_block(std::move(loop_exit)); + append_logical_start(ctx->block); + + #if 0 + // TODO: check if it is beneficial to not branch on continues + /* trim linear phis in loop header */ + for (auto&& instr : loop_entry->instructions) { + if (instr->opcode == aco_opcode::p_linear_phi) { + aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)}; + new_phi->definitions[0] = instr->definitions[0]; + for (unsigned i = 0; i < new_phi->operands.size(); i++) + new_phi->operands[i] = instr->operands[i]; + /* check that the remaining operands are all the same */ + for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++) + assert(instr->operands[i].tempId() == instr->operands.back().tempId()); + instr.swap(new_phi); + } else if (instr->opcode == aco_opcode::p_phi) { + continue; + } else { + break; + } + } + #endif +} + +static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond) +{ + ic->cond = cond; + + append_logical_end(ctx->block); + ctx->block->kind |= block_kind_branch; + + /* branch to linear then block */ + assert(cond.regClass() == s2); + aco_ptr<Pseudo_branch_instruction> branch; + branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0)); + branch->operands[0] = Operand(cond); + ctx->block->instructions.push_back(std::move(branch)); + + ic->BB_if_idx = ctx->block->index; + ic->BB_invert = Block(); + ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth; + /* Invert blocks are intentionally not marked as top level because they + * are not part of the logical cfg. */ + ic->BB_invert.kind |= block_kind_invert; + ic->BB_endif = Block(); + ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth; + ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level)); + + ic->exec_potentially_empty_old = ctx->cf_info.exec_potentially_empty; + ic->divergent_old = ctx->cf_info.parent_if.is_divergent; + ctx->cf_info.parent_if.is_divergent = true; + ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */ + + /** emit logical then block */ + Block* BB_then_logical = ctx->program->create_and_insert_block(); + BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_edge(ic->BB_if_idx, BB_then_logical); + ctx->block = BB_then_logical; + append_logical_start(BB_then_logical); +} + +static void begin_divergent_if_else(isel_context *ctx, if_context *ic) +{ + Block *BB_then_logical = ctx->block; + append_logical_end(BB_then_logical); + /* branch from logical then block to invert block */ + aco_ptr<Pseudo_branch_instruction> branch; + branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_then_logical->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_then_logical->index, &ic->BB_invert); + if (!ctx->cf_info.parent_loop.has_divergent_branch) + add_logical_edge(BB_then_logical->index, &ic->BB_endif); + BB_then_logical->kind |= block_kind_uniform; + assert(!ctx->cf_info.has_branch); + ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch; + ctx->cf_info.parent_loop.has_divergent_branch = false; + + /** emit linear then block */ + Block* BB_then_linear = ctx->program->create_and_insert_block(); + BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth; + BB_then_linear->kind |= block_kind_uniform; + add_linear_edge(ic->BB_if_idx, BB_then_linear); + /* branch from linear then block to invert block */ + branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_then_linear->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_then_linear->index, &ic->BB_invert); + + /** emit invert merge block */ + ctx->block = ctx->program->insert_block(std::move(ic->BB_invert)); + ic->invert_idx = ctx->block->index; + + /* branch to linear else block (skip else) */ + branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0)); + branch->operands[0] = Operand(ic->cond); + ctx->block->instructions.push_back(std::move(branch)); + + ic->exec_potentially_empty_old |= ctx->cf_info.exec_potentially_empty; + ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */ + + /** emit logical else block */ + Block* BB_else_logical = ctx->program->create_and_insert_block(); + BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_logical_edge(ic->BB_if_idx, BB_else_logical); + add_linear_edge(ic->invert_idx, BB_else_logical); + ctx->block = BB_else_logical; + append_logical_start(BB_else_logical); +} + +static void end_divergent_if(isel_context *ctx, if_context *ic) +{ + Block *BB_else_logical = ctx->block; + append_logical_end(BB_else_logical); + + /* branch from logical else block to endif block */ + aco_ptr<Pseudo_branch_instruction> branch; + branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_else_logical->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_else_logical->index, &ic->BB_endif); + if (!ctx->cf_info.parent_loop.has_divergent_branch) + add_logical_edge(BB_else_logical->index, &ic->BB_endif); + BB_else_logical->kind |= block_kind_uniform; + + assert(!ctx->cf_info.has_branch); + ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent; + + + /** emit linear else block */ + Block* BB_else_linear = ctx->program->create_and_insert_block(); + BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth; + BB_else_linear->kind |= block_kind_uniform; + add_linear_edge(ic->invert_idx, BB_else_linear); + + /* branch from linear else block to endif block */ + branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_else_linear->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_else_linear->index, &ic->BB_endif); + + + /** emit endif merge block */ + ctx->block = ctx->program->insert_block(std::move(ic->BB_endif)); + append_logical_start(ctx->block); + + + ctx->cf_info.parent_if.is_divergent = ic->divergent_old; + ctx->cf_info.exec_potentially_empty |= ic->exec_potentially_empty_old; + /* uniform control flow never has an empty exec-mask */ + if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) + ctx->cf_info.exec_potentially_empty = false; +} + +static void visit_if(isel_context *ctx, nir_if *if_stmt) +{ + Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa); + Builder bld(ctx->program, ctx->block); + aco_ptr<Pseudo_branch_instruction> branch; + + if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */ + /** + * Uniform conditionals are represented in the following way*) : + * + * The linear and logical CFG: + * BB_IF + * / \ + * BB_THEN (logical) BB_ELSE (logical) + * \ / + * BB_ENDIF + * + * *) Exceptions may be due to break and continue statements within loops + * If a break/continue happens within uniform control flow, it branches + * to the loop exit/entry block. Otherwise, it branches to the next + * merge block. + **/ + append_logical_end(ctx->block); + ctx->block->kind |= block_kind_uniform; + + /* emit branch */ + if (cond.regClass() == s2) { + // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction + cond = as_uniform_bool(ctx, cond); + } + branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0)); + branch->operands[0] = Operand(cond); + branch->operands[0].setFixed(scc); + ctx->block->instructions.emplace_back(std::move(branch)); + + unsigned BB_if_idx = ctx->block->index; + Block BB_endif = Block(); + BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth; + BB_endif.kind |= ctx->block->kind & block_kind_top_level; + + /** emit then block */ + Block* BB_then = ctx->program->create_and_insert_block(); + BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_edge(BB_if_idx, BB_then); + append_logical_start(BB_then); + ctx->block = BB_then; + visit_cf_list(ctx, &if_stmt->then_list); + BB_then = ctx->block; + bool then_branch = ctx->cf_info.has_branch; + bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch; + + if (!then_branch) { + append_logical_end(BB_then); + /* branch from then block to endif block */ + branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_then->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_then->index, &BB_endif); + if (!then_branch_divergent) + add_logical_edge(BB_then->index, &BB_endif); + BB_then->kind |= block_kind_uniform; + } + + ctx->cf_info.has_branch = false; + ctx->cf_info.parent_loop.has_divergent_branch = false; + + /** emit else block */ + Block* BB_else = ctx->program->create_and_insert_block(); + BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_edge(BB_if_idx, BB_else); + append_logical_start(BB_else); + ctx->block = BB_else; + visit_cf_list(ctx, &if_stmt->else_list); + BB_else = ctx->block; + + if (!ctx->cf_info.has_branch) { + append_logical_end(BB_else); + /* branch from then block to endif block */ + branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_else->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_else->index, &BB_endif); + if (!ctx->cf_info.parent_loop.has_divergent_branch) + add_logical_edge(BB_else->index, &BB_endif); + BB_else->kind |= block_kind_uniform; + } + + ctx->cf_info.has_branch &= then_branch; + ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent; + + /** emit endif merge block */ + if (!ctx->cf_info.has_branch) { + ctx->block = ctx->program->insert_block(std::move(BB_endif)); + append_logical_start(ctx->block); + } + } else { /* non-uniform condition */ + /** + * To maintain a logical and linear CFG without critical edges, + * non-uniform conditionals are represented in the following way*) : + * + * The linear CFG: + * BB_IF + * / \ + * BB_THEN (logical) BB_THEN (linear) + * \ / + * BB_INVERT (linear) + * / \ + * BB_ELSE (logical) BB_ELSE (linear) + * \ / + * BB_ENDIF + * + * The logical CFG: + * BB_IF + * / \ + * BB_THEN (logical) BB_ELSE (logical) + * \ / + * BB_ENDIF + * + * *) Exceptions may be due to break and continue statements within loops + **/ + + if_context ic; + + begin_divergent_if_then(ctx, &ic, cond); + visit_cf_list(ctx, &if_stmt->then_list); + + begin_divergent_if_else(ctx, &ic); + visit_cf_list(ctx, &if_stmt->else_list); + + end_divergent_if(ctx, &ic); + } +} + +static void visit_cf_list(isel_context *ctx, + struct exec_list *list) +{ + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: + visit_block(ctx, nir_cf_node_as_block(node)); + break; + case nir_cf_node_if: + visit_if(ctx, nir_cf_node_as_if(node)); + break; + case nir_cf_node_loop: + visit_loop(ctx, nir_cf_node_as_loop(node)); + break; + default: + unreachable("unimplemented cf list type"); + } + } +} + +static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos) +{ + int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot]; + uint64_t mask = ctx->vs_output.mask[slot]; + if (!is_pos && !mask) + return; + if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED) + return; + aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)}; + exp->enabled_mask = mask; + for (unsigned i = 0; i < 4; ++i) { + if (mask & (1 << i)) + exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]); + else + exp->operands[i] = Operand(v1); + } + exp->valid_mask = false; + exp->done = false; + exp->compressed = false; + if (is_pos) + exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++; + else + exp->dest = V_008DFC_SQ_EXP_PARAM + offset; + ctx->block->instructions.emplace_back(std::move(exp)); +} + +static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos) +{ + aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)}; + exp->enabled_mask = 0; + for (unsigned i = 0; i < 4; ++i) + exp->operands[i] = Operand(v1); + if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) { + exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]); + exp->enabled_mask |= 0x1; + } + if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) { + exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]); + exp->enabled_mask |= 0x4; + } + if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) { + if (ctx->options->chip_class < GFX9) { + exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]); + exp->enabled_mask |= 0x8; + } else { + Builder bld(ctx->program, ctx->block); + + Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), + Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0])); + if (exp->operands[2].isTemp()) + out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]); + + exp->operands[2] = Operand(out); + exp->enabled_mask |= 0x4; + } + } + exp->valid_mask = false; + exp->done = false; + exp->compressed = false; + exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++; + ctx->block->instructions.emplace_back(std::move(exp)); +} + +static void create_vs_exports(isel_context *ctx) +{ + radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo; + + if (outinfo->export_prim_id) { + ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1; + ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = ctx->vs_prim_id; + } + + if (ctx->options->key.has_multiview_view_index) { + ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1; + ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, ctx->view_index); + } + + /* the order these position exports are created is important */ + int next_pos = 0; + export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos); + if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) { + export_vs_psiz_layer_viewport(ctx, &next_pos); + } + if (ctx->num_clip_distances + ctx->num_cull_distances > 0) + export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos); + if (ctx->num_clip_distances + ctx->num_cull_distances > 4) + export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos); + + if (ctx->options->key.vs_common_out.export_clip_dists) { + if (ctx->num_clip_distances + ctx->num_cull_distances > 0) + export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos); + if (ctx->num_clip_distances + ctx->num_cull_distances > 4) + export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos); + } + + for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { + if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER && + i != VARYING_SLOT_PRIMITIVE_ID) + continue; + + export_vs_varying(ctx, i, false, NULL); + } +} + +static void emit_stream_output(isel_context *ctx, + Temp const *so_buffers, + Temp const *so_write_offset, + const struct radv_stream_output *output) +{ + unsigned num_comps = util_bitcount(output->component_mask); + unsigned loc = output->location; + unsigned buf = output->buffer; + unsigned offset = output->offset; + + assert(num_comps && num_comps <= 4); + if (!num_comps || num_comps > 4) + return; + + unsigned start = ffs(output->component_mask) - 1; + + Temp out[4]; + bool all_undef = true; + assert(ctx->stage == vertex_vs); + for (unsigned i = 0; i < num_comps; i++) { + out[i] = ctx->vs_output.outputs[loc][start + i]; + all_undef = all_undef && !out[i].id(); + } + if (all_undef) + return; + + Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_comps)}; + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_comps, 1)}; + for (unsigned i = 0; i < num_comps; ++i) + vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << i) ? Operand(out[i]) : Operand(0u); + vec->definitions[0] = Definition(write_data); + ctx->block->instructions.emplace_back(std::move(vec)); + + aco_opcode opcode; + switch (num_comps) { + case 1: + opcode = aco_opcode::buffer_store_dword; + break; + case 2: + opcode = aco_opcode::buffer_store_dwordx2; + break; + case 3: + opcode = aco_opcode::buffer_store_dwordx3; + break; + case 4: + opcode = aco_opcode::buffer_store_dwordx4; + break; + } + + aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)}; + store->operands[0] = Operand(so_write_offset[buf]); + store->operands[1] = Operand(so_buffers[buf]); + store->operands[2] = Operand((uint32_t) 0); + store->operands[3] = Operand(write_data); + if (offset > 4095) { + /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */ + Builder bld(ctx->program, ctx->block); + store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf])); + } else { + store->offset = offset; + } + store->offen = true; + store->glc = true; + store->slc = true; + store->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(store)); +} + +static void emit_streamout(isel_context *ctx, unsigned stream) +{ + Builder bld(ctx->program, ctx->block); + + Temp so_buffers[4]; + Temp buf_ptr = convert_pointer_to_64_bit(ctx, ctx->streamout_buffers); + for (unsigned i = 0; i < 4; i++) { + unsigned stride = ctx->program->info->so.strides[i]; + if (!stride) + continue; + + so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, Operand(i * 16u)); + } + + Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), + ctx->streamout_config, Operand(0x70010u)); + + Temp tid = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1), + bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u))); + + Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid); + + if_context ic; + begin_divergent_if_then(ctx, &ic, can_emit); + + bld.reset(ctx->block); + + Temp so_write_index = bld.vadd32(bld.def(v1), ctx->streamout_write_idx, tid); + + Temp so_write_offset[4]; + + for (unsigned i = 0; i < 4; i++) { + unsigned stride = ctx->program->info->so.strides[i]; + if (!stride) + continue; + + if (stride == 1) { + Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), + ctx->streamout_write_idx, ctx->streamout_offset[i]); + Temp new_offset = bld.vadd32(bld.def(v1), offset, tid); + + so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset); + } else { + Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u); + Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u), ctx->streamout_offset[i]); + so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2); + } + } + + for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) { + struct radv_stream_output *output = + &ctx->program->info->so.outputs[i]; + if (stream != output->stream) + continue; + + emit_stream_output(ctx, so_buffers, so_write_offset, output); + } + + begin_divergent_if_else(ctx, &ic); + end_divergent_if(ctx, &ic); +} + +} /* end namespace */ + +void handle_bc_optimize(isel_context *ctx) +{ + /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */ + Builder bld(ctx->program, ctx->block); + uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena; + bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena); + bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena); + if (uses_center && uses_centroid) { + Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)), ctx->prim_mask, Operand(0u)); + + if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) { + for (unsigned i = 0; i < 2; i++) { + Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + ctx->fs_inputs[fs_input::persp_centroid_p1 + i], + ctx->fs_inputs[fs_input::persp_center_p1 + i], + sel); + ctx->fs_inputs[fs_input::persp_centroid_p1 + i] = new_coord; + } + } + + if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) { + for (unsigned i = 0; i < 2; i++) { + Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + ctx->fs_inputs[fs_input::linear_centroid_p1 + i], + ctx->fs_inputs[fs_input::linear_center_p1 + i], + sel); + ctx->fs_inputs[fs_input::linear_centroid_p1 + i] = new_coord; + } + } + } +} + +void select_program(Program *program, + unsigned shader_count, + struct nir_shader *const *shaders, + ac_shader_config* config, + struct radv_shader_info *info, + struct radv_nir_compiler_options *options) +{ + isel_context ctx = setup_isel_context(program, shader_count, shaders, config, info, options); + + for (unsigned i = 0; i < shader_count; i++) { + nir_shader *nir = shaders[i]; + init_context(&ctx, nir); + + if (!i) { + add_startpgm(&ctx); /* needs to be after init_context() for FS */ + append_logical_start(ctx.block); + } + + if_context ic; + if (shader_count >= 2) { + Builder bld(ctx.program, ctx.block); + Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u))); + Temp thread_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1), + bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u))); + Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(s2)), count, thread_id); + + begin_divergent_if_then(&ctx, &ic, cond); + } + + if (i) { + Builder bld(ctx.program, ctx.block); + bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages + bld.sopp(aco_opcode::s_barrier); + } + + if (ctx.stage == fragment_fs) + handle_bc_optimize(&ctx); + + nir_function_impl *func = nir_shader_get_entrypoint(nir); + visit_cf_list(&ctx, &func->body); + + if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */) + emit_streamout(&ctx, 0); + + if (ctx.stage == vertex_vs) + create_vs_exports(&ctx); + + if (shader_count >= 2) { + begin_divergent_if_else(&ctx, &ic); + end_divergent_if(&ctx, &ic); + } + + ralloc_free(ctx.divergent_vals); + } + + append_logical_end(ctx.block); + ctx.block->kind |= block_kind_uniform; + Builder bld(ctx.program, ctx.block); + if (ctx.program->wb_smem_l1_on_end) + bld.smem(aco_opcode::s_dcache_wb, false); + bld.sopp(aco_opcode::s_endpgm); + + /* cleanup CFG */ + for (Block& BB : program->blocks) { + for (unsigned idx : BB.linear_preds) + program->blocks[idx].linear_succs.emplace_back(BB.index); + for (unsigned idx : BB.logical_preds) + program->blocks[idx].logical_succs.emplace_back(BB.index); + } +} +} diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp new file mode 100644 index 00000000000..6c4c408e659 --- /dev/null +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -0,0 +1,1366 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <unordered_map> +#include "aco_ir.h" +#include "nir.h" +#include "vulkan/radv_shader.h" +#include "vulkan/radv_descriptor_set.h" +#include "sid.h" +#include "ac_exp_param.h" + +#include "util/u_math.h" + +#define MAX_INLINE_PUSH_CONSTS 8 + +namespace aco { + +enum fs_input { + persp_sample_p1, + persp_sample_p2, + persp_center_p1, + persp_center_p2, + persp_centroid_p1, + persp_centroid_p2, + persp_pull_model, + linear_sample_p1, + linear_sample_p2, + linear_center_p1, + linear_center_p2, + linear_centroid_p1, + linear_centroid_p2, + line_stipple, + frag_pos_0, + frag_pos_1, + frag_pos_2, + frag_pos_3, + front_face, + ancillary, + sample_coverage, + fixed_pt, + max_inputs, +}; + +struct vs_output_state { + uint8_t mask[VARYING_SLOT_VAR31 + 1]; + Temp outputs[VARYING_SLOT_VAR31 + 1][4]; +}; + +struct isel_context { + struct radv_nir_compiler_options *options; + Program *program; + nir_shader *shader; + uint32_t constant_data_offset; + Block *block; + bool *divergent_vals; + std::unique_ptr<Temp[]> allocated; + std::unordered_map<unsigned, std::array<Temp,4>> allocated_vec; + Stage stage; /* Stage */ + struct { + bool has_branch; + uint16_t loop_nest_depth = 0; + struct { + unsigned header_idx; + Block* exit; + bool has_divergent_continue = false; + bool has_divergent_branch = false; + } parent_loop; + struct { + bool is_divergent = false; + } parent_if; + bool exec_potentially_empty = false; + } cf_info; + + /* scratch */ + bool scratch_enabled = false; + Temp private_segment_buffer = Temp(0, s2); /* also the part of the scratch descriptor on compute */ + Temp scratch_offset = Temp(0, s1); + + /* inputs common for merged stages */ + Temp merged_wave_info = Temp(0, s1); + + /* FS inputs */ + bool fs_vgpr_args[fs_input::max_inputs]; + Temp fs_inputs[fs_input::max_inputs]; + Temp prim_mask = Temp(0, s1); + Temp descriptor_sets[MAX_SETS]; + Temp push_constants = Temp(0, s1); + Temp inline_push_consts[MAX_INLINE_PUSH_CONSTS]; + unsigned num_inline_push_consts = 0; + unsigned base_inline_push_consts = 0; + + /* VS inputs */ + Temp vertex_buffers = Temp(0, s1); + Temp base_vertex = Temp(0, s1); + Temp start_instance = Temp(0, s1); + Temp draw_id = Temp(0, s1); + Temp view_index = Temp(0, s1); + Temp es2gs_offset = Temp(0, s1); + Temp vertex_id = Temp(0, v1); + Temp rel_auto_id = Temp(0, v1); + Temp instance_id = Temp(0, v1); + Temp vs_prim_id = Temp(0, v1); + bool needs_instance_id; + + /* CS inputs */ + Temp num_workgroups[3] = {Temp(0, s1), Temp(0, s1), Temp(0, s1)}; + Temp workgroup_ids[3] = {Temp(0, s1), Temp(0, s1), Temp(0, s1)}; + Temp tg_size = Temp(0, s1); + Temp local_invocation_ids[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)}; + + /* VS output information */ + unsigned num_clip_distances; + unsigned num_cull_distances; + vs_output_state vs_output; + + /* Streamout */ + Temp streamout_buffers = Temp(0, s1); + Temp streamout_write_idx = Temp(0, s1); + Temp streamout_config = Temp(0, s1); + Temp streamout_offset[4] = {Temp(0, s1), Temp(0, s1), Temp(0, s1), Temp(0, s1)}; +}; + +fs_input get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp) +{ + switch (interp) { + case INTERP_MODE_SMOOTH: + case INTERP_MODE_NONE: + if (intrin == nir_intrinsic_load_barycentric_pixel || + intrin == nir_intrinsic_load_barycentric_at_sample || + intrin == nir_intrinsic_load_barycentric_at_offset) + return fs_input::persp_center_p1; + else if (intrin == nir_intrinsic_load_barycentric_centroid) + return fs_input::persp_centroid_p1; + else if (intrin == nir_intrinsic_load_barycentric_sample) + return fs_input::persp_sample_p1; + break; + case INTERP_MODE_NOPERSPECTIVE: + if (intrin == nir_intrinsic_load_barycentric_pixel) + return fs_input::linear_center_p1; + else if (intrin == nir_intrinsic_load_barycentric_centroid) + return fs_input::linear_centroid_p1; + else if (intrin == nir_intrinsic_load_barycentric_sample) + return fs_input::linear_sample_p1; + break; + default: + break; + } + return fs_input::max_inputs; +} + +void init_context(isel_context *ctx, nir_shader *shader) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + + ctx->shader = shader; + ctx->divergent_vals = nir_divergence_analysis(shader, nir_divergence_view_index_uniform); + + std::unique_ptr<Temp[]> allocated{new Temp[impl->ssa_alloc]()}; + memset(&ctx->fs_vgpr_args, false, sizeof(ctx->fs_vgpr_args)); + + bool done = false; + while (!done) { + done = true; + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + switch(instr->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu_instr = nir_instr_as_alu(instr); + unsigned size = alu_instr->dest.dest.ssa.num_components; + if (alu_instr->dest.dest.ssa.bit_size == 64) + size *= 2; + RegType type = RegType::sgpr; + switch(alu_instr->op) { + case nir_op_fmul: + case nir_op_fadd: + case nir_op_fsub: + case nir_op_fmax: + case nir_op_fmin: + case nir_op_fmax3: + case nir_op_fmin3: + case nir_op_fmed3: + case nir_op_fmod: + case nir_op_frem: + case nir_op_fneg: + case nir_op_fabs: + case nir_op_fsat: + case nir_op_fsign: + case nir_op_frcp: + case nir_op_frsq: + case nir_op_fsqrt: + case nir_op_fexp2: + case nir_op_flog2: + case nir_op_ffract: + case nir_op_ffloor: + case nir_op_fceil: + case nir_op_ftrunc: + case nir_op_fround_even: + case nir_op_fsin: + case nir_op_fcos: + case nir_op_f2f32: + case nir_op_f2f64: + case nir_op_u2f32: + case nir_op_u2f64: + case nir_op_i2f32: + case nir_op_i2f64: + case nir_op_pack_half_2x16: + case nir_op_unpack_half_2x16_split_x: + case nir_op_unpack_half_2x16_split_y: + case nir_op_fddx: + case nir_op_fddy: + case nir_op_fddx_fine: + case nir_op_fddy_fine: + case nir_op_fddx_coarse: + case nir_op_fddy_coarse: + case nir_op_fquantize2f16: + case nir_op_ldexp: + case nir_op_frexp_sig: + case nir_op_frexp_exp: + case nir_op_cube_face_index: + case nir_op_cube_face_coord: + type = RegType::vgpr; + break; + case nir_op_flt: + case nir_op_fge: + case nir_op_feq: + case nir_op_fne: + size = 2; + break; + case nir_op_ilt: + case nir_op_ige: + case nir_op_ult: + case nir_op_uge: + size = alu_instr->src[0].src.ssa->bit_size == 64 ? 2 : 1; + /* fallthrough */ + case nir_op_ieq: + case nir_op_ine: + case nir_op_i2b1: + if (ctx->divergent_vals[alu_instr->dest.dest.ssa.index]) { + size = 2; + } else { + for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) { + if (allocated[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr) + size = 2; + } + } + break; + case nir_op_f2i64: + case nir_op_f2u64: + case nir_op_b2i32: + case nir_op_b2f32: + case nir_op_f2i32: + case nir_op_f2u32: + type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr; + break; + case nir_op_bcsel: + if (alu_instr->dest.dest.ssa.bit_size == 1) { + if (ctx->divergent_vals[alu_instr->dest.dest.ssa.index]) + size = 2; + else if (allocated[alu_instr->src[1].src.ssa->index].regClass() == s2 && + allocated[alu_instr->src[2].src.ssa->index].regClass() == s2) + size = 2; + else + size = 1; + } else { + if (ctx->divergent_vals[alu_instr->dest.dest.ssa.index]) { + type = RegType::vgpr; + } else { + if (allocated[alu_instr->src[1].src.ssa->index].type() == RegType::vgpr || + allocated[alu_instr->src[2].src.ssa->index].type() == RegType::vgpr) { + type = RegType::vgpr; + } + } + if (alu_instr->src[1].src.ssa->num_components == 1 && alu_instr->src[2].src.ssa->num_components == 1) { + assert(allocated[alu_instr->src[1].src.ssa->index].size() == allocated[alu_instr->src[2].src.ssa->index].size()); + size = allocated[alu_instr->src[1].src.ssa->index].size(); + } + } + break; + case nir_op_mov: + if (alu_instr->dest.dest.ssa.bit_size == 1) { + size = allocated[alu_instr->src[0].src.ssa->index].size(); + } else { + type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr; + } + break; + case nir_op_inot: + case nir_op_ixor: + if (alu_instr->dest.dest.ssa.bit_size == 1) { + size = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? 2 : 1; + break; + } else { + /* fallthrough */ + } + default: + if (alu_instr->dest.dest.ssa.bit_size == 1) { + if (ctx->divergent_vals[alu_instr->dest.dest.ssa.index]) { + size = 2; + } else { + size = 2; + for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) { + if (allocated[alu_instr->src[i].src.ssa->index].regClass() == s1) { + size = 1; + break; + } + } + } + } else { + for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) { + if (allocated[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr) + type = RegType::vgpr; + } + } + break; + } + allocated[alu_instr->dest.dest.ssa.index] = Temp(0, RegClass(type, size)); + break; + } + case nir_instr_type_load_const: { + unsigned size = nir_instr_as_load_const(instr)->def.num_components; + if (nir_instr_as_load_const(instr)->def.bit_size == 64) + size *= 2; + allocated[nir_instr_as_load_const(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size)); + break; + } + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr); + if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest) + break; + unsigned size = intrinsic->dest.ssa.num_components; + if (intrinsic->dest.ssa.bit_size == 64) + size *= 2; + RegType type = RegType::sgpr; + switch(intrinsic->intrinsic) { + case nir_intrinsic_load_push_constant: + case nir_intrinsic_load_work_group_id: + case nir_intrinsic_load_num_work_groups: + case nir_intrinsic_load_subgroup_id: + case nir_intrinsic_load_num_subgroups: + case nir_intrinsic_load_first_vertex: + case nir_intrinsic_load_base_instance: + case nir_intrinsic_get_buffer_size: + case nir_intrinsic_vote_all: + case nir_intrinsic_vote_any: + case nir_intrinsic_read_first_invocation: + case nir_intrinsic_read_invocation: + case nir_intrinsic_first_invocation: + case nir_intrinsic_vulkan_resource_index: + type = RegType::sgpr; + break; + case nir_intrinsic_ballot: + type = RegType::sgpr; + size = 2; + break; + case nir_intrinsic_load_sample_id: + case nir_intrinsic_load_sample_mask_in: + case nir_intrinsic_load_input: + case nir_intrinsic_load_vertex_id: + case nir_intrinsic_load_vertex_id_zero_base: + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: + case nir_intrinsic_load_interpolated_input: + case nir_intrinsic_load_frag_coord: + case nir_intrinsic_load_sample_pos: + case nir_intrinsic_load_layer_id: + case nir_intrinsic_load_local_invocation_id: + case nir_intrinsic_load_local_invocation_index: + case nir_intrinsic_load_subgroup_invocation: + case nir_intrinsic_write_invocation_amd: + case nir_intrinsic_mbcnt_amd: + case nir_intrinsic_load_instance_id: + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_size: + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_comp_swap: + case nir_intrinsic_load_scratch: + type = RegType::vgpr; + break; + case nir_intrinsic_shuffle: + case nir_intrinsic_quad_broadcast: + case nir_intrinsic_quad_swap_horizontal: + case nir_intrinsic_quad_swap_vertical: + case nir_intrinsic_quad_swap_diagonal: + case nir_intrinsic_quad_swizzle_amd: + case nir_intrinsic_masked_swizzle_amd: + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_exclusive_scan: + if (!ctx->divergent_vals[intrinsic->dest.ssa.index]) { + type = RegType::sgpr; + } else if (intrinsic->src[0].ssa->bit_size == 1) { + type = RegType::sgpr; + size = 2; + } else { + type = RegType::vgpr; + } + break; + case nir_intrinsic_load_view_index: + type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr; + break; + case nir_intrinsic_load_front_face: + case nir_intrinsic_load_helper_invocation: + case nir_intrinsic_is_helper_invocation: + type = RegType::sgpr; + size = 2; + break; + case nir_intrinsic_reduce: + if (nir_intrinsic_cluster_size(intrinsic) == 0 || + !ctx->divergent_vals[intrinsic->dest.ssa.index]) { + type = RegType::sgpr; + } else if (intrinsic->src[0].ssa->bit_size == 1) { + type = RegType::sgpr; + size = 2; + } else { + type = RegType::vgpr; + } + break; + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_global: + type = ctx->divergent_vals[intrinsic->dest.ssa.index] ? RegType::vgpr : RegType::sgpr; + break; + /* due to copy propagation, the swizzled imov is removed if num dest components == 1 */ + case nir_intrinsic_load_shared: + if (ctx->divergent_vals[intrinsic->dest.ssa.index]) + type = RegType::vgpr; + else + type = RegType::sgpr; + break; + default: + for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; i++) { + if (allocated[intrinsic->src[i].ssa->index].type() == RegType::vgpr) + type = RegType::vgpr; + } + break; + } + allocated[intrinsic->dest.ssa.index] = Temp(0, RegClass(type, size)); + + switch(intrinsic->intrinsic) { + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: { + glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic); + ctx->fs_vgpr_args[get_interp_input(intrinsic->intrinsic, mode)] = true; + break; + } + case nir_intrinsic_load_front_face: + ctx->fs_vgpr_args[fs_input::front_face] = true; + break; + case nir_intrinsic_load_frag_coord: + case nir_intrinsic_load_sample_pos: { + uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa); + for (unsigned i = 0; i < 4; i++) { + if (mask & (1 << i)) + ctx->fs_vgpr_args[fs_input::frag_pos_0 + i] = true; + + } + break; + } + case nir_intrinsic_load_sample_id: + ctx->fs_vgpr_args[fs_input::ancillary] = true; + break; + case nir_intrinsic_load_sample_mask_in: + ctx->fs_vgpr_args[fs_input::ancillary] = true; + ctx->fs_vgpr_args[fs_input::sample_coverage] = true; + break; + default: + break; + } + break; + } + case nir_instr_type_tex: { + nir_tex_instr* tex = nir_instr_as_tex(instr); + unsigned size = tex->dest.ssa.num_components; + + if (tex->dest.ssa.bit_size == 64) + size *= 2; + if (tex->op == nir_texop_texture_samples) + assert(!ctx->divergent_vals[tex->dest.ssa.index]); + if (ctx->divergent_vals[tex->dest.ssa.index]) + allocated[tex->dest.ssa.index] = Temp(0, RegClass(RegType::vgpr, size)); + else + allocated[tex->dest.ssa.index] = Temp(0, RegClass(RegType::sgpr, size)); + break; + } + case nir_instr_type_parallel_copy: { + nir_foreach_parallel_copy_entry(entry, nir_instr_as_parallel_copy(instr)) { + allocated[entry->dest.ssa.index] = allocated[entry->src.ssa->index]; + } + break; + } + case nir_instr_type_ssa_undef: { + unsigned size = nir_instr_as_ssa_undef(instr)->def.num_components; + if (nir_instr_as_ssa_undef(instr)->def.bit_size == 64) + size *= 2; + allocated[nir_instr_as_ssa_undef(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size)); + break; + } + case nir_instr_type_phi: { + nir_phi_instr* phi = nir_instr_as_phi(instr); + RegType type; + unsigned size = phi->dest.ssa.num_components; + + if (phi->dest.ssa.bit_size == 1) { + assert(size == 1 && "multiple components not yet supported on boolean phis."); + type = RegType::sgpr; + size *= ctx->divergent_vals[phi->dest.ssa.index] ? 2 : 1; + allocated[phi->dest.ssa.index] = Temp(0, RegClass(type, size)); + break; + } + + if (ctx->divergent_vals[phi->dest.ssa.index]) { + type = RegType::vgpr; + } else { + type = RegType::sgpr; + nir_foreach_phi_src (src, phi) { + if (allocated[src->src.ssa->index].type() == RegType::vgpr) + type = RegType::vgpr; + if (allocated[src->src.ssa->index].type() == RegType::none) + done = false; + } + } + + size *= phi->dest.ssa.bit_size == 64 ? 2 : 1; + RegClass rc = RegClass(type, size); + if (rc != allocated[phi->dest.ssa.index].regClass()) { + done = false; + } else { + nir_foreach_phi_src(src, phi) + assert(allocated[src->src.ssa->index].size() == rc.size()); + } + allocated[phi->dest.ssa.index] = Temp(0, rc); + break; + } + default: + break; + } + } + } + } + + for (unsigned i = 0; i < impl->ssa_alloc; i++) + allocated[i] = Temp(ctx->program->allocateId(), allocated[i].regClass()); + + ctx->allocated.reset(allocated.release()); +} + +struct user_sgpr_info { + uint8_t num_sgpr; + uint8_t remaining_sgprs; + uint8_t user_sgpr_idx; + bool need_ring_offsets; + bool indirect_all_descriptor_sets; +}; + +static void allocate_inline_push_consts(isel_context *ctx, + user_sgpr_info& user_sgpr_info) +{ + uint8_t remaining_sgprs = user_sgpr_info.remaining_sgprs; + + /* Only supported if shaders use push constants. */ + if (ctx->program->info->min_push_constant_used == UINT8_MAX) + return; + + /* Only supported if shaders don't have indirect push constants. */ + if (ctx->program->info->has_indirect_push_constants) + return; + + /* Only supported for 32-bit push constants. */ + //TODO: it's possible that some day, the load/store vectorization could make this inaccurate + if (!ctx->program->info->has_only_32bit_push_constants) + return; + + uint8_t num_push_consts = + (ctx->program->info->max_push_constant_used - + ctx->program->info->min_push_constant_used) / 4; + + /* Check if the number of user SGPRs is large enough. */ + if (num_push_consts < remaining_sgprs) { + ctx->program->info->num_inline_push_consts = num_push_consts; + } else { + ctx->program->info->num_inline_push_consts = remaining_sgprs; + } + + /* Clamp to the maximum number of allowed inlined push constants. */ + if (ctx->program->info->num_inline_push_consts > MAX_INLINE_PUSH_CONSTS) + ctx->program->info->num_inline_push_consts = MAX_INLINE_PUSH_CONSTS; + + if (ctx->program->info->num_inline_push_consts == num_push_consts && + !ctx->program->info->loads_dynamic_offsets) { + /* Disable the default push constants path if all constants are + * inlined and if shaders don't use dynamic descriptors. + */ + ctx->program->info->loads_push_constants = false; + user_sgpr_info.num_sgpr--; + user_sgpr_info.remaining_sgprs++; + } + + ctx->program->info->base_inline_push_consts = + ctx->program->info->min_push_constant_used / 4; + + user_sgpr_info.num_sgpr += ctx->program->info->num_inline_push_consts; + user_sgpr_info.remaining_sgprs -= ctx->program->info->num_inline_push_consts; +} + +static void allocate_user_sgprs(isel_context *ctx, + bool needs_view_index, user_sgpr_info& user_sgpr_info) +{ + memset(&user_sgpr_info, 0, sizeof(struct user_sgpr_info)); + uint32_t user_sgpr_count = 0; + + /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */ + if (ctx->stage != fragment_fs && + ctx->stage != compute_cs + /*|| ctx->is_gs_copy_shader */) + user_sgpr_info.need_ring_offsets = true; + + if (ctx->stage == fragment_fs && + ctx->program->info->ps.needs_sample_positions) + user_sgpr_info.need_ring_offsets = true; + + /* 2 user sgprs will nearly always be allocated for scratch/rings */ + if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets || ctx->scratch_enabled) + user_sgpr_count += 2; + + switch (ctx->stage) { + case vertex_vs: + /* if (!ctx->is_gs_copy_shader) */ { + if (ctx->program->info->vs.has_vertex_buffers) + user_sgpr_count++; + user_sgpr_count += ctx->program->info->vs.needs_draw_id ? 3 : 2; + } + break; + case fragment_fs: + //user_sgpr_count += ctx->program->info->ps.needs_sample_positions; + break; + case compute_cs: + if (ctx->program->info->cs.uses_grid_size) + user_sgpr_count += 3; + break; + default: + unreachable("Shader stage not implemented"); + } + + if (needs_view_index) + user_sgpr_count++; + + if (ctx->program->info->loads_push_constants) + user_sgpr_count += 1; /* we use 32bit pointers */ + + if (ctx->program->info->so.num_outputs) + user_sgpr_count += 1; /* we use 32bit pointers */ + + uint32_t available_sgprs = ctx->options->chip_class >= GFX9 && !(ctx->stage & hw_cs) ? 32 : 16; + uint32_t remaining_sgprs = available_sgprs - user_sgpr_count; + uint32_t num_desc_set = util_bitcount(ctx->program->info->desc_set_used_mask); + + if (available_sgprs < user_sgpr_count + num_desc_set) { + user_sgpr_info.indirect_all_descriptor_sets = true; + user_sgpr_info.num_sgpr = user_sgpr_count + 1; + user_sgpr_info.remaining_sgprs = remaining_sgprs - 1; + } else { + user_sgpr_info.num_sgpr = user_sgpr_count + num_desc_set; + user_sgpr_info.remaining_sgprs = remaining_sgprs - num_desc_set; + } + + allocate_inline_push_consts(ctx, user_sgpr_info); +} + +#define MAX_ARGS 64 +struct arg_info { + RegClass types[MAX_ARGS]; + Temp *assign[MAX_ARGS]; + PhysReg reg[MAX_ARGS]; + unsigned array_params_mask; + uint8_t count; + uint8_t sgpr_count; + uint8_t num_sgprs_used; + uint8_t num_vgprs_used; +}; + +static void +add_arg(arg_info *info, RegClass rc, Temp *param_ptr, unsigned reg) +{ + assert(info->count < MAX_ARGS); + + info->assign[info->count] = param_ptr; + info->types[info->count] = rc; + + if (rc.type() == RegType::sgpr) { + info->num_sgprs_used += rc.size(); + info->sgpr_count++; + info->reg[info->count] = PhysReg{reg}; + } else { + assert(rc.type() == RegType::vgpr); + info->num_vgprs_used += rc.size(); + info->reg[info->count] = PhysReg{reg + 256}; + } + info->count++; +} + +static void +set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs) +{ + ud_info->sgpr_idx = *sgpr_idx; + ud_info->num_sgprs = num_sgprs; + *sgpr_idx += num_sgprs; +} + +static void +set_loc_shader(isel_context *ctx, int idx, uint8_t *sgpr_idx, + uint8_t num_sgprs) +{ + struct radv_userdata_info *ud_info = &ctx->program->info->user_sgprs_locs.shader_data[idx]; + assert(ud_info); + + set_loc(ud_info, sgpr_idx, num_sgprs); +} + +static void +set_loc_shader_ptr(isel_context *ctx, int idx, uint8_t *sgpr_idx) +{ + bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS; + + set_loc_shader(ctx, idx, sgpr_idx, use_32bit_pointers ? 1 : 2); +} + +static void +set_loc_desc(isel_context *ctx, int idx, uint8_t *sgpr_idx) +{ + struct radv_userdata_locations *locs = &ctx->program->info->user_sgprs_locs; + struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx]; + assert(ud_info); + + set_loc(ud_info, sgpr_idx, 1); + locs->descriptor_sets_enabled |= 1 << idx; +} + +static void +declare_global_input_sgprs(isel_context *ctx, + /* bool has_previous_stage, gl_shader_stage previous_stage, */ + user_sgpr_info *user_sgpr_info, + struct arg_info *args, + Temp *desc_sets) +{ + /* 1 for each descriptor set */ + if (!user_sgpr_info->indirect_all_descriptor_sets) { + uint32_t mask = ctx->program->info->desc_set_used_mask; + while (mask) { + int i = u_bit_scan(&mask); + add_arg(args, s1, &desc_sets[i], user_sgpr_info->user_sgpr_idx); + set_loc_desc(ctx, i, &user_sgpr_info->user_sgpr_idx); + } + /* NIR->LLVM might have set this to true if RADV_DEBUG=compiletime */ + ctx->program->info->need_indirect_descriptor_sets = false; + } else { + add_arg(args, s1, desc_sets, user_sgpr_info->user_sgpr_idx); + set_loc_shader_ptr(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS, &user_sgpr_info->user_sgpr_idx); + ctx->program->info->need_indirect_descriptor_sets = true; + } + + if (ctx->program->info->loads_push_constants) { + /* 1 for push constants and dynamic descriptors */ + add_arg(args, s1, &ctx->push_constants, user_sgpr_info->user_sgpr_idx); + set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, &user_sgpr_info->user_sgpr_idx); + } + + if (ctx->program->info->num_inline_push_consts) { + unsigned count = ctx->program->info->num_inline_push_consts; + for (unsigned i = 0; i < count; i++) + add_arg(args, s1, &ctx->inline_push_consts[i], user_sgpr_info->user_sgpr_idx + i); + set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, &user_sgpr_info->user_sgpr_idx, count); + + ctx->num_inline_push_consts = ctx->program->info->num_inline_push_consts; + ctx->base_inline_push_consts = ctx->program->info->base_inline_push_consts; + } + + if (ctx->program->info->so.num_outputs) { + add_arg(args, s1, &ctx->streamout_buffers, user_sgpr_info->user_sgpr_idx); + set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS, &user_sgpr_info->user_sgpr_idx); + } +} + +static void +declare_vs_input_vgprs(isel_context *ctx, struct arg_info *args) +{ + unsigned vgpr_idx = 0; + add_arg(args, v1, &ctx->vertex_id, vgpr_idx++); +/* if (!ctx->is_gs_copy_shader) */ { + if (ctx->options->key.vs.out.as_ls) { + add_arg(args, v1, &ctx->rel_auto_id, vgpr_idx++); + add_arg(args, v1, &ctx->instance_id, vgpr_idx++); + } else { + add_arg(args, v1, &ctx->instance_id, vgpr_idx++); + add_arg(args, v1, &ctx->vs_prim_id, vgpr_idx++); + } + add_arg(args, v1, NULL, vgpr_idx); /* unused */ + } +} + +static void +declare_streamout_sgprs(isel_context *ctx, struct arg_info *args, unsigned *idx) +{ + /* Streamout SGPRs. */ + if (ctx->program->info->so.num_outputs) { + assert(ctx->stage & hw_vs); + + if (ctx->stage != tess_eval_vs) { + add_arg(args, s1, &ctx->streamout_config, (*idx)++); + } else { + args->assign[args->count - 1] = &ctx->streamout_config; + args->types[args->count - 1] = s1; + } + + add_arg(args, s1, &ctx->streamout_write_idx, (*idx)++); + } + + /* A streamout buffer offset is loaded if the stride is non-zero. */ + for (unsigned i = 0; i < 4; i++) { + if (!ctx->program->info->so.strides[i]) + continue; + + add_arg(args, s1, &ctx->streamout_offset[i], (*idx)++); + } +} + +static bool needs_view_index_sgpr(isel_context *ctx) +{ + switch (ctx->stage) { + case vertex_vs: + return ctx->program->info->needs_multiview_view_index || ctx->options->key.has_multiview_view_index; + case tess_eval_vs: + return ctx->program->info->needs_multiview_view_index && ctx->options->key.has_multiview_view_index; + case vertex_ls: + case vertex_tess_control_ls: + case vertex_geometry_es: + case tess_control_hs: + case tess_eval_es: + case tess_eval_geometry_es: + case geometry_gs: + return ctx->program->info->needs_multiview_view_index; + default: + return false; + } +} + +static inline bool +add_fs_arg(isel_context *ctx, arg_info *args, unsigned &vgpr_idx, fs_input input, unsigned value, bool enable_next = false, RegClass rc = v1) +{ + if (!ctx->fs_vgpr_args[input]) + return false; + + add_arg(args, rc, &ctx->fs_inputs[input], vgpr_idx); + vgpr_idx += rc.size(); + + if (enable_next) { + add_arg(args, rc, &ctx->fs_inputs[input + 1], vgpr_idx); + vgpr_idx += rc.size(); + } + + ctx->program->config->spi_ps_input_addr |= value; + ctx->program->config->spi_ps_input_ena |= value; + return true; +} + +void add_startpgm(struct isel_context *ctx) +{ + user_sgpr_info user_sgpr_info; + bool needs_view_index = needs_view_index_sgpr(ctx); + allocate_user_sgprs(ctx, needs_view_index, user_sgpr_info); + arg_info args = {}; + + /* this needs to be in sgprs 0 and 1 */ + if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets || ctx->scratch_enabled) { + add_arg(&args, s2, &ctx->private_segment_buffer, 0); + set_loc_shader_ptr(ctx, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_info.user_sgpr_idx); + } + + unsigned vgpr_idx = 0; + switch (ctx->stage) { + case vertex_vs: { + declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets); + if (ctx->program->info->vs.has_vertex_buffers) { + add_arg(&args, s1, &ctx->vertex_buffers, user_sgpr_info.user_sgpr_idx); + set_loc_shader_ptr(ctx, AC_UD_VS_VERTEX_BUFFERS, &user_sgpr_info.user_sgpr_idx); + } + add_arg(&args, s1, &ctx->base_vertex, user_sgpr_info.user_sgpr_idx); + add_arg(&args, s1, &ctx->start_instance, user_sgpr_info.user_sgpr_idx + 1); + if (ctx->program->info->vs.needs_draw_id) { + add_arg(&args, s1, &ctx->draw_id, user_sgpr_info.user_sgpr_idx + 2); + set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, &user_sgpr_info.user_sgpr_idx, 3); + } else + set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, &user_sgpr_info.user_sgpr_idx, 2); + + if (needs_view_index) { + add_arg(&args, s1, &ctx->view_index, user_sgpr_info.user_sgpr_idx); + set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_info.user_sgpr_idx, 1); + } + + assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr); + unsigned idx = user_sgpr_info.user_sgpr_idx; + if (ctx->options->key.vs.out.as_es) + add_arg(&args, s1, &ctx->es2gs_offset, idx++); + else + declare_streamout_sgprs(ctx, &args, &idx); + + if (ctx->scratch_enabled) + add_arg(&args, s1, &ctx->scratch_offset, idx++); + + declare_vs_input_vgprs(ctx, &args); + break; + } + case fragment_fs: { + declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets); + + assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr); + add_arg(&args, s1, &ctx->prim_mask, user_sgpr_info.user_sgpr_idx); + + if (ctx->scratch_enabled) + add_arg(&args, s1, &ctx->scratch_offset, user_sgpr_info.user_sgpr_idx + 1); + + ctx->program->config->spi_ps_input_addr = 0; + ctx->program->config->spi_ps_input_ena = 0; + + bool has_interp_mode = false; + + has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_sample_p1, S_0286CC_PERSP_SAMPLE_ENA(1), true); + has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true); + has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_centroid_p1, S_0286CC_PERSP_CENTROID_ENA(1), true); + has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_pull_model, S_0286CC_PERSP_PULL_MODEL_ENA(1), false, v3); + + if (!has_interp_mode && ctx->fs_vgpr_args[fs_input::frag_pos_3]) { + /* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */ + ctx->fs_vgpr_args[fs_input::persp_center_p1] = true; + has_interp_mode = add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true); + } + + has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_sample_p1, S_0286CC_LINEAR_SAMPLE_ENA(1), true); + has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_center_p1, S_0286CC_LINEAR_CENTER_ENA(1), true); + has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_centroid_p1, S_0286CC_LINEAR_CENTROID_ENA(1), true); + has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::line_stipple, S_0286CC_LINE_STIPPLE_TEX_ENA(1)); + + if (!has_interp_mode) { + /* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */ + ctx->fs_vgpr_args[fs_input::persp_center_p1] = true; + has_interp_mode = add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true); + } + + add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_0, S_0286CC_POS_X_FLOAT_ENA(1)); + add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_1, S_0286CC_POS_Y_FLOAT_ENA(1)); + add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_2, S_0286CC_POS_Z_FLOAT_ENA(1)); + add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_3, S_0286CC_POS_W_FLOAT_ENA(1)); + + add_fs_arg(ctx, &args, vgpr_idx, fs_input::front_face, S_0286CC_FRONT_FACE_ENA(1)); + add_fs_arg(ctx, &args, vgpr_idx, fs_input::ancillary, S_0286CC_ANCILLARY_ENA(1)); + add_fs_arg(ctx, &args, vgpr_idx, fs_input::sample_coverage, S_0286CC_SAMPLE_COVERAGE_ENA(1)); + add_fs_arg(ctx, &args, vgpr_idx, fs_input::fixed_pt, S_0286CC_POS_FIXED_PT_ENA(1)); + + ASSERTED bool unset_interp_mode = !(ctx->program->config->spi_ps_input_addr & 0x7F) || + (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_addr) + && !(ctx->program->config->spi_ps_input_addr & 0xF)); + + assert(has_interp_mode); + assert(!unset_interp_mode); + break; + } + case compute_cs: { + declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets); + + if (ctx->program->info->cs.uses_grid_size) { + add_arg(&args, s1, &ctx->num_workgroups[0], user_sgpr_info.user_sgpr_idx); + add_arg(&args, s1, &ctx->num_workgroups[1], user_sgpr_info.user_sgpr_idx + 1); + add_arg(&args, s1, &ctx->num_workgroups[2], user_sgpr_info.user_sgpr_idx + 2); + set_loc_shader(ctx, AC_UD_CS_GRID_SIZE, &user_sgpr_info.user_sgpr_idx, 3); + } + assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr); + unsigned idx = user_sgpr_info.user_sgpr_idx; + for (unsigned i = 0; i < 3; i++) { + if (ctx->program->info->cs.uses_block_id[i]) + add_arg(&args, s1, &ctx->workgroup_ids[i], idx++); + } + + if (ctx->program->info->cs.uses_local_invocation_idx) + add_arg(&args, s1, &ctx->tg_size, idx++); + if (ctx->scratch_enabled) + add_arg(&args, s1, &ctx->scratch_offset, idx++); + + add_arg(&args, v1, &ctx->local_invocation_ids[0], vgpr_idx++); + add_arg(&args, v1, &ctx->local_invocation_ids[1], vgpr_idx++); + add_arg(&args, v1, &ctx->local_invocation_ids[2], vgpr_idx++); + break; + } + default: + unreachable("Shader stage not implemented"); + } + + ctx->program->info->num_input_vgprs = 0; + ctx->program->info->num_input_sgprs = args.num_sgprs_used; + ctx->program->info->num_user_sgprs = user_sgpr_info.num_sgpr; + ctx->program->info->num_input_vgprs = args.num_vgprs_used; + + aco_ptr<Pseudo_instruction> startpgm{create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, args.count + 1)}; + for (unsigned i = 0; i < args.count; i++) { + if (args.assign[i]) { + *args.assign[i] = Temp{ctx->program->allocateId(), args.types[i]}; + startpgm->definitions[i] = Definition(*args.assign[i]); + startpgm->definitions[i].setFixed(args.reg[i]); + } + } + startpgm->definitions[args.count] = Definition{ctx->program->allocateId(), exec, s2}; + ctx->block->instructions.push_back(std::move(startpgm)); +} + +int +type_size(const struct glsl_type *type, bool bindless) +{ + // TODO: don't we need type->std430_base_alignment() here? + return glsl_count_attribute_slots(type, false); +} + +void +shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align) +{ + assert(glsl_type_is_vector_or_scalar(type)); + + uint32_t comp_size = glsl_type_is_boolean(type) + ? 4 : glsl_get_bit_size(type) / 8; + unsigned length = glsl_get_vector_elements(type); + *size = comp_size * length, + *align = comp_size; +} + +int +get_align(nir_variable_mode mode, bool is_store, unsigned bit_size, unsigned num_components) +{ + /* TODO: ACO doesn't have good support for non-32-bit reads/writes yet */ + if (bit_size != 32) + return -1; + + switch (mode) { + case nir_var_mem_ubo: + case nir_var_mem_ssbo: + //case nir_var_mem_push_const: enable with 1240! + case nir_var_mem_shared: + /* TODO: what are the alignment requirements for LDS? */ + return num_components <= 4 ? 4 : -1; + default: + return -1; + } +} + +void +setup_vs_variables(isel_context *ctx, nir_shader *nir) +{ + nir_foreach_variable(variable, &nir->inputs) + { + variable->data.driver_location = variable->data.location * 4; + } + nir_foreach_variable(variable, &nir->outputs) + { + variable->data.driver_location = variable->data.location * 4; + } + + radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo; + + memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, + sizeof(outinfo->vs_output_param_offset)); + + ctx->needs_instance_id = ctx->program->info->vs.needs_instance_id; + + bool export_clip_dists = ctx->options->key.vs_common_out.export_clip_dists; + + outinfo->param_exports = 0; + int pos_written = 0x1; + if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer) + pos_written |= 1 << 1; + + nir_foreach_variable(variable, &nir->outputs) + { + int idx = variable->data.location; + unsigned slots = variable->type->count_attribute_slots(false); + if (variable->data.compact) { + unsigned component_count = variable->data.location_frac + variable->type->length; + slots = (component_count + 3) / 4; + } + + if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID || + ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) { + for (unsigned i = 0; i < slots; i++) { + if (outinfo->vs_output_param_offset[idx + i] == AC_EXP_PARAM_UNDEFINED) + outinfo->vs_output_param_offset[idx + i] = outinfo->param_exports++; + } + } + } + if (outinfo->writes_layer && + outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) { + /* when ctx->options->key.has_multiview_view_index = true, the layer + * variable isn't declared in NIR and it's isel's job to get the layer */ + outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++; + } + + if (outinfo->export_prim_id) { + assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED); + outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++; + } + + ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask); + ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask); + + assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8); + + if (ctx->num_clip_distances + ctx->num_cull_distances > 0) + pos_written |= 1 << 2; + if (ctx->num_clip_distances + ctx->num_cull_distances > 4) + pos_written |= 1 << 3; + + outinfo->pos_exports = util_bitcount(pos_written); +} + +void +setup_variables(isel_context *ctx, nir_shader *nir) +{ + switch (nir->info.stage) { + case MESA_SHADER_FRAGMENT: { + nir_foreach_variable(variable, &nir->outputs) + { + int idx = variable->data.location + variable->data.index; + variable->data.driver_location = idx * 4; + } + break; + } + case MESA_SHADER_COMPUTE: { + unsigned lds_allocation_size_unit = 4 * 64; + if (ctx->program->chip_class >= GFX7) + lds_allocation_size_unit = 4 * 128; + ctx->program->config->lds_size = (nir->info.cs.shared_size + lds_allocation_size_unit - 1) / lds_allocation_size_unit; + break; + } + case MESA_SHADER_VERTEX: { + setup_vs_variables(ctx, nir); + break; + } + default: + unreachable("Unhandled shader stage."); + } +} + +isel_context +setup_isel_context(Program* program, + unsigned shader_count, + struct nir_shader *const *shaders, + ac_shader_config* config, + radv_shader_info *info, + radv_nir_compiler_options *options) +{ + program->stage = 0; + for (unsigned i = 0; i < shader_count; i++) { + switch (shaders[i]->info.stage) { + case MESA_SHADER_VERTEX: + program->stage |= sw_vs; + break; + case MESA_SHADER_TESS_CTRL: + program->stage |= sw_tcs; + break; + case MESA_SHADER_TESS_EVAL: + program->stage |= sw_tes; + break; + case MESA_SHADER_GEOMETRY: + program->stage |= sw_gs; + break; + case MESA_SHADER_FRAGMENT: + program->stage |= sw_fs; + break; + case MESA_SHADER_COMPUTE: + program->stage |= sw_cs; + break; + default: + unreachable("Shader stage not implemented"); + } + } + if (program->stage == sw_vs) + program->stage |= hw_vs; + else if (program->stage == sw_fs) + program->stage |= hw_fs; + else if (program->stage == sw_cs) + program->stage |= hw_cs; + else + unreachable("Shader stage not implemented"); + + program->config = config; + program->info = info; + program->chip_class = options->chip_class; + program->family = options->family; + program->sgpr_limit = options->chip_class >= GFX8 ? 102 : 104; + if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND) + program->sgpr_limit = 94; /* workaround hardware bug */ + + for (unsigned i = 0; i < MAX_SETS; ++i) + program->info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1; + for (unsigned i = 0; i < AC_UD_MAX_UD; ++i) + program->info->user_sgprs_locs.shader_data[i].sgpr_idx = -1; + + isel_context ctx = {}; + ctx.program = program; + ctx.options = options; + ctx.stage = program->stage; + + for (unsigned i = 0; i < fs_input::max_inputs; ++i) + ctx.fs_inputs[i] = Temp(0, v1); + ctx.fs_inputs[fs_input::persp_pull_model] = Temp(0, v3); + for (unsigned i = 0; i < MAX_SETS; ++i) + ctx.descriptor_sets[i] = Temp(0, s1); + for (unsigned i = 0; i < MAX_INLINE_PUSH_CONSTS; ++i) + ctx.inline_push_consts[i] = Temp(0, s1); + for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { + for (unsigned j = 0; j < 4; ++j) + ctx.vs_output.outputs[i][j] = Temp(0, v1); + } + + for (unsigned i = 0; i < shader_count; i++) { + nir_shader *nir = shaders[i]; + + /* align and copy constant data */ + while (program->constant_data.size() % 4u) + program->constant_data.push_back(0); + ctx.constant_data_offset = program->constant_data.size(); + program->constant_data.insert(program->constant_data.end(), + (uint8_t*)nir->constant_data, + (uint8_t*)nir->constant_data + nir->constant_data_size); + + /* the variable setup has to be done before lower_io / CSE */ + if (nir->info.stage == MESA_SHADER_COMPUTE) + nir_lower_vars_to_explicit_types(nir, nir_var_mem_shared, shared_var_info); + setup_variables(&ctx, nir); + + /* optimize and lower memory operations */ + bool lower_to_scalar = false; + bool lower_pack = false; + // TODO: uncomment this once !1240 is merged + /*if (nir_opt_load_store_vectorize(nir, + (nir_variable_mode)(nir_var_mem_ssbo | nir_var_mem_ubo | + nir_var_mem_push_const | nir_var_mem_shared), + get_align)) { + lower_to_scalar = true; + lower_pack = true; + }*/ + if (nir->info.stage == MESA_SHADER_COMPUTE) + lower_to_scalar |= nir_lower_explicit_io(nir, nir_var_mem_shared, nir_address_format_32bit_offset); + else + nir_lower_io(nir, (nir_variable_mode)(nir_var_shader_in | nir_var_shader_out), type_size, (nir_lower_io_options)0); + nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global); + + if (lower_to_scalar) + nir_lower_alu_to_scalar(nir, NULL, NULL); + if (lower_pack) + nir_lower_pack(nir); + + /* lower ALU operations */ + nir_opt_idiv_const(nir, 32); + nir_lower_idiv(nir); // TODO: use the LLVM path once !1239 is merged + + // TODO: implement logic64 in aco, it's more effective for sgprs + nir_lower_int64(nir, (nir_lower_int64_options) (nir_lower_imul64 | + nir_lower_imul_high64 | + nir_lower_imul_2x32_64 | + nir_lower_divmod64 | + nir_lower_logic64 | + nir_lower_minmax64 | + nir_lower_iabs64 | + nir_lower_ineg64)); + + /* optimize the lowered ALU operations */ + nir_copy_prop(nir); + nir_opt_constant_folding(nir); + nir_opt_algebraic(nir); + nir_opt_algebraic_late(nir); + nir_opt_constant_folding(nir); + + /* cleanup passes */ + nir_lower_load_const_to_scalar(nir); + nir_opt_cse(nir); + nir_opt_dce(nir); + nir_opt_shrink_load(nir); + nir_move_options move_opts = (nir_move_options)( + nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | nir_move_comparisons); + //nir_opt_sink(nir, move_opts); // TODO: enable this once !1664 is merged + nir_opt_move(nir, move_opts); + nir_convert_to_lcssa(nir, true, false); + nir_lower_phis_to_scalar(nir); + + nir_function_impl *func = nir_shader_get_entrypoint(nir); + nir_index_ssa_defs(func); + + if (options->dump_preoptir) { + fprintf(stderr, "NIR shader before instruction selection:\n"); + nir_print_shader(nir, stderr); + } + } + + unsigned scratch_size = 0; + for (unsigned i = 0; i < shader_count; i++) + scratch_size = std::max(scratch_size, shaders[i]->scratch_size); + ctx.scratch_enabled = scratch_size > 0; + ctx.program->config->scratch_bytes_per_wave = align(scratch_size * ctx.options->wave_size, 1024); + ctx.program->config->float_mode = V_00B028_FP_64_DENORMS; + ctx.program->info->wave_size = ctx.options->wave_size; + + ctx.block = ctx.program->create_and_insert_block(); + ctx.block->loop_nest_depth = 0; + ctx.block->kind = block_kind_top_level; + + return ctx; +} + +} diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp new file mode 100644 index 00000000000..6adb911e4b3 --- /dev/null +++ b/src/amd/compiler/aco_interface.cpp @@ -0,0 +1,166 @@ +/* + * Copyright © 2018 Google + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "aco_interface.h" +#include "aco_ir.h" +#include "vulkan/radv_shader.h" +#include "c11/threads.h" +#include "util/debug.h" + +#include <iostream> +#include <sstream> + +namespace aco { +uint64_t debug_flags = 0; + +static const struct debug_control aco_debug_options[] = { + {"validateir", DEBUG_VALIDATE}, + {"validatera", DEBUG_VALIDATE_RA}, + {"perfwarn", DEBUG_PERFWARN}, + {NULL, 0} +}; + +static once_flag init_once_flag = ONCE_FLAG_INIT; + +static void init() +{ + debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options); + + #ifndef NDEBUG + /* enable some flags by default on debug builds */ + debug_flags |= aco::DEBUG_VALIDATE; + #endif +} +} + +void aco_compile_shader(unsigned shader_count, + struct nir_shader *const *shaders, + struct radv_shader_binary **binary, + struct radv_shader_info *info, + struct radv_nir_compiler_options *options) +{ + call_once(&aco::init_once_flag, aco::init); + + ac_shader_config config = {0}; + std::unique_ptr<aco::Program> program{new aco::Program}; + + /* Instruction Selection */ + aco::select_program(program.get(), shader_count, shaders, &config, info, options); + if (options->dump_preoptir) { + std::cerr << "After Instruction Selection:\n"; + aco_print_program(program.get(), stderr); + } + aco::validate(program.get(), stderr); + + /* Boolean phi lowering */ + aco::lower_bool_phis(program.get()); + //std::cerr << "After Boolean Phi Lowering:\n"; + //aco_print_program(program.get(), stderr); + + aco::dominator_tree(program.get()); + + /* Optimization */ + aco::value_numbering(program.get()); + aco::optimize(program.get()); + aco::validate(program.get(), stderr); + + aco::setup_reduce_temp(program.get()); + aco::insert_exec_mask(program.get()); + aco::validate(program.get(), stderr); + + aco::live live_vars = aco::live_var_analysis(program.get(), options); + aco::spill(program.get(), live_vars, options); + + //std::cerr << "Before Schedule:\n"; + //aco_print_program(program.get(), stderr); + aco::schedule_program(program.get(), live_vars); + + /* Register Allocation */ + aco::register_allocation(program.get(), live_vars.live_out); + if (options->dump_shader) { + std::cerr << "After RA:\n"; + aco_print_program(program.get(), stderr); + } + + if (aco::validate_ra(program.get(), options, stderr)) { + std::cerr << "Program after RA validation failure:\n"; + aco_print_program(program.get(), stderr); + abort(); + } + + aco::ssa_elimination(program.get()); + /* Lower to HW Instructions */ + aco::lower_to_hw_instr(program.get()); + //std::cerr << "After Eliminate Pseudo Instr:\n"; + //aco_print_program(program.get(), stderr); + + /* Insert Waitcnt */ + aco::insert_wait_states(program.get()); + aco::insert_NOPs(program.get()); + + //std::cerr << "After Insert-Waitcnt:\n"; + //aco_print_program(program.get(), stderr); + + /* Assembly */ + std::vector<uint32_t> code; + unsigned exec_size = aco::emit_program(program.get(), code); + + bool get_disasm = options->dump_shader; +#ifndef NDEBUG + get_disasm |= options->record_llvm_ir; +#endif + + size_t size = 0; + + std::string disasm; + if (get_disasm) { + std::ostringstream stream; + aco::print_asm(program.get(), code, exec_size / 4u, options->family, stream); + stream << '\0'; + disasm = stream.str(); + size += disasm.size(); + } + + size += code.size() * sizeof(uint32_t) + sizeof(radv_shader_binary_legacy); + radv_shader_binary_legacy* legacy_binary = (radv_shader_binary_legacy*) malloc(size); + + legacy_binary->base.type = RADV_BINARY_TYPE_LEGACY; + legacy_binary->base.stage = shaders[shader_count-1]->info.stage; + legacy_binary->base.is_gs_copy_shader = false; + legacy_binary->base.total_size = size; + + memcpy(legacy_binary->data, code.data(), code.size() * sizeof(uint32_t)); + legacy_binary->exec_size = exec_size; + legacy_binary->code_size = code.size() * sizeof(uint32_t); + + legacy_binary->config = config; + legacy_binary->disasm_size = 0; + legacy_binary->llvm_ir_size = 0; + + if (get_disasm) { + disasm.copy((char*) legacy_binary->data + legacy_binary->code_size, disasm.size()); + legacy_binary->disasm_size = disasm.size() - 1; + } + + *binary = (radv_shader_binary*) legacy_binary; +} diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h new file mode 100644 index 00000000000..1425a0997a0 --- /dev/null +++ b/src/amd/compiler/aco_interface.h @@ -0,0 +1,45 @@ +/* + * Copyright © 2018 Google + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef ACO_INTERFACE_H +#define ACO_INTERFACE_H + +#include "nir.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ac_shader_config; + +void aco_compile_shader(unsigned shader_count, + struct nir_shader *const *shaders, + struct radv_shader_binary** binary, + struct radv_shader_info *info, + struct radv_nir_compiler_options *options); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h new file mode 100644 index 00000000000..663635e5b93 --- /dev/null +++ b/src/amd/compiler/aco_ir.h @@ -0,0 +1,1169 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#ifndef ACO_IR_H +#define ACO_IR_H + +#include <vector> +#include <set> +#include <bitset> +#include <memory> + +#include "nir.h" +#include "ac_binary.h" +#include "amd_family.h" +#include "aco_opcodes.h" +#include "aco_util.h" + +struct radv_nir_compiler_options; +struct radv_shader_info; + +namespace aco { + +extern uint64_t debug_flags; + +enum { + DEBUG_VALIDATE = 0x1, + DEBUG_VALIDATE_RA = 0x2, + DEBUG_PERFWARN = 0x4, +}; + +/** + * Representation of the instruction's microcode encoding format + * Note: Some Vector ALU Formats can be combined, such that: + * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding + * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive. + * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing. + * + * (*) The same is applicable for VOP1 and VOPC instructions. + */ +enum class Format : std::uint16_t { + /* Pseudo Instruction Format */ + PSEUDO = 0, + /* Scalar ALU & Control Formats */ + SOP1 = 1, + SOP2 = 2, + SOPK = 3, + SOPP = 4, + SOPC = 5, + /* Scalar Memory Format */ + SMEM = 6, + /* LDS/GDS Format */ + DS = 8, + /* Vector Memory Buffer Formats */ + MTBUF = 9, + MUBUF = 10, + /* Vector Memory Image Format */ + MIMG = 11, + /* Export Format */ + EXP = 12, + /* Flat Formats */ + FLAT = 13, + GLOBAL = 14, + SCRATCH = 15, + + PSEUDO_BRANCH = 16, + PSEUDO_BARRIER = 17, + PSEUDO_REDUCTION = 18, + + /* Vector ALU Formats */ + VOP1 = 1 << 8, + VOP2 = 1 << 9, + VOPC = 1 << 10, + VOP3 = 1 << 11, + VOP3A = 1 << 11, + VOP3B = 1 << 11, + VOP3P = 1 << 12, + /* Vector Parameter Interpolation Format */ + VINTRP = 1 << 13, + DPP = 1 << 14, + SDWA = 1 << 15, +}; + +enum barrier_interaction { + barrier_none = 0, + barrier_buffer = 0x1, + barrier_image = 0x2, + barrier_atomic = 0x4, + barrier_shared = 0x8, + barrier_count = 4, +}; + +constexpr Format asVOP3(Format format) { + return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format); +}; + +enum class RegType { + none = 0, + sgpr, + vgpr, + linear_vgpr, +}; + +struct RegClass { + + enum RC : uint8_t { + s1 = 1, + s2 = 2, + s3 = 3, + s4 = 4, + s6 = 6, + s8 = 8, + s16 = 16, + v1 = s1 | (1 << 5), + v2 = s2 | (1 << 5), + v3 = s3 | (1 << 5), + v4 = s4 | (1 << 5), + v5 = 5 | (1 << 5), + v6 = 6 | (1 << 5), + v7 = 7 | (1 << 5), + v8 = 8 | (1 << 5), + /* these are used for WWM and spills to vgpr */ + v1_linear = v1 | (1 << 6), + v2_linear = v2 | (1 << 6), + }; + + RegClass() = default; + constexpr RegClass(RC rc) + : rc(rc) {} + constexpr RegClass(RegType type, unsigned size) + : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {} + + constexpr operator RC() const { return rc; } + explicit operator bool() = delete; + + constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; } + constexpr unsigned size() const { return (unsigned) rc & 0x1F; } + constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); } + constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); } + +private: + RC rc; +}; + +/* transitional helper expressions */ +static constexpr RegClass s1{RegClass::s1}; +static constexpr RegClass s2{RegClass::s2}; +static constexpr RegClass s3{RegClass::s3}; +static constexpr RegClass s4{RegClass::s4}; +static constexpr RegClass s8{RegClass::s8}; +static constexpr RegClass s16{RegClass::s16}; +static constexpr RegClass v1{RegClass::v1}; +static constexpr RegClass v2{RegClass::v2}; +static constexpr RegClass v3{RegClass::v3}; +static constexpr RegClass v4{RegClass::v4}; +static constexpr RegClass v5{RegClass::v5}; +static constexpr RegClass v6{RegClass::v6}; +static constexpr RegClass v7{RegClass::v7}; +static constexpr RegClass v8{RegClass::v8}; + +/** + * Temp Class + * Each temporary virtual register has a + * register class (i.e. size and type) + * and SSA id. + */ +struct Temp { + Temp() = default; + constexpr Temp(uint32_t id, RegClass cls) noexcept + : id_(id), reg_class(cls) {} + + constexpr uint32_t id() const noexcept { return id_; } + constexpr RegClass regClass() const noexcept { return reg_class; } + + constexpr unsigned size() const noexcept { return reg_class.size(); } + constexpr RegType type() const noexcept { return reg_class.type(); } + constexpr bool is_linear() const noexcept { return reg_class.is_linear(); } + + constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); } + constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); } + constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); } + +private: + uint32_t id_:24; + RegClass reg_class; +}; + +/** + * PhysReg + * Represents the physical register for each + * Operand and Definition. + */ +struct PhysReg { + constexpr PhysReg() = default; + explicit constexpr PhysReg(unsigned r) : reg(r) {} + constexpr operator unsigned() const { return reg; } + + uint16_t reg = 0; +}; + +/* helper expressions for special registers */ +static constexpr PhysReg m0{124}; +static constexpr PhysReg vcc{106}; +static constexpr PhysReg exec{126}; +static constexpr PhysReg exec_lo{126}; +static constexpr PhysReg exec_hi{127}; +static constexpr PhysReg scc{253}; + +/** + * Operand Class + * Initially, each Operand refers to either + * a temporary virtual register + * or to a constant value + * Temporary registers get mapped to physical register during RA + * Constant values are inlined into the instruction sequence. + */ +class Operand final +{ +public: + constexpr Operand() + : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), + isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {} + + explicit Operand(Temp r) noexcept + { + data_.temp = r; + if (r.id()) { + isTemp_ = true; + } else { + isUndef_ = true; + setFixed(PhysReg{128}); + } + }; + explicit Operand(uint32_t v) noexcept + { + data_.i = v; + isConstant_ = true; + if (v <= 64) + setFixed(PhysReg{128 + v}); + else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */ + setFixed(PhysReg{192 - v}); + else if (v == 0x3f000000) /* 0.5 */ + setFixed(PhysReg{240}); + else if (v == 0xbf000000) /* -0.5 */ + setFixed(PhysReg{241}); + else if (v == 0x3f800000) /* 1.0 */ + setFixed(PhysReg{242}); + else if (v == 0xbf800000) /* -1.0 */ + setFixed(PhysReg{243}); + else if (v == 0x40000000) /* 2.0 */ + setFixed(PhysReg{244}); + else if (v == 0xc0000000) /* -2.0 */ + setFixed(PhysReg{245}); + else if (v == 0x40800000) /* 4.0 */ + setFixed(PhysReg{246}); + else if (v == 0xc0800000) /* -4.0 */ + setFixed(PhysReg{247}); + else if (v == 0x3e22f983) /* 1/(2*PI) */ + setFixed(PhysReg{248}); + else /* Literal Constant */ + setFixed(PhysReg{255}); + }; + explicit Operand(uint64_t v) noexcept + { + isConstant_ = true; + is64BitConst_ = true; + if (v <= 64) + setFixed(PhysReg{128 + (uint32_t) v}); + else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */ + setFixed(PhysReg{192 - (uint32_t) v}); + else if (v == 0x3FE0000000000000) /* 0.5 */ + setFixed(PhysReg{240}); + else if (v == 0xBFE0000000000000) /* -0.5 */ + setFixed(PhysReg{241}); + else if (v == 0x3FF0000000000000) /* 1.0 */ + setFixed(PhysReg{242}); + else if (v == 0xBFF0000000000000) /* -1.0 */ + setFixed(PhysReg{243}); + else if (v == 0x4000000000000000) /* 2.0 */ + setFixed(PhysReg{244}); + else if (v == 0xC000000000000000) /* -2.0 */ + setFixed(PhysReg{245}); + else if (v == 0x4010000000000000) /* 4.0 */ + setFixed(PhysReg{246}); + else if (v == 0xC010000000000000) /* -4.0 */ + setFixed(PhysReg{247}); + else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */ + setFixed(PhysReg{248}); + else { /* Literal Constant: we don't know if it is a long or double.*/ + isConstant_ = 0; + assert(false && "attempt to create a 64-bit literal constant"); + } + }; + explicit Operand(RegClass type) noexcept + { + isUndef_ = true; + data_.temp = Temp(0, type); + setFixed(PhysReg{128}); + }; + explicit Operand(PhysReg reg, RegClass type) noexcept + { + data_.temp = Temp(0, type); + setFixed(reg); + } + + constexpr bool isTemp() const noexcept + { + return isTemp_; + } + + constexpr void setTemp(Temp t) noexcept { + assert(!isConstant_); + isTemp_ = true; + data_.temp = t; + } + + constexpr Temp getTemp() const noexcept + { + return data_.temp; + } + + constexpr uint32_t tempId() const noexcept + { + return data_.temp.id(); + } + + constexpr bool hasRegClass() const noexcept + { + return isTemp() || isUndefined(); + } + + constexpr RegClass regClass() const noexcept + { + return data_.temp.regClass(); + } + + constexpr unsigned size() const noexcept + { + if (isConstant()) + return is64BitConst_ ? 2 : 1; + else + return data_.temp.size(); + } + + constexpr bool isFixed() const noexcept + { + return isFixed_; + } + + constexpr PhysReg physReg() const noexcept + { + return reg_; + } + + constexpr void setFixed(PhysReg reg) noexcept + { + isFixed_ = reg != unsigned(-1); + reg_ = reg; + } + + constexpr bool isConstant() const noexcept + { + return isConstant_; + } + + constexpr bool isLiteral() const noexcept + { + return isConstant() && reg_ == 255; + } + + constexpr bool isUndefined() const noexcept + { + return isUndef_; + } + + constexpr uint32_t constantValue() const noexcept + { + return data_.i; + } + + constexpr bool constantEquals(uint32_t cmp) const noexcept + { + return isConstant() && constantValue() == cmp; + } + + constexpr void setKill(bool flag) noexcept + { + isKill_ = flag; + if (!flag) + setFirstKill(false); + } + + constexpr bool isKill() const noexcept + { + return isKill_ || isFirstKill(); + } + + constexpr void setFirstKill(bool flag) noexcept + { + isFirstKill_ = flag; + if (flag) + setKill(flag); + } + + /* When there are multiple operands killing the same temporary, + * isFirstKill() is only returns true for the first one. */ + constexpr bool isFirstKill() const noexcept + { + return isFirstKill_; + } + +private: + union { + uint32_t i; + float f; + Temp temp = Temp(0, s1); + } data_; + PhysReg reg_; + union { + struct { + uint8_t isTemp_:1; + uint8_t isFixed_:1; + uint8_t isConstant_:1; + uint8_t isKill_:1; + uint8_t isUndef_:1; + uint8_t isFirstKill_:1; + uint8_t is64BitConst_:1; + }; + /* can't initialize bit-fields in c++11, so work around using a union */ + uint8_t control_ = 0; + }; +}; + +/** + * Definition Class + * Definitions are the results of Instructions + * and refer to temporary virtual registers + * which are later mapped to physical registers + */ +class Definition final +{ +public: + constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {} + Definition(uint32_t index, RegClass type) noexcept + : temp(index, type) {} + explicit Definition(Temp tmp) noexcept + : temp(tmp) {} + Definition(PhysReg reg, RegClass type) noexcept + : temp(Temp(0, type)) + { + setFixed(reg); + } + Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept + : temp(Temp(tmpId, type)) + { + setFixed(reg); + } + + constexpr bool isTemp() const noexcept + { + return tempId() > 0; + } + + constexpr Temp getTemp() const noexcept + { + return temp; + } + + constexpr uint32_t tempId() const noexcept + { + return temp.id(); + } + + constexpr void setTemp(Temp t) noexcept { + temp = t; + } + + constexpr RegClass regClass() const noexcept + { + return temp.regClass(); + } + + constexpr unsigned size() const noexcept + { + return temp.size(); + } + + constexpr bool isFixed() const noexcept + { + return isFixed_; + } + + constexpr PhysReg physReg() const noexcept + { + return reg_; + } + + constexpr void setFixed(PhysReg reg) noexcept + { + isFixed_ = 1; + reg_ = reg; + } + + constexpr void setHint(PhysReg reg) noexcept + { + hasHint_ = 1; + reg_ = reg; + } + + constexpr bool hasHint() const noexcept + { + return hasHint_; + } + + constexpr void setKill(bool flag) noexcept + { + isKill_ = flag; + } + + constexpr bool isKill() const noexcept + { + return isKill_; + } + +private: + Temp temp = Temp(0, s1); + PhysReg reg_; + union { + struct { + uint8_t isFixed_:1; + uint8_t hasHint_:1; + uint8_t isKill_:1; + }; + /* can't initialize bit-fields in c++11, so work around using a union */ + uint8_t control_ = 0; + }; +}; + +class Block; + +struct Instruction { + aco_opcode opcode; + Format format; + + aco::span<Operand> operands; + aco::span<Definition> definitions; + + constexpr bool isVALU() const noexcept + { + return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1 + || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2 + || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC + || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A + || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B + || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P; + } + + constexpr bool isSALU() const noexcept + { + return format == Format::SOP1 || + format == Format::SOP2 || + format == Format::SOPC || + format == Format::SOPK || + format == Format::SOPP; + } + + constexpr bool isVMEM() const noexcept + { + return format == Format::MTBUF || + format == Format::MUBUF || + format == Format::MIMG; + } + + constexpr bool isDPP() const noexcept + { + return (uint16_t) format & (uint16_t) Format::DPP; + } + + constexpr bool isVOP3() const noexcept + { + return ((uint16_t) format & (uint16_t) Format::VOP3A) || + ((uint16_t) format & (uint16_t) Format::VOP3B) || + format == Format::VOP3P; + } + + constexpr bool isSDWA() const noexcept + { + return (uint16_t) format & (uint16_t) Format::SDWA; + } + + constexpr bool isFlatOrGlobal() const noexcept + { + return format == Format::FLAT || format == Format::GLOBAL; + } +}; + +struct SOPK_instruction : public Instruction { + uint16_t imm; +}; + +struct SOPP_instruction : public Instruction { + uint32_t imm; + int block; +}; + +struct SOPC_instruction : public Instruction { +}; + +struct SOP1_instruction : public Instruction { +}; + +struct SOP2_instruction : public Instruction { +}; + +/** + * Scalar Memory Format: + * For s_(buffer_)load_dword*: + * Operand(0): SBASE - SGPR-pair which provides base address + * Operand(1): Offset - immediate (un)signed offset or SGPR + * Operand(2) / Definition(0): SDATA - SGPR for read / write result + * Operand(n-1): SOffset - SGPR offset (Vega only) + * + * Having no operands is also valid for instructions such as s_dcache_inv. + * + */ +struct SMEM_instruction : public Instruction { + bool glc; /* VI+: globally coherent */ + bool dlc; /* NAVI: device level coherent */ + bool nv; /* VEGA only: Non-volatile */ + bool can_reorder; + bool disable_wqm; + barrier_interaction barrier; +}; + +struct VOP1_instruction : public Instruction { +}; + +struct VOP2_instruction : public Instruction { +}; + +struct VOPC_instruction : public Instruction { +}; + +struct VOP3A_instruction : public Instruction { + bool abs[3]; + bool opsel[3]; + bool clamp; + unsigned omod; + bool neg[3]; +}; + +/** + * Data Parallel Primitives Format: + * This format can be used for VOP1, VOP2 or VOPC instructions. + * The swizzle applies to the src0 operand. + * + */ +struct DPP_instruction : public Instruction { + uint16_t dpp_ctrl; + uint8_t row_mask; + uint8_t bank_mask; + bool abs[2]; + bool neg[2]; + bool bound_ctrl; +}; + +struct Interp_instruction : public Instruction { + unsigned attribute; + unsigned component; +}; + +/** + * Local and Global Data Sharing instructions + * Operand(0): ADDR - VGPR which supplies the address. + * Operand(1): DATA0 - First data VGPR. + * Operand(2): DATA1 - Second data VGPR. + * Operand(n-1): M0 - LDS size. + * Definition(0): VDST - Destination VGPR when results returned to VGPRs. + * + */ +struct DS_instruction : public Instruction { + int16_t offset0; + int8_t offset1; + bool gds; +}; + +/** + * Vector Memory Untyped-buffer Instructions + * Operand(0): VADDR - Address source. Can carry an index and/or offset + * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) + * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data + * + */ +struct MUBUF_instruction : public Instruction { + unsigned offset; /* Unsigned byte offset - 12 bit */ + bool offen; /* Supply an offset from VGPR (VADDR) */ + bool idxen; /* Supply an index from VGPR (VADDR) */ + bool glc; /* globally coherent */ + bool dlc; /* NAVI: device level coherent */ + bool slc; /* system level coherent */ + bool tfe; /* texture fail enable */ + bool lds; /* Return read-data to LDS instead of VGPRs */ + bool disable_wqm; /* Require an exec mask without helper invocations */ + bool can_reorder; + barrier_interaction barrier; +}; + +/** + * Vector Memory Typed-buffer Instructions + * Operand(0): VADDR - Address source. Can carry an index and/or offset + * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) + * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data + * + */ +struct MTBUF_instruction : public Instruction { + union { + struct { + uint8_t dfmt : 4; /* Data Format of data in memory buffer */ + uint8_t nfmt : 3; /* Numeric format of data in memory */ + }; + uint8_t img_format; /* Buffer or image format as used by GFX10 */ + }; + unsigned offset; /* Unsigned byte offset - 12 bit */ + bool offen; /* Supply an offset from VGPR (VADDR) */ + bool idxen; /* Supply an index from VGPR (VADDR) */ + bool glc; /* globally coherent */ + bool dlc; /* NAVI: device level coherent */ + bool slc; /* system level coherent */ + bool tfe; /* texture fail enable */ + bool disable_wqm; /* Require an exec mask without helper invocations */ + bool can_reorder; + barrier_interaction barrier; +}; + +/** + * Vector Memory Image Instructions + * Operand(0): VADDR - Address source. Can carry an offset or an index. + * Operand(1): SRSRC - Scalar GPR that specifies the resource constant. + * Operand(2): SSAMP - Scalar GPR that specifies sampler constant. + * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result. + * + */ +struct MIMG_instruction : public Instruction { + unsigned dmask; /* Data VGPR enable mask */ + bool unrm; /* Force address to be un-normalized */ + bool dlc; /* NAVI: device level coherent */ + bool glc; /* globally coherent */ + bool slc; /* system level coherent */ + bool tfe; /* texture fail enable */ + bool da; /* declare an array */ + bool lwe; /* Force data to be un-normalized */ + bool r128; /* NAVI: Texture resource size */ + bool a16; /* VEGA, NAVI: Address components are 16-bits */ + bool d16; /* Convert 32-bit data to 16-bit data */ + bool disable_wqm; /* Require an exec mask without helper invocations */ + bool can_reorder; + barrier_interaction barrier; +}; + +/** + * Flat/Scratch/Global Instructions + * Operand(0): ADDR + * Operand(1): SADDR + * Operand(2) / Definition(0): DATA/VDST + * + */ +struct FLAT_instruction : public Instruction { + uint16_t offset; /* Vega only */ + bool slc; + bool glc; + bool lds; + bool nv; +}; + +struct Export_instruction : public Instruction { + unsigned enabled_mask; + unsigned dest; + bool compressed; + bool done; + bool valid_mask; +}; + +struct Pseudo_instruction : public Instruction { + bool tmp_in_scc; + PhysReg scratch_sgpr; /* might not be valid if it's not needed */ +}; + +struct Pseudo_branch_instruction : public Instruction { + /* target[0] is the block index of the branch target. + * For conditional branches, target[1] contains the fall-through alternative. + * A value of 0 means the target has not been initialized (BB0 cannot be a branch target). + */ + uint32_t target[2]; +}; + +struct Pseudo_barrier_instruction : public Instruction { +}; + +enum ReduceOp { + iadd32, iadd64, + imul32, imul64, + fadd32, fadd64, + fmul32, fmul64, + imin32, imin64, + imax32, imax64, + umin32, umin64, + umax32, umax64, + fmin32, fmin64, + fmax32, fmax64, + iand32, iand64, + ior32, ior64, + ixor32, ixor64, +}; + +/** + * Subgroup Reduction Instructions, everything except for the data to be + * reduced and the result as inserted by setup_reduce_temp(). + * Operand(0): data to be reduced + * Operand(1): reduce temporary + * Operand(2): vector temporary + * Definition(0): result + * Definition(1): scalar temporary + * Definition(2): scalar identity temporary + * Definition(3): scc clobber + * Definition(4): vcc clobber + * + */ +struct Pseudo_reduction_instruction : public Instruction { + ReduceOp reduce_op; + unsigned cluster_size; // must be 0 for scans +}; + +struct instr_deleter_functor { + void operator()(void* p) { + free(p); + } +}; + +template<typename T> +using aco_ptr = std::unique_ptr<T, instr_deleter_functor>; + +template<typename T> +T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions) +{ + std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition); + char *data = (char*) calloc(1, size); + T* inst = (T*) data; + + inst->opcode = opcode; + inst->format = format; + + inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands); + inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions); + + return inst; +} + +constexpr bool is_phi(Instruction* instr) +{ + return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi; +} + +static inline bool is_phi(aco_ptr<Instruction>& instr) +{ + return is_phi(instr.get()); +} + +constexpr barrier_interaction get_barrier_interaction(Instruction* instr) +{ + switch (instr->format) { + case Format::SMEM: + return static_cast<SMEM_instruction*>(instr)->barrier; + case Format::MUBUF: + return static_cast<MUBUF_instruction*>(instr)->barrier; + case Format::MIMG: + return static_cast<MIMG_instruction*>(instr)->barrier; + case Format::FLAT: + case Format::GLOBAL: + return barrier_buffer; + case Format::DS: + return barrier_shared; + default: + return barrier_none; + } +} + +enum block_kind { + /* uniform indicates that leaving this block, + * all actives lanes stay active */ + block_kind_uniform = 1 << 0, + block_kind_top_level = 1 << 1, + block_kind_loop_preheader = 1 << 2, + block_kind_loop_header = 1 << 3, + block_kind_loop_exit = 1 << 4, + block_kind_continue = 1 << 5, + block_kind_break = 1 << 6, + block_kind_continue_or_break = 1 << 7, + block_kind_discard = 1 << 8, + block_kind_branch = 1 << 9, + block_kind_merge = 1 << 10, + block_kind_invert = 1 << 11, + block_kind_uses_discard_if = 1 << 12, + block_kind_needs_lowering = 1 << 13, +}; + + +struct RegisterDemand { + constexpr RegisterDemand() = default; + constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept + : vgpr{v}, sgpr{s} {} + int16_t vgpr = 0; + int16_t sgpr = 0; + + constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept { + return a.vgpr == b.vgpr && a.sgpr == b.sgpr; + } + + constexpr bool exceeds(const RegisterDemand other) const noexcept { + return vgpr > other.vgpr || sgpr > other.sgpr; + } + + constexpr RegisterDemand operator+(const Temp t) const noexcept { + if (t.type() == RegType::sgpr) + return RegisterDemand( vgpr, sgpr + t.size() ); + else + return RegisterDemand( vgpr + t.size(), sgpr ); + } + + constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept { + return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr); + } + + constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept { + return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr); + } + + constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept { + vgpr += other.vgpr; + sgpr += other.sgpr; + return *this; + } + + constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept { + vgpr -= other.vgpr; + sgpr -= other.sgpr; + return *this; + } + + constexpr RegisterDemand& operator+=(const Temp t) noexcept { + if (t.type() == RegType::sgpr) + sgpr += t.size(); + else + vgpr += t.size(); + return *this; + } + + constexpr RegisterDemand& operator-=(const Temp t) noexcept { + if (t.type() == RegType::sgpr) + sgpr -= t.size(); + else + vgpr -= t.size(); + return *this; + } + + constexpr void update(const RegisterDemand other) noexcept { + vgpr = std::max(vgpr, other.vgpr); + sgpr = std::max(sgpr, other.sgpr); + } + +}; + +/* CFG */ +struct Block { + unsigned index; + unsigned offset = 0; + std::vector<aco_ptr<Instruction>> instructions; + std::vector<unsigned> logical_preds; + std::vector<unsigned> linear_preds; + std::vector<unsigned> logical_succs; + std::vector<unsigned> linear_succs; + RegisterDemand register_demand = RegisterDemand(); + uint16_t loop_nest_depth = 0; + uint16_t kind = 0; + int logical_idom = -1; + int linear_idom = -1; + Temp live_out_exec = Temp(); + + /* this information is needed for predecessors to blocks with phis when + * moving out of ssa */ + bool scc_live_out = false; + PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */ + + Block(unsigned idx) : index(idx) {} + Block() : index(0) {} +}; + +using Stage = uint16_t; + +/* software stages */ +static constexpr Stage sw_vs = 1 << 0; +static constexpr Stage sw_gs = 1 << 1; +static constexpr Stage sw_tcs = 1 << 2; +static constexpr Stage sw_tes = 1 << 3; +static constexpr Stage sw_fs = 1 << 4; +static constexpr Stage sw_cs = 1 << 5; +static constexpr Stage sw_mask = 0x3f; + +/* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */ +static constexpr Stage hw_vs = 1 << 6; +static constexpr Stage hw_es = 1 << 7; +static constexpr Stage hw_gs = 1 << 8; /* not on GFX9. combined into ES on GFX9 (and GFX10/legacy). */ +static constexpr Stage hw_ls = 1 << 9; +static constexpr Stage hw_hs = 1 << 10; /* not on GFX9. combined into LS on GFX9 (and GFX10/legacy). */ +static constexpr Stage hw_fs = 1 << 11; +static constexpr Stage hw_cs = 1 << 12; +static constexpr Stage hw_mask = 0x7f << 6; + +/* possible settings of Program::stage */ +static constexpr Stage vertex_vs = sw_vs | hw_vs; +static constexpr Stage fragment_fs = sw_fs | hw_fs; +static constexpr Stage compute_cs = sw_cs | hw_cs; +static constexpr Stage tess_eval_vs = sw_tes | hw_vs; +/* GFX10/NGG */ +static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs; +static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs; +static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs; +static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs; +/* GFX9 (and GFX10 if NGG isn't used) */ +static constexpr Stage vertex_geometry_es = sw_vs | sw_gs | hw_es; +static constexpr Stage vertex_tess_control_ls = sw_vs | sw_tcs | hw_ls; +static constexpr Stage tess_eval_geometry_es = sw_tes | sw_gs | hw_es; +/* pre-GFX9 */ +static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */ +static constexpr Stage tess_control_hs = sw_tcs | hw_hs; +static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before GS */ +static constexpr Stage geometry_gs = sw_gs | hw_gs; + +class Program final { +public: + std::vector<Block> blocks; + RegisterDemand max_reg_demand = RegisterDemand(); + uint16_t sgpr_limit = 0; + uint16_t num_waves = 0; + ac_shader_config* config; + struct radv_shader_info *info; + enum chip_class chip_class; + enum radeon_family family; + Stage stage; /* Stage */ + bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ + bool needs_wqm = false; /* there exists a p_wqm instruction */ + bool wb_smem_l1_on_end = false; + + std::vector<uint8_t> constant_data; + + uint32_t allocateId() + { + assert(allocationID <= 16777215); + return allocationID++; + } + + uint32_t peekAllocationId() + { + return allocationID; + } + + void setAllocationId(uint32_t id) + { + allocationID = id; + } + + Block* create_and_insert_block() { + blocks.emplace_back(blocks.size()); + return &blocks.back(); + } + + Block* insert_block(Block&& block) { + block.index = blocks.size(); + blocks.emplace_back(std::move(block)); + return &blocks.back(); + } + +private: + uint32_t allocationID = 1; +}; + +struct live { + /* live temps out per block */ + std::vector<std::set<Temp>> live_out; + /* register demand (sgpr/vgpr) per instruction per block */ + std::vector<std::vector<RegisterDemand>> register_demand; +}; + +void select_program(Program *program, + unsigned shader_count, + struct nir_shader *const *shaders, + ac_shader_config* config, + struct radv_shader_info *info, + struct radv_nir_compiler_options *options); + +void lower_wqm(Program* program, live& live_vars, + const struct radv_nir_compiler_options *options); +void lower_bool_phis(Program* program); +void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand); +live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options); +std::vector<uint16_t> dead_code_analysis(Program *program); +void dominator_tree(Program* program); +void insert_exec_mask(Program *program); +void value_numbering(Program* program); +void optimize(Program* program); +void setup_reduce_temp(Program* program); +void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); +void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block); +void ssa_elimination(Program* program); +void lower_to_hw_instr(Program* program); +void schedule_program(Program* program, live& live_vars); +void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); +void insert_wait_states(Program* program); +void insert_NOPs(Program* program); +unsigned emit_program(Program* program, std::vector<uint32_t>& code); +void print_asm(Program *program, std::vector<uint32_t>& binary, unsigned exec_size, + enum radeon_family family, std::ostream& out); +void validate(Program* program, FILE *output); +bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output); +#ifndef NDEBUG +void perfwarn(bool cond, const char *msg, Instruction *instr=NULL); +#else +#define perfwarn(program, cond, msg, ...) +#endif + +void aco_print_instr(Instruction *instr, FILE *output); +void aco_print_program(Program *program, FILE *output); + +typedef struct { + const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)]; + const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)]; + const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers; + const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers; + const char *name[static_cast<int>(aco_opcode::num_opcodes)]; + const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)]; +} Info; + +extern const Info instr_info; + +} + +#endif /* ACO_IR_H */ + diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp new file mode 100644 index 00000000000..f99e57c8b3a --- /dev/null +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -0,0 +1,243 @@ +/* + * Copyright © 2018 Valve Corporation + * Copyright © 2018 Google + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Daniel Schürmann ([email protected]) + * Bas Nieuwenhuizen ([email protected]) + * + */ + +#include "aco_ir.h" + +#include <set> +#include <vector> + +#include "vulkan/radv_shader.h" + +namespace aco { +namespace { + +void process_live_temps_per_block(Program *program, live& lives, Block* block, + std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops) +{ + std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index]; + RegisterDemand new_demand; + + register_demand.resize(block->instructions.size()); + block->register_demand = RegisterDemand(); + + std::set<Temp> live_sgprs; + std::set<Temp> live_vgprs; + + /* add the live_out_exec to live */ + bool exec_live = false; + if (block->live_out_exec != Temp()) { + live_sgprs.insert(block->live_out_exec); + new_demand.sgpr += 2; + exec_live = true; + } + + /* split the live-outs from this block into the temporary sets */ + std::vector<std::set<Temp>>& live_temps = lives.live_out; + for (const Temp temp : live_temps[block->index]) { + const bool inserted = temp.is_linear() + ? live_sgprs.insert(temp).second + : live_vgprs.insert(temp).second; + if (inserted) { + new_demand += temp; + } + } + new_demand.sgpr -= phi_sgpr_ops[block->index]; + + /* traverse the instructions backwards */ + for (int idx = block->instructions.size() -1; idx >= 0; idx--) + { + /* substract the 2 sgprs from exec */ + if (exec_live) + assert(new_demand.sgpr >= 2); + register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? 2 : 0)); + + Instruction *insn = block->instructions[idx].get(); + /* KILL */ + for (Definition& definition : insn->definitions) { + if (!definition.isTemp()) { + continue; + } + + const Temp temp = definition.getTemp(); + size_t n = 0; + if (temp.is_linear()) + n = live_sgprs.erase(temp); + else + n = live_vgprs.erase(temp); + + if (n) { + new_demand -= temp; + definition.setKill(false); + } else { + register_demand[idx] += temp; + definition.setKill(true); + } + + if (definition.isFixed() && definition.physReg() == exec) + exec_live = false; + } + + /* GEN */ + if (insn->opcode == aco_opcode::p_phi || + insn->opcode == aco_opcode::p_linear_phi) { + /* directly insert into the predecessors live-out set */ + std::vector<unsigned>& preds = insn->opcode == aco_opcode::p_phi + ? block->logical_preds + : block->linear_preds; + for (unsigned i = 0; i < preds.size(); ++i) + { + Operand &operand = insn->operands[i]; + if (!operand.isTemp()) { + continue; + } + /* check if we changed an already processed block */ + const bool inserted = live_temps[preds[i]].insert(operand.getTemp()).second; + if (inserted) { + operand.setFirstKill(true); + worklist.insert(preds[i]); + if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr) + phi_sgpr_ops[preds[i]] += operand.size(); + } + } + } else if (insn->opcode == aco_opcode::p_logical_end) { + new_demand.sgpr += phi_sgpr_ops[block->index]; + } else { + for (unsigned i = 0; i < insn->operands.size(); ++i) + { + Operand& operand = insn->operands[i]; + if (!operand.isTemp()) { + continue; + } + const Temp temp = operand.getTemp(); + const bool inserted = temp.is_linear() + ? live_sgprs.insert(temp).second + : live_vgprs.insert(temp).second; + if (inserted) { + operand.setFirstKill(true); + for (unsigned j = i + 1; j < insn->operands.size(); ++j) { + if (insn->operands[j].isTemp() && insn->operands[j].tempId() == operand.tempId()) { + insn->operands[j].setFirstKill(false); + insn->operands[j].setKill(true); + } + } + new_demand += temp; + } else { + operand.setKill(false); + } + + if (operand.isFixed() && operand.physReg() == exec) + exec_live = true; + } + } + + block->register_demand.update(register_demand[idx]); + } + + /* now, we have the live-in sets and need to merge them into the live-out sets */ + for (unsigned pred_idx : block->logical_preds) { + for (Temp vgpr : live_vgprs) { + auto it = live_temps[pred_idx].insert(vgpr); + if (it.second) + worklist.insert(pred_idx); + } + } + + for (unsigned pred_idx : block->linear_preds) { + for (Temp sgpr : live_sgprs) { + auto it = live_temps[pred_idx].insert(sgpr); + if (it.second) + worklist.insert(pred_idx); + } + } + + if (!(block->index != 0 || (live_vgprs.empty() && live_sgprs.empty()))) { + aco_print_program(program, stderr); + fprintf(stderr, "These temporaries are never defined or are defined after use:\n"); + for (Temp vgpr : live_vgprs) + fprintf(stderr, "%%%d\n", vgpr.id()); + for (Temp sgpr : live_sgprs) + fprintf(stderr, "%%%d\n", sgpr.id()); + abort(); + } + + assert(block->index != 0 || new_demand == RegisterDemand()); +} +} /* end namespace */ + +void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) +{ + // TODO: also take shared mem into account + const int16_t total_sgpr_regs = program->chip_class >= GFX8 ? 800 : 512; + const int16_t max_addressible_sgpr = program->sgpr_limit; + /* VGPRs are allocated in chunks of 4 */ + const int16_t rounded_vgpr_demand = std::max<int16_t>(4, (new_demand.vgpr + 3) & ~3); + /* SGPRs are allocated in chunks of 16 between 8 and 104. VCC occupies the last 2 registers */ + const int16_t rounded_sgpr_demand = std::min(std::max<int16_t>(8, (new_demand.sgpr + 2 + 7) & ~7), max_addressible_sgpr); + /* this won't compile, register pressure reduction necessary */ + if (new_demand.vgpr > 256 || new_demand.sgpr > max_addressible_sgpr) { + program->num_waves = 0; + program->max_reg_demand = new_demand; + } else { + program->num_waves = std::min<uint16_t>(10, + std::min<uint16_t>(256 / rounded_vgpr_demand, + total_sgpr_regs / rounded_sgpr_demand)); + + program->max_reg_demand = { int16_t((256 / program->num_waves) & ~3), std::min<int16_t>(((total_sgpr_regs / program->num_waves) & ~7) - 2, max_addressible_sgpr)}; + } +} + +live live_var_analysis(Program* program, + const struct radv_nir_compiler_options *options) +{ + live result; + result.live_out.resize(program->blocks.size()); + result.register_demand.resize(program->blocks.size()); + std::set<unsigned> worklist; + std::vector<uint16_t> phi_sgpr_ops(program->blocks.size()); + RegisterDemand new_demand; + + /* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */ + for (Block& block : program->blocks) + worklist.insert(block.index); + while (!worklist.empty()) { + std::set<unsigned>::reverse_iterator b_it = worklist.rbegin(); + unsigned block_idx = *b_it; + worklist.erase(block_idx); + process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, phi_sgpr_ops); + new_demand.update(program->blocks[block_idx].register_demand); + } + + /* calculate the program's register demand and number of waves */ + update_vgpr_sgpr_demand(program, new_demand); + + return result; +} + +} + diff --git a/src/amd/compiler/aco_lower_bool_phis.cpp b/src/amd/compiler/aco_lower_bool_phis.cpp new file mode 100644 index 00000000000..0c56ca07214 --- /dev/null +++ b/src/amd/compiler/aco_lower_bool_phis.cpp @@ -0,0 +1,241 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Rhys Perry ([email protected]) + * + */ + +#include <map> + +#include "aco_ir.h" +#include "aco_builder.h" + + +namespace aco { + +struct phi_use { + Block *block; + unsigned phi_def; + + bool operator<(const phi_use& other) const { + return std::make_tuple(block, phi_def) < + std::make_tuple(other.block, other.phi_def); + } +}; + +struct ssa_state { + std::map<unsigned, unsigned> latest; + std::map<unsigned, std::map<phi_use, uint64_t>> phis; +}; + +Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state) +{ + while (true) { + auto pos = state->latest.find(block_idx); + if (pos != state->latest.end()) + return Operand({pos->second, s2}); + + Block& block = program->blocks[block_idx]; + size_t pred = block.linear_preds.size(); + if (pred == 0) { + return Operand(s2); + } else if (pred == 1) { + block_idx = block.linear_preds[0]; + continue; + } else { + unsigned res = program->allocateId(); + state->latest[block_idx] = res; + + aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)}; + for (unsigned i = 0; i < pred; i++) { + phi->operands[i] = get_ssa(program, block.linear_preds[i], state); + if (phi->operands[i].isTemp()) { + assert(i < 64); + state->phis[phi->operands[i].tempId()][(phi_use){&block, res}] |= (uint64_t)1 << i; + } + } + phi->definitions[0] = Definition(Temp{res, s2}); + block.instructions.emplace(block.instructions.begin(), std::move(phi)); + + return Operand({res, s2}); + } + } +} + +void update_phi(Program *program, ssa_state *state, Block *block, unsigned phi_def, uint64_t operand_mask) { + for (auto& phi : block->instructions) { + if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) + break; + if (phi->opcode != aco_opcode::p_linear_phi) + continue; + if (phi->definitions[0].tempId() != phi_def) + continue; + assert(ffsll(operand_mask) <= phi->operands.size()); + + uint64_t operands = operand_mask; + while (operands) { + unsigned operand = u_bit_scan64(&operands); + Operand new_operand = get_ssa(program, block->linear_preds[operand], state); + phi->operands[operand] = new_operand; + if (!new_operand.isUndefined()) + state->phis[new_operand.tempId()][(phi_use){block, phi_def}] |= (uint64_t)1 << operand; + } + return; + } + assert(false); +} + +Temp write_ssa(Program *program, Block *block, ssa_state *state, unsigned previous) { + unsigned id = program->allocateId(); + state->latest[block->index] = id; + + /* update phis */ + if (previous) { + std::map<phi_use, uint64_t> phis; + phis.swap(state->phis[previous]); + for (auto& phi : phis) + update_phi(program, state, phi.first.block, phi.first.phi_def, phi.second); + } + + return {id, s2}; +} + +void insert_before_branch(Block *block, aco_ptr<Instruction> instr) +{ + int end = block->instructions.size() - 1; + if (block->instructions[end]->format == Format::PSEUDO_BRANCH) + block->instructions.emplace(std::prev(block->instructions.end()), std::move(instr)); + else + block->instructions.emplace_back(std::move(instr)); +} + +void insert_before_logical_end(Block *block, aco_ptr<Instruction> instr) +{ + for (int i = block->instructions.size() - 1; i >= 0; --i) { + if (block->instructions[i]->opcode == aco_opcode::p_logical_end) { + block->instructions.emplace(std::next(block->instructions.begin(), i), std::move(instr)); + return; + } + } + insert_before_branch(block, std::move(instr)); +} + +aco_ptr<Instruction> lower_divergent_bool_phi(Program *program, Block *block, aco_ptr<Instruction>& phi) +{ + Builder bld(program); + + ssa_state state; + for (unsigned i = 0; i < phi->operands.size(); i++) { + Block *pred = &program->blocks[block->logical_preds[i]]; + + if (phi->operands[i].isUndefined()) + continue; + + assert(phi->operands[i].isTemp()); + Temp phi_src = phi->operands[i].getTemp(); + if (phi_src.regClass() == s1) { + Temp new_phi_src = bld.tmp(s2); + insert_before_logical_end(pred, + bld.sop2(aco_opcode::s_cselect_b64, Definition(new_phi_src), + Operand((uint32_t)-1), Operand(0u), bld.scc(phi_src)).get_ptr()); + phi_src = new_phi_src; + } + assert(phi_src.regClass() == s2); + + Operand cur = get_ssa(program, pred->index, &state); + Temp new_cur = write_ssa(program, pred, &state, cur.isTemp() ? cur.tempId() : 0); + + if (cur.isUndefined()) { + insert_before_logical_end(pred, bld.sop1(aco_opcode::s_mov_b64, Definition(new_cur), phi_src).get_ptr()); + } else { + Temp tmp1 = bld.tmp(s2), tmp2 = bld.tmp(s2); + insert_before_logical_end(pred, + bld.sop2(aco_opcode::s_andn2_b64, Definition(tmp1), bld.def(s1, scc), + cur, Operand(exec, s2)).get_ptr()); + insert_before_logical_end(pred, + bld.sop2(aco_opcode::s_and_b64, Definition(tmp2), bld.def(s1, scc), + phi_src, Operand(exec, s2)).get_ptr()); + insert_before_logical_end(pred, + bld.sop2(aco_opcode::s_or_b64, Definition(new_cur), bld.def(s1, scc), + tmp1, tmp2).get_ptr()); + } + } + + return bld.sop1(aco_opcode::s_mov_b64, phi->definitions[0], get_ssa(program, block->index, &state)).get_ptr(); +} + +void lower_linear_bool_phi(Program *program, Block *block, aco_ptr<Instruction>& phi) +{ + Builder bld(program); + + for (unsigned i = 0; i < phi->operands.size(); i++) { + if (!phi->operands[i].isTemp()) + continue; + + Temp phi_src = phi->operands[i].getTemp(); + if (phi_src.regClass() == s2) { + Temp new_phi_src = bld.tmp(s1); + insert_before_logical_end(&program->blocks[block->linear_preds[i]], + bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(Definition(new_phi_src)), + Operand(0u), phi_src).get_ptr()); + phi->operands[i].setTemp(new_phi_src); + } + } +} + +void lower_bool_phis(Program* program) +{ + for (Block& block : program->blocks) { + std::vector<aco_ptr<Instruction>> instructions; + std::vector<aco_ptr<Instruction>> non_phi; + instructions.swap(block.instructions); + block.instructions.reserve(instructions.size()); + unsigned i = 0; + for (; i < instructions.size(); i++) + { + aco_ptr<Instruction>& phi = instructions[i]; + if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) + break; + if (phi->opcode == aco_opcode::p_phi && phi->definitions[0].regClass() == s2) { + non_phi.emplace_back(std::move(lower_divergent_bool_phi(program, &block, phi))); + } else if (phi->opcode == aco_opcode::p_linear_phi && phi->definitions[0].regClass() == s1) { + /* if it's a valid non-boolean phi, this should be a no-op */ + lower_linear_bool_phi(program, &block, phi); + block.instructions.emplace_back(std::move(phi)); + } else { + block.instructions.emplace_back(std::move(phi)); + } + } + for (auto&& instr : non_phi) { + assert(instr->opcode != aco_opcode::p_phi && instr->opcode != aco_opcode::p_linear_phi); + block.instructions.emplace_back(std::move(instr)); + } + for (; i < instructions.size(); i++) { + aco_ptr<Instruction> instr = std::move(instructions[i]); + assert(instr->opcode != aco_opcode::p_phi && instr->opcode != aco_opcode::p_linear_phi); + block.instructions.emplace_back(std::move(instr)); + } + } +} + +} diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp new file mode 100644 index 00000000000..8fd33e47d92 --- /dev/null +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -0,0 +1,765 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Daniel Schürmann ([email protected]) + * + */ + +#include <map> + +#include "aco_ir.h" +#include "aco_builder.h" +#include "util/u_math.h" +#include "sid.h" + + +namespace aco { + +struct lower_context { + Program *program; + std::vector<aco_ptr<Instruction>> instructions; +}; + +void emit_dpp_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1, PhysReg vtmp, PhysReg wrtmp, + aco_opcode op, Format format, bool clobber_vcc, unsigned dpp_ctrl, + unsigned row_mask, unsigned bank_mask, bool bound_ctrl_zero, unsigned size, + Operand *identity=NULL) /* for VOP3 with sparse writes */ +{ + RegClass rc = RegClass(RegType::vgpr, size); + if (format == Format::VOP3) { + Builder bld(ctx->program, &ctx->instructions); + + if (identity) + bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]); + if (identity && size >= 2) + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), identity[1]); + + for (unsigned i = 0; i < size; i++) + bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{src0+i}, v1), + dpp_ctrl, row_mask, bank_mask, bound_ctrl_zero); + + if (clobber_vcc) + bld.vop3(op, Definition(dst, rc), Definition(vcc, s2), Operand(vtmp, rc), Operand(src1, rc)); + else + bld.vop3(op, Definition(dst, rc), Operand(vtmp, rc), Operand(src1, rc)); + } else { + assert(format == Format::VOP2 || format == Format::VOP1); + assert(size == 1 || (op == aco_opcode::v_mov_b32)); + + for (unsigned i = 0; i < size; i++) { + aco_ptr<DPP_instruction> dpp{create_instruction<DPP_instruction>( + op, (Format) ((uint32_t) format | (uint32_t) Format::DPP), + format == Format::VOP2 ? 2 : 1, clobber_vcc ? 2 : 1)}; + dpp->operands[0] = Operand(PhysReg{src0+i}, rc); + if (format == Format::VOP2) + dpp->operands[1] = Operand(PhysReg{src1+i}, rc); + dpp->definitions[0] = Definition(PhysReg{dst+i}, rc); + if (clobber_vcc) + dpp->definitions[1] = Definition(vcc, s2); + dpp->dpp_ctrl = dpp_ctrl; + dpp->row_mask = row_mask; + dpp->bank_mask = bank_mask; + dpp->bound_ctrl = bound_ctrl_zero; + ctx->instructions.emplace_back(std::move(dpp)); + } + } +} + +uint32_t get_reduction_identity(ReduceOp op, unsigned idx) +{ + switch (op) { + case iadd32: + case iadd64: + case fadd32: + case fadd64: + case ior32: + case ior64: + case ixor32: + case ixor64: + case umax32: + case umax64: + return 0; + case imul32: + case imul64: + return idx ? 0 : 1; + case fmul32: + return 0x3f800000u; /* 1.0 */ + case fmul64: + return idx ? 0x3ff00000u : 0u; /* 1.0 */ + case imin32: + return INT32_MAX; + case imin64: + return idx ? 0x7fffffffu : 0xffffffffu; + case imax32: + return INT32_MIN; + case imax64: + return idx ? 0x80000000u : 0; + case umin32: + case umin64: + case iand32: + case iand64: + return 0xffffffffu; + case fmin32: + return 0x7f800000u; /* infinity */ + case fmin64: + return idx ? 0x7ff00000u : 0u; /* infinity */ + case fmax32: + return 0xff800000u; /* negative infinity */ + case fmax64: + return idx ? 0xfff00000u : 0u; /* negative infinity */ + } + unreachable("Invalid reduction operation"); +} + +aco_opcode get_reduction_opcode(lower_context *ctx, ReduceOp op, bool *clobber_vcc, Format *format) +{ + *clobber_vcc = false; + *format = Format::VOP2; + switch (op) { + case iadd32: + *clobber_vcc = ctx->program->chip_class < GFX9; + return ctx->program->chip_class < GFX9 ? aco_opcode::v_add_co_u32 : aco_opcode::v_add_u32; + case imul32: + *format = Format::VOP3; + return aco_opcode::v_mul_lo_u32; + case fadd32: + return aco_opcode::v_add_f32; + case fmul32: + return aco_opcode::v_mul_f32; + case imax32: + return aco_opcode::v_max_i32; + case imin32: + return aco_opcode::v_min_i32; + case umin32: + return aco_opcode::v_min_u32; + case umax32: + return aco_opcode::v_max_u32; + case fmin32: + return aco_opcode::v_min_f32; + case fmax32: + return aco_opcode::v_max_f32; + case iand32: + return aco_opcode::v_and_b32; + case ixor32: + return aco_opcode::v_xor_b32; + case ior32: + return aco_opcode::v_or_b32; + case iadd64: + case imul64: + assert(false); + break; + case fadd64: + *format = Format::VOP3; + return aco_opcode::v_add_f64; + case fmul64: + *format = Format::VOP3; + return aco_opcode::v_mul_f64; + case imin64: + case imax64: + case umin64: + case umax64: + assert(false); + break; + case fmin64: + *format = Format::VOP3; + return aco_opcode::v_min_f64; + case fmax64: + *format = Format::VOP3; + return aco_opcode::v_max_f64; + case iand64: + case ior64: + case ixor64: + assert(false); + break; + } + unreachable("Invalid reduction operation"); + return aco_opcode::v_min_u32; +} + +void emit_vopn(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1, + RegClass rc, aco_opcode op, Format format, bool clobber_vcc) +{ + aco_ptr<Instruction> instr; + switch (format) { + case Format::VOP2: + instr.reset(create_instruction<VOP2_instruction>(op, format, 2, clobber_vcc ? 2 : 1)); + break; + case Format::VOP3: + instr.reset(create_instruction<VOP3A_instruction>(op, format, 2, clobber_vcc ? 2 : 1)); + break; + default: + assert(false); + } + instr->operands[0] = Operand(src0, rc); + instr->operands[1] = Operand(src1, rc); + instr->definitions[0] = Definition(dst, rc); + if (clobber_vcc) + instr->definitions[1] = Definition(vcc, s2); + ctx->instructions.emplace_back(std::move(instr)); +} + +void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size, PhysReg tmp, + PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst) +{ + assert(cluster_size == 64 || op == aco_opcode::p_reduce); + + Builder bld(ctx->program, &ctx->instructions); + + PhysReg wrtmp{0}; /* should never be needed */ + + Format format; + bool should_clobber_vcc; + aco_opcode reduce_opcode = get_reduction_opcode(ctx, reduce_op, &should_clobber_vcc, &format); + Operand identity[2]; + identity[0] = Operand(get_reduction_identity(reduce_op, 0)); + identity[1] = Operand(get_reduction_identity(reduce_op, 1)); + Operand vcndmask_identity[2] = {identity[0], identity[1]}; + + /* First, copy the source to tmp and set inactive lanes to the identity */ + // note: this clobbers SCC! + bld.sop1(aco_opcode::s_or_saveexec_b64, Definition(stmp, s2), Definition(scc, s1), Definition(exec, s2), Operand(UINT64_MAX), Operand(exec, s2)); + + for (unsigned i = 0; i < src.size(); i++) { + /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32 */ + if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) { + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]); + identity[i] = Operand(PhysReg{sitmp+i}, s1); + + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]); + vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1); + } else if (identity[i].isLiteral()) { + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]); + vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1); + } + } + + for (unsigned i = 0; i < src.size(); i++) { + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1), + vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1), + Operand(stmp, s2)); + } + + bool exec_restored = false; + bool dst_written = false; + switch (op) { + case aco_opcode::p_reduce: + if (cluster_size == 1) break; + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_quad_perm(1, 0, 3, 2), 0xf, 0xf, false, src.size()); + if (cluster_size == 2) break; + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_quad_perm(2, 3, 0, 1), 0xf, 0xf, false, src.size()); + if (cluster_size == 4) break; + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_row_half_mirror, 0xf, 0xf, false, src.size()); + if (cluster_size == 8) break; + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_row_mirror, 0xf, 0xf, false, src.size()); + if (cluster_size == 16) break; + if (cluster_size == 32) { + for (unsigned i = 0; i < src.size(); i++) + bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), ds_pattern_bitmode(0x1f, 0, 0x10)); + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2)); + exec_restored = true; + emit_vopn(ctx, dst.physReg(), vtmp, tmp, src.regClass(), reduce_opcode, format, should_clobber_vcc); + dst_written = true; + } else { + assert(cluster_size == 64); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_row_bcast15, 0xa, 0xf, false, src.size()); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_row_bcast31, 0xc, 0xf, false, src.size()); + } + break; + case aco_opcode::p_exclusive_scan: + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, aco_opcode::v_mov_b32, Format::VOP1, false, + dpp_wf_sr1, 0xf, 0xf, true, src.size()); + for (unsigned i = 0; i < src.size(); i++) { + if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */ + assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i}); + bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{tmp+i}, v1), + identity[i], Operand(0u)); + } + } + /* fall through */ + case aco_opcode::p_inclusive_scan: + assert(cluster_size == 64); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_row_sr(1), 0xf, 0xf, false, src.size(), identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_row_sr(2), 0xf, 0xf, false, src.size(), identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_row_sr(4), 0xf, 0xf, false, src.size(), identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_row_sr(8), 0xf, 0xf, false, src.size(), identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_row_bcast15, 0xa, 0xf, false, src.size(), identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_row_bcast31, 0xc, 0xf, false, src.size(), identity); + break; + default: + unreachable("Invalid reduction mode"); + } + + if (!exec_restored) + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2)); + + if (op == aco_opcode::p_reduce && cluster_size == 64) { + for (unsigned k = 0; k < src.size(); k++) { + bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{dst.physReg() + k}, s1), + Operand(PhysReg{tmp + k}, v1), Operand(63u)); + } + } else if (!(dst.physReg() == tmp) && !dst_written) { + for (unsigned k = 0; k < src.size(); k++) { + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, s1), + Operand(PhysReg{tmp + k}, v1)); + } + } +} + +struct copy_operation { + Operand op; + Definition def; + unsigned uses; + unsigned size; +}; + +void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx, chip_class chip_class, Pseudo_instruction *pi) +{ + Builder bld(ctx->program, &ctx->instructions); + aco_ptr<Instruction> mov; + std::map<PhysReg, copy_operation>::iterator it = copy_map.begin(); + std::map<PhysReg, copy_operation>::iterator target; + bool writes_scc = false; + + /* count the number of uses for each dst reg */ + while (it != copy_map.end()) { + if (it->second.op.isConstant()) { + ++it; + continue; + } + + if (it->second.def.physReg() == scc) + writes_scc = true; + + assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr)); + + /* if src and dst reg are the same, remove operation */ + if (it->first == it->second.op.physReg()) { + it = copy_map.erase(it); + continue; + } + /* check if the operand reg may be overwritten by another copy operation */ + target = copy_map.find(it->second.op.physReg()); + if (target != copy_map.end()) { + target->second.uses++; + } + + ++it; + } + + /* first, handle paths in the location transfer graph */ + bool preserve_scc = pi->tmp_in_scc && !writes_scc; + it = copy_map.begin(); + while (it != copy_map.end()) { + + /* the target reg is not used as operand for any other copy */ + if (it->second.uses == 0) { + + /* try to coalesce 32-bit sgpr copies to 64-bit copies */ + if (it->second.def.getTemp().type() == RegType::sgpr && it->second.size == 1 && + !it->second.op.isConstant() && it->first % 2 == it->second.op.physReg() % 2) { + + PhysReg other_def_reg = PhysReg{it->first % 2 ? it->first - 1 : it->first + 1}; + PhysReg other_op_reg = PhysReg{it->first % 2 ? it->second.op.physReg() - 1 : it->second.op.physReg() + 1}; + std::map<PhysReg, copy_operation>::iterator other = copy_map.find(other_def_reg); + + if (other != copy_map.end() && !other->second.uses && other->second.size == 1 && + other->second.op.physReg() == other_op_reg && !other->second.op.isConstant()) { + std::map<PhysReg, copy_operation>::iterator to_erase = it->first % 2 ? it : other; + it = it->first % 2 ? other : it; + copy_map.erase(to_erase); + it->second.size = 2; + } + } + + if (it->second.def.physReg() == scc) { + bld.sopc(aco_opcode::s_cmp_lg_i32, it->second.def, it->second.op, Operand(0u)); + preserve_scc = true; + } else if (it->second.size == 2 && it->second.def.getTemp().type() == RegType::sgpr) { + bld.sop1(aco_opcode::s_mov_b64, it->second.def, Operand(it->second.op.physReg(), s2)); + } else { + bld.copy(it->second.def, it->second.op); + } + + /* reduce the number of uses of the operand reg by one */ + if (!it->second.op.isConstant()) { + for (unsigned i = 0; i < it->second.size; i++) { + target = copy_map.find(PhysReg{it->second.op.physReg() + i}); + if (target != copy_map.end()) + target->second.uses--; + } + } + + copy_map.erase(it); + it = copy_map.begin(); + continue; + } else { + /* the target reg is used as operand, check the next entry */ + ++it; + } + } + + if (copy_map.empty()) + return; + + /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */ + bool constants = false; + for (it = copy_map.begin(); it != copy_map.end(); ++it) { + assert(it->second.op.isFixed()); + if (it->first == it->second.op.physReg()) + continue; + /* do constants later */ + if (it->second.op.isConstant()) { + constants = true; + continue; + } + + if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr) + assert(!(it->second.def.physReg() == pi->scratch_sgpr)); + + /* to resolve the cycle, we have to swap the src reg with the dst reg */ + copy_operation swap = it->second; + assert(swap.op.regClass() == swap.def.regClass()); + Operand def_as_op = Operand(swap.def.physReg(), swap.def.regClass()); + Definition op_as_def = Definition(swap.op.physReg(), swap.op.regClass()); + if (chip_class >= GFX9 && swap.def.getTemp().type() == RegType::vgpr) { + bld.vop1(aco_opcode::v_swap_b32, swap.def, op_as_def, swap.op, def_as_op); + } else if (swap.op.physReg() == scc || swap.def.physReg() == scc) { + /* we need to swap scc and another sgpr */ + assert(!preserve_scc); + + PhysReg other = swap.op.physReg() == scc ? swap.def.physReg() : swap.op.physReg(); + + bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1)); + bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1), Operand(0u)); + bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1)); + } else if (swap.def.getTemp().type() == RegType::sgpr) { + if (preserve_scc) { + bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), swap.op); + bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op); + bld.sop1(aco_opcode::s_mov_b32, swap.def, Operand(pi->scratch_sgpr, s1)); + } else { + bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), swap.op, def_as_op); + bld.sop2(aco_opcode::s_xor_b32, swap.def, Definition(scc, s1), swap.op, def_as_op); + bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), swap.op, def_as_op); + } + } else { + bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op); + bld.vop2(aco_opcode::v_xor_b32, swap.def, swap.op, def_as_op); + bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op); + } + + /* change the operand reg of the target's use */ + assert(swap.uses == 1); + target = it; + for (++target; target != copy_map.end(); ++target) { + if (target->second.op.physReg() == it->first) { + target->second.op.setFixed(swap.op.physReg()); + break; + } + } + } + + /* copy constants into a registers which were operands */ + if (constants) { + for (it = copy_map.begin(); it != copy_map.end(); ++it) { + if (!it->second.op.isConstant()) + continue; + if (it->second.def.physReg() == scc) { + bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(0u), Operand(it->second.op.constantValue() ? 1u : 0u)); + } else { + bld.copy(it->second.def, it->second.op); + } + } + } +} + +void lower_to_hw_instr(Program* program) +{ + Block *discard_block = NULL; + + for (size_t i = 0; i < program->blocks.size(); i++) + { + Block *block = &program->blocks[i]; + lower_context ctx; + ctx.program = program; + Builder bld(program, &ctx.instructions); + + for (size_t j = 0; j < block->instructions.size(); j++) { + aco_ptr<Instruction>& instr = block->instructions[j]; + aco_ptr<Instruction> mov; + if (instr->format == Format::PSEUDO) { + Pseudo_instruction *pi = (Pseudo_instruction*)instr.get(); + + switch (instr->opcode) + { + case aco_opcode::p_extract_vector: + { + unsigned reg = instr->operands[0].physReg() + instr->operands[1].constantValue() * instr->definitions[0].size(); + RegClass rc = RegClass(instr->operands[0].getTemp().type(), 1); + RegClass rc_def = RegClass(instr->definitions[0].getTemp().type(), 1); + if (reg == instr->definitions[0].physReg()) + break; + + std::map<PhysReg, copy_operation> copy_operations; + for (unsigned i = 0; i < instr->definitions[0].size(); i++) { + Definition def = Definition(PhysReg{instr->definitions[0].physReg() + i}, rc_def); + copy_operations[def.physReg()] = {Operand(PhysReg{reg + i}, rc), def, 0, 1}; + } + handle_operands(copy_operations, &ctx, program->chip_class, pi); + break; + } + case aco_opcode::p_create_vector: + { + std::map<PhysReg, copy_operation> copy_operations; + RegClass rc_def = RegClass(instr->definitions[0].getTemp().type(), 1); + unsigned reg_idx = 0; + for (const Operand& op : instr->operands) { + if (op.isConstant()) { + const PhysReg reg = PhysReg{instr->definitions[0].physReg() + reg_idx}; + const Definition def = Definition(reg, rc_def); + copy_operations[reg] = {op, def, 0, 1}; + reg_idx++; + continue; + } + + RegClass rc_op = RegClass(op.getTemp().type(), 1); + for (unsigned j = 0; j < op.size(); j++) + { + const Operand copy_op = Operand(PhysReg{op.physReg() + j}, rc_op); + const Definition def = Definition(PhysReg{instr->definitions[0].physReg() + reg_idx}, rc_def); + copy_operations[def.physReg()] = {copy_op, def, 0, 1}; + reg_idx++; + } + } + handle_operands(copy_operations, &ctx, program->chip_class, pi); + break; + } + case aco_opcode::p_split_vector: + { + std::map<PhysReg, copy_operation> copy_operations; + RegClass rc_op = instr->operands[0].isConstant() ? s1 : RegClass(instr->operands[0].regClass().type(), 1); + for (unsigned i = 0; i < instr->definitions.size(); i++) { + unsigned k = instr->definitions[i].size(); + RegClass rc_def = RegClass(instr->definitions[i].getTemp().type(), 1); + for (unsigned j = 0; j < k; j++) { + Operand op = Operand(PhysReg{instr->operands[0].physReg() + (i*k+j)}, rc_op); + Definition def = Definition(PhysReg{instr->definitions[i].physReg() + j}, rc_def); + copy_operations[def.physReg()] = {op, def, 0, 1}; + } + } + handle_operands(copy_operations, &ctx, program->chip_class, pi); + break; + } + case aco_opcode::p_parallelcopy: + case aco_opcode::p_wqm: + { + std::map<PhysReg, copy_operation> copy_operations; + for (unsigned i = 0; i < instr->operands.size(); i++) + { + Operand operand = instr->operands[i]; + if (operand.isConstant() || operand.size() == 1) { + assert(instr->definitions[i].size() == 1); + copy_operations[instr->definitions[i].physReg()] = {operand, instr->definitions[i], 0, 1}; + } else { + RegClass def_rc = RegClass(instr->definitions[i].regClass().type(), 1); + RegClass op_rc = RegClass(operand.getTemp().type(), 1); + for (unsigned j = 0; j < operand.size(); j++) + { + Operand op = Operand(PhysReg{instr->operands[i].physReg() + j}, op_rc); + Definition def = Definition(PhysReg{instr->definitions[i].physReg() + j}, def_rc); + copy_operations[def.physReg()] = {op, def, 0, 1}; + } + } + } + handle_operands(copy_operations, &ctx, program->chip_class, pi); + break; + } + case aco_opcode::p_discard_if: + { + bool early_exit = false; + if (block->instructions[j + 1]->opcode != aco_opcode::p_logical_end || + block->instructions[j + 2]->opcode != aco_opcode::s_endpgm) { + early_exit = true; + } + + if (early_exit && !discard_block) { + discard_block = program->create_and_insert_block(); + block = &program->blocks[i]; + + bld.reset(discard_block); + bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), + 0, V_008DFC_SQ_EXP_NULL, false, true, true); + if (program->wb_smem_l1_on_end) + bld.smem(aco_opcode::s_dcache_wb); + bld.sopp(aco_opcode::s_endpgm); + + bld.reset(&ctx.instructions); + } + + // TODO: optimize uniform conditions + Definition branch_cond = instr->definitions.back(); + Operand discard_cond = instr->operands.back(); + aco_ptr<Instruction> sop2; + /* backwards, to finally branch on the global exec mask */ + for (int i = instr->operands.size() - 2; i >= 0; i--) { + bld.sop2(aco_opcode::s_andn2_b64, + instr->definitions[i], /* new mask */ + branch_cond, /* scc */ + instr->operands[i], /* old mask */ + discard_cond); + } + + if (early_exit) { + bld.sopp(aco_opcode::s_cbranch_scc0, bld.scc(branch_cond.getTemp()), discard_block->index); + + discard_block->linear_preds.push_back(block->index); + block->linear_succs.push_back(discard_block->index); + } + + break; + } + case aco_opcode::p_spill: + { + assert(instr->operands[0].regClass() == v1.as_linear()); + for (unsigned i = 0; i < instr->operands[2].size(); i++) { + bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1, instr->operands[0].physReg()), + Operand(PhysReg{instr->operands[2].physReg() + i}, s1), + Operand(instr->operands[1].constantValue() + i)); + } + break; + } + case aco_opcode::p_reload: + { + assert(instr->operands[0].regClass() == v1.as_linear()); + for (unsigned i = 0; i < instr->definitions[0].size(); i++) { + bld.vop3(aco_opcode::v_readlane_b32, + bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}), + instr->operands[0], Operand(instr->operands[1].constantValue() + i)); + } + break; + } + case aco_opcode::p_as_uniform: + { + if (instr->operands[0].isConstant() || instr->operands[0].regClass().type() == RegType::sgpr) { + std::map<PhysReg, copy_operation> copy_operations; + Operand operand = instr->operands[0]; + if (operand.isConstant() || operand.size() == 1) { + assert(instr->definitions[0].size() == 1); + copy_operations[instr->definitions[0].physReg()] = {operand, instr->definitions[0], 0, 1}; + } else { + for (unsigned i = 0; i < operand.size(); i++) + { + Operand op = Operand(PhysReg{operand.physReg() + i}, s1); + Definition def = Definition(PhysReg{instr->definitions[0].physReg() + i}, s1); + copy_operations[def.physReg()] = {op, def, 0, 1}; + } + } + + handle_operands(copy_operations, &ctx, program->chip_class, pi); + } else { + assert(instr->operands[0].regClass().type() == RegType::vgpr); + assert(instr->definitions[0].regClass().type() == RegType::sgpr); + assert(instr->operands[0].size() == instr->definitions[0].size()); + for (unsigned i = 0; i < instr->definitions[0].size(); i++) { + bld.vop1(aco_opcode::v_readfirstlane_b32, + bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}), + Operand(PhysReg{instr->operands[0].physReg() + i}, v1)); + } + } + break; + } + default: + break; + } + } else if (instr->format == Format::PSEUDO_BRANCH) { + Pseudo_branch_instruction* branch = static_cast<Pseudo_branch_instruction*>(instr.get()); + /* check if all blocks from current to target are empty */ + bool can_remove = block->index < branch->target[0]; + for (unsigned i = block->index + 1; can_remove && i < branch->target[0]; i++) { + if (program->blocks[i].instructions.size()) + can_remove = false; + } + if (can_remove) + continue; + + switch (instr->opcode) { + case aco_opcode::p_branch: + assert(block->linear_succs[0] == branch->target[0]); + bld.sopp(aco_opcode::s_branch, branch->target[0]); + break; + case aco_opcode::p_cbranch_nz: + assert(block->linear_succs[1] == branch->target[0]); + if (branch->operands[0].physReg() == exec) + bld.sopp(aco_opcode::s_cbranch_execnz, branch->target[0]); + else if (branch->operands[0].physReg() == vcc) + bld.sopp(aco_opcode::s_cbranch_vccnz, branch->target[0]); + else { + assert(branch->operands[0].physReg() == scc); + bld.sopp(aco_opcode::s_cbranch_scc1, branch->target[0]); + } + break; + case aco_opcode::p_cbranch_z: + assert(block->linear_succs[1] == branch->target[0]); + if (branch->operands[0].physReg() == exec) + bld.sopp(aco_opcode::s_cbranch_execz, branch->target[0]); + else if (branch->operands[0].physReg() == vcc) + bld.sopp(aco_opcode::s_cbranch_vccz, branch->target[0]); + else { + assert(branch->operands[0].physReg() == scc); + bld.sopp(aco_opcode::s_cbranch_scc0, branch->target[0]); + } + break; + default: + unreachable("Unknown Pseudo branch instruction!"); + } + + } else if (instr->format == Format::PSEUDO_REDUCTION) { + Pseudo_reduction_instruction* reduce = static_cast<Pseudo_reduction_instruction*>(instr.get()); + emit_reduction(&ctx, reduce->opcode, reduce->reduce_op, reduce->cluster_size, + reduce->operands[1].physReg(), // tmp + reduce->definitions[1].physReg(), // stmp + reduce->operands[2].physReg(), // vtmp + reduce->definitions[2].physReg(), // sitmp + reduce->operands[0], reduce->definitions[0]); + } else { + ctx.instructions.emplace_back(std::move(instr)); + } + + } + block->instructions.swap(ctx.instructions); + } +} + +} diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py new file mode 100644 index 00000000000..2221e2817af --- /dev/null +++ b/src/amd/compiler/aco_opcodes.py @@ -0,0 +1,1552 @@ +# +# Copyright (c) 2018 Valve Corporation +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +# +# Authors: +# Daniel Schuermann ([email protected]) + + +# Class that represents all the information we have about the opcode +# NOTE: this must be kept in sync with aco_op_info + +from enum import Enum + +class Format(Enum): + PSEUDO = 0 + SOP1 = 1 + SOP2 = 2 + SOPK = 3 + SOPP = 4 + SOPC = 5 + SMEM = 6 + DS = 8 + MTBUF = 9 + MUBUF = 10 + MIMG = 11 + EXP = 12 + FLAT = 13 + GLOBAL = 14 + SCRATCH = 15 + PSEUDO_BRANCH = 16 + PSEUDO_BARRIER = 17 + PSEUDO_REDUCTION = 18 + VOP1 = 1 << 8 + VOP2 = 1 << 9 + VOPC = 1 << 10 + VOP3A = 1 << 11 + VOP3B = 1 << 11 + VOP3P = 1 << 12 + VINTRP = 1 << 13 + DPP = 1 << 14 + SDWA = 1 << 15 + + def get_builder_fields(self): + if self == Format.SOPK: + return [('uint16_t', 'imm', None)] + elif self == Format.SOPP: + return [('uint32_t', 'block', '-1'), + ('uint32_t', 'imm', '0')] + elif self == Format.SMEM: + return [('bool', 'can_reorder', 'true'), + ('bool', 'glc', 'false'), + ('bool', 'dlc', 'false'), + ('bool', 'nv', 'false')] + elif self == Format.DS: + return [('int16_t', 'offset0', '0'), + ('int8_t', 'offset1', '0'), + ('bool', 'gds', 'false')] + elif self == Format.MTBUF: + return [('unsigned', 'dfmt', None), + ('unsigned', 'nfmt', None), + ('unsigned', 'img_format', None), + ('unsigned', 'offset', None), + ('bool', 'offen', None), + ('bool', 'idxen', 'false'), + ('bool', 'disable_wqm', 'false'), + ('bool', 'glc', 'false'), + ('bool', 'dlc', 'false'), + ('bool', 'slc', 'false'), + ('bool', 'tfe', 'false'), + ('bool', 'lds', 'false')] + elif self == Format.MUBUF: + return [('unsigned', 'offset', None), + ('bool', 'offen', None), + ('bool', 'idxen', 'false'), + ('bool', 'disable_wqm', 'false'), + ('bool', 'glc', 'false'), + ('bool', 'dlc', 'false'), + ('bool', 'slc', 'false'), + ('bool', 'tfe', 'false'), + ('bool', 'lds', 'false')] + elif self == Format.MIMG: + return [('unsigned', 'dmask', '0xF'), + ('bool', 'da', 'false'), + ('bool', 'unrm', 'true'), + ('bool', 'disable_wqm', 'false'), + ('bool', 'glc', 'false'), + ('bool', 'dlc', 'false'), + ('bool', 'slc', 'false'), + ('bool', 'tfe', 'false'), + ('bool', 'lwe', 'false'), + ('bool', 'r128_a16', 'false', 'r128'), + ('bool', 'd16', 'false')] + return [('unsigned', 'attribute', None), + ('unsigned', 'component', None)] + elif self == Format.EXP: + return [('unsigned', 'enabled_mask', None), + ('unsigned', 'dest', None), + ('bool', 'compr', 'false', 'compressed'), + ('bool', 'done', 'false'), + ('bool', 'vm', 'false', 'valid_mask')] + elif self == Format.PSEUDO_BRANCH: + return [('uint32_t', 'target0', '0', 'target[0]'), + ('uint32_t', 'target1', '0', 'target[1]')] + elif self == Format.PSEUDO_REDUCTION: + return [('ReduceOp', 'op', None, 'reduce_op'), + ('unsigned', 'cluster_size', '0')] + elif self == Format.VINTRP: + return [('unsigned', 'attribute', None), + ('unsigned', 'component', None)] + elif self == Format.DPP: + return [('uint16_t', 'dpp_ctrl', None), + ('uint8_t', 'row_mask', '0xF'), + ('uint8_t', 'bank_mask', '0xF'), + ('bool', 'bound_ctrl', 'false')] + elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]: + return [('uint16_t', 'offset', 0), + ('bool', 'glc', 'false'), + ('bool', 'slc', 'false'), + ('bool', 'lds', 'false'), + ('bool', 'nv', 'false')] + else: + return [] + + def get_builder_field_names(self): + return [f[1] for f in self.get_builder_fields()] + + def get_builder_field_dests(self): + return [(f[3] if len(f) >= 4 else f[1]) for f in self.get_builder_fields()] + + def get_builder_field_decls(self): + return [('%s %s=%s' % (f[0], f[1], f[2]) if f[2] != None else '%s %s' % (f[0], f[1])) for f in self.get_builder_fields()] + + +class Opcode(object): + """Class that represents all the information we have about the opcode + NOTE: this must be kept in sync with aco_op_info + """ + def __init__(self, name, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod): + """Parameters: + + - name is the name of the opcode (prepend nir_op_ for the enum name) + - all types are strings that get nir_type_ prepended to them + - input_types is a list of types + - algebraic_properties is a space-seperated string, where nir_op_is_ is + prepended before each entry + - const_expr is an expression or series of statements that computes the + constant value of the opcode given the constant values of its inputs. + """ + assert isinstance(name, str) + assert isinstance(opcode_gfx9, int) + assert isinstance(opcode_gfx10, int) + assert isinstance(format, Format) + assert isinstance(input_mod, bool) + assert isinstance(output_mod, bool) + + self.name = name + self.opcode_gfx9 = opcode_gfx9 + self.opcode_gfx10 = opcode_gfx10 + self.input_mod = "1" if input_mod else "0" + self.output_mod = "1" if output_mod else "0" + self.format = format + + +# global dictionary of opcodes +opcodes = {} + +# VOPC to GFX6 opcode translation map +VOPC_GFX6 = [0] * 256 + +def opcode(name, opcode_gfx9 = -1, opcode_gfx10 = -1, format = Format.PSEUDO, input_mod = False, output_mod = False): + assert name not in opcodes + opcodes[name] = Opcode(name, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod) + +opcode("exp", 0, 0, format = Format.EXP) +opcode("p_parallelcopy") +opcode("p_startpgm") +opcode("p_phi") +opcode("p_linear_phi") +opcode("p_as_uniform") + +opcode("p_create_vector") +opcode("p_extract_vector") +opcode("p_split_vector") + +# start/end the parts where we can use exec based instructions +# implicitly +opcode("p_logical_start") +opcode("p_logical_end") + +# e.g. subgroupMin() in SPIR-V +opcode("p_reduce", format=Format.PSEUDO_REDUCTION) +# e.g. subgroupInclusiveMin() +opcode("p_inclusive_scan", format=Format.PSEUDO_REDUCTION) +# e.g. subgroupExclusiveMin() +opcode("p_exclusive_scan", format=Format.PSEUDO_REDUCTION) + +opcode("p_branch", format=Format.PSEUDO_BRANCH) +opcode("p_cbranch", format=Format.PSEUDO_BRANCH) +opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH) +opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH) + +opcode("p_memory_barrier_all", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER) + +opcode("p_spill") +opcode("p_reload") + +# start/end linear vgprs +opcode("p_start_linear_vgpr") +opcode("p_end_linear_vgpr") + +opcode("p_wqm") +opcode("p_discard_if") +opcode("p_load_helper") +opcode("p_demote_to_helper") +opcode("p_is_helper") + +opcode("p_fs_buffer_store_smem", format=Format.SMEM) + + +# SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc) +SOP2 = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name + (0x00, 0x00, 0x00, 0x00, 0x00, "s_add_u32"), + (0x01, 0x01, 0x01, 0x01, 0x01, "s_sub_u32"), + (0x02, 0x02, 0x02, 0x02, 0x02, "s_add_i32"), + (0x03, 0x03, 0x03, 0x03, 0x03, "s_sub_i32"), + (0x04, 0x04, 0x04, 0x04, 0x04, "s_addc_u32"), + (0x05, 0x05, 0x05, 0x05, 0x05, "s_subb_u32"), + (0x06, 0x06, 0x06, 0x06, 0x06, "s_min_i32"), + (0x07, 0x07, 0x07, 0x07, 0x07, "s_min_u32"), + (0x08, 0x08, 0x08, 0x08, 0x08, "s_max_i32"), + (0x09, 0x09, 0x09, 0x09, 0x09, "s_max_u32"), + (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_cselect_b32"), + (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_cselect_b64"), + (0x0e, 0x0e, 0x0c, 0x0c, 0x0e, "s_and_b32"), + (0x0f, 0x0f, 0x0d, 0x0d, 0x0f, "s_and_b64"), + (0x10, 0x10, 0x0e, 0x0e, 0x10, "s_or_b32"), + (0x11, 0x11, 0x0f, 0x0f, 0x11, "s_or_b64"), + (0x12, 0x12, 0x10, 0x10, 0x12, "s_xor_b32"), + (0x13, 0x13, 0x11, 0x11, 0x13, "s_xor_b64"), + (0x14, 0x14, 0x12, 0x12, 0x14, "s_andn2_b32"), + (0x15, 0x15, 0x13, 0x13, 0x15, "s_andn2_b64"), + (0x16, 0x16, 0x14, 0x14, 0x16, "s_orn2_b32"), + (0x17, 0x17, 0x15, 0x15, 0x17, "s_orn2_b64"), + (0x18, 0x18, 0x16, 0x16, 0x18, "s_nand_b32"), + (0x19, 0x19, 0x17, 0x17, 0x19, "s_nand_b64"), + (0x1a, 0x1a, 0x18, 0x18, 0x1a, "s_nor_b32"), + (0x1b, 0x1b, 0x19, 0x19, 0x1b, "s_nor_b64"), + (0x1c, 0x1c, 0x1a, 0x1a, 0x1c, "s_xnor_b32"), + (0x1d, 0x1d, 0x1b, 0x1b, 0x1d, "s_xnor_b64"), + (0x1e, 0x1e, 0x1c, 0x1c, 0x1e, "s_lshl_b32"), + (0x1f, 0x1f, 0x1d, 0x1d, 0x1f, "s_lshl_b64"), + (0x20, 0x20, 0x1e, 0x1e, 0x20, "s_lshr_b32"), + (0x21, 0x21, 0x1f, 0x1f, 0x21, "s_lshr_b64"), + (0x22, 0x22, 0x20, 0x20, 0x22, "s_ashr_i32"), + (0x23, 0x23, 0x21, 0x21, 0x23, "s_ashr_i64"), + (0x24, 0x24, 0x22, 0x22, 0x24, "s_bfm_b32"), + (0x25, 0x25, 0x23, 0x23, 0x25, "s_bfm_b64"), + (0x26, 0x26, 0x24, 0x24, 0x26, "s_mul_i32"), + (0x27, 0x27, 0x25, 0x25, 0x27, "s_bfe_u32"), + (0x28, 0x28, 0x26, 0x26, 0x28, "s_bfe_i32"), + (0x29, 0x29, 0x27, 0x27, 0x29, "s_bfe_u64"), + (0x2a, 0x2a, 0x28, 0x28, 0x2a, "s_bfe_i64"), + (0x2b, 0x2b, 0x29, 0x29, -1, "s_cbranch_g_fork"), + (0x2c, 0x2c, 0x2a, 0x2a, 0x2c, "s_absdiff_i32"), + ( -1, -1, 0x2b, 0x2b, -1, "s_rfe_restore_b64"), + ( -1, -1, -1, 0x2e, 0x2e, "s_lshl1_add_u32"), + ( -1, -1, -1, 0x2f, 0x2f, "s_lshl2_add_u32"), + ( -1, -1, -1, 0x30, 0x30, "s_lshl3_add_u32"), + ( -1, -1, -1, 0x31, 0x31, "s_lshl4_add_u32"), + ( -1, -1, -1, 0x32, 0x32, "s_pack_ll_b32_b16"), + ( -1, -1, -1, 0x33, 0x33, "s_pack_lh_b32_b16"), + ( -1, -1, -1, 0x34, 0x34, "s_pack_hh_b32_b16"), + ( -1, -1, -1, 0x2c, 0x35, "s_mul_hi_u32"), + ( -1, -1, -1, 0x2c, 0x35, "s_mul_hi_i32"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP2: + opcode(name, gfx9, gfx10, Format.SOP2) + + +# SOPK instructions: 0 input (+ imm), 1 output + optional scc +SOPK = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name + (0x00, 0x00, 0x00, 0x00, 0x00, "s_movk_i32"), + ( -1, -1, -1, -1, 0x01, "s_version"), # GFX10+ + (0x02, 0x02, 0x01, 0x01, 0x02, "s_cmovk_i32"), # GFX8_GFX9 + (0x03, 0x03, 0x02, 0x02, 0x03, "s_cmpk_eq_i32"), + (0x04, 0x04, 0x03, 0x03, 0x04, "s_cmpk_lg_i32"), + (0x05, 0x05, 0x04, 0x04, 0x05, "s_cmpk_gt_i32"), + (0x06, 0x06, 0x05, 0x05, 0x06, "s_cmpk_ge_i32"), + (0x07, 0x07, 0x06, 0x06, 0x07, "s_cmpk_lt_i32"), + (0x08, 0x08, 0x07, 0x07, 0x08, "s_cmpk_le_i32"), + (0x09, 0x09, 0x08, 0x08, 0x09, "s_cmpk_eq_u32"), + (0x0a, 0x0a, 0x09, 0x09, 0x0a, "s_cmpk_lg_u32"), + (0x0b, 0x0b, 0x0a, 0x0a, 0x0b, "s_cmpk_gt_u32"), + (0x0c, 0x0c, 0x0b, 0x0b, 0x0c, "s_cmpk_ge_u32"), + (0x0d, 0x0d, 0x0c, 0x0c, 0x0d, "s_cmpk_lt_u32"), + (0x0e, 0x0e, 0x0d, 0x0d, 0x0e, "s_cmpk_le_u32"), + (0x0f, 0x0f, 0x0e, 0x0e, 0x0f, "s_addk_i32"), + (0x10, 0x10, 0x0f, 0x0f, 0x10, "s_mulk_i32"), + (0x11, 0x11, 0x10, 0x10, -1, "s_cbranch_i_fork"), + (0x12, 0x12, 0x11, 0x11, 0x12, "s_getreg_b32"), + (0x13, 0x13, 0x12, 0x12, 0x13, "s_setreg_b32"), + (0x15, 0x15, 0x14, 0x14, 0x15, "s_setreg_imm32_b32"), # requires 32bit literal + ( -1, -1, 0x15, 0x15, 0x16, "s_call_b64"), + ( -1, -1, -1, -1, 0x17, "s_waitcnt_vscnt"), + ( -1, -1, -1, -1, 0x18, "s_waitcnt_vmcnt"), + ( -1, -1, -1, -1, 0x19, "s_waitcnt_expcnt"), + ( -1, -1, -1, -1, 0x1a, "s_waitcnt_lgkmcnt"), + ( -1, -1, -1, -1, 0x1b, "s_subvector_loop_begin"), + ( -1, -1, -1, -1, 0x1c, "s_subvector_loop_end"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPK: + opcode(name, gfx9, gfx10, Format.SOPK) + + +# SOP1 instructions: 1 input, 1 output (+optional SCC) +SOP1 = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name + (0x03, 0x03, 0x00, 0x00, 0x03, "s_mov_b32"), + (0x04, 0x04, 0x01, 0x01, 0x04, "s_mov_b64"), + (0x05, 0x05, 0x02, 0x02, 0x05, "s_cmov_b32"), + (0x06, 0x06, 0x03, 0x03, 0x06, "s_cmov_b64"), + (0x07, 0x07, 0x04, 0x04, 0x07, "s_not_b32"), + (0x08, 0x08, 0x05, 0x05, 0x08, "s_not_b64"), + (0x09, 0x09, 0x06, 0x06, 0x09, "s_wqm_b32"), + (0x0a, 0x0a, 0x07, 0x07, 0x0a, "s_wqm_b64"), + (0x0b, 0x0b, 0x08, 0x08, 0x0b, "s_brev_b32"), + (0x0c, 0x0c, 0x09, 0x09, 0x0c, "s_brev_b64"), + (0x0d, 0x0d, 0x0a, 0x0a, 0x0d, "s_bcnt0_i32_b32"), + (0x0e, 0x0e, 0x0b, 0x0b, 0x0e, "s_bcnt0_i32_b64"), + (0x0f, 0x0f, 0x0c, 0x0c, 0x0f, "s_bcnt1_i32_b32"), + (0x10, 0x10, 0x0d, 0x0d, 0x10, "s_bcnt1_i32_b64"), + (0x11, 0x11, 0x0e, 0x0e, 0x11, "s_ff0_i32_b32"), + (0x12, 0x12, 0x0f, 0x0f, 0x12, "s_ff0_i32_b64"), + (0x13, 0x13, 0x10, 0x10, 0x13, "s_ff1_i32_b32"), + (0x14, 0x14, 0x11, 0x11, 0x14, "s_ff1_i32_b64"), + (0x15, 0x15, 0x12, 0x12, 0x15, "s_flbit_i32_b32"), + (0x16, 0x16, 0x13, 0x13, 0x16, "s_flbit_i32_b64"), + (0x17, 0x17, 0x14, 0x14, 0x17, "s_flbit_i32"), + (0x18, 0x18, 0x15, 0x15, 0x18, "s_flbit_i32_i64"), + (0x19, 0x19, 0x16, 0x16, 0x19, "s_sext_i32_i8"), + (0x1a, 0x1a, 0x17, 0x17, 0x1a, "s_sext_i32_i16"), + (0x1b, 0x1b, 0x18, 0x18, 0x1b, "s_bitset0_b32"), + (0x1c, 0x1c, 0x19, 0x19, 0x1c, "s_bitset0_b64"), + (0x1d, 0x1d, 0x1a, 0x1a, 0x1d, "s_bitset1_b32"), + (0x1e, 0x1e, 0x1b, 0x1b, 0x1e, "s_bitset1_b64"), + (0x1f, 0x1f, 0x1c, 0x1c, 0x1f, "s_getpc_b64"), + (0x20, 0x20, 0x1d, 0x1d, 0x20, "s_setpc_b64"), + (0x21, 0x21, 0x1e, 0x1e, 0x21, "s_swappc_b64"), + (0x22, 0x22, 0x1f, 0x1f, 0x22, "s_rfe_b64"), + (0x24, 0x24, 0x20, 0x20, 0x24, "s_and_saveexec_b64"), + (0x25, 0x25, 0x21, 0x21, 0x25, "s_or_saveexec_b64"), + (0x26, 0x26, 0x22, 0x22, 0x26, "s_xor_saveexec_b64"), + (0x27, 0x27, 0x23, 0x23, 0x27, "s_andn2_saveexec_b64"), + (0x28, 0x28, 0x24, 0x24, 0x28, "s_orn2_saveexec_b64"), + (0x29, 0x29, 0x25, 0x25, 0x29, "s_nand_saveexec_b64"), + (0x2a, 0x2a, 0x26, 0x26, 0x2a, "s_nor_saveexec_b64"), + (0x2b, 0x2b, 0x27, 0x27, 0x2b, "s_xnor_saveexec_b64"), + (0x2c, 0x2c, 0x28, 0x28, 0x2c, "s_quadmask_b32"), + (0x2d, 0x2d, 0x29, 0x29, 0x2d, "s_quadmask_b64"), + (0x2e, 0x2e, 0x2a, 0x2a, 0x2e, "s_movrels_b32"), + (0x2f, 0x2f, 0x2b, 0x2b, 0x2f, "s_movrels_b64"), + (0x30, 0x30, 0x2c, 0x2c, 0x30, "s_movreld_b32"), + (0x31, 0x31, 0x2d, 0x2d, 0x31, "s_movreld_b64"), + (0x32, 0x32, 0x2e, 0x2e, -1, "s_cbranch_join"), + (0x34, 0x34, 0x30, 0x30, 0x34, "s_abs_i32"), + (0x35, 0x35, -1, -1, 0x35, "s_mov_fed_b32"), + ( -1, -1, 0x32, 0x32, -1, "s_set_gpr_idx_idx"), + ( -1, -1, -1, 0x33, 0x37, "s_andn1_saveexec_b64"), + ( -1, -1, -1, 0x34, 0x38, "s_orn1_saveexec_b64"), + ( -1, -1, -1, 0x35, 0x39, "s_andn1_wrexec_b64"), + ( -1, -1, -1, 0x36, 0x3a, "s_andn2_wrexec_b64"), + ( -1, -1, -1, 0x37, 0x3b, "s_bitreplicate_b64_b32"), + ( -1, -1, -1, -1, 0x3c, "s_and_saveexec_b32"), + ( -1, -1, -1, -1, 0x3d, "s_or_saveexec_b32"), + ( -1, -1, -1, -1, 0x3e, "s_xor_saveexec_b32"), + ( -1, -1, -1, -1, 0x3f, "s_andn2_saveexec_b32"), + ( -1, -1, -1, -1, 0x40, "s_orn2_saveexec_b32"), + ( -1, -1, -1, -1, 0x41, "s_nand_saveexec_b32"), + ( -1, -1, -1, -1, 0x42, "s_nor_saveexec_b32"), + ( -1, -1, -1, -1, 0x43, "s_xnor_saveexec_b32"), + ( -1, -1, -1, -1, 0x44, "s_andn1_saveexec_b32"), + ( -1, -1, -1, -1, 0x45, "s_orn1_saveexec_b32"), + ( -1, -1, -1, -1, 0x46, "s_andn1_wrexec_b32"), + ( -1, -1, -1, -1, 0x47, "s_andn2_wrexec_b32"), + ( -1, -1, -1, -1, 0x49, "s_movrelsd_2_b32"), + # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP1. + ( -1, -1, -1, -1, -1, "p_constaddr"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP1: + opcode(name, gfx9, gfx10, Format.SOP1) + + +# SOPC instructions: 2 inputs and 0 outputs (+SCC) +SOPC = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name + (0x00, 0x00, 0x00, 0x00, 0x00, "s_cmp_eq_i32"), + (0x01, 0x01, 0x01, 0x01, 0x01, "s_cmp_lg_i32"), + (0x02, 0x02, 0x02, 0x02, 0x02, "s_cmp_gt_i32"), + (0x03, 0x03, 0x03, 0x03, 0x03, "s_cmp_ge_i32"), + (0x04, 0x04, 0x04, 0x04, 0x04, "s_cmp_lt_i32"), + (0x05, 0x05, 0x05, 0x05, 0x05, "s_cmp_le_i32"), + (0x06, 0x06, 0x06, 0x06, 0x06, "s_cmp_eq_u32"), + (0x07, 0x07, 0x07, 0x07, 0x07, "s_cmp_lg_u32"), + (0x08, 0x08, 0x08, 0x08, 0x08, "s_cmp_gt_u32"), + (0x09, 0x09, 0x09, 0x09, 0x09, "s_cmp_ge_u32"), + (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_cmp_lt_u32"), + (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_cmp_le_u32"), + (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_bitcmp0_b32"), + (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "s_bitcmp1_b32"), + (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "s_bitcmp0_b64"), + (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "s_bitcmp1_b64"), + (0x10, 0x10, 0x10, 0x10, -1, "s_setvskip"), + ( -1, -1, 0x11, 0x11, -1, "s_set_gpr_idx_on"), + ( -1, -1, 0x12, 0x12, 0x12, "s_cmp_eq_u64"), + ( -1, -1, 0x13, 0x13, 0x13, "s_cmp_lg_u64"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPC: + opcode(name, gfx9, gfx10, Format.SOPC) + + +# SOPP instructions: 0 inputs (+optional scc/vcc), 0 outputs +SOPP = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name + (0x00, 0x00, 0x00, 0x00, 0x00, "s_nop"), + (0x01, 0x01, 0x01, 0x01, 0x01, "s_endpgm"), + (0x02, 0x02, 0x02, 0x02, 0x02, "s_branch"), + ( -1, -1, 0x03, 0x03, 0x03, "s_wakeup"), + (0x04, 0x04, 0x04, 0x04, 0x04, "s_cbranch_scc0"), + (0x05, 0x05, 0x05, 0x05, 0x05, "s_cbranch_scc1"), + (0x06, 0x06, 0x06, 0x06, 0x06, "s_cbranch_vccz"), + (0x07, 0x07, 0x07, 0x07, 0x07, "s_cbranch_vccnz"), + (0x08, 0x08, 0x08, 0x08, 0x08, "s_cbranch_execz"), + (0x09, 0x09, 0x09, 0x09, 0x09, "s_cbranch_execnz"), + (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_barrier"), + ( -1, 0x0b, 0x0b, 0x0b, 0x0b, "s_setkill"), + (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_waitcnt"), + (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "s_sethalt"), + (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "s_sleep"), + (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "s_setprio"), + (0x10, 0x10, 0x10, 0x10, 0x10, "s_sendmsg"), + (0x11, 0x11, 0x11, 0x11, 0x11, "s_sendmsghalt"), + (0x12, 0x12, 0x12, 0x12, 0x12, "s_trap"), + (0x13, 0x13, 0x13, 0x13, 0x13, "s_icache_inv"), + (0x14, 0x14, 0x14, 0x14, 0x14, "s_incperflevel"), + (0x15, 0x15, 0x15, 0x15, 0x15, "s_decperflevel"), + (0x16, 0x16, 0x16, 0x16, 0x16, "s_ttracedata"), + ( -1, 0x17, 0x17, 0x17, 0x17, "s_cbranch_cdbgsys"), + ( -1, 0x18, 0x18, 0x18, 0x18, "s_cbranch_cdbguser"), + ( -1, 0x19, 0x19, 0x19, 0x19, "s_cbranch_cdbgsys_or_user"), + ( -1, 0x1a, 0x1a, 0x1a, 0x1a, "s_cbranch_cdbgsys_and_user"), + ( -1, -1, 0x1b, 0x1b, 0x1b, "s_endpgm_saved"), + ( -1, -1, 0x1c, 0x1c, -1, "s_set_gpr_idx_off"), + ( -1, -1, 0x1d, 0x1d, -1, "s_set_gpr_idx_mode"), + ( -1, -1, -1, 0x1e, 0x1e, "s_endpgm_ordered_ps_done"), + ( -1, -1, -1, -1, 0x1f, "s_code_end"), + ( -1, -1, -1, -1, 0x20, "s_inst_prefetch"), + ( -1, -1, -1, -1, 0x21, "s_clause"), + ( -1, -1, -1, -1, 0x22, "s_wait_idle"), + ( -1, -1, -1, -1, 0x23, "s_waitcnt_depctr"), + ( -1, -1, -1, -1, 0x24, "s_round_mode"), + ( -1, -1, -1, -1, 0x25, "s_denorm_mode"), + ( -1, -1, -1, -1, 0x26, "s_ttracedata_imm"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPP: + opcode(name, gfx9, gfx10, Format.SOPP) + + +# SMEM instructions: sbase input (2 sgpr), potentially 2 offset inputs, 1 sdata input/output +SMEM = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name + (0x00, 0x00, 0x00, 0x00, 0x00, "s_load_dword"), + (0x01, 0x01, 0x01, 0x01, 0x01, "s_load_dwordx2"), + (0x02, 0x02, 0x02, 0x02, 0x02, "s_load_dwordx4"), + (0x03, 0x03, 0x03, 0x03, 0x03, "s_load_dwordx8"), + (0x04, 0x04, 0x04, 0x04, 0x04, "s_load_dwordx16"), + ( -1, -1, -1, 0x05, 0x05, "s_scratch_load_dword"), + ( -1, -1, -1, 0x06, 0x06, "s_scratch_load_dwordx2"), + ( -1, -1, -1, 0x07, 0x07, "s_scratch_load_dwordx4"), + (0x08, 0x08, 0x08, 0x08, 0x08, "s_buffer_load_dword"), + (0x09, 0x09, 0x09, 0x09, 0x09, "s_buffer_load_dwordx2"), + (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_buffer_load_dwordx4"), + (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_buffer_load_dwordx8"), + (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_buffer_load_dwordx16"), + ( -1, -1, 0x10, 0x10, 0x10, "s_store_dword"), + ( -1, -1, 0x11, 0x11, 0x11, "s_store_dwordx2"), + ( -1, -1, 0x12, 0x12, 0x12, "s_store_dwordx4"), + ( -1, -1, -1, 0x15, 0x15, "s_scratch_store_dword"), + ( -1, -1, -1, 0x16, 0x16, "s_scratch_store_dwordx2"), + ( -1, -1, -1, 0x17, 0x17, "s_scratch_store_dwordx4"), + ( -1, -1, 0x18, 0x18, 0x18, "s_buffer_store_dword"), + ( -1, -1, 0x19, 0x19, 0x19, "s_buffer_store_dwordx2"), + ( -1, -1, 0x1a, 0x1a, 0x1a, "s_buffer_store_dwordx4"), + ( -1, -1, 0x1f, 0x1f, 0x1f, "s_gl1_inv"), + (0x1f, 0x1f, 0x20, 0x20, 0x20, "s_dcache_inv"), + ( -1, -1, 0x21, 0x21, 0x21, "s_dcache_wb"), + ( -1, 0x1d, 0x22, 0x22, -1, "s_dcache_inv_vol"), + ( -1, -1, 0x23, 0x23, -1, "s_dcache_wb_vol"), + (0x1e, 0x1e, 0x24, 0x24, 0x24, "s_memtime"), + ( -1, -1, 0x25, 0x25, 0x25, "s_memrealtime"), + ( -1, -1, 0x26, 0x26, 0x26, "s_atc_probe"), + ( -1, -1, 0x27, 0x27, 0x27, "s_atc_probe_buffer"), + ( -1, -1, -1, 0x28, 0x28, "s_dcache_discard"), + ( -1, -1, -1, 0x29, 0x29, "s_dcache_discard_x2"), + ( -1, -1, -1, -1, 0x2a, "s_get_waveid_in_workgroup"), + ( -1, -1, -1, 0x40, 0x40, "s_buffer_atomic_swap"), + ( -1, -1, -1, 0x41, 0x41, "s_buffer_atomic_cmpswap"), + ( -1, -1, -1, 0x42, 0x42, "s_buffer_atomic_add"), + ( -1, -1, -1, 0x43, 0x43, "s_buffer_atomic_sub"), + ( -1, -1, -1, 0x44, 0x44, "s_buffer_atomic_smin"), + ( -1, -1, -1, 0x45, 0x45, "s_buffer_atomic_umin"), + ( -1, -1, -1, 0x46, 0x46, "s_buffer_atomic_smax"), + ( -1, -1, -1, 0x47, 0x47, "s_buffer_atomic_umax"), + ( -1, -1, -1, 0x48, 0x48, "s_buffer_atomic_and"), + ( -1, -1, -1, 0x49, 0x49, "s_buffer_atomic_or"), + ( -1, -1, -1, 0x4a, 0x4a, "s_buffer_atomic_xor"), + ( -1, -1, -1, 0x4b, 0x4b, "s_buffer_atomic_inc"), + ( -1, -1, -1, 0x4c, 0x4c, "s_buffer_atomic_dec"), + ( -1, -1, -1, 0x60, 0x60, "s_buffer_atomic_swap_x2"), + ( -1, -1, -1, 0x61, 0x61, "s_buffer_atomic_cmpswap_x2"), + ( -1, -1, -1, 0x62, 0x62, "s_buffer_atomic_add_x2"), + ( -1, -1, -1, 0x63, 0x63, "s_buffer_atomic_sub_x2"), + ( -1, -1, -1, 0x64, 0x64, "s_buffer_atomic_smin_x2"), + ( -1, -1, -1, 0x65, 0x65, "s_buffer_atomic_umin_x2"), + ( -1, -1, -1, 0x66, 0x66, "s_buffer_atomic_smax_x2"), + ( -1, -1, -1, 0x67, 0x67, "s_buffer_atomic_umax_x2"), + ( -1, -1, -1, 0x68, 0x68, "s_buffer_atomic_and_x2"), + ( -1, -1, -1, 0x69, 0x69, "s_buffer_atomic_or_x2"), + ( -1, -1, -1, 0x6a, 0x6a, "s_buffer_atomic_xor_x2"), + ( -1, -1, -1, 0x6b, 0x6b, "s_buffer_atomic_inc_x2"), + ( -1, -1, -1, 0x6c, 0x6c, "s_buffer_atomic_dec_x2"), + ( -1, -1, -1, 0x80, 0x80, "s_atomic_swap"), + ( -1, -1, -1, 0x81, 0x81, "s_atomic_cmpswap"), + ( -1, -1, -1, 0x82, 0x82, "s_atomic_add"), + ( -1, -1, -1, 0x83, 0x83, "s_atomic_sub"), + ( -1, -1, -1, 0x84, 0x84, "s_atomic_smin"), + ( -1, -1, -1, 0x85, 0x85, "s_atomic_umin"), + ( -1, -1, -1, 0x86, 0x86, "s_atomic_smax"), + ( -1, -1, -1, 0x87, 0x87, "s_atomic_umax"), + ( -1, -1, -1, 0x88, 0x88, "s_atomic_and"), + ( -1, -1, -1, 0x89, 0x89, "s_atomic_or"), + ( -1, -1, -1, 0x8a, 0x8a, "s_atomic_xor"), + ( -1, -1, -1, 0x8b, 0x8b, "s_atomic_inc"), + ( -1, -1, -1, 0x8c, 0x8c, "s_atomic_dec"), + ( -1, -1, -1, 0xa0, 0xa0, "s_atomic_swap_x2"), + ( -1, -1, -1, 0xa1, 0xa1, "s_atomic_cmpswap_x2"), + ( -1, -1, -1, 0xa2, 0xa2, "s_atomic_add_x2"), + ( -1, -1, -1, 0xa3, 0xa3, "s_atomic_sub_x2"), + ( -1, -1, -1, 0xa4, 0xa4, "s_atomic_smin_x2"), + ( -1, -1, -1, 0xa5, 0xa5, "s_atomic_umin_x2"), + ( -1, -1, -1, 0xa6, 0xa6, "s_atomic_smax_x2"), + ( -1, -1, -1, 0xa7, 0xa7, "s_atomic_umax_x2"), + ( -1, -1, -1, 0xa8, 0xa8, "s_atomic_and_x2"), + ( -1, -1, -1, 0xa9, 0xa9, "s_atomic_or_x2"), + ( -1, -1, -1, 0xaa, 0xaa, "s_atomic_xor_x2"), + ( -1, -1, -1, 0xab, 0xab, "s_atomic_inc_x2"), + ( -1, -1, -1, 0xac, 0xac, "s_atomic_dec_x2"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SMEM: + opcode(name, gfx9, gfx10, Format.SMEM) + + +# VOP2 instructions: 2 inputs, 1 output (+ optional vcc) +# TODO: misses some GFX6_7 opcodes which were shifted to VOP3 in GFX8 +VOP2 = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name, input/output modifiers + (0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32", False), + (0x03, 0x03, 0x01, 0x01, 0x03, "v_add_f32", True), + (0x04, 0x04, 0x02, 0x02, 0x04, "v_sub_f32", True), + (0x05, 0x05, 0x03, 0x03, 0x05, "v_subrev_f32", True), + (0x06, 0x06, -1, -1, 0x06, "v_mac_legacy_f32", True), + (0x07, 0x07, 0x04, 0x04, 0x07, "v_mul_legacy_f32", True), + (0x08, 0x08, 0x05, 0x05, 0x08, "v_mul_f32", True), + (0x09, 0x09, 0x06, 0x06, 0x09, "v_mul_i32_i24", False), + (0x0a, 0x0a, 0x07, 0x07, 0x0a, "v_mul_hi_i32_i24", False), + (0x0b, 0x0b, 0x08, 0x08, 0x0b, "v_mul_u32_u24", False), + (0x0c, 0x0c, 0x09, 0x09, 0x0c, "v_mul_hi_u32_u24", False), + (0x0d, 0x0d, -1, -1, -1, "v_min_legacy_f32", True), + (0x0e, 0x0e, -1, -1, -1, "v_max_legacy_f32", True), + (0x0f, 0x0f, 0x0a, 0x0a, 0x0f, "v_min_f32", True), + (0x10, 0x10, 0x0b, 0x0b, 0x10, "v_max_f32", True), + (0x11, 0x11, 0x0c, 0x0c, 0x11, "v_min_i32", False), + (0x12, 0x12, 0x0d, 0x0d, 0x12, "v_max_i32", False), + (0x13, 0x13, 0x0e, 0x0e, 0x13, "v_min_u32", False), + (0x14, 0x14, 0x0f, 0x0f, 0x14, "v_max_u32", False), + (0x15, 0x15, -1, -1, -1, "v_lshr_b32", False), + (0x16, 0x16, 0x10, 0x10, 0x16, "v_lshrrev_b32", False), + (0x17, 0x17, -1, -1, -1, "v_ashr_i32", False), + (0x18, 0x18, 0x11, 0x11, 0x18, "v_ashrrev_i32", False), + (0x19, 0x19, -1, -1, -1, "v_lshl_b32", False), + (0x1a, 0x1a, 0x12, 0x12, 0x1a, "v_lshlrev_b32", False), + (0x1b, 0x1b, 0x13, 0x13, 0x1b, "v_and_b32", False), + (0x1c, 0x1c, 0x14, 0x14, 0x1c, "v_or_b32", False), + (0x1d, 0x1d, 0x15, 0x15, 0x1d, "v_xor_b32", False), + ( -1, -1, -1, -1, 0x1e, "v_xnor_b32", False), + (0x1f, 0x1f, 0x16, 0x16, 0x1f, "v_mac_f32", True), + (0x20, 0x20, 0x17, 0x17, 0x20, "v_madmk_f32", False), + (0x21, 0x21, 0x18, 0x18, 0x21, "v_madak_f32", False), + (0x25, 0x25, 0x19, 0x19, -1, "v_add_co_u32", False), # VOP3B only in RDNA + (0x26, 0x26, 0x1a, 0x1a, -1, "v_sub_co_u32", False), # VOP3B only in RDNA + (0x27, 0x27, 0x1b, 0x1b, -1, "v_subrev_co_u32", False), # VOP3B only in RDNA + (0x28, 0x28, 0x1c, 0x1c, 0x28, "v_addc_co_u32", False), # v_add_co_ci_u32 in RDNA + (0x29, 0x29, 0x1d, 0x1d, 0x29, "v_subb_co_u32", False), # v_sub_co_ci_u32 in RDNA + (0x2a, 0x2a, 0x1e, 0x1e, 0x2a, "v_subbrev_co_u32", False), # v_subrev_co_ci_u32 in RDNA + ( -1, -1, -1, -1, 0x2b, "v_fmac_f32", True), + ( -1, -1, -1, -1, 0x2c, "v_fmamk_f32", True), + ( -1, -1, -1, -1, 0x2d, "v_fmaak_f32", True), + ( -1, -1, 0x1f, 0x1f, 0x32, "v_add_f16", True), + ( -1, -1, 0x20, 0x20, 0x33, "v_sub_f16", True), + ( -1, -1, 0x21, 0x21, 0x34, "v_subrev_f16", True), + ( -1, -1, 0x22, 0x22, 0x35, "v_mul_f16", True), + ( -1, -1, 0x23, 0x23, -1, "v_mac_f16", True), + ( -1, -1, 0x24, 0x24, -1, "v_madmk_f16", False), + ( -1, -1, 0x25, 0x25, -1, "v_madak_f16", False), + ( -1, -1, 0x26, 0x26, -1, "v_add_u16", False), + ( -1, -1, 0x27, 0x27, -1, "v_sub_u16", False), + ( -1, -1, 0x28, 0x28, -1, "v_subrev_u16", False), + ( -1, -1, 0x29, 0x29, -1, "v_mul_lo_u16", False), + ( -1, -1, 0x2a, 0x2a, -1, "v_lshlrev_b16", False), + ( -1, -1, 0x2b, 0x2b, -1, "v_lshrrev_b16", False), + ( -1, -1, 0x2c, 0x2c, -1, "v_ashrrev_b16", False), + ( -1, -1, 0x2d, 0x2d, 0x39, "v_max_f16", True), + ( -1, -1, 0x2e, 0x2e, 0x3a, "v_min_f16", True), + ( -1, -1, 0x2f, 0x2f, -1, "v_max_u16", False), + ( -1, -1, 0x30, 0x30, -1, "v_max_i16", False), + ( -1, -1, 0x31, 0x31, -1, "v_min_u16", False), + ( -1, -1, 0x32, 0x32, -1, "v_min_i16", False), + ( -1, -1, 0x33, 0x33, 0x3b, "v_ldexp_f16", False), + ( -1, -1, 0x34, 0x34, 0x25, "v_add_u32", False), # v_add_nc_u32 in RDNA + ( -1, -1, 0x35, 0x35, 0x26, "v_sub_u32", False), # v_sub_nc_u32 in RDNA + ( -1, -1, 0x36, 0x36, 0x27, "v_subrev_u32", False), # v_subrev_nc_u32 in RDNA + ( -1, -1, -1, -1, 0x36, "v_fmac_f16", False), + ( -1, -1, -1, -1, 0x37, "v_fmamk_f16", False), + ( -1, -1, -1, -1, 0x38, "v_fmaak_f16", False), + ( -1, -1, -1, -1, 0x3c, "v_pk_fmac_f16", False), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name, modifiers) in VOP2: + opcode(name, gfx9, gfx10, Format.VOP2, modifiers, modifiers) + + +# VOP1 instructions: instructions with 1 input and 1 output +VOP1 = { + # GFX6, GFX7, GFX8, GFX9, GFX10, name, input_modifiers, output_modifiers + (0x00, 0x00, 0x00, 0x00, 0x00, "v_nop", False, False), + (0x01, 0x01, 0x01, 0x01, 0x01, "v_mov_b32", False, False), + (0x02, 0x02, 0x02, 0x02, 0x02, "v_readfirstlane_b32", False, False), + (0x03, 0x03, 0x03, 0x03, 0x03, "v_cvt_i32_f64", True, False), + (0x04, 0x04, 0x04, 0x04, 0x04, "v_cvt_f64_i32", False, True), + (0x05, 0x05, 0x05, 0x05, 0x05, "v_cvt_f32_i32", False, True), + (0x06, 0x06, 0x06, 0x06, 0x06, "v_cvt_f32_u32", False, True), + (0x07, 0x07, 0x07, 0x07, 0x07, "v_cvt_u32_f32", True, False), + (0x08, 0x08, 0x08, 0x08, 0x08, "v_cvt_i32_f32", True, False), + (0x09, 0x09, -1, -1, 0x09, "v_mov_fed_b32", True, False), # LLVM mentions it for GFX8_9 + (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "v_cvt_f16_f32", True, True), + (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "v_cvt_f32_f16", True, True), + (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "v_cvt_rpi_i32_f32", True, False), + (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "v_cvt_flr_i32_f32", True, False), + (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "v_cvt_off_f32_i4", False, True), + (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "v_cvt_f32_f64", True, True), + (0x10, 0x10, 0x10, 0x10, 0x10, "v_cvt_f64_f32", True, True), + (0x11, 0x11, 0x11, 0x11, 0x11, "v_cvt_f32_ubyte0", False, True), + (0x12, 0x12, 0x12, 0x12, 0x12, "v_cvt_f32_ubyte1", False, True), + (0x13, 0x13, 0x13, 0x13, 0x13, "v_cvt_f32_ubyte2", False, True), + (0x14, 0x14, 0x14, 0x14, 0x14, "v_cvt_f32_ubyte3", False, True), + (0x15, 0x15, 0x15, 0x15, 0x15, "v_cvt_u32_f64", True, False), + (0x16, 0x16, 0x16, 0x16, 0x16, "v_cvt_f64_u32", False, True), + ( -1, 0x17, 0x17, 0x17, 0x17, "v_trunc_f64", True, True), + ( -1, 0x18, 0x18, 0x18, 0x18, "v_ceil_f64", True, True), + ( -1, 0x19, 0x19, 0x19, 0x19, "v_rndne_f64", True, True), + ( -1, 0x1a, 0x1a, 0x1a, 0x1a, "v_floor_f64", True, True), + ( -1, -1, -1, -1, 0x1b, "v_pipeflush", False, False), + (0x20, 0x20, 0x1b, 0x1b, 0x20, "v_fract_f32", True, True), + (0x21, 0x21, 0x1c, 0x1c, 0x21, "v_trunc_f32", True, True), + (0x22, 0x22, 0x1d, 0x1d, 0x22, "v_ceil_f32", True, True), + (0x23, 0x23, 0x1e, 0x1e, 0x23, "v_rndne_f32", True, True), + (0x24, 0x24, 0x1f, 0x1f, 0x24, "v_floor_f32", True, True), + (0x25, 0x25, 0x20, 0x20, 0x25, "v_exp_f32", True, True), + (0x26, 0x26, -1, -1, -1, "v_log_clamp_f32", True, True), + (0x27, 0x27, 0x21, 0x21, 0x27, "v_log_f32", True, True), + (0x28, 0x28, -1, -1, -1, "v_rcp_clamp_f32", True, True), + (0x29, 0x29, -1, -1, -1, "v_rcp_legacy_f32", True, True), + (0x2a, 0x2a, 0x22, 0x22, 0x2a, "v_rcp_f32", True, True), + (0x2b, 0x2b, 0x23, 0x23, 0x2b, "v_rcp_iflag_f32", True, True), + (0x2c, 0x2c, -1, -1, -1, "v_rsq_clamp_f32", True, True), + (0x2d, 0x2d, -1, -1, -1, "v_rsq_legacy_f32", True, True), + (0x2e, 0x2e, 0x24, 0x24, 0x2e, "v_rsq_f32", True, True), + (0x2f, 0x2f, 0x25, 0x25, 0x2f, "v_rcp_f64", True, True), + (0x30, 0x30, -1, -1, -1, "v_rcp_clamp_f64", True, True), + (0x31, 0x31, 0x26, 0x26, 0x31, "v_rsq_f64", True, True), + (0x32, 0x32, -1, -1, -1, "v_rsq_clamp_f64", True, True), + (0x33, 0x33, 0x27, 0x27, 0x33, "v_sqrt_f32", True, True), + (0x34, 0x34, 0x28, 0x28, 0x34, "v_sqrt_f64", True, True), + (0x35, 0x35, 0x29, 0x29, 0x35, "v_sin_f32", True, True), + (0x36, 0x36, 0x2a, 0x2a, 0x36, "v_cos_f32", True, True), + (0x37, 0x37, 0x2b, 0x2b, 0x37, "v_not_b32", False, False), + (0x38, 0x38, 0x2c, 0x2c, 0x38, "v_bfrev_b32", False, False), + (0x39, 0x39, 0x2d, 0x2d, 0x39, "v_ffbh_u32", False, False), + (0x3a, 0x3a, 0x2e, 0x2e, 0x3a, "v_ffbl_b32", False, False), + (0x3b, 0x3b, 0x2f, 0x2f, 0x3b, "v_ffbh_i32", False, False), + (0x3c, 0x3c, 0x30, 0x30, 0x3c, "v_frexp_exp_i32_f64", True, False), + (0x3d, 0x3d, 0x31, 0x31, 0x3d, "v_frexp_mant_f64", True, False), + (0x3e, 0x3e, 0x32, 0x32, 0x3e, "v_fract_f64", True, True), + (0x3f, 0x3f, 0x33, 0x33, 0x3f, "v_frexp_exp_i32_f32", True, False), + (0x40, 0x40, 0x34, 0x34, 0x40, "v_frexp_mant_f32", True, False), + (0x41, 0x41, 0x35, 0x35, 0x41, "v_clrexcp", False, False), + (0x42, 0x42, 0x36, -1, 0x42, "v_movreld_b32", False, False), + (0x43, 0x43, 0x37, -1, 0x43, "v_movrels_b32", False, False), + (0x44, 0x44, 0x38, -1, 0x44, "v_movrelsd_b32", False, False), + ( -1, -1, -1, -1, 0x48, "v_movrelsd_2_b32", False, False), + ( -1, -1, -1, 0x37, -1, "v_screen_partition_4se_b32", False, False), + ( -1, -1, 0x39, 0x39, 0x50, "v_cvt_f16_u16", False, True), + ( -1, -1, 0x3a, 0x3a, 0x51, "v_cvt_f16_i16", False, True), + ( -1, -1, 0x3b, 0x3b, 0x52, "v_cvt_u16_f16", True, False), + ( -1, -1, 0x3c, 0x3c, 0x53, "v_cvt_i16_f16", True, False), + ( -1, -1, 0x3d, 0x3d, 0x54, "v_rcp_f16", True, True), + ( -1, -1, 0x3e, 0x3e, 0x55, "v_sqrt_f16", True, True), + ( -1, -1, 0x3f, 0x3f, 0x56, "v_rsq_f16", True, True), + ( -1, -1, 0x40, 0x40, 0x57, "v_log_f16", True, True), + ( -1, -1, 0x41, 0x41, 0x58, "v_exp_f16", True, True), + ( -1, -1, 0x42, 0x42, 0x59, "v_frexp_mant_f16", True, False), + ( -1, -1, 0x43, 0x43, 0x5a, "v_frexp_exp_i16_f16", True, False), + ( -1, -1, 0x44, 0x44, 0x5b, "v_floor_f16", True, True), + ( -1, -1, 0x45, 0x45, 0x5c, "v_ceil_f16", True, True), + ( -1, -1, 0x46, 0x46, 0x5d, "v_trunc_f16", True, True), + ( -1, -1, 0x47, 0x47, 0x5e, "v_rndne_f16", True, True), + ( -1, -1, 0x48, 0x48, 0x5f, "v_fract_f16", True, True), + ( -1, -1, 0x49, 0x49, 0x60, "v_sin_f16", True, True), + ( -1, -1, 0x4a, 0x4a, 0x61, "v_cos_f16", True, True), + ( -1, 0x46, 0x4b, 0x4b, -1, "v_exp_legacy_f32", True, True), + ( -1, 0x45, 0x4c, 0x4c, -1, "v_log_legacy_f32", True, True), + ( -1, -1, -1, 0x4f, 0x62, "v_sat_pk_u8_i16", False, False), + ( -1, -1, -1, 0x4d, 0x63, "v_cvt_norm_i16_f16", True, False), + ( -1, -1, -1, 0x4e, 0x64, "v_cvt_norm_u16_f16", True, False), + ( -1, -1, -1, 0x51, 0x65, "v_swap_b32", False, False), + ( -1, -1, -1, -1, 0x68, "v_swaprel_b32", False, False), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP1: + opcode(name, gfx9, gfx10, Format.VOP1, in_mod, out_mod) + + +# VOPC instructions: + +VOPC_CLASS = { + (0x88, 0x88, 0x10, 0x10, 0x88, "v_cmp_class_f32"), + ( -1, -1, 0x14, 0x14, 0x8f, "v_cmp_class_f16"), + (0x98, 0x98, 0x11, 0x11, 0x98, "v_cmpx_class_f32"), + ( -1, -1, 0x15, 0x15, 0x9f, "v_cmpx_class_f16"), + (0xa8, 0xa8, 0x12, 0x12, 0xa8, "v_cmp_class_f64"), + (0xb8, 0xb8, 0x13, 0x13, 0xb8, "v_cmpx_class_f64"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in VOPC_CLASS: + opcode(name, gfx9, gfx10, Format.VOPC, True, False) + +COMPF = ["f", "lt", "eq", "le", "gt", "lg", "ge", "o", "u", "nge", "nlg", "ngt", "nle", "neq", "nlt", "tru"] + +for i in range(8): + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x20+i, 0x20+i, 0xc8+i, "v_cmp_"+COMPF[i]+"_f16") + opcode(name, gfx9, gfx10, Format.VOPC, True, False) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x30+i, 0x30+i, 0xd8+i, "v_cmpx_"+COMPF[i]+"_f16") + opcode(name, gfx9, gfx10, Format.VOPC, True, False) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x28+i, 0x28+i, 0xe8+i, "v_cmp_"+COMPF[i+8]+"_f16") + opcode(name, gfx9, gfx10, Format.VOPC, True, False) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x38+i, 0x38+i, 0xf8+i, "v_cmpx_"+COMPF[i+8]+"_f16") + opcode(name, gfx9, gfx10, Format.VOPC, True, False) + +for i in range(16): + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x00+i, 0x00+i, 0x40+i, 0x40+i, 0x00+i, "v_cmp_"+COMPF[i]+"_f32") + opcode(name, gfx9, gfx10, Format.VOPC, True, False) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x10+i, 0x10+i, 0x50+i, 0x50+i, 0x10+i, "v_cmpx_"+COMPF[i]+"_f32") + opcode(name, gfx9, gfx10, Format.VOPC, True, False) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x20+i, 0x20+i, 0x60+i, 0x60+i, 0x20+i, "v_cmp_"+COMPF[i]+"_f64") + opcode(name, gfx9, gfx10, Format.VOPC, True, False) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x30+i, 0x30+i, 0x70+i, 0x70+i, 0x30+i, "v_cmpx_"+COMPF[i]+"_f64") + opcode(name, gfx9, gfx10, Format.VOPC, True, False) + # GFX_6_7 + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x40+i, 0x40+i, -1, -1, -1, "v_cmps_"+COMPF[i]+"_f32") + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x50+i, 0x50+i, -1, -1, -1, "v_cmpsx_"+COMPF[i]+"_f32") + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x60+i, 0x60+i, -1, -1, -1, "v_cmps_"+COMPF[i]+"_f64") + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x70+i, 0x70+i, -1, -1, -1, "v_cmpsx_"+COMPF[i]+"_f64") + +COMPI = ["f", "lt", "eq", "le", "gt", "lg", "ge", "tru"] + +# GFX_8_9 +for i in [0,7]: # only 0 and 7 + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, -1, "v_cmp_"+COMPI[i]+"_i16") + opcode(name, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, -1, "v_cmpx_"+COMPI[i]+"_i16") + opcode(name, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, -1, "v_cmp_"+COMPI[i]+"_u16") + opcode(name, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, -1, "v_cmpx_"+COMPI[i]+"_u16") + opcode(name, gfx9, gfx10, Format.VOPC) + +for i in range(1, 7): # [1..6] + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, 0x88+i, "v_cmp_"+COMPI[i]+"_i16") + opcode(name, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, 0x98+i, "v_cmpx_"+COMPI[i]+"_i16") + opcode(name, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, 0xa8+i, "v_cmp_"+COMPI[i]+"_u16") + opcode(name, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, 0xb8+i, "v_cmpx_"+COMPI[i]+"_u16") + opcode(name, gfx9, gfx10, Format.VOPC) + +for i in range(8): + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x80+i, 0x80+i, 0xc0+i, 0xc0+i, 0x80+i, "v_cmp_"+COMPI[i]+"_i32") + opcode(name, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x90+i, 0x90+i, 0xd0+i, 0xd0+i, 0x90+i, "v_cmpx_"+COMPI[i]+"_i32") + opcode(name, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xa0+i, 0xa0+i, 0xe0+i, 0xe0+i, 0xa0+i, "v_cmp_"+COMPI[i]+"_i64") + opcode(name, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xb0+i, 0xb0+i, 0xf0+i, 0xf0+i, 0xb0+i, "v_cmpx_"+COMPI[i]+"_i64") + opcode(name, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xc0+i, 0xc0+i, 0xc8+i, 0xc8+i, 0xc0+i, "v_cmp_"+COMPI[i]+"_u32") + opcode(name, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xd0+i, 0xd0+i, 0xd8+i, 0xd8+i, 0xd0+i, "v_cmpx_"+COMPI[i]+"_u32") + opcode(name, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xe0+i, 0xe0+i, 0xe8+i, 0xe8+i, 0xe0+i, "v_cmp_"+COMPI[i]+"_u64") + opcode(name, gfx9, gfx10, Format.VOPC) + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xf0+i, 0xf0+i, 0xf8+i, 0xf8+i, 0xf0+i, "v_cmpx_"+COMPI[i]+"_u64") + opcode(name, gfx9, gfx10, Format.VOPC) + + +# VOPP instructions: packed 16bit instructions - 1 or 2 inputs and 1 output +VOPP = { + (0x00, "v_pk_mad_i16"), + (0x01, "v_pk_mul_lo_u16"), + (0x02, "v_pk_add_i16"), + (0x03, "v_pk_sub_i16"), + (0x04, "v_pk_lshlrev_b16"), + (0x05, "v_pk_lshrrev_b16"), + (0x06, "v_pk_ashrrev_i16"), + (0x07, "v_pk_max_i16"), + (0x08, "v_pk_min_i16"), + (0x09, "v_pk_mad_u16"), + (0x0a, "v_pk_add_u16"), + (0x0b, "v_pk_sub_u16"), + (0x0c, "v_pk_max_u16"), + (0x0d, "v_pk_min_u16"), + (0x0e, "v_pk_fma_f16"), + (0x0f, "v_pk_add_f16"), + (0x10, "v_pk_mul_f16"), + (0x11, "v_pk_min_f16"), + (0x12, "v_pk_max_f16"), + (0x20, "v_pk_fma_mix_f32"), # v_mad_mix_f32 in VEGA ISA, v_fma_mix_f32 in RDNA ISA + (0x21, "v_pk_fma_mixlo_f16"), # v_mad_mixlo_f16 in VEGA ISA, v_fma_mixlo_f16 in RDNA ISA + (0x22, "v_pk_fma_mixhi_f16"), # v_mad_mixhi_f16 in VEGA ISA, v_fma_mixhi_f16 in RDNA ISA +} +# note that these are only supported on gfx9+ so we'll need to distinguish between gfx8 and gfx9 here +# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, -1, code, code, name) +for (code, name) in VOPP: + opcode(name, code, code, Format.VOP3P) + + +# VINTERP instructions: +VINTRP = { + (0x00, "v_interp_p1_f32"), + (0x01, "v_interp_p2_f32"), + (0x02, "v_interp_mov_f32"), +} +# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name) +for (code, name) in VINTRP: + opcode(name, code, code, Format.VINTRP) + +# VOP3 instructions: 3 inputs, 1 output +# VOP3b instructions: have a unique scalar output, e.g. VOP2 with vcc out +VOP3 = { + (0x140, 0x140, 0x1c0, 0x1c0, 0x140, "v_mad_legacy_f32", True, True), + (0x141, 0x141, 0x1c1, 0x1c1, 0x141, "v_mad_f32", True, True), + (0x142, 0x142, 0x1c2, 0x1c2, 0x142, "v_mad_i32_i24", False, False), + (0x143, 0x143, 0x1c3, 0x1c3, 0x143, "v_mad_u32_u24", False, False), + (0x144, 0x144, 0x1c4, 0x1c4, 0x144, "v_cubeid_f32", True, True), + (0x145, 0x145, 0x1c5, 0x1c5, 0x145, "v_cubesc_f32", True, True), + (0x146, 0x146, 0x1c6, 0x1c6, 0x146, "v_cubetc_f32", True, True), + (0x147, 0x147, 0x1c7, 0x1c7, 0x147, "v_cubema_f32", True, True), + (0x148, 0x148, 0x1c8, 0x1c8, 0x148, "v_bfe_u32", False, False), + (0x149, 0x149, 0x1c9, 0x1c9, 0x149, "v_bfe_i32", False, False), + (0x14a, 0x14a, 0x1ca, 0x1ca, 0x14a, "v_bfi_b32", False, False), + (0x14b, 0x14b, 0x1cb, 0x1cb, 0x14b, "v_fma_f32", True, True), + (0x14c, 0x14c, 0x1cc, 0x1cc, 0x14c, "v_fma_f64", True, True), + (0x14d, 0x14d, 0x1cd, 0x1cd, 0x14d, "v_lerp_u8", False, False), + (0x14e, 0x14e, 0x1ce, 0x1ce, 0x14e, "v_alignbit_b32", False, False), + (0x14f, 0x14f, 0x1cf, 0x1cf, 0x14f, "v_alignbyte_b32", False, False), + (0x150, 0x150, -1, -1, 0x150, "v_mullit_f32", True, True), + (0x151, 0x151, 0x1d0, 0x1d0, 0x151, "v_min3_f32", True, True), + (0x152, 0x152, 0x1d1, 0x1d1, 0x152, "v_min3_i32", False, False), + (0x153, 0x153, 0x1d2, 0x1d2, 0x153, "v_min3_u32", False, False), + (0x154, 0x154, 0x1d3, 0x1d3, 0x154, "v_max3_f32", True, True), + (0x155, 0x155, 0x1d4, 0x1d4, 0x155, "v_max3_i32", False, False), + (0x156, 0x156, 0x1d5, 0x1d5, 0x156, "v_max3_u32", False, False), + (0x157, 0x157, 0x1d6, 0x1d6, 0x157, "v_med3_f32", True, True), + (0x158, 0x158, 0x1d7, 0x1d7, 0x158, "v_med3_i32", False, False), + (0x159, 0x159, 0x1d8, 0x1d8, 0x159, "v_med3_u32", False, False), + (0x15a, 0x15a, 0x1d9, 0x1d9, 0x15a, "v_sad_u8", False, False), + (0x15b, 0x15b, 0x1da, 0x1da, 0x15b, "v_sad_hi_u8", False, False), + (0x15c, 0x15c, 0x1db, 0x1db, 0x15c, "v_sad_u16", False, False), + (0x15d, 0x15d, 0x1dc, 0x1dc, 0x15d, "v_sad_u32", False, False), + (0x15e, 0x15e, 0x1dd, 0x1dd, 0x15e, "v_cvt_pk_u8_f32", True, False), + (0x15f, 0x15f, 0x1de, 0x1de, 0x15f, "v_div_fixup_f32", True, True), + (0x160, 0x160, 0x1df, 0x1df, 0x160, "v_div_fixup_f64", True, True), + (0x161, 0x161, -1, -1, -1, "v_lshl_b64", False, False), + (0x162, 0x162, -1, -1, -1, "v_lshr_b64", False, False), + (0x163, 0x163, -1, -1, -1, "v_ashr_i64", False, False), + (0x164, 0x164, 0x280, 0x280, 0x164, "v_add_f64", True, True), + (0x165, 0x165, 0x281, 0x281, 0x165, "v_mul_f64", True, True), + (0x166, 0x166, 0x282, 0x282, 0x166, "v_min_f64", True, True), + (0x167, 0x167, 0x283, 0x283, 0x167, "v_max_f64", True, True), + (0x168, 0x168, 0x284, 0x284, 0x168, "v_ldexp_f64", False, True), # src1 can take input modifiers + (0x169, 0x169, 0x285, 0x285, 0x169, "v_mul_lo_u32", False, False), + (0x16a, 0x16a, 0x286, 0x286, 0x16a, "v_mul_hi_u32", False, False), + (0x16b, 0x16b, 0x285, 0x285, 0x16b, "v_mul_lo_i32", False, False), # identical to v_mul_lo_u32 + (0x16c, 0x16c, 0x287, 0x287, 0x16c, "v_mul_hi_i32", False, False), + (0x16d, 0x16d, 0x1e0, 0x1e0, 0x16d, "v_div_scale_f32", True, True), # writes to VCC + (0x16e, 0x16e, 0x1e1, 0x1e1, 0x16e, "v_div_scale_f64", True, True), # writes to VCC + (0x16f, 0x16f, 0x1e2, 0x1e2, 0x16f, "v_div_fmas_f32", True, True), # takes VCC input + (0x170, 0x170, 0x1e3, 0x1e3, 0x170, "v_div_fmas_f64", True, True), # takes VCC input + (0x171, 0x171, 0x1e4, 0x1e4, 0x171, "v_msad_u8", False, False), + (0x172, 0x172, 0x1e5, 0x1e5, 0x172, "v_qsad_pk_u16_u8", False, False), + (0x172, -1, -1, -1, -1, "v_qsad_u8", False, False), # what's the difference? + (0x173, 0x173, 0x1e6, 0x1e6, 0x173, "v_mqsad_pk_u16_u8", False, False), + (0x173, -1, -1, -1, -1, "v_mqsad_u8", False, False), # what's the difference? + (0x174, 0x174, 0x292, 0x292, 0x174, "v_trig_preop_f64", False, False), + ( -1, 0x175, 0x1e7, 0x1e7, 0x175, "v_mqsad_u32_u8", False, False), + ( -1, 0x176, 0x1e8, 0x1e8, 0x176, "v_mad_u64_u32", False, False), + ( -1, 0x177, 0x1e9, 0x1e9, 0x177, "v_mad_i64_i32", False, False), + ( -1, -1, 0x1ea, 0x1ea, -1, "v_mad_legacy_f16", True, True), + ( -1, -1, 0x1eb, 0x1eb, -1, "v_mad_legacy_u16", False, False), + ( -1, -1, 0x1ec, 0x1ec, -1, "v_mad_legacy_i16", False, False), + ( -1, -1, 0x1ed, 0x1ed, 0x344, "v_perm_b32", False, False), + ( -1, -1, 0x1ee, 0x1ee, -1, "v_fma_legacy_f16", True, True), + ( -1, -1, 0x1ef, 0x1ef, -1, "v_div_fixup_legacy_f16", True, True), + (0x12c, 0x12c, 0x1f0, 0x1f0, -1, "v_cvt_pkaccum_u8_f32", True, False), + ( -1, -1, -1, 0x1f1, 0x373, "v_mad_u32_u16", False, False), + ( -1, -1, -1, 0x1f2, 0x375, "v_mad_i32_i16", False, False), + ( -1, -1, -1, 0x1f2, 0x345, "v_xad_u32", False, False), + ( -1, -1, -1, 0x1f4, 0x351, "v_min3_f16", True, True), + ( -1, -1, -1, 0x1f5, 0x352, "v_min3_i16", False, False), + ( -1, -1, -1, 0x1f6, 0x353, "v_min3_u16", False, False), + ( -1, -1, -1, 0x1f7, 0x354, "v_max3_f16", True, True), + ( -1, -1, -1, 0x1f8, 0x355, "v_max3_i16", False, False), + ( -1, -1, -1, 0x1f9, 0x356, "v_max3_u16", False, False), + ( -1, -1, -1, 0x1fa, 0x357, "v_med3_f16", True, True), + ( -1, -1, -1, 0x1fb, 0x358, "v_med3_i16", False, False), + ( -1, -1, -1, 0x1fc, 0x359, "v_med3_u16", False, False), + ( -1, -1, -1, 0x1fd, 0x346, "v_lshl_add_u32", False, False), + ( -1, -1, -1, 0x1fe, 0x347, "v_add_lshl_u32", False, False), + ( -1, -1, -1, 0x1ff, 0x36d, "v_add3_u32", False, False), + ( -1, -1, -1, 0x200, 0x36f, "v_lshl_or_b32", False, False), + ( -1, -1, -1, 0x201, 0x371, "v_and_or_b32", False, False), + ( -1, -1, -1, 0x202, 0x372, "v_or3_b32", False, False), + ( -1, -1, -1, 0x203, -1, "v_mad_f16", True, True), + ( -1, -1, -1, 0x204, 0x340, "v_mad_u16", False, False), + ( -1, -1, -1, 0x205, 0x35e, "v_mad_i16", False, False), + ( -1, -1, -1, 0x206, 0x34b, "v_fma_f16", True, True), + ( -1, -1, -1, 0x207, 0x35f, "v_div_fixup_f16", True, True), + ( -1, -1, 0x274, 0x274, 0x342, "v_interp_p1ll_f16", True, True), + ( -1, -1, 0x275, 0x275, 0x343, "v_interp_p1lv_f16", True, True), + ( -1, -1, 0x276, 0x276, -1, "v_interp_p2_legacy_f16", True, True), + ( -1, -1, -1, 0x277, 0x35a, "v_interp_p2_f16", True, True), + (0x12b, 0x12b, 0x288, 0x288, 0x362, "v_ldexp_f32", False, True), + (0x101, 0x101, 0x289, 0x289, 0x360, "v_readlane_b32", False, False), + (0x102, 0x102, 0x28a, 0x28a, 0x361, "v_writelane_b32", False, False), + (0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False), + (0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False), + (0x124, 0x124, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32", False, False), + ( -1, -1, 0x28f, 0x28f, 0x2ff, "v_lshlrev_b64", False, False), + ( -1, -1, 0x290, 0x290, 0x300, "v_lshrrev_b64", False, False), + ( -1, -1, 0x291, 0x291, 0x301, "v_ashrrev_i64", False, False), + (0x11e, 0x11e, 0x293, 0x293, 0x363, "v_bfm_b32", False, False), + (0x12d, 0x12d, 0x294, 0x294, 0x368, "v_cvt_pknorm_i16_f32", True, False), + (0x12e, 0x12e, 0x295, 0x295, 0x369, "v_cvt_pknorm_u16_f32", True, False), + (0x12f, 0x12f, 0x296, 0x296, 0x12f, "v_cvt_pkrtz_f16_f32", True, False), # GFX6_7_10 is VOP2 with opcode 0x02f + (0x130, 0x130, 0x297, 0x297, 0x36a, "v_cvt_pk_u16_u32", False, False), + (0x131, 0x131, 0x298, 0x298, 0x36b, "v_cvt_pk_i16_i32", False, False), + ( -1, -1, -1, 0x299, 0x312, "v_cvt_pknorm_i16_f16", True, False), + ( -1, -1, -1, 0x29a, 0x313, "v_cvt_pknorm_u16_f16", True, False), + ( -1, -1, -1, 0x29c, 0x37f, "v_add_i32", False, False), + ( -1, -1, -1, 0x29d, 0x376, "v_sub_i32", False, False), + ( -1, -1, -1, 0x29e, 0x30d, "v_add_i16", False, False), + ( -1, -1, -1, 0x29f, 0x30e, "v_sub_i16", False, False), + ( -1, -1, -1, 0x2a0, 0x311, "v_pack_b32_f16", True, False), + ( -1, -1, -1, -1, 0x178, "v_xor3_b32", False, False), + ( -1, -1, -1, -1, 0x377, "v_permlane16_b32", False, False), + ( -1, -1, -1, -1, 0x378, "v_permlanex16_b32", False, False), + ( -1, -1, -1, -1, 0x30f, "v_add_co_u32_e64", False, False), + ( -1, -1, -1, -1, 0x310, "v_sub_co_u32_e64", False, False), + ( -1, -1, -1, -1, 0x311, "v_subrev_co_u32_e64", False, False), +# TODO: many 16bit instructions moved from VOP2 to VOP3 on GFX10 +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP3: + opcode(name, gfx9, gfx10, Format.VOP3A, in_mod, out_mod) + + +# DS instructions: 3 inputs (1 addr, 2 data), 1 output +DS = { + (0x00, 0x00, 0x00, 0x00, 0x00, "ds_add_u32"), + (0x01, 0x01, 0x01, 0x01, 0x01, "ds_sub_u32"), + (0x02, 0x02, 0x02, 0x02, 0x02, "ds_rsub_u32"), + (0x03, 0x03, 0x03, 0x03, 0x03, "ds_inc_u32"), + (0x04, 0x04, 0x04, 0x04, 0x04, "ds_dec_u32"), + (0x05, 0x05, 0x05, 0x05, 0x05, "ds_min_i32"), + (0x06, 0x06, 0x06, 0x06, 0x06, "ds_max_i32"), + (0x07, 0x07, 0x07, 0x07, 0x07, "ds_min_u32"), + (0x08, 0x08, 0x08, 0x08, 0x08, "ds_max_u32"), + (0x09, 0x09, 0x09, 0x09, 0x09, "ds_and_b32"), + (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "ds_or_b32"), + (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "ds_xor_b32"), + (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "ds_mskor_b32"), + (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "ds_write_b32"), + (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "ds_write2_b32"), + (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "ds_write2st64_b32"), + (0x10, 0x10, 0x10, 0x10, 0x10, "ds_cmpst_b32"), + (0x11, 0x11, 0x11, 0x11, 0x11, "ds_cmpst_f32"), + (0x12, 0x12, 0x12, 0x12, 0x12, "ds_min_f32"), + (0x13, 0x13, 0x13, 0x13, 0x13, "ds_max_f32"), + ( -1, 0x14, 0x14, 0x14, 0x14, "ds_nop"), + ( -1, -1, 0x15, 0x15, 0x15, "ds_add_f32"), + ( -1, -1, 0x1d, 0x1d, 0xb0, "ds_write_addtid_b32"), + (0x1e, 0x1e, 0x1e, 0x1e, 0x1e, "ds_write_b8"), + (0x1f, 0x1f, 0x1f, 0x1f, 0x1f, "ds_write_b16"), + (0x20, 0x20, 0x20, 0x20, 0x20, "ds_add_rtn_u32"), + (0x21, 0x21, 0x21, 0x21, 0x21, "ds_sub_rtn_u32"), + (0x22, 0x22, 0x22, 0x22, 0x22, "ds_rsub_rtn_u32"), + (0x23, 0x23, 0x23, 0x23, 0x23, "ds_inc_rtn_u32"), + (0x24, 0x24, 0x24, 0x24, 0x24, "ds_dec_rtn_u32"), + (0x25, 0x25, 0x25, 0x25, 0x25, "ds_min_rtn_i32"), + (0x26, 0x26, 0x26, 0x26, 0x26, "ds_max_rtn_i32"), + (0x27, 0x27, 0x27, 0x27, 0x27, "ds_min_rtn_u32"), + (0x28, 0x28, 0x28, 0x28, 0x28, "ds_max_rtn_u32"), + (0x29, 0x29, 0x29, 0x29, 0x29, "ds_and_rtn_b32"), + (0x2a, 0x2a, 0x2a, 0x2a, 0x2a, "ds_or_rtn_b32"), + (0x2b, 0x2b, 0x2b, 0x2b, 0x2b, "ds_xor_rtn_b32"), + (0x2c, 0x2c, 0x2c, 0x2c, 0x2c, "ds_mskor_rtn_b32"), + (0x2d, 0x2d, 0x2d, 0x2d, 0x2d, "ds_wrxchg_rtn_b32"), + (0x2e, 0x2e, 0x2e, 0x2e, 0x2e, "ds_wrxchg2_rtn_b32"), + (0x2f, 0x2f, 0x2f, 0x2f, 0x2f, "ds_wrxchg2st64_rtn_b32"), + (0x30, 0x30, 0x30, 0x30, 0x30, "ds_cmpst_rtn_b32"), + (0x31, 0x31, 0x31, 0x31, 0x31, "ds_cmpst_rtn_f32"), + (0x32, 0x32, 0x32, 0x32, 0x32, "ds_min_rtn_f32"), + (0x33, 0x33, 0x33, 0x33, 0x33, "ds_max_rtn_f32"), + ( -1, 0x34, 0x34, 0x34, 0x34, "ds_wrap_rtn_b32"), + ( -1, -1, 0x35, 0x35, 0x55, "ds_add_rtn_f32"), + (0x36, 0x36, 0x36, 0x36, 0x36, "ds_read_b32"), + (0x37, 0x37, 0x37, 0x37, 0x37, "ds_read2_b32"), + (0x38, 0x38, 0x38, 0x38, 0x38, "ds_read2st64_b32"), + (0x39, 0x39, 0x39, 0x39, 0x39, "ds_read_i8"), + (0x3a, 0x3a, 0x3a, 0x3a, 0x3a, "ds_read_u8"), + (0x3b, 0x3b, 0x3b, 0x3b, 0x3b, "ds_read_i16"), + (0x3c, 0x3c, 0x3c, 0x3c, 0x3c, "ds_read_u16"), + (0x35, 0x35, 0x3d, 0x3d, 0x35, "ds_swizzle_b32"), #data1 & offset, no addr/data2 + ( -1, -1, 0x3e, 0x3e, 0xb2, "ds_permute_b32"), + ( -1, -1, 0x3f, 0x3f, 0xb3, "ds_bpermute_b32"), + (0x40, 0x40, 0x40, 0x40, 0x40, "ds_add_u64"), + (0x41, 0x41, 0x41, 0x41, 0x41, "ds_sub_u64"), + (0x42, 0x42, 0x42, 0x42, 0x42, "ds_rsub_u64"), + (0x43, 0x43, 0x43, 0x43, 0x43, "ds_inc_u64"), + (0x44, 0x44, 0x44, 0x44, 0x44, "ds_dec_u64"), + (0x45, 0x45, 0x45, 0x45, 0x45, "ds_min_i64"), + (0x46, 0x46, 0x46, 0x46, 0x46, "ds_max_i64"), + (0x47, 0x47, 0x47, 0x47, 0x47, "ds_min_u64"), + (0x48, 0x48, 0x48, 0x48, 0x48, "ds_max_u64"), + (0x49, 0x49, 0x49, 0x49, 0x49, "ds_and_b64"), + (0x4a, 0x4a, 0x4a, 0x4a, 0x4a, "ds_or_b64"), + (0x4b, 0x4b, 0x4b, 0x4b, 0x4b, "ds_xor_b64"), + (0x4c, 0x4c, 0x4c, 0x4c, 0x4c, "ds_mskor_b64"), + (0x4d, 0x4d, 0x4d, 0x4d, 0x4d, "ds_write_b64"), + (0x4e, 0x4e, 0x4e, 0x4e, 0x4e, "ds_write2_b64"), + (0x4f, 0x4f, 0x4f, 0x4f, 0x4f, "ds_write2st64_b64"), + (0x50, 0x50, 0x50, 0x50, 0x50, "ds_cmpst_b64"), + (0x51, 0x51, 0x51, 0x51, 0x51, "ds_cmpst_f64"), + (0x52, 0x52, 0x52, 0x52, 0x52, "ds_min_f64"), + (0x53, 0x53, 0x53, 0x53, 0x53, "ds_max_f64"), + ( -1, -1, 0x54, 0x54, 0xa0, "ds_write_b8_d16_hi"), + ( -1, -1, 0x55, 0x55, 0xa1, "ds_write_b16_d16_hi"), + ( -1, -1, 0x56, 0x56, 0xa2, "ds_read_u8_d16"), + ( -1, -1, 0x57, 0x57, 0xa3, "ds_read_u8_d16_hi"), + ( -1, -1, 0x58, 0x58, 0xa4, "ds_read_i8_d16"), + ( -1, -1, 0x59, 0x59, 0xa5, "ds_read_i8_d16_hi"), + ( -1, -1, 0x5a, 0x5a, 0xa6, "ds_read_u16_d16"), + ( -1, -1, 0x5b, 0x5b, 0xa7, "ds_read_u16_d16_hi"), + (0x60, 0x60, 0x60, 0x60, 0x60, "ds_add_rtn_u64"), + (0x61, 0x61, 0x61, 0x61, 0x61, "ds_sub_rtn_u64"), + (0x62, 0x62, 0x62, 0x62, 0x62, "ds_rsub_rtn_u64"), + (0x63, 0x63, 0x63, 0x63, 0x63, "ds_inc_rtn_u64"), + (0x64, 0x64, 0x64, 0x64, 0x64, "ds_dec_rtn_u64"), + (0x65, 0x65, 0x65, 0x65, 0x65, "ds_min_rtn_i64"), + (0x66, 0x66, 0x66, 0x66, 0x66, "ds_max_rtn_i64"), + (0x67, 0x67, 0x67, 0x67, 0x67, "ds_min_rtn_u64"), + (0x68, 0x68, 0x68, 0x68, 0x68, "ds_max_rtn_u64"), + (0x69, 0x69, 0x69, 0x69, 0x69, "ds_and_rtn_b64"), + (0x6a, 0x6a, 0x6a, 0x6a, 0x6a, "ds_or_rtn_b64"), + (0x6b, 0x6b, 0x6b, 0x6b, 0x6b, "ds_xor_rtn_b64"), + (0x6c, 0x6c, 0x6c, 0x6c, 0x6c, "ds_mskor_rtn_b64"), + (0x6d, 0x6d, 0x6d, 0x6d, 0x6d, "ds_wrxchg_rtn_b64"), + (0x6e, 0x6e, 0x6e, 0x6e, 0x6e, "ds_wrxchg2_rtn_b64"), + (0x6f, 0x6f, 0x6f, 0x6f, 0x6f, "ds_wrxchg2st64_rtn_b64"), + (0x70, 0x70, 0x70, 0x70, 0x70, "ds_cmpst_rtn_b64"), + (0x71, 0x71, 0x71, 0x71, 0x71, "ds_cmpst_rtn_f64"), + (0x72, 0x72, 0x72, 0x72, 0x72, "ds_min_rtn_f64"), + (0x73, 0x73, 0x73, 0x73, 0x73, "ds_max_rtn_f64"), + (0x76, 0x76, 0x76, 0x76, 0x76, "ds_read_b64"), + (0x77, 0x77, 0x77, 0x77, 0x77, "ds_read2_b64"), + (0x78, 0x78, 0x78, 0x78, 0x78, "ds_read2st64_b64"), + ( -1, 0x7e, 0x7e, 0x7e, 0x7e, "ds_condxchg32_rtn_b64"), + (0x80, 0x80, 0x80, 0x80, 0x80, "ds_add_src2_u32"), + (0x81, 0x81, 0x81, 0x81, 0x81, "ds_sub_src2_u32"), + (0x82, 0x82, 0x82, 0x82, 0x82, "ds_rsub_src2_u32"), + (0x83, 0x83, 0x83, 0x83, 0x83, "ds_inc_src2_u32"), + (0x84, 0x84, 0x84, 0x84, 0x84, "ds_dec_src2_u32"), + (0x85, 0x85, 0x85, 0x85, 0x85, "ds_min_src2_i32"), + (0x86, 0x86, 0x86, 0x86, 0x86, "ds_max_src2_i32"), + (0x87, 0x87, 0x87, 0x87, 0x87, "ds_min_src2_u32"), + (0x88, 0x88, 0x88, 0x88, 0x88, "ds_max_src2_u32"), + (0x89, 0x89, 0x89, 0x89, 0x89, "ds_and_src2_b32"), + (0x8a, 0x8a, 0x8a, 0x8a, 0x8a, "ds_or_src2_b32"), + (0x8b, 0x8b, 0x8b, 0x8b, 0x8b, "ds_xor_src2_b32"), + (0x8d, 0x8d, 0x8d, 0x8d, 0x8d, "ds_write_src2_b32"), + (0x92, 0x92, 0x92, 0x92, 0x92, "ds_min_src2_f32"), + (0x93, 0x93, 0x93, 0x93, 0x93, "ds_max_src2_f32"), + ( -1, -1, 0x95, 0x95, 0x95, "ds_add_src2_f32"), + ( -1, 0x18, 0x98, 0x98, 0x18, "ds_gws_sema_release_all"), + (0x19, 0x19, 0x99, 0x99, 0x19, "ds_gws_init"), + (0x1a, 0x1a, 0x9a, 0x9a, 0x1a, "ds_gws_sema_v"), + (0x1b, 0x1b, 0x9b, 0x9b, 0x1b, "ds_gws_sema_br"), + (0x1c, 0x1c, 0x9c, 0x9c, 0x1c, "ds_gws_sema_p"), + (0x1d, 0x1d, 0x9d, 0x9d, 0x1d, "ds_gws_barrier"), + ( -1, -1, 0xb6, 0xb6, 0xb1, "ds_read_addtid_b32"), + (0x3d, 0x3d, 0xbd, 0xbd, 0x3d, "ds_consume"), + (0x3e, 0x3e, 0xbe, 0xbe, 0x3e, "ds_append"), + (0x3f, 0x3f, 0xbf, 0xbf, 0x3f, "ds_ordered_count"), + (0xc0, 0xc0, 0xc0, 0xc0, 0xc0, "ds_add_src2_u64"), + (0xc1, 0xc1, 0xc1, 0xc1, 0xc1, "ds_sub_src2_u64"), + (0xc2, 0xc2, 0xc2, 0xc2, 0xc2, "ds_rsub_src2_u64"), + (0xc3, 0xc3, 0xc3, 0xc3, 0xc3, "ds_inc_src2_u64"), + (0xc4, 0xc4, 0xc4, 0xc4, 0xc4, "ds_dec_src2_u64"), + (0xc5, 0xc5, 0xc5, 0xc5, 0xc5, "ds_min_src2_i64"), + (0xc6, 0xc6, 0xc6, 0xc6, 0xc6, "ds_max_src2_i64"), + (0xc7, 0xc7, 0xc7, 0xc7, 0xc7, "ds_min_src2_u64"), + (0xc8, 0xc8, 0xc8, 0xc8, 0xc8, "ds_max_src2_u64"), + (0xc9, 0xc9, 0xc9, 0xc9, 0xc9, "ds_and_src2_b64"), + (0xca, 0xca, 0xca, 0xca, 0xca, "ds_or_src2_b64"), + (0xcb, 0xcb, 0xcb, 0xcb, 0xcb, "ds_xor_src2_b64"), + (0xcd, 0xcd, 0xcd, 0xcd, 0xcd, "ds_write_src2_b64"), + (0xd2, 0xd2, 0xd2, 0xd2, 0xd2, "ds_min_src2_f64"), + (0xd3, 0xd3, 0xd3, 0xd3, 0xd3, "ds_max_src2_f64"), + ( -1, 0xde, 0xde, 0xde, 0xde, "ds_write_b96"), + ( -1, 0xdf, 0xdf, 0xdf, 0xdf, "ds_write_b128"), + ( -1, 0xfd, 0xfd, -1, -1, "ds_condxchg32_rtn_b128"), + ( -1, 0xfe, 0xfe, 0xfe, 0xfe, "ds_read_b96"), + ( -1, 0xff, 0xff, 0xff, 0xff, "ds_read_b128"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in DS: + opcode(name, gfx9, gfx10, Format.DS) + +# MUBUF instructions: +MUBUF = { + (0x00, 0x00, 0x00, 0x00, 0x00, "buffer_load_format_x"), + (0x01, 0x01, 0x01, 0x01, 0x01, "buffer_load_format_xy"), + (0x02, 0x02, 0x02, 0x02, 0x02, "buffer_load_format_xyz"), + (0x03, 0x03, 0x03, 0x03, 0x03, "buffer_load_format_xyzw"), + (0x04, 0x04, 0x04, 0x04, 0x04, "buffer_store_format_x"), + (0x05, 0x05, 0x05, 0x05, 0x05, "buffer_store_format_xy"), + (0x06, 0x06, 0x06, 0x06, 0x06, "buffer_store_format_xyz"), + (0x07, 0x07, 0x07, 0x07, 0x07, "buffer_store_format_xyzw"), + ( -1, -1, 0x08, 0x08, 0x80, "buffer_load_format_d16_x"), + ( -1, -1, 0x09, 0x09, 0x81, "buffer_load_format_d16_xy"), + ( -1, -1, 0x0a, 0x0a, 0x82, "buffer_load_format_d16_xyz"), + ( -1, -1, 0x0b, 0x0b, 0x83, "buffer_load_format_d16_xyzw"), + ( -1, -1, 0x0c, 0x0c, 0x84, "buffer_store_format_d16_x"), + ( -1, -1, 0x0d, 0x0d, 0x85, "buffer_store_format_d16_xy"), + ( -1, -1, 0x0e, 0x0e, 0x86, "buffer_store_format_d16_xyz"), + ( -1, -1, 0x0f, 0x0f, 0x87, "buffer_store_format_d16_xyzw"), + (0x08, 0x08, 0x10, 0x10, 0x08, "buffer_load_ubyte"), + (0x09, 0x09, 0x11, 0x11, 0x09, "buffer_load_sbyte"), + (0x0a, 0x0a, 0x12, 0x12, 0x0a, "buffer_load_ushort"), + (0x0b, 0x0b, 0x13, 0x13, 0x0b, "buffer_load_sshort"), + (0x0c, 0x0c, 0x14, 0x14, 0x0c, "buffer_load_dword"), + (0x0d, 0x0d, 0x15, 0x15, 0x0d, "buffer_load_dwordx2"), + ( -1, 0x0f, 0x16, 0x16, 0x0f, "buffer_load_dwordx3"), + (0x0f, 0x0e, 0x17, 0x17, 0x0e, "buffer_load_dwordx4"), + (0x18, 0x18, 0x18, 0x18, 0x18, "buffer_store_byte"), + ( -1, -1, -1, 0x19, 0x19, "buffer_store_byte_d16_hi"), + (0x1a, 0x1a, 0x1a, 0x1a, 0x1a, "buffer_store_short"), + ( -1, -1, -1, 0x1b, 0x1b, "buffer_store_short_d16_hi"), + (0x1c, 0x1c, 0x1c, 0x1c, 0x1c, "buffer_store_dword"), + (0x1d, 0x1d, 0x1d, 0x1d, 0x1d, "buffer_store_dwordx2"), + ( -1, 0x1f, 0x1e, 0x1e, 0x1f, "buffer_store_dwordx3"), + (0x1e, 0x1e, 0x1f, 0x1f, 0x1e, "buffer_store_dwordx4"), + ( -1, -1, -1, 0x20, 0x20, "buffer_load_ubyte_d16"), + ( -1, -1, -1, 0x21, 0x21, "buffer_load_ubyte_d16_hi"), + ( -1, -1, -1, 0x22, 0x22, "buffer_load_sbyte_d16"), + ( -1, -1, -1, 0x23, 0x23, "buffer_load_sbyte_d16_hi"), + ( -1, -1, -1, 0x24, 0x24, "buffer_load_short_d16"), + ( -1, -1, -1, 0x25, 0x25, "buffer_load_short_d16_hi"), + ( -1, -1, -1, 0x26, 0x26, "buffer_load_format_d16_hi_x"), + ( -1, -1, -1, 0x27, 0x27, "buffer_store_format_d16_hi_x"), + ( -1, -1, 0x3d, 0x3d, -1, "buffer_store_lds_dword"), + (0x71, 0x71, 0x3e, 0x3e, -1, "buffer_wbinvl1"), + (0x70, 0x70, 0x3f, 0x3f, -1, "buffer_wbinvl1_vol"), + (0x30, 0x30, 0x40, 0x40, 0x30, "buffer_atomic_swap"), + (0x31, 0x31, 0x41, 0x41, 0x31, "buffer_atomic_cmpswap"), + (0x32, 0x32, 0x42, 0x42, 0x32, "buffer_atomic_add"), + (0x33, 0x33, 0x43, 0x43, 0x33, "buffer_atomic_sub"), + (0x34, -1, -1, -1, -1, "buffer_atomic_rsub"), + (0x35, 0x35, 0x44, 0x44, 0x35, "buffer_atomic_smin"), + (0x36, 0x36, 0x45, 0x45, 0x36, "buffer_atomic_umin"), + (0x37, 0x37, 0x46, 0x46, 0x37, "buffer_atomic_smax"), + (0x38, 0x38, 0x47, 0x47, 0x38, "buffer_atomic_umax"), + (0x39, 0x39, 0x48, 0x48, 0x39, "buffer_atomic_and"), + (0x3a, 0x3a, 0x49, 0x49, 0x3a, "buffer_atomic_or"), + (0x3b, 0x3b, 0x4a, 0x4a, 0x3b, "buffer_atomic_xor"), + (0x3c, 0x3c, 0x4b, 0x4b, 0x3c, "buffer_atomic_inc"), + (0x3d, 0x3d, 0x4c, 0x4c, 0x3d, "buffer_atomic_dec"), + (0x3e, 0x3e, -1, -1, 0x3e, "buffer_atomic_fcmpswap"), + (0x3f, 0x3f, -1, -1, 0x3f, "buffer_atomic_fmin"), + (0x40, 0x40, -1, -1, 0x40, "buffer_atomic_fmax"), + (0x50, 0x50, 0x60, 0x60, 0x50, "buffer_atomic_swap_x2"), + (0x51, 0x51, 0x61, 0x61, 0x51, "buffer_atomic_cmpswap_x2"), + (0x52, 0x52, 0x62, 0x62, 0x52, "buffer_atomic_add_x2"), + (0x53, 0x53, 0x63, 0x63, 0x53, "buffer_atomic_sub_x2"), + (0x54, -1, -1, -1, -1, "buffer_atomic_rsub_x2"), + (0x55, 0x55, 0x64, 0x64, 0x55, "buffer_atomic_smin_x2"), + (0x56, 0x56, 0x65, 0x65, 0x56, "buffer_atomic_umin_x2"), + (0x57, 0x57, 0x66, 0x66, 0x57, "buffer_atomic_smax_x2"), + (0x58, 0x58, 0x67, 0x67, 0x58, "buffer_atomic_umax_x2"), + (0x59, 0x59, 0x68, 0x68, 0x59, "buffer_atomic_and_x2"), + (0x5a, 0x5a, 0x69, 0x69, 0x5a, "buffer_atomic_or_x2"), + (0x5b, 0x5b, 0x6a, 0x6a, 0x5b, "buffer_atomic_xor_x2"), + (0x5c, 0x5c, 0x6b, 0x6b, 0x5c, "buffer_atomic_inc_x2"), + (0x5d, 0x5d, 0x6c, 0x6c, 0x5d, "buffer_atomic_dec_x2"), + (0x5e, 0x5e, -1, -1, 0x5e, "buffer_atomic_fcmpswap_x2"), + (0x5f, 0x5f, -1, -1, 0x5f, "buffer_atomic_fmin_x2"), + (0x60, 0x60, -1, -1, 0x60, "buffer_atomic_fmax_x2"), + ( -1, -1, -1, -1, 0x71, "buffer_gl0_inv"), + ( -1, -1, -1, -1, 0x72, "buffer_gl1_inv"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MUBUF: + opcode(name, gfx9, gfx10, Format.MUBUF) + +MTBUF = { + (0x00, 0x00, 0x00, 0x00, 0x00, "tbuffer_load_format_x"), + (0x01, 0x01, 0x01, 0x01, 0x01, "tbuffer_load_format_xy"), + (0x02, 0x02, 0x02, 0x02, 0x02, "tbuffer_load_format_xyz"), + (0x03, 0x03, 0x03, 0x03, 0x03, "tbuffer_load_format_xyzw"), + (0x04, 0x04, 0x04, 0x04, 0x04, "tbuffer_store_format_x"), + (0x05, 0x05, 0x05, 0x05, 0x05, "tbuffer_store_format_xy"), + (0x06, 0x06, 0x06, 0x06, 0x06, "tbuffer_store_format_xyz"), + (0x07, 0x07, 0x07, 0x07, 0x07, "tbuffer_store_format_xyzw"), + ( -1, -1, 0x08, 0x08, 0x08, "tbuffer_load_format_d16_x"), + ( -1, -1, 0x09, 0x09, 0x09, "tbuffer_load_format_d16_xy"), + ( -1, -1, 0x0a, 0x0a, 0x0a, "tbuffer_load_format_d16_xyz"), + ( -1, -1, 0x0b, 0x0b, 0x0b, "tbuffer_load_format_d16_xyzw"), + ( -1, -1, 0x0c, 0x0c, 0x0c, "tbuffer_store_format_d16_x"), + ( -1, -1, 0x0d, 0x0d, 0x0d, "tbuffer_store_format_d16_xy"), + ( -1, -1, 0x0e, 0x0e, 0x0e, "tbuffer_store_format_d16_xyz"), + ( -1, -1, 0x0f, 0x0f, 0x0f, "tbuffer_store_format_d16_xyzw"), +} +for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MTBUF: + opcode(name, gfx9, gfx10, Format.MTBUF) + + +IMAGE = { + (0x00, "image_load"), + (0x01, "image_load_mip"), + (0x02, "image_load_pck"), + (0x03, "image_load_pck_sgn"), + (0x04, "image_load_mip_pck"), + (0x05, "image_load_mip_pck_sgn"), + (0x08, "image_store"), + (0x09, "image_store_mip"), + (0x0a, "image_store_pck"), + (0x0b, "image_store_mip_pck"), + (0x0e, "image_get_resinfo"), + (0x60, "image_get_lod"), +} +# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name) +for (code, name) in IMAGE: + opcode(name, code, code, Format.MIMG) + +IMAGE_ATOMIC = { + (0x0f, 0x0f, 0x10, "image_atomic_swap"), + (0x10, 0x10, 0x11, "image_atomic_cmpswap"), + (0x11, 0x11, 0x12, "image_atomic_add"), + (0x12, 0x12, 0x13, "image_atomic_sub"), + (0x13, -1, -1, "image_atomic_rsub"), + (0x14, 0x14, 0x14, "image_atomic_smin"), + (0x15, 0x15, 0x15, "image_atomic_umin"), + (0x16, 0x16, 0x16, "image_atomic_smax"), + (0x17, 0x17, 0x17, "image_atomic_umax"), + (0x18, 0x18, 0x18, "image_atomic_and"), + (0x19, 0x19, 0x19, "image_atomic_or"), + (0x1a, 0x1a, 0x1a, "image_atomic_xor"), + (0x1b, 0x1b, 0x1b, "image_atomic_inc"), + (0x1c, 0x1c, 0x1c, "image_atomic_dec"), + (0x1d, 0x1d, -1, "image_atomic_fcmpswap"), + (0x1e, 0x1e, -1, "image_atomic_fmin"), + (0x1f, 0x1f, -1, "image_atomic_fmax"), +} +# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (gfx6, gfx7, gfx89, gfx89, ???, name) +# gfx7 and gfx10 opcodes are the same here +for (gfx6, gfx7, gfx89, name) in IMAGE_ATOMIC: + opcode(name, gfx89, gfx7, Format.MIMG) + +IMAGE_SAMPLE = { + (0x20, "image_sample"), + (0x21, "image_sample_cl"), + (0x22, "image_sample_d"), + (0x23, "image_sample_d_cl"), + (0x24, "image_sample_l"), + (0x25, "image_sample_b"), + (0x26, "image_sample_b_cl"), + (0x27, "image_sample_lz"), + (0x28, "image_sample_c"), + (0x29, "image_sample_c_cl"), + (0x2a, "image_sample_c_d"), + (0x2b, "image_sample_c_d_cl"), + (0x2c, "image_sample_c_l"), + (0x2d, "image_sample_c_b"), + (0x2e, "image_sample_c_b_cl"), + (0x2f, "image_sample_c_lz"), + (0x30, "image_sample_o"), + (0x31, "image_sample_cl_o"), + (0x32, "image_sample_d_o"), + (0x33, "image_sample_d_cl_o"), + (0x34, "image_sample_l_o"), + (0x35, "image_sample_b_o"), + (0x36, "image_sample_b_cl_o"), + (0x37, "image_sample_lz_o"), + (0x38, "image_sample_c_o"), + (0x39, "image_sample_c_cl_o"), + (0x3a, "image_sample_c_d_o"), + (0x3b, "image_sample_c_d_cl_o"), + (0x3c, "image_sample_c_l_o"), + (0x3d, "image_sample_c_b_o"), + (0x3e, "image_sample_c_b_cl_o"), + (0x3f, "image_sample_c_lz_o"), + (0x68, "image_sample_cd"), + (0x69, "image_sample_cd_cl"), + (0x6a, "image_sample_c_cd"), + (0x6b, "image_sample_c_cd_cl"), + (0x6c, "image_sample_cd_o"), + (0x6d, "image_sample_cd_cl_o"), + (0x6e, "image_sample_c_cd_o"), + (0x6f, "image_sample_c_cd_cl_o"), +} +# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name) +for (code, name) in IMAGE_SAMPLE: + opcode(name, code, code, Format.MIMG) + +IMAGE_GATHER4 = { + (0x40, "image_gather4"), + (0x41, "image_gather4_cl"), + #(0x42, "image_gather4h"), VEGA only? + (0x44, "image_gather4_l"), # following instructions have different opcodes according to ISA sheet. + (0x45, "image_gather4_b"), + (0x46, "image_gather4_b_cl"), + (0x47, "image_gather4_lz"), + (0x48, "image_gather4_c"), + (0x49, "image_gather4_c_cl"), # previous instructions have different opcodes according to ISA sheet. + #(0x4a, "image_gather4h_pck"), VEGA only? + #(0x4b, "image_gather8h_pck"), VGEA only? + (0x4c, "image_gather4_c_l"), + (0x4d, "image_gather4_c_b"), + (0x4e, "image_gather4_c_b_cl"), + (0x4f, "image_gather4_c_lz"), + (0x50, "image_gather4_o"), + (0x51, "image_gather4_cl_o"), + (0x54, "image_gather4_l_o"), + (0x55, "image_gather4_b_o"), + (0x56, "image_gather4_b_cl_o"), + (0x57, "image_gather4_lz_o"), + (0x58, "image_gather4_c_o"), + (0x59, "image_gather4_c_cl_o"), + (0x5c, "image_gather4_c_l_o"), + (0x5d, "image_gather4_c_b_o"), + (0x5e, "image_gather4_c_b_cl_o"), + (0x5f, "image_gather4_c_lz_o"), +} +# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name) +for (code, name) in IMAGE_GATHER4: + opcode(name, code, code, Format.MIMG) + + +FLAT = { + #GFX7, GFX8_9, GFX10 + (0x08, 0x10, 0x08, "flat_load_ubyte"), + (0x09, 0x11, 0x09, "flat_load_sbyte"), + (0x0a, 0x12, 0x0a, "flat_load_ushort"), + (0x0b, 0x13, 0x0b, "flat_load_sshort"), + (0x0c, 0x14, 0x0c, "flat_load_dword"), + (0x0d, 0x15, 0x0d, "flat_load_dwordx2"), + (0x0f, 0x16, 0x0f, "flat_load_dwordx3"), + (0x0e, 0x17, 0x0e, "flat_load_dwordx4"), + (0x18, 0x18, 0x18, "flat_store_byte"), + ( -1, 0x19, 0x19, "flat_store_byte_d16_hi"), + (0x1a, 0x1a, 0x1a, "flat_store_short"), + ( -1, 0x1b, 0x1b, "flat_store_short_d16_hi"), + (0x1c, 0x1c, 0x1c, "flat_store_dword"), + (0x1d, 0x1d, 0x1d, "flat_store_dwordx2"), + (0x1f, 0x1e, 0x1f, "flat_store_dwordx3"), + (0x1e, 0x1f, 0x1e, "flat_store_dwordx4"), + ( -1, 0x20, 0x20, "flat_load_ubyte_d16"), + ( -1, 0x21, 0x21, "flat_load_ubyte_d16_hi"), + ( -1, 0x22, 0x22, "flat_load_sbyte_d16"), + ( -1, 0x23, 0x23, "flat_load_sbyte_d16_hi"), + ( -1, 0x24, 0x24, "flat_load_short_d16"), + ( -1, 0x25, 0x25, "flat_load_short_d16_hi"), + (0x30, 0x40, 0x30, "flat_atomic_swap"), + (0x31, 0x41, 0x31, "flat_atomic_cmpswap"), + (0x32, 0x42, 0x32, "flat_atomic_add"), + (0x33, 0x43, 0x33, "flat_atomic_sub"), + (0x35, 0x44, 0x35, "flat_atomic_smin"), + (0x36, 0x45, 0x36, "flat_atomic_umin"), + (0x37, 0x46, 0x37, "flat_atomic_smax"), + (0x38, 0x47, 0x38, "flat_atomic_umax"), + (0x39, 0x48, 0x39, "flat_atomic_and"), + (0x3a, 0x49, 0x3a, "flat_atomic_or"), + (0x3b, 0x4a, 0x3b, "flat_atomic_xor"), + (0x3c, 0x4b, 0x3c, "flat_atomic_inc"), + (0x3d, 0x4c, 0x3d, "flat_atomic_dec"), + (0x3e, -1, 0x3e, "flat_atomic_fcmpswap"), + (0x3f, -1, 0x3f, "flat_atomic_fmin"), + (0x40, -1, 0x40, "flat_atomic_fmax"), + (0x50, 0x60, 0x50, "flat_atomic_swap_x2"), + (0x51, 0x61, 0x51, "flat_atomic_cmpswap_x2"), + (0x52, 0x62, 0x52, "flat_atomic_add_x2"), + (0x53, 0x63, 0x53, "flat_atomic_sub_x2"), + (0x55, 0x64, 0x54, "flat_atomic_smin_x2"), + (0x56, 0x65, 0x55, "flat_atomic_umin_x2"), + (0x57, 0x66, 0x56, "flat_atomic_smax_x2"), + (0x58, 0x67, 0x58, "flat_atomic_umax_x2"), + (0x59, 0x68, 0x59, "flat_atomic_and_x2"), + (0x5a, 0x69, 0x5a, "flat_atomic_or_x2"), + (0x5b, 0x6a, 0x5b, "flat_atomic_xor_x2"), + (0x5c, 0x6b, 0x5c, "flat_atomic_inc_x2"), + (0x5d, 0x6c, 0x5d, "flat_atomic_dec_x2"), + (0x5e, -1, 0x5e, "flat_atomic_fcmpswap_x2"), + (0x5f, -1, 0x5f, "flat_atomic_fmin_x2"), + (0x60, -1, 0x60, "flat_atomic_fmax_x2"), +} +for (gfx7, gfx8, gfx10, name) in FLAT: + opcode(name, gfx8, gfx10, Format.FLAT) + +GLOBAL = { + #GFX8_9, GFX10 + (0x10, 0x08, "global_load_ubyte"), + (0x11, 0x09, "global_load_sbyte"), + (0x12, 0x0a, "global_load_ushort"), + (0x13, 0x0b, "global_load_sshort"), + (0x14, 0x0c, "global_load_dword"), + (0x15, 0x0d, "global_load_dwordx2"), + (0x16, 0x0f, "global_load_dwordx3"), + (0x17, 0x0e, "global_load_dwordx4"), + (0x18, 0x18, "global_store_byte"), + (0x19, 0x19, "global_store_byte_d16_hi"), + (0x1a, 0x1a, "global_store_short"), + (0x1b, 0x1b, "global_store_short_d16_hi"), + (0x1c, 0x1c, "global_store_dword"), + (0x1d, 0x1d, "global_store_dwordx2"), + (0x1e, 0x1f, "global_store_dwordx3"), + (0x1f, 0x1e, "global_store_dwordx4"), + (0x20, 0x20, "global_load_ubyte_d16"), + (0x21, 0x21, "global_load_ubyte_d16_hi"), + (0x22, 0x22, "global_load_sbyte_d16"), + (0x23, 0x23, "global_load_sbyte_d16_hi"), + (0x24, 0x24, "global_load_short_d16"), + (0x25, 0x25, "global_load_short_d16_hi"), + (0x40, 0x30, "global_atomic_swap"), + (0x41, 0x31, "global_atomic_cmpswap"), + (0x42, 0x32, "global_atomic_add"), + (0x43, 0x33, "global_atomic_sub"), + (0x44, 0x35, "global_atomic_smin"), + (0x45, 0x36, "global_atomic_umin"), + (0x46, 0x37, "global_atomic_smax"), + (0x47, 0x38, "global_atomic_umax"), + (0x48, 0x39, "global_atomic_and"), + (0x49, 0x3a, "global_atomic_or"), + (0x4a, 0x3b, "global_atomic_xor"), + (0x4b, 0x3c, "global_atomic_inc"), + (0x4c, 0x3d, "global_atomic_dec"), + ( -1, 0x3e, "global_atomic_fcmpswap"), + ( -1, 0x3f, "global_atomic_fmin"), + ( -1, 0x40, "global_atomic_fmax"), + (0x60, 0x50, "global_atomic_swap_x2"), + (0x61, 0x51, "global_atomic_cmpswap_x2"), + (0x62, 0x52, "global_atomic_add_x2"), + (0x63, 0x53, "global_atomic_sub_x2"), + (0x64, 0x54, "global_atomic_smin_x2"), + (0x65, 0x55, "global_atomic_umin_x2"), + (0x66, 0x56, "global_atomic_smax_x2"), + (0x67, 0x58, "global_atomic_umax_x2"), + (0x68, 0x59, "global_atomic_and_x2"), + (0x69, 0x5a, "global_atomic_or_x2"), + (0x6a, 0x5b, "global_atomic_xor_x2"), + (0x6b, 0x5c, "global_atomic_inc_x2"), + (0x6c, 0x5d, "global_atomic_dec_x2"), + ( -1, 0x5e, "global_atomic_fcmpswap_x2"), + ( -1, 0x5f, "global_atomic_fmin_x2"), + ( -1, 0x60, "global_atomic_fmax_x2"), +} +for (gfx8, gfx10, name) in GLOBAL: + opcode(name, gfx8, gfx10, Format.GLOBAL) + +SCRATCH = { + #GFX8_9, GFX10 + (0x10, 0x08, "scratch_load_ubyte"), + (0x11, 0x09, "scratch_load_sbyte"), + (0x12, 0x0a, "scratch_load_ushort"), + (0x13, 0x0b, "scratch_load_sshort"), + (0x14, 0x0c, "scratch_load_dword"), + (0x15, 0x0d, "scratch_load_dwordx2"), + (0x16, 0x0f, "scratch_load_dwordx3"), + (0x17, 0x0e, "scratch_load_dwordx4"), + (0x18, 0x18, "scratch_store_byte"), + (0x19, 0x19, "scratch_store_byte_d16_hi"), + (0x1a, 0x1a, "scratch_store_short"), + (0x1b, 0x1b, "scratch_store_short_d16_hi"), + (0x1c, 0x1c, "scratch_store_dword"), + (0x1d, 0x1d, "scratch_store_dwordx2"), + (0x1e, 0x1f, "scratch_store_dwordx3"), + (0x1f, 0x1e, "scratch_store_dwordx4"), + (0x20, 0x20, "scratch_load_ubyte_d16"), + (0x21, 0x21, "scratch_load_ubyte_d16_hi"), + (0x22, 0x22, "scratch_load_sbyte_d16"), + (0x23, 0x23, "scratch_load_sbyte_d16_hi"), + (0x24, 0x24, "scratch_load_short_d16"), + (0x25, 0x25, "scratch_load_short_d16_hi"), +} +for (gfx8, gfx10, name) in SCRATCH: + opcode(name, gfx8, gfx10, Format.SCRATCH) diff --git a/src/amd/compiler/aco_opcodes_cpp.py b/src/amd/compiler/aco_opcodes_cpp.py new file mode 100644 index 00000000000..83c24e0eb44 --- /dev/null +++ b/src/amd/compiler/aco_opcodes_cpp.py @@ -0,0 +1,74 @@ + +template = """\ +/* + * Copyright (c) 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" + +namespace aco { + +const unsigned VOPC_to_GFX6[256] = { +% for code in VOPC_GFX6: + ${code}, +% endfor +}; + +<% +opcode_names = sorted(opcodes.keys()) +can_use_input_modifiers = "".join([opcodes[name].input_mod for name in reversed(opcode_names)]) +can_use_output_modifiers = "".join([opcodes[name].output_mod for name in reversed(opcode_names)]) +%> + +extern const aco::Info instr_info = { + .opcode_gfx9 = { + % for name in opcode_names: + ${opcodes[name].opcode_gfx9}, + % endfor + }, + .opcode_gfx10 = { + % for name in opcode_names: + ${opcodes[name].opcode_gfx10}, + % endfor + }, + .can_use_input_modifiers = std::bitset<${len(opcode_names)}>("${can_use_input_modifiers}"), + .can_use_output_modifiers = std::bitset<${len(opcode_names)}>("${can_use_output_modifiers}"), + .name = { + % for name in opcode_names: + "${name}", + % endfor + }, + .format = { + % for name in opcode_names: + aco::Format::${str(opcodes[name].format.name)}, + % endfor + }, +}; + +} +""" + +from aco_opcodes import opcodes, VOPC_GFX6 +from mako.template import Template + +print(Template(template).render(opcodes=opcodes, VOPC_GFX6=VOPC_GFX6)) diff --git a/src/amd/compiler/aco_opcodes_h.py b/src/amd/compiler/aco_opcodes_h.py new file mode 100644 index 00000000000..6b8bfc1ee07 --- /dev/null +++ b/src/amd/compiler/aco_opcodes_h.py @@ -0,0 +1,47 @@ + +template = """\ +/* + * Copyright (c) 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Daniel Schuermann ([email protected]) + */ + +#ifndef _ACO_OPCODES_ +#define _ACO_OPCODES_ + +<% opcode_names = sorted(opcodes.keys()) %> + +enum class aco_opcode : std::uint16_t { +% for name in opcode_names: + ${name}, +% endfor + last_opcode = ${opcode_names[-1]}, + num_opcodes = last_opcode + 1 +}; + +#endif /* _ACO_OPCODES_ */""" + +from aco_opcodes import opcodes +from mako.template import Template + +print(Template(template).render(opcodes=opcodes)) diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp new file mode 100644 index 00000000000..8071ace1f97 --- /dev/null +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -0,0 +1,327 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <map> +#include <unordered_set> + +#include "aco_ir.h" + +/* + * Implements the algorithm for dominator-tree value numbering + * from "Value Numbering" by Briggs, Cooper, and Simpson. + */ + +namespace aco { +namespace { + +struct InstrHash { + std::size_t operator()(Instruction* instr) const + { + uint64_t hash = (uint64_t) instr->opcode + (uint64_t) instr->format; + for (unsigned i = 0; i < instr->operands.size(); i++) { + Operand op = instr->operands[i]; + uint64_t val = op.isTemp() ? op.tempId() : op.isFixed() ? op.physReg() : op.constantValue(); + hash |= val << (i+1) * 8; + } + if (instr->isVOP3()) { + VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr); + for (unsigned i = 0; i < 3; i++) { + hash ^= vop3->abs[i] << (i*3 + 0); + hash ^= vop3->opsel[i] << (i*3 + 1); + hash ^= vop3->neg[i] << (i*3 + 2); + } + hash ^= (vop3->clamp << 28) * 13; + hash += vop3->omod << 19; + } + switch (instr->format) { + case Format::SMEM: + break; + case Format::VINTRP: { + Interp_instruction* interp = static_cast<Interp_instruction*>(instr); + hash ^= interp->attribute << 13; + hash ^= interp->component << 27; + break; + } + case Format::DS: + break; + default: + break; + } + + return hash; + } +}; + +struct InstrPred { + bool operator()(Instruction* a, Instruction* b) const + { + if (a->format != b->format) + return false; + if (a->opcode != b->opcode) + return false; + if (a->operands.size() != b->operands.size() || a->definitions.size() != b->definitions.size()) + return false; /* possible with pseudo-instructions */ + for (unsigned i = 0; i < a->operands.size(); i++) { + if (a->operands[i].isConstant()) { + if (!b->operands[i].isConstant()) + return false; + if (a->operands[i].constantValue() != b->operands[i].constantValue()) + return false; + } + else if (a->operands[i].isTemp()) { + if (!b->operands[i].isTemp()) + return false; + if (a->operands[i].tempId() != b->operands[i].tempId()) + return false; + } + else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined()) + return false; + if (a->operands[i].isFixed()) { + if (a->operands[i].physReg() == exec) + return false; + if (!b->operands[i].isFixed()) + return false; + if (!(a->operands[i].physReg() == b->operands[i].physReg())) + return false; + } + } + for (unsigned i = 0; i < a->definitions.size(); i++) { + if (a->definitions[i].isTemp()) { + if (!b->definitions[i].isTemp()) + return false; + if (a->definitions[i].regClass() != b->definitions[i].regClass()) + return false; + } + if (a->definitions[i].isFixed()) { + if (!b->definitions[i].isFixed()) + return false; + if (!(a->definitions[i].physReg() == b->definitions[i].physReg())) + return false; + } + } + if (a->format == Format::PSEUDO_BRANCH) + return false; + if (a->isVOP3()) { + VOP3A_instruction* a3 = static_cast<VOP3A_instruction*>(a); + VOP3A_instruction* b3 = static_cast<VOP3A_instruction*>(b); + for (unsigned i = 0; i < 3; i++) { + if (a3->abs[i] != b3->abs[i] || + a3->opsel[i] != b3->opsel[i] || + a3->neg[i] != b3->neg[i]) + return false; + } + return a3->clamp == b3->clamp && + a3->omod == b3->omod; + } + if (a->isDPP()) { + DPP_instruction* aDPP = static_cast<DPP_instruction*>(a); + DPP_instruction* bDPP = static_cast<DPP_instruction*>(b); + return aDPP->dpp_ctrl == bDPP->dpp_ctrl && + aDPP->bank_mask == bDPP->bank_mask && + aDPP->row_mask == bDPP->row_mask && + aDPP->bound_ctrl == bDPP->bound_ctrl && + aDPP->abs[0] == bDPP->abs[0] && + aDPP->abs[1] == bDPP->abs[1] && + aDPP->neg[0] == bDPP->neg[0] && + aDPP->neg[1] == bDPP->neg[1]; + } + switch (a->format) { + case Format::VOPC: { + /* Since the results depend on the exec mask, these shouldn't + * be value numbered (this is especially useful for subgroupBallot()). */ + return false; + } + case Format::SOPK: { + SOPK_instruction* aK = static_cast<SOPK_instruction*>(a); + SOPK_instruction* bK = static_cast<SOPK_instruction*>(b); + return aK->imm == bK->imm; + } + case Format::SMEM: { + SMEM_instruction* aS = static_cast<SMEM_instruction*>(a); + SMEM_instruction* bS = static_cast<SMEM_instruction*>(b); + return aS->can_reorder && bS->can_reorder && + aS->glc == bS->glc && aS->nv == bS->nv; + } + case Format::VINTRP: { + Interp_instruction* aI = static_cast<Interp_instruction*>(a); + Interp_instruction* bI = static_cast<Interp_instruction*>(b); + if (aI->attribute != bI->attribute) + return false; + if (aI->component != bI->component) + return false; + return true; + } + case Format::PSEUDO_REDUCTION: + return false; + case Format::MTBUF: { + /* this is fine since they are only used for vertex input fetches */ + MTBUF_instruction* aM = static_cast<MTBUF_instruction *>(a); + MTBUF_instruction* bM = static_cast<MTBUF_instruction *>(b); + return aM->dfmt == bM->dfmt && + aM->nfmt == bM->nfmt && + aM->offset == bM->offset && + aM->offen == bM->offen && + aM->idxen == bM->idxen && + aM->glc == bM->glc && + aM->slc == bM->slc && + aM->tfe == bM->tfe && + aM->disable_wqm == bM->disable_wqm; + } + /* we want to optimize these in NIR and don't hassle with load-store dependencies */ + case Format::MUBUF: + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: + case Format::DS: + return false; + case Format::MIMG: { + MIMG_instruction* aM = static_cast<MIMG_instruction*>(a); + MIMG_instruction* bM = static_cast<MIMG_instruction*>(b); + return aM->can_reorder && bM->can_reorder && + aM->dmask == bM->dmask && + aM->unrm == bM->unrm && + aM->glc == bM->glc && + aM->slc == bM->slc && + aM->tfe == bM->tfe && + aM->da == bM->da && + aM->lwe == bM->lwe && + aM->r128 == bM->r128 && + aM->a16 == bM->a16 && + aM->d16 == bM->d16 && + aM->disable_wqm == bM->disable_wqm; + } + default: + return true; + } + } +}; + + +typedef std::unordered_set<Instruction*, InstrHash, InstrPred> expr_set; + +void process_block(Block& block, + expr_set& expr_values, + std::map<uint32_t, Temp>& renames) +{ + bool run = false; + std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin(); + std::vector<aco_ptr<Instruction>> new_instructions; + new_instructions.reserve(block.instructions.size()); + expr_set phi_values; + + while (it != block.instructions.end()) { + aco_ptr<Instruction>& instr = *it; + /* first, rename operands */ + for (Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + auto it = renames.find(op.tempId()); + if (it != renames.end()) + op.setTemp(it->second); + } + + if (instr->definitions.empty() || !run) { + if (instr->opcode == aco_opcode::p_logical_start) + run = true; + else if (instr->opcode == aco_opcode::p_logical_end) + run = false; + else if (instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi) { + std::pair<expr_set::iterator, bool> res = phi_values.emplace(instr.get()); + if (!res.second) { + Instruction* orig_phi = *(res.first); + renames.emplace(instr->definitions[0].tempId(), orig_phi->definitions[0].getTemp()).second; + ++it; + continue; + } + } + new_instructions.emplace_back(std::move(instr)); + ++it; + continue; + } + + /* simple copy-propagation through renaming */ + if ((instr->opcode == aco_opcode::s_mov_b32 || instr->opcode == aco_opcode::s_mov_b64 || instr->opcode == aco_opcode::v_mov_b32) && + !instr->definitions[0].isFixed() && instr->operands[0].isTemp() && instr->operands[0].regClass() == instr->definitions[0].regClass() && + !instr->isDPP() && !((int)instr->format & (int)Format::SDWA)) { + renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp(); + } + + std::pair<expr_set::iterator, bool> res = expr_values.emplace(instr.get()); + + /* if there was already an expression with the same value number */ + if (!res.second) { + Instruction* orig_instr = *(res.first); + assert(instr->definitions.size() == orig_instr->definitions.size()); + for (unsigned i = 0; i < instr->definitions.size(); i++) { + assert(instr->definitions[i].regClass() == orig_instr->definitions[i].regClass()); + renames.emplace(instr->definitions[i].tempId(), orig_instr->definitions[i].getTemp()).second; + } + } else { + new_instructions.emplace_back(std::move(instr)); + } + ++it; + } + + block.instructions.swap(new_instructions); +} + +void rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames) +{ + for (aco_ptr<Instruction>& phi : block.instructions) { + if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) + break; + + for (Operand& op : phi->operands) { + if (!op.isTemp()) + continue; + auto it = renames.find(op.tempId()); + if (it != renames.end()) + op.setTemp(it->second); + } + } +} +} /* end namespace */ + + +void value_numbering(Program* program) +{ + std::vector<expr_set> expr_values(program->blocks.size()); + std::map<uint32_t, Temp> renames; + + for (Block& block : program->blocks) { + if (block.logical_idom != -1) { + /* initialize expr_values from idom */ + expr_values[block.index] = expr_values[block.logical_idom]; + process_block(block, expr_values[block.index], renames); + } else { + expr_set empty; + process_block(block, empty, renames); + } + } + + for (Block& block : program->blocks) + rename_phi_operands(block, renames); +} + +} diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp new file mode 100644 index 00000000000..fe05634c280 --- /dev/null +++ b/src/amd/compiler/aco_optimizer.cpp @@ -0,0 +1,2401 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Daniel Schürmann ([email protected]) + * + */ + +#include <algorithm> +#include <math.h> + +#include "aco_ir.h" +#include "util/half_float.h" +#include "util/u_math.h" + +namespace aco { + +/** + * The optimizer works in 4 phases: + * (1) The first pass collects information for each ssa-def, + * propagates reg->reg operands of the same type, inline constants + * and neg/abs input modifiers. + * (2) The second pass combines instructions like mad, omod, clamp and + * propagates sgpr's on VALU instructions. + * This pass depends on information collected in the first pass. + * (3) The third pass goes backwards, and selects instructions, + * i.e. decides if a mad instruction is profitable and eliminates dead code. + * (4) The fourth pass cleans up the sequence: literals get applied and dead + * instructions are removed from the sequence. + */ + + +struct mad_info { + aco_ptr<Instruction> add_instr; + uint32_t mul_temp_id; + uint32_t literal_idx; + bool needs_vop3; + bool check_literal; + + mad_info(aco_ptr<Instruction> instr, uint32_t id, bool vop3) + : add_instr(std::move(instr)), mul_temp_id(id), needs_vop3(vop3), check_literal(false) {} +}; + +enum Label { + label_vec = 1 << 0, + label_constant = 1 << 1, + label_abs = 1 << 2, + label_neg = 1 << 3, + label_mul = 1 << 4, + label_temp = 1 << 5, + label_literal = 1 << 6, + label_mad = 1 << 7, + label_omod2 = 1 << 8, + label_omod4 = 1 << 9, + label_omod5 = 1 << 10, + label_omod_success = 1 << 11, + label_clamp = 1 << 12, + label_clamp_success = 1 << 13, + label_undefined = 1 << 14, + label_vcc = 1 << 15, + label_b2f = 1 << 16, + label_add_sub = 1 << 17, + label_bitwise = 1 << 18, + label_minmax = 1 << 19, + label_fcmp = 1 << 20, +}; + +static constexpr uint32_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success | label_add_sub | label_bitwise | label_minmax | label_fcmp; +static constexpr uint32_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f; +static constexpr uint32_t val_labels = label_constant | label_literal | label_mad; + +struct ssa_info { + uint32_t val; + union { + Temp temp; + Instruction* instr; + }; + uint32_t label; + + void add_label(Label new_label) + { + /* Since all labels which use "instr" use it for the same thing + * (indicating the defining instruction), there is no need to clear + * any other instr labels. */ + if (new_label & instr_labels) + label &= ~temp_labels; /* instr and temp alias */ + + if (new_label & temp_labels) { + label &= ~temp_labels; + label &= ~instr_labels; /* instr and temp alias */ + } + + if (new_label & val_labels) + label &= ~val_labels; + + label |= new_label; + } + + void set_vec(Instruction* vec) + { + add_label(label_vec); + instr = vec; + } + + bool is_vec() + { + return label & label_vec; + } + + void set_constant(uint32_t constant) + { + add_label(label_constant); + val = constant; + } + + bool is_constant() + { + return label & label_constant; + } + + void set_abs(Temp abs_temp) + { + add_label(label_abs); + temp = abs_temp; + } + + bool is_abs() + { + return label & label_abs; + } + + void set_neg(Temp neg_temp) + { + add_label(label_neg); + temp = neg_temp; + } + + bool is_neg() + { + return label & label_neg; + } + + void set_neg_abs(Temp neg_abs_temp) + { + add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg)); + temp = neg_abs_temp; + } + + void set_mul(Instruction* mul) + { + add_label(label_mul); + instr = mul; + } + + bool is_mul() + { + return label & label_mul; + } + + void set_temp(Temp tmp) + { + add_label(label_temp); + temp = tmp; + } + + bool is_temp() + { + return label & label_temp; + } + + void set_literal(uint32_t lit) + { + add_label(label_literal); + val = lit; + } + + bool is_literal() + { + return label & label_literal; + } + + void set_mad(Instruction* mad, uint32_t mad_info_idx) + { + add_label(label_mad); + val = mad_info_idx; + instr = mad; + } + + bool is_mad() + { + return label & label_mad; + } + + void set_omod2() + { + add_label(label_omod2); + } + + bool is_omod2() + { + return label & label_omod2; + } + + void set_omod4() + { + add_label(label_omod4); + } + + bool is_omod4() + { + return label & label_omod4; + } + + void set_omod5() + { + add_label(label_omod5); + } + + bool is_omod5() + { + return label & label_omod5; + } + + void set_omod_success(Instruction* omod_instr) + { + add_label(label_omod_success); + instr = omod_instr; + } + + bool is_omod_success() + { + return label & label_omod_success; + } + + void set_clamp() + { + add_label(label_clamp); + } + + bool is_clamp() + { + return label & label_clamp; + } + + void set_clamp_success(Instruction* clamp_instr) + { + add_label(label_clamp_success); + instr = clamp_instr; + } + + bool is_clamp_success() + { + return label & label_clamp_success; + } + + void set_undefined() + { + add_label(label_undefined); + } + + bool is_undefined() + { + return label & label_undefined; + } + + void set_vcc(Temp vcc) + { + add_label(label_vcc); + temp = vcc; + } + + bool is_vcc() + { + return label & label_vcc; + } + + bool is_constant_or_literal() + { + return is_constant() || is_literal(); + } + + void set_b2f(Temp val) + { + add_label(label_b2f); + temp = val; + } + + bool is_b2f() + { + return label & label_b2f; + } + + void set_add_sub(Instruction *add_sub_instr) + { + add_label(label_add_sub); + instr = add_sub_instr; + } + + bool is_add_sub() + { + return label & label_add_sub; + } + + void set_bitwise(Instruction *bitwise_instr) + { + add_label(label_bitwise); + instr = bitwise_instr; + } + + bool is_bitwise() + { + return label & label_bitwise; + } + + void set_minmax(Instruction *minmax_instr) + { + add_label(label_minmax); + instr = minmax_instr; + } + + bool is_minmax() + { + return label & label_minmax; + } + + void set_fcmp(Instruction *fcmp_instr) + { + add_label(label_fcmp); + instr = fcmp_instr; + } + + bool is_fcmp() + { + return label & label_fcmp; + } + +}; + +struct opt_ctx { + Program* program; + std::vector<aco_ptr<Instruction>> instructions; + ssa_info* info; + std::pair<uint32_t,Temp> last_literal; + std::vector<mad_info> mad_infos; + std::vector<uint16_t> uses; +}; + +bool can_swap_operands(aco_ptr<Instruction>& instr) +{ + if (instr->operands[0].isConstant() || + (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr)) + return false; + + switch (instr->opcode) { + case aco_opcode::v_add_f32: + case aco_opcode::v_mul_f32: + case aco_opcode::v_or_b32: + case aco_opcode::v_and_b32: + case aco_opcode::v_xor_b32: + case aco_opcode::v_max_f32: + case aco_opcode::v_min_f32: + case aco_opcode::v_cmp_eq_f32: + case aco_opcode::v_cmp_lg_f32: + return true; + case aco_opcode::v_sub_f32: + instr->opcode = aco_opcode::v_subrev_f32; + return true; + case aco_opcode::v_cmp_lt_f32: + instr->opcode = aco_opcode::v_cmp_gt_f32; + return true; + case aco_opcode::v_cmp_ge_f32: + instr->opcode = aco_opcode::v_cmp_le_f32; + return true; + case aco_opcode::v_cmp_lt_i32: + instr->opcode = aco_opcode::v_cmp_gt_i32; + return true; + default: + return false; + } +} + +bool can_use_VOP3(aco_ptr<Instruction>& instr) +{ + if (instr->operands.size() && instr->operands[0].isLiteral()) + return false; + + if (instr->isDPP() || instr->isSDWA()) + return false; + + return instr->opcode != aco_opcode::v_madmk_f32 && + instr->opcode != aco_opcode::v_madak_f32 && + instr->opcode != aco_opcode::v_madmk_f16 && + instr->opcode != aco_opcode::v_madak_f16; +} + +void to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr) +{ + if (instr->isVOP3()) + return; + + assert(!instr->operands[0].isLiteral()); + aco_ptr<Instruction> tmp = std::move(instr); + Format format = asVOP3(tmp->format); + instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); + std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin()); + for (unsigned i = 0; i < instr->definitions.size(); i++) { + instr->definitions[i] = tmp->definitions[i]; + if (instr->definitions[i].isTemp()) { + ssa_info& info = ctx.info[instr->definitions[i].tempId()]; + if (info.label & instr_labels && info.instr == tmp.get()) + info.instr = instr.get(); + } + } +} + +bool valu_can_accept_literal(opt_ctx& ctx, aco_ptr<Instruction>& instr) +{ + // TODO: VOP3 can take a literal on GFX10 + return !instr->isSDWA() && !instr->isDPP() && !instr->isVOP3(); +} + +/* only covers special cases */ +bool can_accept_constant(aco_ptr<Instruction>& instr, unsigned operand) +{ + switch (instr->opcode) { + case aco_opcode::v_interp_p2_f32: + case aco_opcode::v_mac_f32: + case aco_opcode::v_writelane_b32: + case aco_opcode::v_cndmask_b32: + return operand != 2; + case aco_opcode::s_addk_i32: + case aco_opcode::s_mulk_i32: + case aco_opcode::p_wqm: + case aco_opcode::p_extract_vector: + case aco_opcode::p_split_vector: + return operand != 0; + default: + if ((instr->format == Format::MUBUF || + instr->format == Format::MIMG) && + instr->definitions.size() == 1 && + instr->operands.size() == 4) { + return operand != 3; + } + return true; + } +} + +bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset) +{ + Operand op = instr->operands[op_index]; + + if (!op.isTemp()) + return false; + Temp tmp = op.getTemp(); + if (!ctx.info[tmp.id()].is_add_sub()) + return false; + + Instruction *add_instr = ctx.info[tmp.id()].instr; + + switch (add_instr->opcode) { + case aco_opcode::v_add_u32: + case aco_opcode::v_add_co_u32: + case aco_opcode::s_add_i32: + case aco_opcode::s_add_u32: + break; + default: + return false; + } + + for (unsigned i = 0; i < 2; i++) { + if (add_instr->operands[i].isConstant()) { + *offset = add_instr->operands[i].constantValue(); + } else if (add_instr->operands[i].isTemp() && + ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal()) { + *offset = ctx.info[add_instr->operands[i].tempId()].val; + } else { + continue; + } + if (!add_instr->operands[!i].isTemp()) + continue; + + uint32_t offset2 = 0; + if (parse_base_offset(ctx, add_instr, !i, base, &offset2)) { + *offset += offset2; + } else { + *base = add_instr->operands[!i].getTemp(); + } + return true; + } + + return false; +} + +void label_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr) +{ + if (instr->isSALU() || instr->isVALU() || instr->format == Format::PSEUDO) { + ASSERTED bool all_const = false; + for (Operand& op : instr->operands) + all_const = all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal()); + perfwarn(all_const, "All instruction operands are constant", instr.get()); + } + + for (unsigned i = 0; i < instr->operands.size(); i++) + { + if (!instr->operands[i].isTemp()) + continue; + + ssa_info info = ctx.info[instr->operands[i].tempId()]; + /* propagate undef */ + if (info.is_undefined() && is_phi(instr)) + instr->operands[i] = Operand(instr->operands[i].regClass()); + /* propagate reg->reg of same type */ + if (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) { + instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp); + info = ctx.info[info.temp.id()]; + } + + /* SALU / PSEUDO: propagate inline constants */ + if (instr->isSALU() || instr->format == Format::PSEUDO) { + if (info.is_temp() && info.temp.type() == RegType::sgpr) { + instr->operands[i].setTemp(info.temp); + info = ctx.info[info.temp.id()]; + } else if (info.is_temp() && info.temp.type() == RegType::vgpr) { + /* propagate vgpr if it can take it */ + switch (instr->opcode) { + case aco_opcode::p_create_vector: + case aco_opcode::p_split_vector: + case aco_opcode::p_extract_vector: + case aco_opcode::p_phi: { + const bool all_vgpr = std::none_of(instr->definitions.begin(), instr->definitions.end(), + [] (const Definition& def) { return def.getTemp().type() != RegType::vgpr;}); + if (all_vgpr) { + instr->operands[i] = Operand(info.temp); + info = ctx.info[info.temp.id()]; + } + break; + } + default: + break; + } + } + if ((info.is_constant() || (info.is_literal() && instr->format == Format::PSEUDO)) && !instr->operands[i].isFixed() && can_accept_constant(instr, i)) { + instr->operands[i] = Operand(info.val); + continue; + } + } + + /* VALU: propagate neg, abs & inline constants */ + else if (instr->isVALU()) { + if (info.is_temp() && info.temp.type() == RegType::vgpr) { + instr->operands[i].setTemp(info.temp); + info = ctx.info[info.temp.id()]; + } + if (info.is_abs() && (can_use_VOP3(instr) || instr->isDPP()) && instr_info.can_use_input_modifiers[(int)instr->opcode]) { + if (!instr->isDPP()) + to_VOP3(ctx, instr); + instr->operands[i] = Operand(info.temp); + if (instr->isDPP()) + static_cast<DPP_instruction*>(instr.get())->abs[i] = true; + else + static_cast<VOP3A_instruction*>(instr.get())->abs[i] = true; + } + if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32) { + instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32; + instr->operands[i].setTemp(info.temp); + continue; + } else if (info.is_neg() && (can_use_VOP3(instr) || instr->isDPP()) && instr_info.can_use_input_modifiers[(int)instr->opcode]) { + if (!instr->isDPP()) + to_VOP3(ctx, instr); + instr->operands[i].setTemp(info.temp); + if (instr->isDPP()) + static_cast<DPP_instruction*>(instr.get())->neg[i] = true; + else + static_cast<VOP3A_instruction*>(instr.get())->neg[i] = true; + continue; + } + if (info.is_constant() && can_accept_constant(instr, i)) { + perfwarn(instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get()); + if (i == 0) { + instr->operands[i] = Operand(info.val); + continue; + } else if (!instr->isVOP3() && can_swap_operands(instr)) { + instr->operands[i] = instr->operands[0]; + instr->operands[0] = Operand(info.val); + continue; + } else if (can_use_VOP3(instr)) { + to_VOP3(ctx, instr); + instr->operands[i] = Operand(info.val); + continue; + } + } + } + + /* MUBUF: propagate constants and combine additions */ + else if (instr->format == Format::MUBUF) { + MUBUF_instruction *mubuf = static_cast<MUBUF_instruction *>(instr.get()); + Temp base; + uint32_t offset; + while (info.is_temp()) + info = ctx.info[info.temp.id()]; + + if (mubuf->offen && i == 0 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) { + assert(!mubuf->idxen); + instr->operands[i] = Operand(v1); + mubuf->offset += info.val; + mubuf->offen = false; + continue; + } else if (i == 2 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) { + instr->operands[2] = Operand((uint32_t) 0); + mubuf->offset += info.val; + continue; + } else if (mubuf->offen && i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) { + assert(!mubuf->idxen); + instr->operands[i].setTemp(base); + mubuf->offset += offset; + continue; + } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && mubuf->offset + offset < 4096) { + instr->operands[i].setTemp(base); + mubuf->offset += offset; + continue; + } + } + + /* DS: combine additions */ + else if (instr->format == Format::DS) { + + DS_instruction *ds = static_cast<DS_instruction *>(instr.get()); + Temp base; + uint32_t offset; + if (i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == instr->operands[i].regClass()) { + if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32) { + if (offset % 4 == 0 && + ds->offset0 + (offset >> 2) <= 255 && + ds->offset1 + (offset >> 2) <= 255) { + instr->operands[i].setTemp(base); + ds->offset0 += offset >> 2; + ds->offset1 += offset >> 2; + } + } else { + if (ds->offset0 + offset <= 65535) { + instr->operands[i].setTemp(base); + ds->offset0 += offset; + } + } + } + } + + /* SMEM: propagate constants and combine additions */ + else if (instr->format == Format::SMEM) { + + SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr.get()); + Temp base; + uint32_t offset; + if (i == 1 && info.is_constant_or_literal() && info.val <= 0xFFFFF) { + instr->operands[i] = Operand(info.val); + continue; + } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) { + bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4); + if (soe && + (!ctx.info[smem->operands.back().tempId()].is_constant_or_literal() || + ctx.info[smem->operands.back().tempId()].val != 0)) { + continue; + } + if (soe) { + smem->operands[1] = Operand(offset); + smem->operands.back() = Operand(base); + } else { + SMEM_instruction *new_instr = create_instruction<SMEM_instruction>(smem->opcode, Format::SMEM, smem->operands.size() + 1, smem->definitions.size()); + new_instr->operands[0] = smem->operands[0]; + new_instr->operands[1] = Operand(offset); + if (smem->definitions.empty()) + new_instr->operands[2] = smem->operands[2]; + new_instr->operands.back() = Operand(base); + if (!smem->definitions.empty()) + new_instr->definitions[0] = smem->definitions[0]; + instr.reset(new_instr); + smem = static_cast<SMEM_instruction *>(instr.get()); + } + continue; + } + } + } + + /* if this instruction doesn't define anything, return */ + if (instr->definitions.empty()) + return; + + switch (instr->opcode) { + case aco_opcode::p_create_vector: { + unsigned num_ops = instr->operands.size(); + for (const Operand& op : instr->operands) { + if (op.isTemp() && ctx.info[op.tempId()].is_vec()) + num_ops += ctx.info[op.tempId()].instr->operands.size() - 1; + } + if (num_ops != instr->operands.size()) { + aco_ptr<Instruction> old_vec = std::move(instr); + instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_ops, 1)); + instr->definitions[0] = old_vec->definitions[0]; + unsigned k = 0; + for (Operand& old_op : old_vec->operands) { + if (old_op.isTemp() && ctx.info[old_op.tempId()].is_vec()) { + for (unsigned j = 0; j < ctx.info[old_op.tempId()].instr->operands.size(); j++) + instr->operands[k++] = ctx.info[old_op.tempId()].instr->operands[j]; + } else { + instr->operands[k++] = old_op; + } + } + assert(k == num_ops); + } + if (instr->operands.size() == 1 && instr->operands[0].isTemp()) + ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); + else if (instr->definitions[0].getTemp().size() == instr->operands.size()) + ctx.info[instr->definitions[0].tempId()].set_vec(instr.get()); + break; + } + case aco_opcode::p_split_vector: { + if (!ctx.info[instr->operands[0].tempId()].is_vec()) + break; + Instruction* vec = ctx.info[instr->operands[0].tempId()].instr; + assert(instr->definitions.size() == vec->operands.size()); + for (unsigned i = 0; i < instr->definitions.size(); i++) { + Operand vec_op = vec->operands[i]; + if (vec_op.isConstant()) { + if (vec_op.isLiteral()) + ctx.info[instr->definitions[i].tempId()].set_literal(vec_op.constantValue()); + else + ctx.info[instr->definitions[i].tempId()].set_constant(vec_op.constantValue()); + } else { + assert(vec_op.isTemp()); + ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp()); + } + } + break; + } + case aco_opcode::p_extract_vector: { /* mov */ + if (!ctx.info[instr->operands[0].tempId()].is_vec()) + break; + Instruction* vec = ctx.info[instr->operands[0].tempId()].instr; + if (vec->definitions[0].getTemp().size() == vec->operands.size()) { /* TODO: what about 64bit or other combinations? */ + + /* convert this extract into a mov instruction */ + Operand vec_op = vec->operands[instr->operands[1].constantValue()]; + bool is_vgpr = instr->definitions[0].getTemp().type() == RegType::vgpr; + aco_opcode opcode = is_vgpr ? aco_opcode::v_mov_b32 : aco_opcode::s_mov_b32; + Format format = is_vgpr ? Format::VOP1 : Format::SOP1; + instr->opcode = opcode; + instr->format = format; + instr->operands = {instr->operands.begin(), 1 }; + instr->operands[0] = vec_op; + + if (vec_op.isConstant()) { + if (vec_op.isLiteral()) + ctx.info[instr->definitions[0].tempId()].set_literal(vec_op.constantValue()); + else + ctx.info[instr->definitions[0].tempId()].set_constant(vec_op.constantValue()); + } else { + assert(vec_op.isTemp()); + ctx.info[instr->definitions[0].tempId()].set_temp(vec_op.getTemp()); + } + } + break; + } + case aco_opcode::s_mov_b32: /* propagate */ + case aco_opcode::s_mov_b64: + case aco_opcode::v_mov_b32: + case aco_opcode::p_as_uniform: + if (instr->definitions[0].isFixed()) { + /* don't copy-propagate copies into fixed registers */ + } else if (instr->operands[0].isConstant()) { + if (instr->operands[0].isLiteral()) + ctx.info[instr->definitions[0].tempId()].set_literal(instr->operands[0].constantValue()); + else + ctx.info[instr->definitions[0].tempId()].set_constant(instr->operands[0].constantValue()); + } else if (instr->isDPP()) { + // TODO + } else if (instr->operands[0].isTemp()) { + ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); + } else { + assert(instr->operands[0].isFixed()); + } + break; + case aco_opcode::p_is_helper: + if (!ctx.program->needs_wqm) + ctx.info[instr->definitions[0].tempId()].set_constant(0u); + break; + case aco_opcode::s_movk_i32: { + uint32_t v = static_cast<SOPK_instruction*>(instr.get())->imm; + v = v & 0x8000 ? (v | 0xffff0000) : v; + if (v <= 64 || v >= 0xfffffff0) + ctx.info[instr->definitions[0].tempId()].set_constant(v); + else + ctx.info[instr->definitions[0].tempId()].set_literal(v); + break; + } + case aco_opcode::v_bfrev_b32: + case aco_opcode::s_brev_b32: { + if (instr->operands[0].isConstant()) { + uint32_t v = util_bitreverse(instr->operands[0].constantValue()); + if (v <= 64 || v >= 0xfffffff0) + ctx.info[instr->definitions[0].tempId()].set_constant(v); + else + ctx.info[instr->definitions[0].tempId()].set_literal(v); + } + break; + } + case aco_opcode::s_bfm_b32: { + if (instr->operands[0].isConstant() && instr->operands[1].isConstant()) { + unsigned size = instr->operands[0].constantValue() & 0x1f; + unsigned start = instr->operands[1].constantValue() & 0x1f; + uint32_t v = ((1u << size) - 1u) << start; + if (v <= 64 || v >= 0xfffffff0) + ctx.info[instr->definitions[0].tempId()].set_constant(v); + else + ctx.info[instr->definitions[0].tempId()].set_literal(v); + } + } + case aco_opcode::v_mul_f32: { /* omod */ + /* TODO: try to move the negate/abs modifier to the consumer instead */ + if (instr->isVOP3()) { + VOP3A_instruction *vop3 = static_cast<VOP3A_instruction*>(instr.get()); + if (vop3->abs[0] || vop3->abs[1] || vop3->neg[0] || vop3->neg[1] || vop3->omod || vop3->clamp) + break; + } + + for (unsigned i = 0; i < 2; i++) { + if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) { + if (instr->operands[!i].constantValue() == 0x40000000) { /* 2.0 */ + ctx.info[instr->operands[i].tempId()].set_omod2(); + } else if (instr->operands[!i].constantValue() == 0x40800000) { /* 4.0 */ + ctx.info[instr->operands[i].tempId()].set_omod4(); + } else if (instr->operands[!i].constantValue() == 0x3f000000) { /* 0.5 */ + ctx.info[instr->operands[i].tempId()].set_omod5(); + } else if (instr->operands[!i].constantValue() == 0x3f800000) { /* 1.0 */ + ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[i].getTemp()); + } else { + continue; + } + break; + } + } + break; + } + case aco_opcode::v_and_b32: /* abs */ + if (instr->operands[0].constantEquals(0x7FFFFFFF) && instr->operands[1].isTemp()) + ctx.info[instr->definitions[0].tempId()].set_abs(instr->operands[1].getTemp()); + else + ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get()); + break; + case aco_opcode::v_xor_b32: { /* neg */ + if (instr->operands[0].constantEquals(0x80000000u) && instr->operands[1].isTemp()) { + if (ctx.info[instr->operands[1].tempId()].is_neg()) { + ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp); + } else { + if (ctx.info[instr->operands[1].tempId()].is_abs()) { /* neg(abs(x)) */ + instr->operands[1].setTemp(ctx.info[instr->operands[1].tempId()].temp); + instr->opcode = aco_opcode::v_or_b32; + ctx.info[instr->definitions[0].tempId()].set_neg_abs(instr->operands[1].getTemp()); + } else { + ctx.info[instr->definitions[0].tempId()].set_neg(instr->operands[1].getTemp()); + } + } + } else { + ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get()); + } + break; + } + case aco_opcode::v_med3_f32: { /* clamp */ + VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get()); + if (vop3->abs[0] || vop3->neg[0] || vop3->opsel[0] || + vop3->abs[1] || vop3->neg[1] || vop3->opsel[1] || + vop3->abs[2] || vop3->neg[2] || vop3->opsel[2] || + vop3->omod != 0) + break; + + unsigned idx = 0; + bool found_zero = false, found_one = false; + for (unsigned i = 0; i < 3; i++) + { + if (instr->operands[i].constantEquals(0)) + found_zero = true; + else if (instr->operands[i].constantEquals(0x3f800000)) /* 1.0 */ + found_one = true; + else + idx = i; + } + if (found_zero && found_one && instr->operands[idx].isTemp()) { + ctx.info[instr->operands[idx].tempId()].set_clamp(); + } + break; + } + case aco_opcode::v_cndmask_b32: + if (instr->operands[0].constantEquals(0) && + instr->operands[1].constantEquals(0xFFFFFFFF) && + instr->operands[2].isTemp()) + ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp()); + else if (instr->operands[0].constantEquals(0) && + instr->operands[1].constantEquals(0x3f800000u) && + instr->operands[2].isTemp()) + ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp()); + break; + case aco_opcode::v_cmp_lg_u32: + if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */ + instr->operands[0].constantEquals(0) && + instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_vcc()) + ctx.info[instr->definitions[0].tempId()].set_temp(ctx.info[instr->operands[1].tempId()].temp); + break; + case aco_opcode::p_phi: + case aco_opcode::p_linear_phi: { + /* lower_bool_phis() can create phis like this */ + bool all_same_temp = instr->operands[0].isTemp(); + /* this check is needed when moving uniform loop counters out of a divergent loop */ + if (all_same_temp) + all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass(); + for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) { + if (!instr->operands[i].isTemp() || instr->operands[i].tempId() != instr->operands[0].tempId()) + all_same_temp = false; + } + if (all_same_temp) { + ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); + } else { + bool all_undef = instr->operands[0].isUndefined(); + for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) { + if (!instr->operands[i].isUndefined()) + all_undef = false; + } + if (all_undef) + ctx.info[instr->definitions[0].tempId()].set_undefined(); + } + break; + } + case aco_opcode::v_add_u32: + case aco_opcode::v_add_co_u32: + case aco_opcode::s_add_i32: + case aco_opcode::s_add_u32: + ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get()); + break; + case aco_opcode::s_not_b32: + case aco_opcode::s_not_b64: + case aco_opcode::s_and_b32: + case aco_opcode::s_and_b64: + case aco_opcode::s_or_b32: + case aco_opcode::s_or_b64: + case aco_opcode::s_xor_b32: + case aco_opcode::s_xor_b64: + case aco_opcode::s_lshl_b32: + case aco_opcode::v_or_b32: + case aco_opcode::v_lshlrev_b32: + ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get()); + break; + case aco_opcode::v_min_f32: + case aco_opcode::v_min_f16: + case aco_opcode::v_min_u32: + case aco_opcode::v_min_i32: + case aco_opcode::v_min_u16: + case aco_opcode::v_min_i16: + case aco_opcode::v_max_f32: + case aco_opcode::v_max_f16: + case aco_opcode::v_max_u32: + case aco_opcode::v_max_i32: + case aco_opcode::v_max_u16: + case aco_opcode::v_max_i16: + ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get()); + break; + case aco_opcode::v_cmp_lt_f32: + case aco_opcode::v_cmp_eq_f32: + case aco_opcode::v_cmp_le_f32: + case aco_opcode::v_cmp_gt_f32: + case aco_opcode::v_cmp_lg_f32: + case aco_opcode::v_cmp_ge_f32: + case aco_opcode::v_cmp_o_f32: + case aco_opcode::v_cmp_u_f32: + case aco_opcode::v_cmp_nge_f32: + case aco_opcode::v_cmp_nlg_f32: + case aco_opcode::v_cmp_ngt_f32: + case aco_opcode::v_cmp_nle_f32: + case aco_opcode::v_cmp_neq_f32: + case aco_opcode::v_cmp_nlt_f32: + ctx.info[instr->definitions[0].tempId()].set_fcmp(instr.get()); + break; + default: + break; + } +} + +ALWAYS_INLINE bool get_cmp_info(aco_opcode op, aco_opcode *ordered, aco_opcode *unordered, aco_opcode *inverse) +{ + *ordered = *unordered = op; + switch (op) { + #define CMP(ord, unord) \ + case aco_opcode::v_cmp_##ord##_f32:\ + case aco_opcode::v_cmp_n##unord##_f32:\ + *ordered = aco_opcode::v_cmp_##ord##_f32;\ + *unordered = aco_opcode::v_cmp_n##unord##_f32;\ + *inverse = op == aco_opcode::v_cmp_n##unord##_f32 ? aco_opcode::v_cmp_##unord##_f32 : aco_opcode::v_cmp_n##ord##_f32;\ + return true; + CMP(lt, /*n*/ge) + CMP(eq, /*n*/lg) + CMP(le, /*n*/gt) + CMP(gt, /*n*/le) + CMP(lg, /*n*/eq) + CMP(ge, /*n*/lt) + #undef CMP + default: + return false; + } +} + +aco_opcode get_ordered(aco_opcode op) +{ + aco_opcode ordered, unordered, inverse; + return get_cmp_info(op, &ordered, &unordered, &inverse) ? ordered : aco_opcode::last_opcode; +} + +aco_opcode get_unordered(aco_opcode op) +{ + aco_opcode ordered, unordered, inverse; + return get_cmp_info(op, &ordered, &unordered, &inverse) ? unordered : aco_opcode::last_opcode; +} + +aco_opcode get_inverse(aco_opcode op) +{ + aco_opcode ordered, unordered, inverse; + return get_cmp_info(op, &ordered, &unordered, &inverse) ? inverse : aco_opcode::last_opcode; +} + +bool is_cmp(aco_opcode op) +{ + aco_opcode ordered, unordered, inverse; + return get_cmp_info(op, &ordered, &unordered, &inverse); +} + +unsigned original_temp_id(opt_ctx &ctx, Temp tmp) +{ + if (ctx.info[tmp.id()].is_temp()) + return ctx.info[tmp.id()].temp.id(); + else + return tmp.id(); +} + +void decrease_uses(opt_ctx &ctx, Instruction* instr) +{ + if (!--ctx.uses[instr->definitions[0].tempId()]) { + for (const Operand& op : instr->operands) { + if (op.isTemp()) + ctx.uses[op.tempId()]--; + } + } +} + +Instruction *follow_operand(opt_ctx &ctx, Operand op, bool ignore_uses=false) +{ + if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_labels)) + return nullptr; + if (!ignore_uses && ctx.uses[op.tempId()] > 1) + return nullptr; + + Instruction *instr = ctx.info[op.tempId()].instr; + + if (instr->definitions.size() == 2) { + assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId()); + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return nullptr; + } + + return instr; +} + +/* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b) + * s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */ +bool combine_ordering_test(opt_ctx &ctx, aco_ptr<Instruction>& instr) +{ + if (instr->opcode != aco_opcode::s_or_b64 && instr->opcode != aco_opcode::s_and_b64) + return false; + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + + bool neg[2] = {false, false}; + bool abs[2] = {false, false}; + bool opsel[2] = {false, false}; + Instruction *op_instr[2]; + Temp op[2]; + + for (unsigned i = 0; i < 2; i++) { + op_instr[i] = follow_operand(ctx, instr->operands[i], true); + if (!op_instr[i]) + return false; + + aco_opcode expected_cmp = instr->opcode == aco_opcode::s_or_b64 ? + aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32; + + if (op_instr[i]->opcode != expected_cmp) + return false; + if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp()) + return false; + + if (op_instr[i]->isVOP3()) { + VOP3A_instruction *vop3 = static_cast<VOP3A_instruction*>(op_instr[i]); + if (vop3->neg[0] != vop3->neg[1] || vop3->abs[0] != vop3->abs[1] || vop3->opsel[0] != vop3->opsel[1]) + return false; + neg[i] = vop3->neg[0]; + abs[i] = vop3->abs[0]; + opsel[i] = vop3->opsel[0]; + } + + Temp op0 = op_instr[i]->operands[0].getTemp(); + Temp op1 = op_instr[i]->operands[1].getTemp(); + if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1)) + return false; + /* shouldn't happen yet, but best to be safe */ + if (op1.type() != RegType::vgpr) + return false; + + op[i] = op1; + } + + ctx.uses[op[0].id()]++; + ctx.uses[op[1].id()]++; + decrease_uses(ctx, op_instr[0]); + decrease_uses(ctx, op_instr[1]); + + aco_opcode new_op = instr->opcode == aco_opcode::s_or_b64 ? + aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; + Instruction *new_instr; + if (neg[0] || neg[1] || abs[0] || abs[1] || opsel[0] || opsel[1]) { + VOP3A_instruction *vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1); + for (unsigned i = 0; i < 2; i++) { + vop3->neg[i] = neg[i]; + vop3->abs[i] = abs[i]; + vop3->opsel[i] = opsel[i]; + } + new_instr = static_cast<Instruction *>(vop3); + } else { + new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1); + } + new_instr->operands[0] = Operand(op[0]); + new_instr->operands[1] = Operand(op[1]); + new_instr->definitions[0] = instr->definitions[0]; + + ctx.info[instr->definitions[0].tempId()].label = 0; + ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + + instr.reset(new_instr); + + return true; +} + +/* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b) + * s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */ +bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr) +{ + if (instr->opcode != aco_opcode::s_or_b64 && instr->opcode != aco_opcode::s_and_b64) + return false; + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + + aco_opcode expected_nan_test = instr->opcode == aco_opcode::s_or_b64 ? + aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; + + Instruction *nan_test = follow_operand(ctx, instr->operands[0], true); + Instruction *cmp = follow_operand(ctx, instr->operands[1], true); + if (!nan_test || !cmp) + return false; + + if (cmp->opcode == expected_nan_test) + std::swap(nan_test, cmp); + else if (nan_test->opcode != expected_nan_test) + return false; + + if (!is_cmp(cmp->opcode)) + return false; + + if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp()) + return false; + if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp()) + return false; + + unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp()); + unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp()); + unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp()); + unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp()); + if (prop_cmp0 != prop_nan0 && prop_cmp0 != prop_nan1) + return false; + if (prop_cmp1 != prop_nan0 && prop_cmp1 != prop_nan1) + return false; + + ctx.uses[cmp->operands[0].tempId()]++; + ctx.uses[cmp->operands[1].tempId()]++; + decrease_uses(ctx, nan_test); + decrease_uses(ctx, cmp); + + aco_opcode new_op = instr->opcode == aco_opcode::s_or_b64 ? + get_unordered(cmp->opcode) : get_ordered(cmp->opcode); + Instruction *new_instr; + if (cmp->isVOP3()) { + VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1); + VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp); + memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs)); + memcpy(new_vop3->opsel, cmp_vop3->opsel, sizeof(new_vop3->opsel)); + memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg)); + new_vop3->clamp = cmp_vop3->clamp; + new_vop3->omod = cmp_vop3->omod; + new_instr = new_vop3; + } else { + new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1); + } + new_instr->operands[0] = cmp->operands[0]; + new_instr->operands[1] = cmp->operands[1]; + new_instr->definitions[0] = instr->definitions[0]; + + ctx.info[instr->definitions[0].tempId()].label = 0; + ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + + instr.reset(new_instr); + + return true; +} + +/* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b) + * s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */ +bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr<Instruction>& instr) +{ + if (instr->opcode != aco_opcode::s_or_b64 && instr->opcode != aco_opcode::s_and_b64) + return false; + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + + Instruction *nan_test = follow_operand(ctx, instr->operands[0], true); + Instruction *cmp = follow_operand(ctx, instr->operands[1], true); + + if (!nan_test || !cmp) + return false; + + aco_opcode expected_nan_test = instr->opcode == aco_opcode::s_or_b64 ? + aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32; + if (cmp->opcode == expected_nan_test) + std::swap(nan_test, cmp); + else if (nan_test->opcode != expected_nan_test) + return false; + + if (!is_cmp(cmp->opcode)) + return false; + + if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp()) + return false; + if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp()) + return false; + + unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp()); + unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp()); + if (prop_nan0 != prop_nan1) + return false; + + int constant_operand = -1; + for (unsigned i = 0; i < 2; i++) { + if (cmp->operands[i].isTemp() && original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) { + constant_operand = !i; + break; + } + } + if (constant_operand == -1) + return false; + + uint32_t constant; + if (cmp->operands[constant_operand].isConstant()) { + constant = cmp->operands[constant_operand].constantValue(); + } else if (cmp->operands[constant_operand].isTemp()) { + unsigned id = cmp->operands[constant_operand].tempId(); + if (!ctx.info[id].is_constant() && !ctx.info[id].is_literal()) + return false; + constant = ctx.info[id].val; + } else { + return false; + } + + float constantf; + memcpy(&constantf, &constant, 4); + if (isnan(constantf)) + return false; + + if (cmp->operands[0].isTemp()) + ctx.uses[cmp->operands[0].tempId()]++; + if (cmp->operands[1].isTemp()) + ctx.uses[cmp->operands[1].tempId()]++; + decrease_uses(ctx, nan_test); + decrease_uses(ctx, cmp); + + aco_opcode new_op = instr->opcode == aco_opcode::s_or_b64 ? + get_unordered(cmp->opcode) : get_ordered(cmp->opcode); + Instruction *new_instr; + if (cmp->isVOP3()) { + VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_op, asVOP3(Format::VOPC), 2, 1); + VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp); + memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs)); + memcpy(new_vop3->opsel, cmp_vop3->opsel, sizeof(new_vop3->opsel)); + memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg)); + new_vop3->clamp = cmp_vop3->clamp; + new_vop3->omod = cmp_vop3->omod; + new_instr = new_vop3; + } else { + new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1); + } + new_instr->operands[0] = cmp->operands[0]; + new_instr->operands[1] = cmp->operands[1]; + new_instr->definitions[0] = instr->definitions[0]; + + ctx.info[instr->definitions[0].tempId()].label = 0; + ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + + instr.reset(new_instr); + + return true; +} + +/* s_not_b64(cmp(a, b) -> get_inverse(cmp)(a, b) */ +bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr<Instruction>& instr) +{ + if (instr->opcode != aco_opcode::s_not_b64) + return false; + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + if (!instr->operands[0].isTemp()) + return false; + + Instruction *cmp = follow_operand(ctx, instr->operands[0]); + if (!cmp) + return false; + + aco_opcode new_opcode = get_inverse(cmp->opcode); + if (new_opcode == aco_opcode::last_opcode) + return false; + + if (cmp->operands[0].isTemp()) + ctx.uses[cmp->operands[0].tempId()]++; + if (cmp->operands[1].isTemp()) + ctx.uses[cmp->operands[1].tempId()]++; + decrease_uses(ctx, cmp); + + Instruction *new_instr; + if (cmp->isVOP3()) { + VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_opcode, asVOP3(Format::VOPC), 2, 1); + VOP3A_instruction *cmp_vop3 = static_cast<VOP3A_instruction*>(cmp); + memcpy(new_vop3->abs, cmp_vop3->abs, sizeof(new_vop3->abs)); + memcpy(new_vop3->opsel, cmp_vop3->opsel, sizeof(new_vop3->opsel)); + memcpy(new_vop3->neg, cmp_vop3->neg, sizeof(new_vop3->neg)); + new_vop3->clamp = cmp_vop3->clamp; + new_vop3->omod = cmp_vop3->omod; + new_instr = new_vop3; + } else { + new_instr = create_instruction<VOPC_instruction>(new_opcode, Format::VOPC, 2, 1); + } + new_instr->operands[0] = cmp->operands[0]; + new_instr->operands[1] = cmp->operands[1]; + new_instr->definitions[0] = instr->definitions[0]; + + ctx.info[instr->definitions[0].tempId()].label = 0; + ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + + instr.reset(new_instr); + + return true; +} + +/* op1(op2(1, 2), 0) if swap = false + * op1(0, op2(1, 2)) if swap = true */ +bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2, + Instruction* op1_instr, bool swap, const char *shuffle_str, + Operand operands[3], bool neg[3], bool abs[3], bool opsel[3], + bool *op1_clamp, unsigned *op1_omod, + bool *inbetween_neg, bool *inbetween_abs, bool *inbetween_opsel) +{ + /* checks */ + if (op1_instr->opcode != op1) + return false; + + Instruction *op2_instr = follow_operand(ctx, op1_instr->operands[swap]); + if (!op2_instr || op2_instr->opcode != op2) + return false; + + VOP3A_instruction *op1_vop3 = op1_instr->isVOP3() ? static_cast<VOP3A_instruction *>(op1_instr) : NULL; + VOP3A_instruction *op2_vop3 = op2_instr->isVOP3() ? static_cast<VOP3A_instruction *>(op2_instr) : NULL; + + /* don't support inbetween clamp/omod */ + if (op2_vop3 && (op2_vop3->clamp || op2_vop3->omod)) + return false; + + /* get operands and modifiers and check inbetween modifiers */ + *op1_clamp = op1_vop3 ? op1_vop3->clamp : false; + *op1_omod = op1_vop3 ? op1_vop3->omod : 0u; + + if (inbetween_neg) + *inbetween_neg = op1_vop3 ? op1_vop3->neg[swap] : false; + else if (op1_vop3 && op1_vop3->neg[swap]) + return false; + + if (inbetween_abs) + *inbetween_abs = op1_vop3 ? op1_vop3->abs[swap] : false; + else if (op1_vop3 && op1_vop3->abs[swap]) + return false; + + if (inbetween_opsel) + *inbetween_opsel = op1_vop3 ? op1_vop3->opsel[swap] : false; + else if (op1_vop3 && op1_vop3->opsel[swap]) + return false; + + int shuffle[3]; + shuffle[shuffle_str[0] - '0'] = 0; + shuffle[shuffle_str[1] - '0'] = 1; + shuffle[shuffle_str[2] - '0'] = 2; + + operands[shuffle[0]] = op1_instr->operands[!swap]; + neg[shuffle[0]] = op1_vop3 ? op1_vop3->neg[!swap] : false; + abs[shuffle[0]] = op1_vop3 ? op1_vop3->abs[!swap] : false; + opsel[shuffle[0]] = op1_vop3 ? op1_vop3->opsel[!swap] : false; + + for (unsigned i = 0; i < 2; i++) { + operands[shuffle[i + 1]] = op2_instr->operands[i]; + neg[shuffle[i + 1]] = op2_vop3 ? op2_vop3->neg[i] : false; + abs[shuffle[i + 1]] = op2_vop3 ? op2_vop3->abs[i] : false; + opsel[shuffle[i + 1]] = op2_vop3 ? op2_vop3->opsel[i] : false; + } + + /* check operands */ + unsigned sgpr_id = 0; + for (unsigned i = 0; i < 3; i++) { + Operand op = operands[i]; + if (op.isLiteral()) { + return false; + } else if (op.isTemp() && op.getTemp().type() == RegType::sgpr) { + if (sgpr_id && sgpr_id != op.tempId()) + return false; + sgpr_id = op.tempId(); + } + } + + return true; +} + +void create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr, + Operand operands[3], bool neg[3], bool abs[3], bool opsel[3], + bool clamp, unsigned omod) +{ + VOP3A_instruction *new_instr = create_instruction<VOP3A_instruction>(opcode, Format::VOP3A, 3, 1); + memcpy(new_instr->abs, abs, sizeof(bool[3])); + memcpy(new_instr->opsel, opsel, sizeof(bool[3])); + memcpy(new_instr->neg, neg, sizeof(bool[3])); + new_instr->clamp = clamp; + new_instr->omod = omod; + new_instr->operands[0] = operands[0]; + new_instr->operands[1] = operands[1]; + new_instr->operands[2] = operands[2]; + new_instr->definitions[0] = instr->definitions[0]; + ctx.info[instr->definitions[0].tempId()].label = 0; + + instr.reset(new_instr); +} + +bool combine_minmax3(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op) +{ + uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & + (label_omod_success | label_clamp_success); + + for (unsigned swap = 0; swap < 2; swap++) { + Operand operands[3]; + bool neg[3], abs[3], opsel[3], clamp, inbetween_neg, inbetween_abs; + unsigned omod; + if (match_op3_for_vop3(ctx, instr->opcode, instr->opcode, instr.get(), swap, + "012", operands, neg, abs, opsel, + &clamp, &omod, &inbetween_neg, &inbetween_abs, NULL)) { + ctx.uses[instr->operands[swap].tempId()]--; + neg[1] ^= inbetween_neg; + neg[2] ^= inbetween_neg; + abs[1] |= inbetween_abs; + abs[2] |= inbetween_abs; + create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod); + if (omod_clamp & label_omod_success) + ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); + if (omod_clamp & label_clamp_success) + ctx.info[instr->definitions[0].tempId()].set_clamp_success(instr.get()); + return true; + } + } + return false; +} + +bool combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op, const char *shuffle, uint8_t ops) +{ + uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & + (label_omod_success | label_clamp_success); + + for (unsigned swap = 0; swap < 2; swap++) { + if (!((1 << swap) & ops)) + continue; + + Operand operands[3]; + bool neg[3], abs[3], opsel[3], clamp; + unsigned omod; + if (match_op3_for_vop3(ctx, instr->opcode, op2, + instr.get(), swap, shuffle, + operands, neg, abs, opsel, + &clamp, &omod, NULL, NULL, NULL)) { + ctx.uses[instr->operands[swap].tempId()]--; + create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod); + if (omod_clamp & label_omod_success) + ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); + if (omod_clamp & label_clamp_success) + ctx.info[instr->definitions[0].tempId()].set_clamp_success(instr.get()); + return true; + } + } + return false; +} + +/* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b) + * s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b) + * s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b) + * s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b) + * s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b) + * s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */ +bool combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr) +{ + /* checks */ + if (!instr->operands[0].isTemp()) + return false; + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + + Instruction *op2_instr = follow_operand(ctx, instr->operands[0]); + if (!op2_instr) + return false; + switch (op2_instr->opcode) { + case aco_opcode::s_and_b32: + case aco_opcode::s_or_b32: + case aco_opcode::s_xor_b32: + case aco_opcode::s_and_b64: + case aco_opcode::s_or_b64: + case aco_opcode::s_xor_b64: + break; + default: + return false; + } + + /* create instruction */ + std::swap(instr->definitions[0], op2_instr->definitions[0]); + ctx.uses[instr->operands[0].tempId()]--; + ctx.info[op2_instr->definitions[0].tempId()].label = 0; + + switch (op2_instr->opcode) { + case aco_opcode::s_and_b32: + op2_instr->opcode = aco_opcode::s_nand_b32; + break; + case aco_opcode::s_or_b32: + op2_instr->opcode = aco_opcode::s_nor_b32; + break; + case aco_opcode::s_xor_b32: + op2_instr->opcode = aco_opcode::s_xnor_b32; + break; + case aco_opcode::s_and_b64: + op2_instr->opcode = aco_opcode::s_nand_b64; + break; + case aco_opcode::s_or_b64: + op2_instr->opcode = aco_opcode::s_nor_b64; + break; + case aco_opcode::s_xor_b64: + op2_instr->opcode = aco_opcode::s_xnor_b64; + break; + default: + break; + } + + return true; +} + +/* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b) + * s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b) + * s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b) + * s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */ +bool combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr) +{ + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + + for (unsigned i = 0; i < 2; i++) { + Instruction *op2_instr = follow_operand(ctx, instr->operands[i]); + if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 && op2_instr->opcode != aco_opcode::s_not_b64)) + continue; + + ctx.uses[instr->operands[i].tempId()]--; + instr->operands[0] = instr->operands[!i]; + instr->operands[1] = op2_instr->operands[0]; + ctx.info[instr->definitions[0].tempId()].label = 0; + + switch (instr->opcode) { + case aco_opcode::s_and_b32: + instr->opcode = aco_opcode::s_andn2_b32; + break; + case aco_opcode::s_or_b32: + instr->opcode = aco_opcode::s_orn2_b32; + break; + case aco_opcode::s_and_b64: + instr->opcode = aco_opcode::s_andn2_b64; + break; + case aco_opcode::s_or_b64: + instr->opcode = aco_opcode::s_orn2_b64; + break; + default: + break; + } + + return true; + } + return false; +} + +/* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */ +bool combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr) +{ + if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()]) + return false; + + for (unsigned i = 0; i < 2; i++) { + Instruction *op2_instr = follow_operand(ctx, instr->operands[i]); + if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 || !op2_instr->operands[1].isConstant()) + continue; + + uint32_t shift = op2_instr->operands[1].constantValue(); + if (shift < 1 || shift > 4) + continue; + + ctx.uses[instr->operands[i].tempId()]--; + instr->operands[1] = instr->operands[!i]; + instr->operands[0] = op2_instr->operands[0]; + ctx.info[instr->definitions[0].tempId()].label = 0; + + instr->opcode = ((aco_opcode[]){aco_opcode::s_lshl1_add_u32, + aco_opcode::s_lshl2_add_u32, + aco_opcode::s_lshl3_add_u32, + aco_opcode::s_lshl4_add_u32})[shift - 1]; + + return true; + } + return false; +} + +bool get_minmax_info(aco_opcode op, aco_opcode *min, aco_opcode *max, aco_opcode *min3, aco_opcode *max3, aco_opcode *med3, bool *some_gfx9_only) +{ + switch (op) { + #define MINMAX(type, gfx9) \ + case aco_opcode::v_min_##type:\ + case aco_opcode::v_max_##type:\ + case aco_opcode::v_med3_##type:\ + *min = aco_opcode::v_min_##type;\ + *max = aco_opcode::v_max_##type;\ + *med3 = aco_opcode::v_med3_##type;\ + *min3 = aco_opcode::v_min3_##type;\ + *max3 = aco_opcode::v_max3_##type;\ + *some_gfx9_only = gfx9;\ + return true; + MINMAX(f32, false) + MINMAX(u32, false) + MINMAX(i32, false) + MINMAX(f16, true) + MINMAX(u16, true) + MINMAX(i16, true) + #undef MINMAX + default: + return false; + } +} + +/* v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb + * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub) when ub > lb */ +bool combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr, + aco_opcode min, aco_opcode max, aco_opcode med) +{ + aco_opcode other_op; + if (instr->opcode == min) + other_op = max; + else if (instr->opcode == max) + other_op = min; + else + return false; + + uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & + (label_omod_success | label_clamp_success); + + for (unsigned swap = 0; swap < 2; swap++) { + Operand operands[3]; + bool neg[3], abs[3], opsel[3], clamp, inbetween_neg, inbetween_abs; + unsigned omod; + if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, + "012", operands, neg, abs, opsel, + &clamp, &omod, &inbetween_neg, &inbetween_abs, NULL)) { + int const0_idx = -1, const1_idx = -1; + uint32_t const0 = 0, const1 = 0; + for (int i = 0; i < 3; i++) { + uint32_t val; + if (operands[i].isConstant()) { + val = operands[i].constantValue(); + } else if (operands[i].isTemp() && ctx.uses[operands[i].tempId()] == 1 && + ctx.info[operands[i].tempId()].is_constant_or_literal()) { + val = ctx.info[operands[i].tempId()].val; + } else { + continue; + } + if (const0_idx >= 0) { + const1_idx = i; + const1 = val; + } else { + const0_idx = i; + const0 = val; + } + } + if (const0_idx < 0 || const1_idx < 0) + continue; + + if (opsel[const0_idx]) + const0 >>= 16; + if (opsel[const1_idx]) + const1 >>= 16; + + int lower_idx = const0_idx; + switch (min) { + case aco_opcode::v_min_f32: + case aco_opcode::v_min_f16: { + float const0_f, const1_f; + if (min == aco_opcode::v_min_f32) { + memcpy(&const0_f, &const0, 4); + memcpy(&const1_f, &const1, 4); + } else { + const0_f = _mesa_half_to_float(const0); + const1_f = _mesa_half_to_float(const1); + } + if (abs[const0_idx]) const0_f = fabsf(const0_f); + if (abs[const1_idx]) const1_f = fabsf(const1_f); + if (neg[const0_idx]) const0_f = -const0_f; + if (neg[const1_idx]) const1_f = -const1_f; + lower_idx = const0_f < const1_f ? const0_idx : const1_idx; + break; + } + case aco_opcode::v_min_u32: { + lower_idx = const0 < const1 ? const0_idx : const1_idx; + break; + } + case aco_opcode::v_min_u16: { + lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx; + break; + } + case aco_opcode::v_min_i32: { + int32_t const0_i = const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0; + int32_t const1_i = const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1; + lower_idx = const0_i < const1_i ? const0_idx : const1_idx; + break; + } + case aco_opcode::v_min_i16: { + int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0; + int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1; + lower_idx = const0_i < const1_i ? const0_idx : const1_idx; + break; + } + default: + break; + } + int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx; + + if (instr->opcode == min) { + if (upper_idx != 0 || lower_idx == 0) + return false; + } else { + if (upper_idx == 0 || lower_idx != 0) + return false; + } + + neg[1] ^= inbetween_neg; + neg[2] ^= inbetween_neg; + abs[1] |= inbetween_abs; + abs[2] |= inbetween_abs; + + ctx.uses[instr->operands[swap].tempId()]--; + create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod); + if (omod_clamp & label_omod_success) + ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); + if (omod_clamp & label_clamp_success) + ctx.info[instr->definitions[0].tempId()].set_clamp_success(instr.get()); + + return true; + } + } + + return false; +} + + +void apply_sgprs(opt_ctx &ctx, aco_ptr<Instruction>& instr) +{ + /* apply sgprs */ + uint32_t sgpr_idx = 0; + uint32_t sgpr_info_id = 0; + bool has_sgpr = false; + uint32_t sgpr_ssa_id = 0; + /* find 'best' possible sgpr */ + for (unsigned i = 0; i < instr->operands.size(); i++) + { + if (instr->operands[i].isLiteral()) { + has_sgpr = true; + break; + } + if (!instr->operands[i].isTemp()) + continue; + if (instr->operands[i].getTemp().type() == RegType::sgpr) { + has_sgpr = true; + sgpr_ssa_id = instr->operands[i].tempId(); + continue; + } + ssa_info& info = ctx.info[instr->operands[i].tempId()]; + if (info.is_temp() && info.temp.type() == RegType::sgpr) { + uint16_t uses = ctx.uses[instr->operands[i].tempId()]; + if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) { + sgpr_idx = i; + sgpr_info_id = instr->operands[i].tempId(); + } + } + } + if (!has_sgpr && sgpr_info_id != 0) { + ssa_info& info = ctx.info[sgpr_info_id]; + if (sgpr_idx == 0 || instr->isVOP3()) { + instr->operands[sgpr_idx] = Operand(info.temp); + ctx.uses[sgpr_info_id]--; + ctx.uses[info.temp.id()]++; + } else if (can_swap_operands(instr)) { + instr->operands[sgpr_idx] = instr->operands[0]; + instr->operands[0] = Operand(info.temp); + ctx.uses[sgpr_info_id]--; + ctx.uses[info.temp.id()]++; + } else if (can_use_VOP3(instr)) { + to_VOP3(ctx, instr); + instr->operands[sgpr_idx] = Operand(info.temp); + ctx.uses[sgpr_info_id]--; + ctx.uses[info.temp.id()]++; + } + + /* we can have two sgprs on one instruction if it is the same sgpr! */ + } else if (sgpr_info_id != 0 && + sgpr_ssa_id == sgpr_info_id && + ctx.uses[sgpr_info_id] == 1 && + can_use_VOP3(instr)) { + to_VOP3(ctx, instr); + instr->operands[sgpr_idx] = Operand(ctx.info[sgpr_info_id].temp); + ctx.uses[sgpr_info_id]--; + ctx.uses[ctx.info[sgpr_info_id].temp.id()]++; + } +} + +bool apply_omod_clamp(opt_ctx &ctx, aco_ptr<Instruction>& instr) +{ + /* check if we could apply omod on predecessor */ + if (instr->opcode == aco_opcode::v_mul_f32) { + if (instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_omod_success()) { + + /* omod was successfully applied */ + /* if the omod instruction is v_mad, we also have to change the original add */ + if (ctx.info[instr->operands[1].tempId()].is_mad()) { + Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[1].tempId()].val].add_instr.get(); + if (ctx.info[instr->definitions[0].tempId()].is_clamp()) + static_cast<VOP3A_instruction*>(add_instr)->clamp = true; + add_instr->definitions[0] = instr->definitions[0]; + } + + Instruction* omod_instr = ctx.info[instr->operands[1].tempId()].instr; + /* check if we have an additional clamp modifier */ + if (ctx.info[instr->definitions[0].tempId()].is_clamp() && ctx.uses[instr->definitions[0].tempId()] == 1) { + static_cast<VOP3A_instruction*>(omod_instr)->clamp = true; + ctx.info[instr->definitions[0].tempId()].set_clamp_success(omod_instr); + } + /* change definition ssa-id of modified instruction */ + omod_instr->definitions[0] = instr->definitions[0]; + + /* change the definition of instr to something unused, e.g. the original omod def */ + instr->definitions[0] = Definition(instr->operands[1].getTemp()); + ctx.uses[instr->definitions[0].tempId()] = 0; + return true; + } + if (!ctx.info[instr->definitions[0].tempId()].label) { + /* in all other cases, label this instruction as option for multiply-add */ + ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); + } + } + + /* check if we could apply clamp on predecessor */ + if (instr->opcode == aco_opcode::v_med3_f32) { + unsigned idx = 0; + bool found_zero = false, found_one = false; + for (unsigned i = 0; i < 3; i++) + { + if (instr->operands[i].constantEquals(0)) + found_zero = true; + else if (instr->operands[i].constantEquals(0x3f800000)) /* 1.0 */ + found_one = true; + else + idx = i; + } + if (found_zero && found_one && instr->operands[idx].isTemp() && + ctx.info[instr->operands[idx].tempId()].is_clamp_success()) { + /* clamp was successfully applied */ + /* if the clamp instruction is v_mad, we also have to change the original add */ + if (ctx.info[instr->operands[idx].tempId()].is_mad()) { + Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].val].add_instr.get(); + add_instr->definitions[0] = instr->definitions[0]; + } + Instruction* clamp_instr = ctx.info[instr->operands[idx].tempId()].instr; + /* change definition ssa-id of modified instruction */ + clamp_instr->definitions[0] = instr->definitions[0]; + + /* change the definition of instr to something unused, e.g. the original omod def */ + instr->definitions[0] = Definition(instr->operands[idx].getTemp()); + ctx.uses[instr->definitions[0].tempId()] = 0; + return true; + } + } + + /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */ + if (!instr->definitions.empty() && ctx.uses[instr->definitions[0].tempId()] == 1 && + can_use_VOP3(instr) && instr_info.can_use_output_modifiers[(int)instr->opcode]) { + if(ctx.info[instr->definitions[0].tempId()].is_omod2()) { + to_VOP3(ctx, instr); + static_cast<VOP3A_instruction*>(instr.get())->omod = 1; + ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); + } else if (ctx.info[instr->definitions[0].tempId()].is_omod4()) { + to_VOP3(ctx, instr); + static_cast<VOP3A_instruction*>(instr.get())->omod = 2; + ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); + } else if (ctx.info[instr->definitions[0].tempId()].is_omod5()) { + to_VOP3(ctx, instr); + static_cast<VOP3A_instruction*>(instr.get())->omod = 3; + ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); + } else if (ctx.info[instr->definitions[0].tempId()].is_clamp()) { + to_VOP3(ctx, instr); + static_cast<VOP3A_instruction*>(instr.get())->clamp = true; + ctx.info[instr->definitions[0].tempId()].set_clamp_success(instr.get()); + } + } + + return false; +} + +// TODO: we could possibly move the whole label_instruction pass to combine_instruction: +// this would mean that we'd have to fix the instruction uses while value propagation + +void combine_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr) +{ + if (instr->definitions.empty() || !ctx.uses[instr->definitions[0].tempId()]) + return; + + if (instr->isVALU()) { + apply_sgprs(ctx, instr); + if (apply_omod_clamp(ctx, instr)) + return; + } + + /* TODO: There are still some peephole optimizations that could be done: + * - abs(a - b) -> s_absdiff_i32 + * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32 + * - patterns for v_alignbit_b32 and v_alignbyte_b32 + * These aren't probably too interesting though. + * There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but + * probably more useful than the previously mentioned optimizations. + * The various comparison optimizations also currently only work with 32-bit + * floats. */ + + /* neg(mul(a, b)) -> mul(neg(a), b) */ + if (ctx.info[instr->definitions[0].tempId()].is_neg() && ctx.uses[instr->operands[1].tempId()] == 1) { + Temp val = ctx.info[instr->definitions[0].tempId()].temp; + + if (!ctx.info[val.id()].is_mul()) + return; + + Instruction* mul_instr = ctx.info[val.id()].instr; + + if (mul_instr->operands[0].isLiteral()) + return; + if (mul_instr->isVOP3() && static_cast<VOP3A_instruction*>(mul_instr)->clamp) + return; + + /* convert to mul(neg(a), b) */ + ctx.uses[mul_instr->definitions[0].tempId()]--; + Definition def = instr->definitions[0]; + /* neg(abs(mul(a, b))) -> mul(neg(abs(a)), abs(b)) */ + bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs(); + instr.reset(create_instruction<VOP3A_instruction>(aco_opcode::v_mul_f32, asVOP3(Format::VOP2), 2, 1)); + instr->operands[0] = mul_instr->operands[0]; + instr->operands[1] = mul_instr->operands[1]; + instr->definitions[0] = def; + VOP3A_instruction* new_mul = static_cast<VOP3A_instruction*>(instr.get()); + if (mul_instr->isVOP3()) { + VOP3A_instruction* mul = static_cast<VOP3A_instruction*>(mul_instr); + new_mul->neg[0] = mul->neg[0] && !is_abs; + new_mul->neg[1] = mul->neg[1] && !is_abs; + new_mul->abs[0] = mul->abs[0] || is_abs; + new_mul->abs[1] = mul->abs[1] || is_abs; + new_mul->omod = mul->omod; + } + new_mul->neg[0] ^= true; + new_mul->clamp = false; + + ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); + return; + } + /* combine mul+add -> mad */ + else if (instr->opcode == aco_opcode::v_add_f32 || + instr->opcode == aco_opcode::v_sub_f32 || + instr->opcode == aco_opcode::v_subrev_f32) { + + uint32_t uses_src0 = UINT32_MAX; + uint32_t uses_src1 = UINT32_MAX; + Instruction* mul_instr = nullptr; + unsigned add_op_idx; + /* check if any of the operands is a multiplication */ + if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_mul()) + uses_src0 = ctx.uses[instr->operands[0].tempId()]; + if (instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_mul()) + uses_src1 = ctx.uses[instr->operands[1].tempId()]; + + /* find the 'best' mul instruction to combine with the add */ + if (uses_src0 < uses_src1) { + mul_instr = ctx.info[instr->operands[0].tempId()].instr; + add_op_idx = 1; + } else if (uses_src1 < uses_src0) { + mul_instr = ctx.info[instr->operands[1].tempId()].instr; + add_op_idx = 0; + } else if (uses_src0 != UINT32_MAX) { + /* tiebreaker: quite random what to pick */ + if (ctx.info[instr->operands[0].tempId()].instr->operands[0].isLiteral()) { + mul_instr = ctx.info[instr->operands[1].tempId()].instr; + add_op_idx = 0; + } else { + mul_instr = ctx.info[instr->operands[0].tempId()].instr; + add_op_idx = 1; + } + } + if (mul_instr) { + Operand op[3] = {Operand(v1), Operand(v1), Operand(v1)}; + bool neg[3] = {false, false, false}; + bool abs[3] = {false, false, false}; + unsigned omod = 0; + bool clamp = false; + bool need_vop3 = false; + int num_sgpr = 0; + op[0] = mul_instr->operands[0]; + op[1] = mul_instr->operands[1]; + op[2] = instr->operands[add_op_idx]; + for (unsigned i = 0; i < 3; i++) + { + if (op[i].isLiteral()) + return; + if (op[i].isTemp() && op[i].getTemp().type() == RegType::sgpr) + num_sgpr++; + if (!(i == 0 || (op[i].isTemp() && op[i].getTemp().type() == RegType::vgpr))) + need_vop3 = true; + } + // TODO: would be better to check this before selecting a mul instr? + if (num_sgpr > 1) + return; + + if (mul_instr->isVOP3()) { + VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*> (mul_instr); + neg[0] = vop3->neg[0]; + neg[1] = vop3->neg[1]; + abs[0] = vop3->abs[0]; + abs[1] = vop3->abs[1]; + need_vop3 = true; + /* we cannot use these modifiers between mul and add */ + if (vop3->clamp || vop3->omod) + return; + } + + /* convert to mad */ + ctx.uses[mul_instr->definitions[0].tempId()]--; + if (ctx.uses[mul_instr->definitions[0].tempId()]) { + if (op[0].isTemp()) + ctx.uses[op[0].tempId()]++; + if (op[1].isTemp()) + ctx.uses[op[1].tempId()]++; + } + + if (instr->isVOP3()) { + VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*> (instr.get()); + neg[2] = vop3->neg[add_op_idx]; + abs[2] = vop3->abs[add_op_idx]; + omod = vop3->omod; + clamp = vop3->clamp; + /* abs of the multiplication result */ + if (vop3->abs[1 - add_op_idx]) { + neg[0] = false; + neg[1] = false; + abs[0] = true; + abs[1] = true; + } + /* neg of the multiplication result */ + neg[1] = neg[1] ^ vop3->neg[1 - add_op_idx]; + need_vop3 = true; + } + if (instr->opcode == aco_opcode::v_sub_f32) { + neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true; + need_vop3 = true; + } else if (instr->opcode == aco_opcode::v_subrev_f32) { + neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true; + need_vop3 = true; + } + + aco_ptr<VOP3A_instruction> mad{create_instruction<VOP3A_instruction>(aco_opcode::v_mad_f32, Format::VOP3A, 3, 1)}; + for (unsigned i = 0; i < 3; i++) + { + mad->operands[i] = op[i]; + mad->neg[i] = neg[i]; + mad->abs[i] = abs[i]; + } + mad->omod = omod; + mad->clamp = clamp; + mad->definitions[0] = instr->definitions[0]; + + /* mark this ssa_def to be re-checked for profitability and literals */ + ctx.mad_infos.emplace_back(std::move(instr), mul_instr->definitions[0].tempId(), need_vop3); + ctx.info[mad->definitions[0].tempId()].set_mad(mad.get(), ctx.mad_infos.size() - 1); + instr.reset(mad.release()); + return; + } + } + /* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */ + else if (instr->opcode == aco_opcode::v_mul_f32 && !instr->isVOP3()) { + for (unsigned i = 0; i < 2; i++) { + if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() && + ctx.uses[instr->operands[i].tempId()] == 1 && + instr->operands[!i].isTemp() && instr->operands[!i].getTemp().type() == RegType::vgpr) { + ctx.uses[instr->operands[i].tempId()]--; + ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++; + + aco_ptr<VOP2_instruction> new_instr{create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)}; + new_instr->operands[0] = Operand(0u); + new_instr->operands[1] = instr->operands[!i]; + new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp); + new_instr->definitions[0] = instr->definitions[0]; + instr.reset(new_instr.release()); + ctx.info[instr->definitions[0].tempId()].label = 0; + return; + } + } + } else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->chip_class >= GFX9) { + if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) ; + else if (combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32, "120", 1 | 2)) ; + else combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_or_b32, "210", 1 | 2); + } else if (instr->opcode == aco_opcode::v_add_u32 && ctx.program->chip_class >= GFX9) { + if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ; + else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ; + else combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_add_u32, "210", 1 | 2); + } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->chip_class >= GFX9) { + combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120", 2); + } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) && ctx.program->chip_class >= GFX9) { + combine_salu_lshl_add(ctx, instr); + } else if (instr->opcode == aco_opcode::s_not_b32) { + combine_salu_not_bitwise(ctx, instr); + } else if (instr->opcode == aco_opcode::s_not_b64) { + if (combine_inverse_comparison(ctx, instr)) ; + else combine_salu_not_bitwise(ctx, instr); + } else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32) { + combine_salu_n2(ctx, instr); + } else if (instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) { + if (combine_ordering_test(ctx, instr)) ; + else if (combine_comparison_ordering(ctx, instr)) ; + else if (combine_constant_comparison_ordering(ctx, instr)) ; + else combine_salu_n2(ctx, instr); + } else { + aco_opcode min, max, min3, max3, med3; + bool some_gfx9_only; + if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &some_gfx9_only) && + (!some_gfx9_only || ctx.program->chip_class >= GFX9)) { + if (combine_minmax3(ctx, instr, instr->opcode == min ? min3 : max3)) ; + else combine_clamp(ctx, instr, min, max, med3); + } + } +} + + +void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr) +{ + const uint32_t threshold = 4; + + /* Dead Code Elimination: + * We remove instructions if they define temporaries which all are unused */ + const bool is_used = instr->definitions.empty() || + std::any_of(instr->definitions.begin(), instr->definitions.end(), + [&ctx](const Definition& def) { return ctx.uses[def.tempId()]; }); + if (!is_used) { + instr.reset(); + return; + } + + /* convert split_vector into extract_vector if only one definition is ever used */ + if (instr->opcode == aco_opcode::p_split_vector) { + unsigned num_used = 0; + unsigned idx = 0; + for (unsigned i = 0; i < instr->definitions.size(); i++) { + if (ctx.uses[instr->definitions[i].tempId()]) { + num_used++; + idx = i; + } + } + if (num_used == 1) { + aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)}; + extract->operands[0] = instr->operands[0]; + extract->operands[1] = Operand((uint32_t) idx); + extract->definitions[0] = instr->definitions[idx]; + instr.reset(extract.release()); + } + } + + /* re-check mad instructions */ + if (instr->opcode == aco_opcode::v_mad_f32 && ctx.info[instr->definitions[0].tempId()].is_mad()) { + mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val]; + /* first, check profitability */ + if (ctx.uses[info->mul_temp_id]) { + ctx.uses[info->mul_temp_id]++; + instr.swap(info->add_instr); + + /* second, check possible literals */ + } else if (!info->needs_vop3) { + uint32_t literal_idx = 0; + uint32_t literal_uses = UINT32_MAX; + for (unsigned i = 0; i < instr->operands.size(); i++) + { + if (!instr->operands[i].isTemp()) + continue; + /* if one of the operands is sgpr, we cannot add a literal somewhere else */ + if (instr->operands[i].getTemp().type() == RegType::sgpr) { + if (ctx.info[instr->operands[i].tempId()].is_literal()) { + literal_uses = ctx.uses[instr->operands[i].tempId()]; + literal_idx = i; + } else { + literal_uses = UINT32_MAX; + } + break; + } + else if (ctx.info[instr->operands[i].tempId()].is_literal() && + ctx.uses[instr->operands[i].tempId()] < literal_uses) { + literal_uses = ctx.uses[instr->operands[i].tempId()]; + literal_idx = i; + } + } + if (literal_uses < threshold) { + ctx.uses[instr->operands[literal_idx].tempId()]--; + info->check_literal = true; + info->literal_idx = literal_idx; + } + } + return; + } + + /* check for literals */ + /* we do not apply the literals yet as we don't know if it is profitable */ + if (instr->isSALU()) { + uint32_t literal_idx = 0; + uint32_t literal_uses = UINT32_MAX; + bool has_literal = false; + for (unsigned i = 0; i < instr->operands.size(); i++) + { + if (instr->operands[i].isLiteral()) { + has_literal = true; + break; + } + if (!instr->operands[i].isTemp()) + continue; + if (ctx.info[instr->operands[i].tempId()].is_literal() && + ctx.uses[instr->operands[i].tempId()] < literal_uses) { + literal_uses = ctx.uses[instr->operands[i].tempId()]; + literal_idx = i; + } + } + if (!has_literal && literal_uses < threshold) { + ctx.uses[instr->operands[literal_idx].tempId()]--; + if (ctx.uses[instr->operands[literal_idx].tempId()] == 0) + instr->operands[literal_idx] = Operand(ctx.info[instr->operands[literal_idx].tempId()].val); + } + } else if (instr->isVALU() && valu_can_accept_literal(ctx, instr) && + instr->operands[0].isTemp() && + ctx.info[instr->operands[0].tempId()].is_literal() && + ctx.uses[instr->operands[0].tempId()] < threshold) { + ctx.uses[instr->operands[0].tempId()]--; + if (ctx.uses[instr->operands[0].tempId()] == 0) + instr->operands[0] = Operand(ctx.info[instr->operands[0].tempId()].val); + } + +} + + +void apply_literals(opt_ctx &ctx, aco_ptr<Instruction>& instr) +{ + /* Cleanup Dead Instructions */ + if (!instr) + return; + + /* apply literals on SALU */ + if (instr->isSALU()) { + for (Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + if (op.isLiteral()) + break; + if (ctx.info[op.tempId()].is_literal() && + ctx.uses[op.tempId()] == 0) + op = Operand(ctx.info[op.tempId()].val); + } + } + + /* apply literals on VALU */ + else if (instr->isVALU() && !instr->isVOP3() && + instr->operands[0].isTemp() && + ctx.info[instr->operands[0].tempId()].is_literal() && + ctx.uses[instr->operands[0].tempId()] == 0) { + instr->operands[0] = Operand(ctx.info[instr->operands[0].tempId()].val); + } + + /* apply literals on MAD */ + else if (instr->opcode == aco_opcode::v_mad_f32 && ctx.info[instr->definitions[0].tempId()].is_mad()) { + mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val]; + aco_ptr<Instruction> new_mad; + if (info->check_literal && ctx.uses[instr->operands[info->literal_idx].tempId()] == 0) { + if (info->literal_idx == 2) { /* add literal -> madak */ + new_mad.reset(create_instruction<VOP2_instruction>(aco_opcode::v_madak_f32, Format::VOP2, 3, 1)); + new_mad->operands[0] = instr->operands[0]; + new_mad->operands[1] = instr->operands[1]; + } else { /* mul literal -> madmk */ + new_mad.reset(create_instruction<VOP2_instruction>(aco_opcode::v_madmk_f32, Format::VOP2, 3, 1)); + new_mad->operands[0] = instr->operands[1 - info->literal_idx]; + new_mad->operands[1] = instr->operands[2]; + } + new_mad->operands[2] = Operand(ctx.info[instr->operands[info->literal_idx].tempId()].val); + new_mad->definitions[0] = instr->definitions[0]; + instr.swap(new_mad); + } + } + + ctx.instructions.emplace_back(std::move(instr)); +} + + +void optimize(Program* program) +{ + opt_ctx ctx; + ctx.program = program; + std::vector<ssa_info> info(program->peekAllocationId()); + ctx.info = info.data(); + + /* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */ + for (Block& block : program->blocks) { + for (aco_ptr<Instruction>& instr : block.instructions) + label_instruction(ctx, instr); + } + + ctx.uses = std::move(dead_code_analysis(program)); + + /* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */ + for (Block& block : program->blocks) { + for (aco_ptr<Instruction>& instr : block.instructions) + combine_instruction(ctx, instr); + } + + /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */ + for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); ++it) { + Block* block = &(*it); + for (std::vector<aco_ptr<Instruction>>::reverse_iterator it = block->instructions.rbegin(); it != block->instructions.rend(); ++it) + select_instruction(ctx, *it); + } + + /* 4. Add literals to instructions */ + for (Block& block : program->blocks) { + ctx.instructions.clear(); + for (aco_ptr<Instruction>& instr : block.instructions) + apply_literals(ctx, instr); + block.instructions.swap(ctx.instructions); + } + +} + +} diff --git a/src/amd/compiler/aco_print_asm.cpp b/src/amd/compiler/aco_print_asm.cpp new file mode 100644 index 00000000000..31079aa1c4a --- /dev/null +++ b/src/amd/compiler/aco_print_asm.cpp @@ -0,0 +1,104 @@ + +#include <iomanip> +#include "aco_ir.h" +#include "llvm-c/Disassembler.h" +#include "ac_llvm_util.h" + +#include <llvm/ADT/StringRef.h> + +namespace aco { + +void print_asm(Program *program, std::vector<uint32_t>& binary, + unsigned exec_size, enum radeon_family family, std::ostream& out) +{ + std::vector<bool> referenced_blocks(program->blocks.size()); + referenced_blocks[0] = true; + for (Block& block : program->blocks) { + for (unsigned succ : block.linear_succs) + referenced_blocks[succ] = true; + } + + std::vector<std::tuple<uint64_t, llvm::StringRef, uint8_t>> symbols; + std::vector<std::array<char,16>> block_names; + block_names.reserve(program->blocks.size()); + for (Block& block : program->blocks) { + if (!referenced_blocks[block.index]) + continue; + std::array<char, 16> name; + sprintf(name.data(), "BB%u", block.index); + block_names.push_back(name); + symbols.emplace_back(block.offset * 4, llvm::StringRef(block_names[block_names.size() - 1].data()), 0); + } + + LLVMDisasmContextRef disasm = LLVMCreateDisasmCPU("amdgcn-mesa-mesa3d", + ac_get_llvm_processor_name(family), + &symbols, 0, NULL, NULL); + + char outline[1024]; + size_t pos = 0; + bool invalid = false; + unsigned next_block = 0; + while (pos < exec_size) { + while (next_block < program->blocks.size() && pos == program->blocks[next_block].offset) { + if (referenced_blocks[next_block]) + out << "BB" << std::dec << next_block << ":" << std::endl; + next_block++; + } + + size_t l = LLVMDisasmInstruction(disasm, (uint8_t *) &binary[pos], + (exec_size - pos) * sizeof(uint32_t), pos * 4, + outline, sizeof(outline)); + + size_t new_pos; + const int align_width = 60; + if (program->chip_class == GFX9 && !l && ((binary[pos] & 0xffff8000) == 0xd1348000)) { /* not actually an invalid instruction */ + out << std::left << std::setw(align_width) << std::setfill(' ') << "\tv_add_u32_e64 + clamp"; + new_pos = pos + 2; + } else if (!l) { + out << std::left << std::setw(align_width) << std::setfill(' ') << "(invalid instruction)"; + new_pos = pos + 1; + invalid = true; + } else { + out << std::left << std::setw(align_width) << std::setfill(' ') << outline; + assert(l % 4 == 0); + new_pos = pos + l / 4; + } + out << std::right; + + out << " ;"; + for (; pos < new_pos; pos++) + out << " " << std::setfill('0') << std::setw(8) << std::hex << binary[pos]; + out << std::endl; + } + out << std::setfill(' ') << std::setw(0) << std::dec; + assert(next_block == program->blocks.size()); + + LLVMDisasmDispose(disasm); + + if (program->constant_data.size()) { + out << std::endl << "/* constant data */" << std::endl; + for (unsigned i = 0; i < program->constant_data.size(); i += 32) { + out << '[' << std::setw(6) << std::setfill('0') << std::dec << i << ']'; + unsigned line_size = std::min<size_t>(program->constant_data.size() - i, 32); + for (unsigned j = 0; j < line_size; j += 4) { + unsigned size = std::min<size_t>(program->constant_data.size() - (i + j), 4); + uint32_t v = 0; + memcpy(&v, &program->constant_data[i + j], size); + out << " " << std::setw(8) << std::setfill('0') << std::hex << v; + } + out << std::endl; + } + } + + out << std::setfill(' ') << std::setw(0) << std::dec; + + if (invalid) { + /* Invalid instructions usually lead to GPU hangs, which can make + * getting the actual invalid instruction hard. Abort here so that we + * can find the problem. + */ + abort(); + } +} + +} diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp new file mode 100644 index 00000000000..517ddc235ce --- /dev/null +++ b/src/amd/compiler/aco_print_ir.cpp @@ -0,0 +1,575 @@ +#include "aco_ir.h" +#include "aco_builder.h" + +#include "sid.h" + +namespace aco { + +static const char *reduce_ops[] = { + [iadd32] = "iadd32", + [iadd64] = "iadd64", + [imul32] = "imul32", + [imul64] = "imul64", + [fadd32] = "fadd32", + [fadd64] = "fadd64", + [fmul32] = "fmul32", + [fmul64] = "fmul64", + [imin32] = "imin32", + [imin64] = "imin64", + [imax32] = "imax32", + [imax64] = "imax64", + [umin32] = "umin32", + [umin64] = "umin64", + [umax32] = "umax32", + [umax64] = "umax64", + [fmin32] = "fmin32", + [fmin64] = "fmin64", + [fmax32] = "fmax32", + [fmax64] = "fmax64", + [iand32] = "iand32", + [iand64] = "iand64", + [ior32] = "ior32", + [ior64] = "ior64", + [ixor32] = "ixor32", + [ixor64] = "ixor64", +}; + +static void print_reg_class(const RegClass rc, FILE *output) +{ + switch (rc) { + case RegClass::s1: fprintf(output, " s1: "); return; + case RegClass::s2: fprintf(output, " s2: "); return; + case RegClass::s3: fprintf(output, " s3: "); return; + case RegClass::s4: fprintf(output, " s4: "); return; + case RegClass::s6: fprintf(output, " s6: "); return; + case RegClass::s8: fprintf(output, " s8: "); return; + case RegClass::s16: fprintf(output, "s16: "); return; + case RegClass::v1: fprintf(output, " v1: "); return; + case RegClass::v2: fprintf(output, " v2: "); return; + case RegClass::v3: fprintf(output, " v3: "); return; + case RegClass::v4: fprintf(output, " v4: "); return; + case RegClass::v5: fprintf(output, " v5: "); return; + case RegClass::v6: fprintf(output, " v6: "); return; + case RegClass::v7: fprintf(output, " v7: "); return; + case RegClass::v8: fprintf(output, " v8: "); return; + case RegClass::v1_linear: fprintf(output, " v1: "); return; + case RegClass::v2_linear: fprintf(output, " v2: "); return; + } +} + +void print_physReg(unsigned reg, unsigned size, FILE *output) +{ + if (reg == 124) { + fprintf(output, ":m0"); + } else if (reg == 106) { + fprintf(output, ":vcc"); + } else if (reg == 253) { + fprintf(output, ":scc"); + } else if (reg == 126) { + fprintf(output, ":exec"); + } else { + bool is_vgpr = reg / 256; + reg = reg % 256; + fprintf(output, ":%c[%d", is_vgpr ? 'v' : 's', reg); + if (size > 1) + fprintf(output, "-%d]", reg + size -1); + else + fprintf(output, "]"); + } +} + +static void print_constant(uint8_t reg, FILE *output) +{ + if (reg >= 128 && reg <= 192) { + fprintf(output, "%d", reg - 128); + return; + } else if (reg >= 192 && reg <= 208) { + fprintf(output, "%d", 192 - reg); + return; + } + + switch (reg) { + case 240: + fprintf(output, "0.5"); + break; + case 241: + fprintf(output, "-0.5"); + break; + case 242: + fprintf(output, "1.0"); + break; + case 243: + fprintf(output, "-1.0"); + break; + case 244: + fprintf(output, "2.0"); + break; + case 245: + fprintf(output, "-2.0"); + break; + case 246: + fprintf(output, "4.0"); + break; + case 247: + fprintf(output, "-4.0"); + break; + case 248: + fprintf(output, "1/(2*PI)"); + break; + } +} + +static void print_operand(const Operand *operand, FILE *output) +{ + if (operand->isLiteral()) { + fprintf(output, "0x%x", operand->constantValue()); + } else if (operand->isConstant()) { + print_constant(operand->physReg().reg, output); + } else if (operand->isUndefined()) { + print_reg_class(operand->regClass(), output); + fprintf(output, "undef"); + } else { + fprintf(output, "%%%d", operand->tempId()); + + if (operand->isFixed()) + print_physReg(operand->physReg(), operand->size(), output); + } +} + +static void print_definition(const Definition *definition, FILE *output) +{ + print_reg_class(definition->regClass(), output); + fprintf(output, "%%%d", definition->tempId()); + + if (definition->isFixed()) + print_physReg(definition->physReg(), definition->size(), output); +} + +static void print_barrier_reorder(bool can_reorder, barrier_interaction barrier, FILE *output) +{ + if (can_reorder) + fprintf(output, " reorder"); + + if (barrier & barrier_buffer) + fprintf(output, " buffer"); + if (barrier & barrier_image) + fprintf(output, " image"); + if (barrier & barrier_atomic) + fprintf(output, " atomic"); + if (barrier & barrier_shared) + fprintf(output, " shared"); +} + +static void print_instr_format_specific(struct Instruction *instr, FILE *output) +{ + switch (instr->format) { + case Format::SOPK: { + SOPK_instruction* sopk = static_cast<SOPK_instruction*>(instr); + fprintf(output, " imm:%d", sopk->imm & 0x8000 ? (sopk->imm - 65536) : sopk->imm); + break; + } + case Format::SOPP: { + SOPP_instruction* sopp = static_cast<SOPP_instruction*>(instr); + uint16_t imm = sopp->imm; + switch (instr->opcode) { + case aco_opcode::s_waitcnt: { + /* we usually should check the chip class for vmcnt/lgkm, but + * insert_waitcnt() should fill it in regardless. */ + unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10); + if (vmcnt != 63) fprintf(output, " vmcnt(%d)", vmcnt); + if (((imm >> 4) & 0x7) < 0x7) fprintf(output, " expcnt(%d)", (imm >> 4) & 0x7); + if (((imm >> 8) & 0x3F) < 0x3F) fprintf(output, " lgkmcnt(%d)", (imm >> 8) & 0x3F); + break; + } + case aco_opcode::s_endpgm: + case aco_opcode::s_endpgm_saved: + case aco_opcode::s_endpgm_ordered_ps_done: + case aco_opcode::s_wakeup: + case aco_opcode::s_barrier: + case aco_opcode::s_icache_inv: + case aco_opcode::s_ttracedata: + case aco_opcode::s_set_gpr_idx_off: { + break; + } + default: { + if (imm) + fprintf(output, " imm:%u", imm); + break; + } + } + if (sopp->block != -1) + fprintf(output, " block:BB%d", sopp->block); + break; + } + case Format::SMEM: { + SMEM_instruction* smem = static_cast<SMEM_instruction*>(instr); + if (smem->glc) + fprintf(output, " glc"); + if (smem->nv) + fprintf(output, " nv"); + print_barrier_reorder(smem->can_reorder, smem->barrier, output); + break; + } + case Format::VINTRP: { + Interp_instruction* vintrp = static_cast<Interp_instruction*>(instr); + fprintf(output, " attr%d.%c", vintrp->attribute, "xyzw"[vintrp->component]); + break; + } + case Format::DS: { + DS_instruction* ds = static_cast<DS_instruction*>(instr); + if (ds->offset0) + fprintf(output, " offset0:%u", ds->offset0); + if (ds->offset1) + fprintf(output, " offset1:%u", ds->offset1); + if (ds->gds) + fprintf(output, " gds"); + break; + } + case Format::MUBUF: { + MUBUF_instruction* mubuf = static_cast<MUBUF_instruction*>(instr); + if (mubuf->offset) + fprintf(output, " offset:%u", mubuf->offset); + if (mubuf->offen) + fprintf(output, " offen"); + if (mubuf->idxen) + fprintf(output, " idxen"); + if (mubuf->glc) + fprintf(output, " glc"); + if (mubuf->slc) + fprintf(output, " slc"); + if (mubuf->tfe) + fprintf(output, " tfe"); + if (mubuf->lds) + fprintf(output, " lds"); + if (mubuf->disable_wqm) + fprintf(output, " disable_wqm"); + print_barrier_reorder(mubuf->can_reorder, mubuf->barrier, output); + break; + } + case Format::MIMG: { + MIMG_instruction* mimg = static_cast<MIMG_instruction*>(instr); + unsigned identity_dmask = !instr->definitions.empty() ? + (1 << instr->definitions[0].size()) - 1 : + 0xf; + if ((mimg->dmask & identity_dmask) != identity_dmask) + fprintf(output, " dmask:%s%s%s%s", + mimg->dmask & 0x1 ? "x" : "", + mimg->dmask & 0x2 ? "y" : "", + mimg->dmask & 0x4 ? "z" : "", + mimg->dmask & 0x8 ? "w" : ""); + if (mimg->unrm) + fprintf(output, " unrm"); + if (mimg->glc) + fprintf(output, " glc"); + if (mimg->slc) + fprintf(output, " slc"); + if (mimg->tfe) + fprintf(output, " tfe"); + if (mimg->da) + fprintf(output, " da"); + if (mimg->lwe) + fprintf(output, " lwe"); + if (mimg->r128 || mimg->a16) + fprintf(output, " r128/a16"); + if (mimg->d16) + fprintf(output, " d16"); + if (mimg->disable_wqm) + fprintf(output, " disable_wqm"); + print_barrier_reorder(mimg->can_reorder, mimg->barrier, output); + break; + } + case Format::EXP: { + Export_instruction* exp = static_cast<Export_instruction*>(instr); + unsigned identity_mask = exp->compressed ? 0x5 : 0xf; + if ((exp->enabled_mask & identity_mask) != identity_mask) + fprintf(output, " en:%c%c%c%c", + exp->enabled_mask & 0x1 ? 'r' : '*', + exp->enabled_mask & 0x2 ? 'g' : '*', + exp->enabled_mask & 0x4 ? 'b' : '*', + exp->enabled_mask & 0x8 ? 'a' : '*'); + if (exp->compressed) + fprintf(output, " compr"); + if (exp->done) + fprintf(output, " done"); + if (exp->valid_mask) + fprintf(output, " vm"); + + if (exp->dest <= V_008DFC_SQ_EXP_MRT + 7) + fprintf(output, " mrt%d", exp->dest - V_008DFC_SQ_EXP_MRT); + else if (exp->dest == V_008DFC_SQ_EXP_MRTZ) + fprintf(output, " mrtz"); + else if (exp->dest == V_008DFC_SQ_EXP_NULL) + fprintf(output, " null"); + else if (exp->dest >= V_008DFC_SQ_EXP_POS && exp->dest <= V_008DFC_SQ_EXP_POS + 3) + fprintf(output, " pos%d", exp->dest - V_008DFC_SQ_EXP_POS); + else if (exp->dest >= V_008DFC_SQ_EXP_PARAM && exp->dest <= V_008DFC_SQ_EXP_PARAM + 31) + fprintf(output, " param%d", exp->dest - V_008DFC_SQ_EXP_PARAM); + break; + } + case Format::PSEUDO_BRANCH: { + Pseudo_branch_instruction* branch = static_cast<Pseudo_branch_instruction*>(instr); + /* Note: BB0 cannot be a branch target */ + if (branch->target[0] != 0) + fprintf(output, " BB%d", branch->target[0]); + if (branch->target[1] != 0) + fprintf(output, ", BB%d", branch->target[1]); + break; + } + case Format::PSEUDO_REDUCTION: { + Pseudo_reduction_instruction* reduce = static_cast<Pseudo_reduction_instruction*>(instr); + fprintf(output, " op:%s", reduce_ops[reduce->reduce_op]); + if (reduce->cluster_size) + fprintf(output, " cluster_size:%u", reduce->cluster_size); + break; + } + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: { + FLAT_instruction* flat = static_cast<FLAT_instruction*>(instr); + if (flat->offset) + fprintf(output, " offset:%u", flat->offset); + if (flat->glc) + fprintf(output, " glc"); + if (flat->slc) + fprintf(output, " slc"); + if (flat->lds) + fprintf(output, " lds"); + if (flat->nv) + fprintf(output, " nv"); + break; + } + case Format::MTBUF: { + MTBUF_instruction* mtbuf = static_cast<MTBUF_instruction*>(instr); + fprintf(output, " dfmt:"); + switch (mtbuf->dfmt) { + case V_008F0C_BUF_DATA_FORMAT_8: fprintf(output, "8"); break; + case V_008F0C_BUF_DATA_FORMAT_16: fprintf(output, "16"); break; + case V_008F0C_BUF_DATA_FORMAT_8_8: fprintf(output, "8_8"); break; + case V_008F0C_BUF_DATA_FORMAT_32: fprintf(output, "32"); break; + case V_008F0C_BUF_DATA_FORMAT_16_16: fprintf(output, "16_16"); break; + case V_008F0C_BUF_DATA_FORMAT_10_11_11: fprintf(output, "10_11_11"); break; + case V_008F0C_BUF_DATA_FORMAT_11_11_10: fprintf(output, "11_11_10"); break; + case V_008F0C_BUF_DATA_FORMAT_10_10_10_2: fprintf(output, "10_10_10_2"); break; + case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: fprintf(output, "2_10_10_10"); break; + case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: fprintf(output, "8_8_8_8"); break; + case V_008F0C_BUF_DATA_FORMAT_32_32: fprintf(output, "32_32"); break; + case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: fprintf(output, "16_16_16_16"); break; + case V_008F0C_BUF_DATA_FORMAT_32_32_32: fprintf(output, "32_32_32"); break; + case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: fprintf(output, "32_32_32_32"); break; + case V_008F0C_BUF_DATA_FORMAT_RESERVED_15: fprintf(output, "reserved15"); break; + } + fprintf(output, " nfmt:"); + switch (mtbuf->nfmt) { + case V_008F0C_BUF_NUM_FORMAT_UNORM: fprintf(output, "unorm"); break; + case V_008F0C_BUF_NUM_FORMAT_SNORM: fprintf(output, "snorm"); break; + case V_008F0C_BUF_NUM_FORMAT_USCALED: fprintf(output, "uscaled"); break; + case V_008F0C_BUF_NUM_FORMAT_SSCALED: fprintf(output, "sscaled"); break; + case V_008F0C_BUF_NUM_FORMAT_UINT: fprintf(output, "uint"); break; + case V_008F0C_BUF_NUM_FORMAT_SINT: fprintf(output, "sint"); break; + case V_008F0C_BUF_NUM_FORMAT_SNORM_OGL: fprintf(output, "snorm"); break; + case V_008F0C_BUF_NUM_FORMAT_FLOAT: fprintf(output, "float"); break; + } + if (mtbuf->offset) + fprintf(output, " offset:%u", mtbuf->offset); + if (mtbuf->offen) + fprintf(output, " offen"); + if (mtbuf->idxen) + fprintf(output, " idxen"); + if (mtbuf->glc) + fprintf(output, " glc"); + if (mtbuf->slc) + fprintf(output, " slc"); + if (mtbuf->tfe) + fprintf(output, " tfe"); + if (mtbuf->disable_wqm) + fprintf(output, " disable_wqm"); + print_barrier_reorder(mtbuf->can_reorder, mtbuf->barrier, output); + break; + } + default: { + break; + } + } + if (instr->isVOP3()) { + VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr); + switch (vop3->omod) { + case 1: + fprintf(output, " *2"); + break; + case 2: + fprintf(output, " *4"); + break; + case 3: + fprintf(output, " *0.5"); + break; + } + if (vop3->clamp) + fprintf(output, " clamp"); + } else if (instr->isDPP()) { + DPP_instruction* dpp = static_cast<DPP_instruction*>(instr); + if (dpp->dpp_ctrl <= 0xff) { + fprintf(output, " quad_perm:[%d,%d,%d,%d]", + dpp->dpp_ctrl & 0x3, (dpp->dpp_ctrl >> 2) & 0x3, + (dpp->dpp_ctrl >> 4) & 0x3, (dpp->dpp_ctrl >> 6) & 0x3); + } else if (dpp->dpp_ctrl >= 0x101 && dpp->dpp_ctrl <= 0x10f) { + fprintf(output, " row_shl:%d", dpp->dpp_ctrl & 0xf); + } else if (dpp->dpp_ctrl >= 0x111 && dpp->dpp_ctrl <= 0x11f) { + fprintf(output, " row_shr:%d", dpp->dpp_ctrl & 0xf); + } else if (dpp->dpp_ctrl >= 0x121 && dpp->dpp_ctrl <= 0x12f) { + fprintf(output, " row_ror:%d", dpp->dpp_ctrl & 0xf); + } else if (dpp->dpp_ctrl == dpp_wf_sl1) { + fprintf(output, " wave_shl:1"); + } else if (dpp->dpp_ctrl == dpp_wf_rl1) { + fprintf(output, " wave_rol:1"); + } else if (dpp->dpp_ctrl == dpp_wf_sr1) { + fprintf(output, " wave_shr:1"); + } else if (dpp->dpp_ctrl == dpp_wf_rr1) { + fprintf(output, " wave_ror:1"); + } else if (dpp->dpp_ctrl == dpp_row_mirror) { + fprintf(output, " row_mirror"); + } else if (dpp->dpp_ctrl == dpp_row_half_mirror) { + fprintf(output, " row_half_mirror"); + } else if (dpp->dpp_ctrl == dpp_row_bcast15) { + fprintf(output, " row_bcast:15"); + } else if (dpp->dpp_ctrl == dpp_row_bcast31) { + fprintf(output, " row_bcast:31"); + } else { + fprintf(output, " dpp_ctrl:0x%.3x", dpp->dpp_ctrl); + } + if (dpp->row_mask != 0xf) + fprintf(output, " row_mask:0x%.1x", dpp->row_mask); + if (dpp->bank_mask != 0xf) + fprintf(output, " bank_mask:0x%.1x", dpp->bank_mask); + if (dpp->bound_ctrl) + fprintf(output, " bound_ctrl:1"); + } else if ((int)instr->format & (int)Format::SDWA) { + fprintf(output, " (printing unimplemented)"); + } +} + +void aco_print_instr(struct Instruction *instr, FILE *output) +{ + if (!instr->definitions.empty()) { + for (unsigned i = 0; i < instr->definitions.size(); ++i) { + print_definition(&instr->definitions[i], output); + if (i + 1 != instr->definitions.size()) + fprintf(output, ", "); + } + fprintf(output, " = "); + } + fprintf(output, "%s", instr_info.name[(int)instr->opcode]); + if (instr->operands.size()) { + bool abs[instr->operands.size()]; + bool neg[instr->operands.size()]; + if ((int)instr->format & (int)Format::VOP3A) { + VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr); + for (unsigned i = 0; i < instr->operands.size(); ++i) { + abs[i] = vop3->abs[i]; + neg[i] = vop3->neg[i]; + } + } else if (instr->isDPP()) { + DPP_instruction* dpp = static_cast<DPP_instruction*>(instr); + assert(instr->operands.size() <= 2); + for (unsigned i = 0; i < instr->operands.size(); ++i) { + abs[i] = dpp->abs[i]; + neg[i] = dpp->neg[i]; + } + } else { + for (unsigned i = 0; i < instr->operands.size(); ++i) { + abs[i] = false; + neg[i] = false; + } + } + for (unsigned i = 0; i < instr->operands.size(); ++i) { + if (i) + fprintf(output, ", "); + else + fprintf(output, " "); + + if (neg[i]) + fprintf(output, "-"); + if (abs[i]) + fprintf(output, "|"); + print_operand(&instr->operands[i], output); + if (abs[i]) + fprintf(output, "|"); + } + } + print_instr_format_specific(instr, output); +} + +static void print_block_kind(uint16_t kind, FILE *output) +{ + if (kind & block_kind_uniform) + fprintf(output, "uniform, "); + if (kind & block_kind_top_level) + fprintf(output, "top-level, "); + if (kind & block_kind_loop_preheader) + fprintf(output, "loop-preheader, "); + if (kind & block_kind_loop_header) + fprintf(output, "loop-header, "); + if (kind & block_kind_loop_exit) + fprintf(output, "loop-exit, "); + if (kind & block_kind_continue) + fprintf(output, "continue, "); + if (kind & block_kind_break) + fprintf(output, "break, "); + if (kind & block_kind_continue_or_break) + fprintf(output, "continue_or_break, "); + if (kind & block_kind_discard) + fprintf(output, "discard, "); + if (kind & block_kind_branch) + fprintf(output, "branch, "); + if (kind & block_kind_merge) + fprintf(output, "merge, "); + if (kind & block_kind_invert) + fprintf(output, "invert, "); + if (kind & block_kind_uses_discard_if) + fprintf(output, "discard_if, "); + if (kind & block_kind_needs_lowering) + fprintf(output, "needs_lowering, "); +} + +void aco_print_block(const struct Block* block, FILE *output) +{ + fprintf(output, "BB%d\n", block->index); + fprintf(output, "/* logical preds: "); + for (unsigned pred : block->logical_preds) + fprintf(output, "BB%d, ", pred); + fprintf(output, "/ linear preds: "); + for (unsigned pred : block->linear_preds) + fprintf(output, "BB%d, ", pred); + fprintf(output, "/ kind: "); + print_block_kind(block->kind, output); + fprintf(output, "*/\n"); + for (auto const& instr : block->instructions) { + fprintf(output, "\t"); + aco_print_instr(instr.get(), output); + fprintf(output, "\n"); + } +} + +void aco_print_program(Program *program, FILE *output) +{ + for (Block const& block : program->blocks) + aco_print_block(&block, output); + + if (program->constant_data.size()) { + fprintf(output, "\n/* constant data */\n"); + for (unsigned i = 0; i < program->constant_data.size(); i += 32) { + fprintf(output, "[%06d] ", i); + unsigned line_size = std::min<size_t>(program->constant_data.size() - i, 32); + for (unsigned j = 0; j < line_size; j += 4) { + unsigned size = std::min<size_t>(program->constant_data.size() - (i + j), 4); + uint32_t v = 0; + memcpy(&v, &program->constant_data[i + j], size); + fprintf(output, " %08x", v); + } + fprintf(output, "\n"); + } + } + + fprintf(output, "\n"); +} + +} diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp new file mode 100644 index 00000000000..663a43c539a --- /dev/null +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -0,0 +1,164 @@ +/* + * Copyright © 2018 Valve Corporation + * Copyright © 2018 Google + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" +#include "aco_builder.h" + +/* + * Insert p_linear_start instructions right before RA to correctly allocate + * temporaries for reductions that have to disrespect EXEC by executing in + * WWM. + */ + +namespace aco { + +void setup_reduce_temp(Program* program) +{ + unsigned last_top_level_block_idx = 0; + unsigned maxSize = 0; + + std::vector<bool> hasReductions(program->blocks.size()); + for (Block& block : program->blocks) { + for (aco_ptr<Instruction>& instr : block.instructions) { + if (instr->format != Format::PSEUDO_REDUCTION) + continue; + + maxSize = MAX2(maxSize, instr->operands[0].size()); + hasReductions[block.index] = true; + } + } + + if (maxSize == 0) + return; + + assert(maxSize == 1 || maxSize == 2); + Temp reduceTmp(0, RegClass(RegType::vgpr, maxSize).as_linear()); + Temp vtmp(0, RegClass(RegType::vgpr, maxSize).as_linear()); + int inserted_at = -1; + int vtmp_inserted_at = -1; + bool reduceTmp_in_loop = false; + bool vtmp_in_loop = false; + + for (Block& block : program->blocks) { + + /* insert p_end_linear_vgpr after the outermost loop */ + if (reduceTmp_in_loop && block.loop_nest_depth == 0) { + assert(inserted_at == (int)last_top_level_block_idx); + + aco_ptr<Instruction> end{create_instruction<Instruction>(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)}; + end->operands[0] = Operand(reduceTmp); + if (vtmp_in_loop) + end->operands[1] = Operand(vtmp); + /* insert after the phis of the loop exit block */ + std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin(); + while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi) + ++it; + block.instructions.insert(it, std::move(end)); + reduceTmp_in_loop = false; + } + + if (block.kind & block_kind_top_level) + last_top_level_block_idx = block.index; + + if (!hasReductions[block.index]) + continue; + + std::vector<aco_ptr<Instruction>>::iterator it; + for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { + Instruction *instr = (*it).get(); + if (instr->format != Format::PSEUDO_REDUCTION) + continue; + + ReduceOp op = static_cast<Pseudo_reduction_instruction *>(instr)->reduce_op; + reduceTmp_in_loop |= block.loop_nest_depth > 0; + + if ((int)last_top_level_block_idx != inserted_at) { + reduceTmp = {program->allocateId(), reduceTmp.regClass()}; + aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; + create->definitions[0] = Definition(reduceTmp); + /* find the right place to insert this definition */ + if (last_top_level_block_idx == block.index) { + /* insert right before the current instruction */ + it = block.instructions.insert(it, std::move(create)); + it++; + /* inserted_at is intentionally not updated here, so later blocks + * would insert at the end instead of using this one. */ + } else { + assert(last_top_level_block_idx < block.index); + /* insert before the branch at last top level block */ + std::vector<aco_ptr<Instruction>>& instructions = program->blocks[last_top_level_block_idx].instructions; + instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create)); + inserted_at = last_top_level_block_idx; + } + } + + /* same as before, except for the vector temporary instead of the reduce temporary */ + bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || + op == fmin64 || op == fmax64; + + need_vtmp |= static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size == 32; + vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0; + if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) { + vtmp = {program->allocateId(), vtmp.regClass()}; + aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; + create->definitions[0] = Definition(vtmp); + if (last_top_level_block_idx == block.index) { + it = block.instructions.insert(it, std::move(create)); + it++; + } else { + assert(last_top_level_block_idx < block.index); + std::vector<aco_ptr<Instruction>>& instructions = program->blocks[last_top_level_block_idx].instructions; + instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create)); + vtmp_inserted_at = last_top_level_block_idx; + } + } + + instr->operands[1] = Operand(reduceTmp); + if (need_vtmp) + instr->operands[2] = Operand(vtmp); + + /* scalar temporary */ + Builder bld(program); + instr->definitions[1] = bld.def(s2); + + /* scalar identity temporary */ + if (instr->opcode == aco_opcode::p_exclusive_scan && + (op == imin32 || op == imin64 || + op == imax32 || op == imax64 || + op == fmin32 || op == fmin64 || + op == fmax32 || op == fmax64 || + op == fmul64)) { + instr->definitions[2] = bld.def(RegClass(RegType::sgpr, instr->operands[0].size())); + } + + /* vcc clobber */ + if (op == iadd32 && program->chip_class < GFX9) + instr->definitions[4] = Definition(vcc, s2); + } + } +} + +}; + diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp new file mode 100644 index 00000000000..d55f1febc65 --- /dev/null +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -0,0 +1,1924 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Daniel Schürmann ([email protected]) + * Bas Nieuwenhuizen ([email protected]) + * + */ + +#include <algorithm> +#include <map> +#include <unordered_map> +#include <functional> + +#include "aco_ir.h" +#include "sid.h" + +namespace aco { +namespace { + +struct ra_ctx { + std::bitset<512> war_hint; + Program* program; + std::unordered_map<unsigned, std::pair<PhysReg, RegClass>> assignments; + std::map<unsigned, Temp> orig_names; + unsigned max_used_sgpr = 0; + unsigned max_used_vgpr = 0; + std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */ + + ra_ctx(Program* program) : program(program) {} +}; + + +/* helper function for debugging */ +#if 0 +void print_regs(ra_ctx& ctx, bool vgprs, std::array<uint32_t, 512>& reg_file) +{ + unsigned max = vgprs ? ctx.program->max_reg_demand.vgpr : ctx.program->max_reg_demand.sgpr; + unsigned lb = vgprs ? 256 : 0; + unsigned ub = lb + max; + char reg_char = vgprs ? 'v' : 's'; + + /* print markers */ + printf(" "); + for (unsigned i = lb; i < ub; i += 3) { + printf("%.2u ", i - lb); + } + printf("\n"); + + /* print usage */ + printf("%cgprs: ", reg_char); + unsigned free_regs = 0; + unsigned prev = 0; + bool char_select = false; + for (unsigned i = lb; i < ub; i++) { + if (reg_file[i] == 0xFFFF) { + printf("~"); + } else if (reg_file[i]) { + if (reg_file[i] != prev) { + prev = reg_file[i]; + char_select = !char_select; + } + printf(char_select ? "#" : "@"); + } else { + free_regs++; + printf("."); + } + } + printf("\n"); + + printf("%u/%u used, %u/%u free\n", max - free_regs, max, free_regs, max); + + /* print assignments */ + prev = 0; + unsigned size = 0; + for (unsigned i = lb; i < ub; i++) { + if (reg_file[i] != prev) { + if (prev && size > 1) + printf("-%d]\n", i - 1 - lb); + else if (prev) + printf("]\n"); + prev = reg_file[i]; + if (prev && prev != 0xFFFF) { + if (ctx.orig_names.count(reg_file[i]) && ctx.orig_names[reg_file[i]].id() != reg_file[i]) + printf("%%%u (was %%%d) = %c[%d", reg_file[i], ctx.orig_names[reg_file[i]].id(), reg_char, i - lb); + else + printf("%%%u = %c[%d", reg_file[i], reg_char, i - lb); + } + size = 1; + } else { + size++; + } + } + if (prev && size > 1) + printf("-%d]\n", ub - lb - 1); + else if (prev) + printf("]\n"); +} +#endif + + +void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) +{ + unsigned max_addressible_sgpr = ctx.program->sgpr_limit; + unsigned size = rc.size(); + if (rc.type() == RegType::vgpr) { + assert(reg >= 256); + unsigned hi = reg - 256 + size - 1; + ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi); + } else if (reg + rc.size() <= max_addressible_sgpr) { + unsigned hi = reg + size - 1; + ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr)); + } +} + + +void update_renames(ra_ctx& ctx, std::array<uint32_t, 512>& reg_file, + std::vector<std::pair<Operand, Definition>>& parallelcopies, + aco_ptr<Instruction>& instr) +{ + /* allocate id's and rename operands: this is done transparently here */ + for (std::pair<Operand, Definition>& copy : parallelcopies) { + /* the definitions with id are not from this function and already handled */ + if (copy.second.isTemp()) + continue; + + // FIXME: if a definition got moved, change the target location and remove the parallelcopy + copy.second.setTemp(Temp(ctx.program->allocateId(), copy.second.regClass())); + ctx.assignments[copy.second.tempId()] = {copy.second.physReg(), copy.second.regClass()}; + for (unsigned i = copy.second.physReg().reg; i < copy.second.physReg() + copy.second.size(); i++) + reg_file[i] = copy.second.tempId(); + /* check if we moved an operand */ + for (Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + if (op.tempId() == copy.first.tempId()) { + bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKill(); + for (std::pair<Operand, Definition>& pc : parallelcopies) { + PhysReg def_reg = pc.second.physReg(); + omit_renaming &= def_reg > copy.first.physReg() ? + (copy.first.physReg() + copy.first.size() <= def_reg.reg) : + (def_reg + pc.second.size() <= copy.first.physReg().reg); + } + if (omit_renaming) + continue; + op.setTemp(copy.second.getTemp()); + op.setFixed(copy.second.physReg()); + } + } + } +} + +std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx, + std::array<uint32_t, 512>& reg_file, + uint32_t lb, uint32_t ub, + uint32_t size, uint32_t stride, + RegClass rc) +{ + /* best fit algorithm: find the smallest gap to fit in the variable */ + if (stride == 1) { + unsigned best_pos = 0xFFFF; + unsigned gap_size = 0xFFFF; + unsigned next_pos = 0xFFFF; + + for (unsigned current_reg = lb; current_reg < ub; current_reg++) { + if (reg_file[current_reg] != 0 || ctx.war_hint[current_reg]) { + if (next_pos == 0xFFFF) + continue; + + /* check if the variable fits */ + if (next_pos + size > current_reg) { + next_pos = 0xFFFF; + continue; + } + + /* check if the tested gap is smaller */ + if (current_reg - next_pos < gap_size) { + best_pos = next_pos; + gap_size = current_reg - next_pos; + } + next_pos = 0xFFFF; + continue; + } + + if (next_pos == 0xFFFF) + next_pos = current_reg; + } + + /* final check */ + if (next_pos != 0xFFFF && + next_pos + size <= ub && + ub - next_pos < gap_size) { + best_pos = next_pos; + gap_size = ub - next_pos; + } + if (best_pos != 0xFFFF) { + adjust_max_used_regs(ctx, rc, best_pos); + return {PhysReg{best_pos}, true}; + } + return {{}, false}; + } + + bool found = false; + unsigned reg_lo = lb; + unsigned reg_hi = lb + size - 1; + while (!found && reg_lo + size <= ub) { + if (reg_file[reg_lo] != 0) { + reg_lo += stride; + continue; + } + reg_hi = reg_lo + size - 1; + found = true; + for (unsigned reg = reg_lo + 1; found && reg <= reg_hi; reg++) { + if (reg_file[reg] != 0 || ctx.war_hint[reg]) + found = false; + } + if (found) { + adjust_max_used_regs(ctx, rc, reg_lo); + return {PhysReg{reg_lo}, true}; + } + + reg_lo += stride; + } + + return {{}, false}; +} + +bool get_regs_for_copies(ra_ctx& ctx, + std::array<uint32_t, 512>& reg_file, + std::vector<std::pair<Operand, Definition>>& parallelcopies, + std::set<std::pair<unsigned, unsigned>> vars, + uint32_t lb, uint32_t ub, + aco_ptr<Instruction>& instr, + uint32_t def_reg_lo, + uint32_t def_reg_hi) +{ + + /* variables are sorted from small sized to large */ + /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders slightly though. */ + for (std::set<std::pair<unsigned, unsigned>>::reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) { + unsigned id = it->second; + std::pair<PhysReg, RegClass> var = ctx.assignments[id]; + uint32_t size = it->first; + uint32_t stride = 1; + if (var.second.type() == RegType::sgpr) { + if (size == 2) + stride = 2; + if (size > 3) + stride = 4; + } + + /* check if this is a dead operand, then we can re-use the space from the definition */ + bool is_dead_operand = false; + for (unsigned i = 0; !is_phi(instr) && !is_dead_operand && i < instr->operands.size(); i++) { + if (instr->operands[i].isTemp() && instr->operands[i].isKill() && instr->operands[i].tempId() == id) + is_dead_operand = true; + } + + std::pair<PhysReg, bool> res; + if (is_dead_operand) { + if (instr->opcode == aco_opcode::p_create_vector) { + for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { + if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { + for (unsigned j = 0; j < size; j++) + assert(reg_file[def_reg_lo + offset + j] == 0); + res = {PhysReg{def_reg_lo + offset}, true}; + break; + } + } + } else { + res = get_reg_simple(ctx, reg_file, def_reg_lo, def_reg_hi + 1, size, stride, var.second); + } + } else { + res = get_reg_simple(ctx, reg_file, lb, def_reg_lo, size, stride, var.second); + if (!res.second) { + unsigned lb = (def_reg_hi + stride) & ~(stride - 1); + res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, var.second); + } + } + + if (res.second) { + /* mark the area as blocked */ + for (unsigned i = res.first.reg; i < res.first + size; i++) + reg_file[i] = 0xFFFFFFFF; + /* create parallelcopy pair (without definition id) */ + Temp tmp = Temp(id, var.second); + Operand pc_op = Operand(tmp); + pc_op.setFixed(var.first); + Definition pc_def = Definition(res.first, pc_op.regClass()); + parallelcopies.emplace_back(pc_op, pc_def); + continue; + } + + unsigned best_pos = lb; + unsigned num_moves = 0xFF; + unsigned num_vars = 0; + + /* we use a sliding window to find potential positions */ + unsigned reg_lo = lb; + unsigned reg_hi = lb + size - 1; + for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) { + if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) || + (reg_hi >= def_reg_lo && reg_hi <= def_reg_hi))) + continue; + + /* second, check that we have at most k=num_moves elements in the window + * and no element is larger than the currently processed one */ + unsigned k = 0; + unsigned n = 0; + unsigned last_var = 0; + bool found = true; + for (unsigned j = reg_lo; found && j <= reg_hi; j++) { + if (reg_file[j] == 0 || reg_file[j] == last_var) + continue; + + /* 0xFFFF signals that this area is already blocked! */ + if (reg_file[j] == 0xFFFFFFFF || k > num_moves) { + found = false; + break; + } + /* we cannot split live ranges of linear vgprs */ + if (ctx.assignments[reg_file[j]].second & (1 << 6)) { + found = false; + break; + } + bool is_kill = false; + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isKill() && op.tempId() == reg_file[j]) { + is_kill = true; + break; + } + } + if (!is_kill && ctx.assignments[reg_file[j]].second.size() >= size) { + found = false; + break; + } + + k += ctx.assignments[reg_file[j]].second.size(); + last_var = reg_file[j]; + n++; + if (k > num_moves || (k == num_moves && n <= num_vars)) { + found = false; + break; + } + } + + if (found) { + best_pos = reg_lo; + num_moves = k; + num_vars = n; + } + } + + /* FIXME: we messed up and couldn't find space for the variables to be copied */ + if (num_moves == 0xFF) + return false; + + reg_lo = best_pos; + reg_hi = best_pos + size - 1; + + /* collect variables and block reg file */ + std::set<std::pair<unsigned, unsigned>> new_vars; + for (unsigned j = reg_lo; j <= reg_hi; j++) { + if (reg_file[j] != 0) { + unsigned size = ctx.assignments[reg_file[j]].second.size(); + unsigned id = reg_file[j]; + new_vars.emplace(size, id); + for (unsigned k = 0; k < size; k++) + reg_file[ctx.assignments[id].first + k] = 0; + } + } + + /* mark the area as blocked */ + for (unsigned i = reg_lo; i <= reg_hi; i++) + reg_file[i] = 0xFFFFFFFF; + + if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi)) + return false; + + adjust_max_used_regs(ctx, var.second, reg_lo); + + /* create parallelcopy pair (without definition id) */ + Temp tmp = Temp(id, var.second); + Operand pc_op = Operand(tmp); + pc_op.setFixed(var.first); + Definition pc_def = Definition(PhysReg{reg_lo}, pc_op.regClass()); + parallelcopies.emplace_back(pc_op, pc_def); + } + + return true; +} + + +std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx, + std::array<uint32_t, 512>& reg_file, + std::vector<std::pair<Operand, Definition>>& parallelcopies, + uint32_t lb, uint32_t ub, + uint32_t size, uint32_t stride, + RegClass rc, + aco_ptr<Instruction>& instr) +{ + unsigned regs_free = 0; + /* check how many free regs we have */ + for (unsigned j = lb; j < ub; j++) { + if (reg_file[j] == 0) + regs_free++; + } + + /* mark and count killed operands */ + unsigned killed_ops = 0; + for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) { + if (instr->operands[j].isTemp() && + instr->operands[j].isFirstKill() && + instr->operands[j].physReg() >= lb && + instr->operands[j].physReg() < ub) { + assert(instr->operands[j].isFixed()); + assert(reg_file[instr->operands[j].physReg().reg] == 0); + for (unsigned k = 0; k < instr->operands[j].size(); k++) + reg_file[instr->operands[j].physReg() + k] = 0xFFFFFFFF; + killed_ops += instr->operands[j].getTemp().size(); + } + } + + assert(regs_free >= size); + /* we might have to move dead operands to dst in order to make space */ + unsigned op_moves = 0; + + if (size > (regs_free - killed_ops)) + op_moves = size - (regs_free - killed_ops); + + /* find the best position to place the definition */ + unsigned best_pos = lb; + unsigned num_moves = 0xFF; + unsigned num_vars = 0; + + /* we use a sliding window to check potential positions */ + unsigned reg_lo = lb; + unsigned reg_hi = lb + size - 1; + for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) { + /* first check the edges: this is what we have to fix to allow for num_moves > size */ + if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1]) + continue; + if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1]) + continue; + + /* second, check that we have at most k=num_moves elements in the window + * and no element is larger than the currently processed one */ + unsigned k = op_moves; + unsigned n = 0; + unsigned remaining_op_moves = op_moves; + unsigned last_var = 0; + bool found = true; + bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0; + for (unsigned j = reg_lo; found && j <= reg_hi; j++) { + if (reg_file[j] == 0 || reg_file[j] == last_var) + continue; + + /* dead operands effectively reduce the number of estimated moves */ + if (remaining_op_moves && reg_file[j] == 0xFFFFFFFF) { + k--; + remaining_op_moves--; + continue; + } + + if (ctx.assignments[reg_file[j]].second.size() >= size) { + found = false; + break; + } + + + /* we cannot split live ranges of linear vgprs */ + if (ctx.assignments[reg_file[j]].second & (1 << 6)) { + found = false; + break; + } + + k += ctx.assignments[reg_file[j]].second.size(); + n++; + last_var = reg_file[j]; + } + + if (!found || k > num_moves) + continue; + if (k == num_moves && n < num_vars) + continue; + if (!aligned && k == num_moves && n == num_vars) + continue; + + if (found) { + best_pos = reg_lo; + num_moves = k; + num_vars = n; + } + } + + if (num_moves == 0xFF) { + /* remove killed operands from reg_file once again */ + for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) { + if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill()) { + for (unsigned k = 0; k < instr->operands[i].getTemp().size(); k++) + reg_file[instr->operands[i].physReg() + k] = 0; + } + } + for (unsigned i = 0; i < instr->definitions.size(); i++) { + Definition def = instr->definitions[i]; + if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) { + for (unsigned k = 0; k < def.getTemp().size(); k++) + reg_file[def.physReg() + k] = def.tempId(); + } + } + return {{}, false}; + } + + std::array<uint32_t, 512> register_file = reg_file; + + /* now, we figured the placement for our definition */ + std::set<std::pair<unsigned, unsigned>> vars; + for (unsigned j = best_pos; j < best_pos + size; j++) { + if (reg_file[j] != 0xFFFFFFFF && reg_file[j] != 0) + vars.emplace(ctx.assignments[reg_file[j]].second.size(), reg_file[j]); + reg_file[j] = 0; + } + + if (instr->opcode == aco_opcode::p_create_vector) { + /* move killed operands which aren't yet at the correct position */ + for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { + if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() && + instr->operands[i].getTemp().type() == rc.type()) { + + if (instr->operands[i].physReg() != best_pos + offset) { + vars.emplace(instr->operands[i].size(), instr->operands[i].tempId()); + for (unsigned j = 0; j < instr->operands[i].size(); j++) + reg_file[instr->operands[i].physReg() + j] = 0; + } else { + for (unsigned j = 0; j < instr->operands[i].size(); j++) + reg_file[instr->operands[i].physReg() + j] = instr->operands[i].tempId(); + } + } + } + } else { + /* re-enable the killed operands */ + for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) { + if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill()) { + for (unsigned k = 0; k < instr->operands[j].getTemp().size(); k++) + reg_file[instr->operands[j].physReg() + k] = instr->operands[j].tempId(); + } + } + } + + std::vector<std::pair<Operand, Definition>> pc; + if (!get_regs_for_copies(ctx, reg_file, pc, vars, lb, ub, instr, best_pos, best_pos + size - 1)) { + reg_file = std::move(register_file); + /* remove killed operands from reg_file once again */ + if (!is_phi(instr)) { + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) { + for (unsigned k = 0; k < op.getTemp().size(); k++) + reg_file[op.physReg() + k] = 0; + } + } + } + for (unsigned i = 0; i < instr->definitions.size(); i++) { + Definition& def = instr->definitions[i]; + if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) { + for (unsigned k = 0; k < def.getTemp().size(); k++) + reg_file[def.physReg() + k] = def.tempId(); + } + } + return {{}, false}; + } + + parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); + + /* we set the definition regs == 0. the actual caller is responsible for correct setting */ + for (unsigned i = 0; i < size; i++) + reg_file[best_pos + i] = 0; + + update_renames(ctx, reg_file, parallelcopies, instr); + + /* remove killed operands from reg_file once again */ + for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) { + if (!instr->operands[i].isTemp() || !instr->operands[i].isFixed()) + continue; + assert(!instr->operands[i].isUndefined()); + if (instr->operands[i].isFirstKill()) { + for (unsigned j = 0; j < instr->operands[i].getTemp().size(); j++) + reg_file[instr->operands[i].physReg() + j] = 0; + } + } + for (unsigned i = 0; i < instr->definitions.size(); i++) { + Definition def = instr->definitions[i]; + if (def.isTemp() && def.isFixed() && ctx.defs_done.test(i)) { + for (unsigned k = 0; k < def.getTemp().size(); k++) + reg_file[def.physReg() + k] = def.tempId(); + } + } + + adjust_max_used_regs(ctx, rc, best_pos); + return {PhysReg{best_pos}, true}; +} + +PhysReg get_reg(ra_ctx& ctx, + std::array<uint32_t, 512>& reg_file, + RegClass rc, + std::vector<std::pair<Operand, Definition>>& parallelcopies, + aco_ptr<Instruction>& instr) +{ + uint32_t size = rc.size(); + uint32_t stride = 1; + uint32_t lb, ub; + if (rc.type() == RegType::vgpr) { + lb = 256; + ub = 256 + ctx.program->max_reg_demand.vgpr; + } else { + lb = 0; + ub = ctx.program->max_reg_demand.sgpr; + if (size == 2) + stride = 2; + else if (size >= 4) + stride = 4; + } + + std::pair<PhysReg, bool> res = {{}, false}; + /* try to find space without live-range splits */ + if (rc.type() == RegType::vgpr && (size == 4 || size == 8)) + res = get_reg_simple(ctx, reg_file, lb, ub, size, 4, rc); + if (!res.second) + res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc); + if (res.second) + return res.first; + + /* try to find space with live-range splits */ + res = get_reg_impl(ctx, reg_file, parallelcopies, lb, ub, size, stride, rc, instr); + + if (res.second) + return res.first; + + unsigned regs_free = 0; + for (unsigned i = lb; i < ub; i++) { + if (!reg_file[i]) + regs_free++; + } + + /* We should only fail here because keeping under the limit would require + * too many moves. */ + assert(regs_free >= size); + + /* try using more registers */ + uint16_t max_addressible_sgpr = ctx.program->sgpr_limit; + if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < 256) { + update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr)); + return get_reg(ctx, reg_file, rc, parallelcopies, instr); + } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) { + update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1)); + return get_reg(ctx, reg_file, rc, parallelcopies, instr); + } + + //FIXME: if nothing helps, shift-rotate the registers to make space + + unreachable("did not find a register"); +} + + +std::pair<PhysReg, bool> get_reg_vec(ra_ctx& ctx, + std::array<uint32_t, 512>& reg_file, + RegClass rc) +{ + uint32_t size = rc.size(); + uint32_t stride = 1; + uint32_t lb, ub; + if (rc.type() == RegType::vgpr) { + lb = 256; + ub = 256 + ctx.program->max_reg_demand.vgpr; + } else { + lb = 0; + ub = ctx.program->max_reg_demand.sgpr; + if (size == 2) + stride = 2; + else if (size >= 4) + stride = 4; + } + return get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc); +} + + +PhysReg get_reg_create_vector(ra_ctx& ctx, + std::array<uint32_t, 512>& reg_file, + RegClass rc, + std::vector<std::pair<Operand, Definition>>& parallelcopies, + aco_ptr<Instruction>& instr) +{ + /* create_vector instructions have different costs w.r.t. register coalescing */ + uint32_t size = rc.size(); + uint32_t stride = 1; + uint32_t lb, ub; + if (rc.type() == RegType::vgpr) { + lb = 256; + ub = 256 + ctx.program->max_reg_demand.vgpr; + } else { + lb = 0; + ub = ctx.program->max_reg_demand.sgpr; + if (size == 2) + stride = 2; + else if (size >= 4) + stride = 4; + } + + unsigned best_pos = -1; + unsigned num_moves = 0xFF; + bool best_war_hint = true; + + /* test for each operand which definition placement causes the least shuffle instructions */ + for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { + // TODO: think about, if we can alias live operands on the same register + if (!instr->operands[i].isTemp() || !instr->operands[i].isKill() || instr->operands[i].getTemp().type() != rc.type()) + continue; + + if (offset > instr->operands[i].physReg()) + continue; + + unsigned reg_lo = instr->operands[i].physReg() - offset; + unsigned reg_hi = reg_lo + size - 1; + unsigned k = 0; + + /* no need to check multiple times */ + if (reg_lo == best_pos) + continue; + + /* check borders */ + // TODO: this can be improved */ + if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0) + continue; + if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1]) + continue; + if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1]) + continue; + + /* count variables to be moved and check war_hint */ + bool war_hint = false; + for (unsigned j = reg_lo; j <= reg_hi; j++) { + if (reg_file[j] != 0) + k++; + war_hint |= ctx.war_hint[j]; + } + + /* count operands in wrong positions */ + for (unsigned j = 0, offset = 0; j < instr->operands.size(); offset += instr->operands[j].size(), j++) { + if (j == i || + !instr->operands[j].isTemp() || + instr->operands[j].getTemp().type() != rc.type()) + continue; + if (instr->operands[j].physReg() != reg_lo + offset) + k += instr->operands[j].size(); + } + bool aligned = rc == RegClass::v4 && reg_lo % 4 == 0; + if (k > num_moves || (!aligned && k == num_moves) || (war_hint && !best_war_hint)) + continue; + + best_pos = reg_lo; + num_moves = k; + best_war_hint = war_hint; + } + + if (num_moves >= size) + return get_reg(ctx, reg_file, rc, parallelcopies, instr); + + /* collect variables to be moved */ + std::set<std::pair<unsigned, unsigned>> vars; + for (unsigned i = best_pos; i < best_pos + size; i++) { + if (reg_file[i] != 0) + vars.emplace(ctx.assignments[reg_file[i]].second.size(), reg_file[i]); + reg_file[i] = 0; + } + + /* move killed operands which aren't yet at the correct position */ + for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) { + if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() && instr->operands[i].getTemp().type() == rc.type()) { + if (instr->operands[i].physReg() != best_pos + offset) { + vars.emplace(instr->operands[i].size(), instr->operands[i].tempId()); + } else { + for (unsigned j = 0; j < instr->operands[i].size(); j++) + reg_file[instr->operands[i].physReg() + j] = instr->operands[i].tempId(); + } + } + } + + ASSERTED bool success = false; + success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1); + assert(success); + + update_renames(ctx, reg_file, parallelcopies, instr); + adjust_max_used_regs(ctx, rc, best_pos); + return PhysReg{best_pos}; +} + +bool get_reg_specified(ra_ctx& ctx, + std::array<uint32_t, 512>& reg_file, + RegClass rc, + std::vector<std::pair<Operand, Definition>>& parallelcopies, + aco_ptr<Instruction>& instr, + PhysReg reg) +{ + uint32_t size = rc.size(); + uint32_t stride = 1; + uint32_t lb, ub; + + if (rc.type() == RegType::vgpr) { + lb = 256; + ub = 256 + ctx.program->max_reg_demand.vgpr; + } else { + if (size == 2) + stride = 2; + else if (size >= 4) + stride = 4; + if (reg % stride != 0) + return false; + lb = 0; + ub = ctx.program->max_reg_demand.sgpr; + } + + uint32_t reg_lo = reg.reg; + uint32_t reg_hi = reg + (size - 1); + + if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi) + return false; + + for (unsigned i = reg_lo; i <= reg_hi; i++) { + if (reg_file[i] != 0) + return false; + } + adjust_max_used_regs(ctx, rc, reg_lo); + return true; +} + +void handle_pseudo(ra_ctx& ctx, + const std::array<uint32_t, 512>& reg_file, + Instruction* instr) +{ + if (instr->format != Format::PSEUDO) + return; + + /* all instructions which use handle_operands() need this information */ + switch (instr->opcode) { + case aco_opcode::p_extract_vector: + case aco_opcode::p_create_vector: + case aco_opcode::p_split_vector: + case aco_opcode::p_parallelcopy: + case aco_opcode::p_wqm: + break; + default: + return; + } + + /* if all definitions are vgpr, no need to care for SCC */ + bool writes_sgpr = false; + for (Definition& def : instr->definitions) { + if (def.getTemp().type() == RegType::sgpr) { + writes_sgpr = true; + break; + } + } + if (!writes_sgpr) + return; + + Pseudo_instruction *pi = (Pseudo_instruction *)instr; + if (reg_file[scc.reg]) { + pi->tmp_in_scc = true; + + int reg = ctx.max_used_sgpr; + for (; reg >= 0 && reg_file[reg]; reg--) + ; + if (reg < 0) { + reg = ctx.max_used_sgpr + 1; + for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++) + ; + assert(reg < ctx.program->max_reg_demand.sgpr); + } + + adjust_max_used_regs(ctx, s1, reg); + pi->scratch_sgpr = PhysReg{(unsigned)reg}; + } else { + pi->tmp_in_scc = false; + } +} + +bool operand_can_use_reg(aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg) +{ + switch (instr->format) { + case Format::SMEM: + return reg != scc && + reg != exec && + (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */ + (reg != vcc || (instr->definitions.empty() && idx == 2)); /* sdata can be vcc */ + default: + // TODO: there are more instructions with restrictions on registers + return true; + } +} + +} /* end namespace */ + + +void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block) +{ + ra_ctx ctx(program); + + std::vector<std::unordered_map<unsigned, Temp>> renames(program->blocks.size()); + + struct phi_info { + Instruction* phi; + unsigned block_idx; + std::set<Instruction*> uses; + }; + + bool filled[program->blocks.size()]; + bool sealed[program->blocks.size()]; + memset(filled, 0, sizeof filled); + memset(sealed, 0, sizeof sealed); + std::vector<std::vector<Instruction*>> incomplete_phis(program->blocks.size()); + std::map<unsigned, phi_info> phi_map; + std::map<unsigned, unsigned> affinities; + std::function<Temp(Temp,unsigned)> read_variable; + std::function<Temp(Temp,Block*)> handle_live_in; + std::function<Temp(std::map<unsigned, phi_info>::iterator)> try_remove_trivial_phi; + + read_variable = [&](Temp val, unsigned block_idx) -> Temp { + std::unordered_map<unsigned, Temp>::iterator it = renames[block_idx].find(val.id()); + assert(it != renames[block_idx].end()); + return it->second; + }; + + handle_live_in = [&](Temp val, Block *block) -> Temp { + std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds; + if (preds.size() == 0 && block->index != 0) { + renames[block->index][val.id()] = val; + return val; + } + assert(preds.size() > 0); + + Temp new_val; + if (!sealed[block->index]) { + /* consider rename from already processed predecessor */ + Temp tmp = read_variable(val, preds[0]); + + /* if the block is not sealed yet, we create an incomplete phi (which might later get removed again) */ + new_val = Temp{program->allocateId(), val.regClass()}; + aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; + aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)}; + phi->definitions[0] = Definition(new_val); + for (unsigned i = 0; i < preds.size(); i++) + phi->operands[i] = Operand(val); + if (tmp.regClass() == new_val.regClass()) + affinities[new_val.id()] = tmp.id(); + + phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index}); + incomplete_phis[block->index].emplace_back(phi.get()); + block->instructions.insert(block->instructions.begin(), std::move(phi)); + + } else if (preds.size() == 1) { + /* if the block has only one predecessor, just look there for the name */ + new_val = read_variable(val, preds[0]); + } else { + /* there are multiple predecessors and the block is sealed */ + Temp ops[preds.size()]; + + /* we start assuming that the name is the same from all predecessors */ + renames[block->index][val.id()] = val; + bool needs_phi = false; + + /* get the rename from each predecessor and check if they are the same */ + for (unsigned i = 0; i < preds.size(); i++) { + ops[i] = read_variable(val, preds[i]); + if (i == 0) + new_val = ops[i]; + else + needs_phi |= !(new_val == ops[i]); + } + + if (needs_phi) { + /* the variable has been renamed differently in the predecessors: we need to insert a phi */ + aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; + aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)}; + new_val = Temp{program->allocateId(), val.regClass()}; + phi->definitions[0] = Definition(new_val); + for (unsigned i = 0; i < preds.size(); i++) { + phi->operands[i] = Operand(ops[i]); + phi->operands[i].setFixed(ctx.assignments[ops[i].id()].first); + if (ops[i].regClass() == new_val.regClass()) + affinities[new_val.id()] = ops[i].id(); + } + phi_map.emplace(new_val.id(), phi_info{phi.get(), block->index}); + block->instructions.insert(block->instructions.begin(), std::move(phi)); + } + } + + renames[block->index][val.id()] = new_val; + renames[block->index][new_val.id()] = new_val; + ctx.orig_names[new_val.id()] = val; + return new_val; + }; + + try_remove_trivial_phi = [&] (std::map<unsigned, phi_info>::iterator info) -> Temp { + assert(info->second.block_idx != 0); + Instruction* phi = info->second.phi; + Temp same = Temp(); + + Definition def = phi->definitions[0]; + /* a phi node is trivial if all operands are the same as the definition of the phi */ + for (const Operand& op : phi->operands) { + const Temp t = op.getTemp(); + if (t == same || t == def.getTemp()) + continue; + if (!(same == Temp()) || !(op.physReg() == def.physReg())) { + /* phi is not trivial */ + return def.getTemp(); + } + same = t; + } + assert(!(same == Temp() || same == def.getTemp())); + + /* reroute all uses to same and remove phi */ + std::vector<std::map<unsigned, phi_info>::iterator> phi_users; + std::map<unsigned, phi_info>::iterator same_phi_info = phi_map.find(same.id()); + for (Instruction* instr : info->second.uses) { + assert(phi != instr); + /* recursively try to remove trivial phis */ + if (is_phi(instr)) { + /* ignore if the phi was already flagged trivial */ + if (instr->definitions.empty()) + continue; + + std::map<unsigned, phi_info>::iterator it = phi_map.find(instr->definitions[0].tempId()); + if (it != phi_map.end() && it != info) + phi_users.emplace_back(it); + } + for (Operand& op : instr->operands) { + if (op.isTemp() && op.tempId() == def.tempId()) { + op.setTemp(same); + if (same_phi_info != phi_map.end()) + same_phi_info->second.uses.emplace(instr); + } + } + } + + auto it = ctx.orig_names.find(same.id()); + unsigned orig_var = it != ctx.orig_names.end() ? it->second.id() : same.id(); + for (unsigned i = 0; i < program->blocks.size(); i++) { + auto it = renames[i].find(orig_var); + if (it != renames[i].end() && it->second == def.getTemp()) + renames[i][orig_var] = same; + } + + unsigned block_idx = info->second.block_idx; + phi->definitions.clear(); /* this indicates that the phi can be removed */ + phi_map.erase(info); + for (auto it : phi_users) { + if (sealed[it->second.block_idx]) + try_remove_trivial_phi(it); + } + + /* due to the removal of other phis, the name might have changed once again! */ + return renames[block_idx][orig_var]; + }; + + std::map<unsigned, Instruction*> vectors; + std::vector<std::vector<Temp>> phi_ressources; + std::map<unsigned, unsigned> temp_to_phi_ressources; + + for (std::vector<Block>::reverse_iterator it = program->blocks.rbegin(); it != program->blocks.rend(); it++) { + Block& block = *it; + + /* first, compute the death points of all live vars within the block */ + std::set<Temp>& live = live_out_per_block[block.index]; + + std::vector<aco_ptr<Instruction>>::reverse_iterator rit; + for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) { + aco_ptr<Instruction>& instr = *rit; + if (!is_phi(instr)) { + for (const Operand& op : instr->operands) { + if (op.isTemp()) + live.emplace(op.getTemp()); + } + if (instr->opcode == aco_opcode::p_create_vector) { + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.getTemp().type() == instr->definitions[0].getTemp().type()) + vectors[op.tempId()] = instr.get(); + } + } + } else if (!instr->definitions[0].isKill() && !instr->definitions[0].isFixed()) { + /* collect information about affinity-related temporaries */ + std::vector<Temp> affinity_related; + /* affinity_related[0] is the last seen affinity-related temp */ + affinity_related.emplace_back(instr->definitions[0].getTemp()); + affinity_related.emplace_back(instr->definitions[0].getTemp()); + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.regClass() == instr->definitions[0].regClass()) { + affinity_related.emplace_back(op.getTemp()); + temp_to_phi_ressources[op.tempId()] = phi_ressources.size(); + } + } + phi_ressources.emplace_back(std::move(affinity_related)); + } + + /* erase from live */ + for (const Definition& def : instr->definitions) { + if (def.isTemp()) { + live.erase(def.getTemp()); + std::map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId()); + if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) + phi_ressources[it->second][0] = def.getTemp(); + } + } + } + } + /* create affinities */ + for (std::vector<Temp>& vec : phi_ressources) { + assert(vec.size() > 1); + for (unsigned i = 1; i < vec.size(); i++) + if (vec[i].id() != vec[0].id()) + affinities[vec[i].id()] = vec[0].id(); + } + + /* state of register file after phis */ + std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size()); + + for (Block& block : program->blocks) { + std::set<Temp>& live = live_out_per_block[block.index]; + /* initialize register file */ + assert(block.index != 0 || live.empty()); + std::array<uint32_t, 512> register_file = {0}; + ctx.war_hint.reset(); + + for (Temp t : live) { + Temp renamed = handle_live_in(t, &block); + if (ctx.assignments.find(renamed.id()) != ctx.assignments.end()) { + for (unsigned i = 0; i < t.size(); i++) + register_file[ctx.assignments[renamed.id()].first + i] = renamed.id(); + } + } + + std::vector<aco_ptr<Instruction>> instructions; + std::vector<aco_ptr<Instruction>>::iterator it; + + /* this is a slight adjustment from the paper as we already have phi nodes: + * We consider them incomplete phis and only handle the definition. */ + + /* handle fixed phi definitions */ + for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { + aco_ptr<Instruction>& phi = *it; + if (!is_phi(phi)) + break; + Definition& definition = phi->definitions[0]; + if (!definition.isFixed()) + continue; + + /* check if a dead exec mask phi is needed */ + if (definition.isKill()) { + for (Operand& op : phi->operands) { + assert(op.isTemp()); + if (ctx.assignments.find(op.tempId()) == ctx.assignments.end() || + ctx.assignments[op.tempId()].first != exec) { + definition.setKill(false); + break; + } + } + } + + if (definition.isKill()) + continue; + + assert(definition.physReg() == exec); + for (unsigned i = 0; i < definition.size(); i++) { + assert(register_file[definition.physReg() + i] == 0); + register_file[definition.physReg() + i] = definition.tempId(); + } + ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; + } + + /* look up the affinities */ + for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { + aco_ptr<Instruction>& phi = *it; + if (!is_phi(phi)) + break; + Definition& definition = phi->definitions[0]; + if (definition.isKill() || definition.isFixed()) + continue; + + if (affinities.find(definition.tempId()) != affinities.end() && + ctx.assignments.find(affinities[definition.tempId()]) != ctx.assignments.end()) { + assert(ctx.assignments[affinities[definition.tempId()]].second == definition.regClass()); + PhysReg reg = ctx.assignments[affinities[definition.tempId()]].first; + bool try_use_special_reg = reg == scc || reg == exec; + if (try_use_special_reg) { + for (const Operand& op : phi->operands) { + if (!op.isTemp() || + ctx.assignments.find(op.tempId()) == ctx.assignments.end() || + !(ctx.assignments[op.tempId()].first == reg)) { + try_use_special_reg = false; + break; + } + } + if (!try_use_special_reg) + continue; + } + bool reg_free = true; + for (unsigned i = reg.reg; reg_free && i < reg + definition.size(); i++) { + if (register_file[i] != 0) + reg_free = false; + } + /* only assign if register is still free */ + if (reg_free) { + definition.setFixed(reg); + for (unsigned i = 0; i < definition.size(); i++) + register_file[definition.physReg() + i] = definition.tempId(); + ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; + } + } + } + + /* find registers for phis without affinity or where the register was blocked */ + for (it = block.instructions.begin();it != block.instructions.end(); ++it) { + aco_ptr<Instruction>& phi = *it; + if (!is_phi(phi)) + break; + + Definition& definition = phi->definitions[0]; + if (definition.isKill()) + continue; + + renames[block.index][definition.tempId()] = definition.getTemp(); + + if (!definition.isFixed()) { + std::vector<std::pair<Operand, Definition>> parallelcopy; + /* try to find a register that is used by at least one operand */ + for (const Operand& op : phi->operands) { + if (!op.isTemp() || + ctx.assignments.find(op.tempId()) == ctx.assignments.end()) + continue; + PhysReg reg = ctx.assignments[op.tempId()].first; + /* we tried this already on the previous loop */ + if (reg == scc || reg == exec) + continue; + if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, phi, reg)) { + definition.setFixed(reg); + break; + } + } + if (!definition.isFixed()) + definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, phi)); + + /* process parallelcopy */ + for (std::pair<Operand, Definition> pc : parallelcopy) { + /* rename */ + std::map<unsigned, Temp>::iterator orig_it = ctx.orig_names.find(pc.first.tempId()); + Temp orig = pc.first.getTemp(); + if (orig_it != ctx.orig_names.end()) + orig = orig_it->second; + else + ctx.orig_names[pc.second.tempId()] = orig; + renames[block.index][orig.id()] = pc.second.getTemp(); + renames[block.index][pc.second.tempId()] = pc.second.getTemp(); + + /* see if it's a copy from a previous phi */ + //TODO: prefer moving some previous phis over live-ins + //TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a problem in practice since they can only be fixed to exec) + Instruction *prev_phi = NULL; + for (auto it2 = instructions.begin(); it2 != instructions.end(); ++it2) { + if ((*it2)->definitions[0].tempId() == pc.first.tempId()) + prev_phi = it2->get(); + } + if (prev_phi) { + /* if so, just update that phi */ + prev_phi->definitions[0] = pc.second; + continue; + } + + /* otherwise, this is a live-in and we need to create a new phi + * to move it in this block's predecessors */ + aco_opcode opcode = pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; + std::vector<unsigned>& preds = pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds; + aco_ptr<Instruction> new_phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)}; + new_phi->definitions[0] = pc.second; + for (unsigned i = 0; i < preds.size(); i++) + new_phi->operands[i] = Operand(pc.first); + instructions.emplace_back(std::move(new_phi)); + } + + for (unsigned i = 0; i < definition.size(); i++) + register_file[definition.physReg() + i] = definition.tempId(); + ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; + } + live.emplace(definition.getTemp()); + + /* update phi affinities */ + for (const Operand& op : phi->operands) { + if (op.isTemp() && op.regClass() == phi->definitions[0].regClass()) + affinities[op.tempId()] = definition.tempId(); + } + + instructions.emplace_back(std::move(*it)); + } + + /* fill in sgpr_live_in */ + for (unsigned i = 0; i < ctx.max_used_sgpr; i++) + sgpr_live_in[block.index][i] = register_file[i]; + sgpr_live_in[block.index][127] = register_file[scc.reg]; + + /* Handle all other instructions of the block */ + for (; it != block.instructions.end(); ++it) { + aco_ptr<Instruction>& instr = *it; + + /* parallelcopies from p_phi are inserted here which means + * live ranges of killed operands end here as well */ + if (instr->opcode == aco_opcode::p_logical_end) { + /* no need to process this instruction any further */ + if (block.logical_succs.size() != 1) { + instructions.emplace_back(std::move(instr)); + continue; + } + + Block& succ = program->blocks[block.logical_succs[0]]; + unsigned idx = 0; + for (; idx < succ.logical_preds.size(); idx++) { + if (succ.logical_preds[idx] == block.index) + break; + } + for (aco_ptr<Instruction>& phi : succ.instructions) { + if (phi->opcode == aco_opcode::p_phi) { + if (phi->operands[idx].isTemp() && + phi->operands[idx].getTemp().type() == RegType::sgpr && + phi->operands[idx].isFirstKill()) { + Temp phi_op = read_variable(phi->operands[idx].getTemp(), block.index); + PhysReg reg = ctx.assignments[phi_op.id()].first; + assert(register_file[reg] == phi_op.id()); + register_file[reg] = 0; + } + } else if (phi->opcode != aco_opcode::p_linear_phi) { + break; + } + } + instructions.emplace_back(std::move(instr)); + continue; + } + + std::vector<std::pair<Operand, Definition>> parallelcopy; + + assert(!is_phi(instr)); + + /* handle operands */ + for (unsigned i = 0; i < instr->operands.size(); ++i) { + auto& operand = instr->operands[i]; + if (!operand.isTemp()) + continue; + + /* rename operands */ + operand.setTemp(read_variable(operand.getTemp(), block.index)); + + /* check if the operand is fixed */ + if (operand.isFixed()) { + + if (operand.physReg() == ctx.assignments[operand.tempId()].first) { + /* we are fine: the operand is already assigned the correct reg */ + + } else { + /* check if target reg is blocked, and move away the blocking var */ + if (register_file[operand.physReg().reg]) { + uint32_t blocking_id = register_file[operand.physReg().reg]; + RegClass rc = ctx.assignments[blocking_id].second; + Operand pc_op = Operand(Temp{blocking_id, rc}); + pc_op.setFixed(operand.physReg()); + Definition pc_def = Definition(Temp{program->allocateId(), pc_op.regClass()}); + /* find free reg */ + PhysReg reg = get_reg(ctx, register_file, pc_op.regClass(), parallelcopy, instr); + pc_def.setFixed(reg); + ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()}; + for (unsigned i = 0; i < operand.size(); i++) { + register_file[pc_op.physReg() + i] = 0; + register_file[pc_def.physReg() + i] = pc_def.tempId(); + } + parallelcopy.emplace_back(pc_op, pc_def); + + /* handle renames of previous operands */ + for (unsigned j = 0; j < i; j++) { + Operand& op = instr->operands[j]; + if (op.isTemp() && op.tempId() == blocking_id) { + op = Operand(pc_def.getTemp()); + op.setFixed(reg); + } + } + } + /* move operand to fixed reg and create parallelcopy pair */ + Operand pc_op = operand; + Temp tmp = Temp{program->allocateId(), operand.regClass()}; + Definition pc_def = Definition(tmp); + pc_def.setFixed(operand.physReg()); + pc_op.setFixed(ctx.assignments[operand.tempId()].first); + operand.setTemp(tmp); + ctx.assignments[tmp.id()] = {pc_def.physReg(), pc_def.regClass()}; + operand.setFixed(pc_def.physReg()); + for (unsigned i = 0; i < operand.size(); i++) { + register_file[pc_op.physReg() + i] = 0; + register_file[pc_def.physReg() + i] = tmp.id(); + } + parallelcopy.emplace_back(pc_op, pc_def); + } + } else { + assert(ctx.assignments.find(operand.tempId()) != ctx.assignments.end()); + PhysReg reg = ctx.assignments[operand.tempId()].first; + + if (operand_can_use_reg(instr, i, reg)) { + operand.setFixed(ctx.assignments[operand.tempId()].first); + } else { + Operand pc_op = operand; + pc_op.setFixed(reg); + PhysReg new_reg = get_reg(ctx, register_file, operand.regClass(), parallelcopy, instr); + Definition pc_def = Definition(program->allocateId(), new_reg, pc_op.regClass()); + ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()}; + for (unsigned i = 0; i < operand.size(); i++) { + register_file[pc_op.physReg() + i] = 0; + register_file[pc_def.physReg() + i] = pc_def.tempId(); + } + parallelcopy.emplace_back(pc_op, pc_def); + operand.setFixed(new_reg); + } + + if (instr->format == Format::EXP || + (instr->isVMEM() && i == 3 && program->chip_class == GFX6) || + (instr->format == Format::DS && static_cast<DS_instruction*>(instr.get())->gds)) { + for (unsigned j = 0; j < operand.size(); j++) + ctx.war_hint.set(operand.physReg().reg + j); + } + } + std::map<unsigned, phi_info>::iterator phi = phi_map.find(operand.getTemp().id()); + if (phi != phi_map.end()) + phi->second.uses.emplace(instr.get()); + + } + /* remove dead vars from register file */ + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0; + } + + /* try to optimize v_mad_f32 -> v_mac_f32 */ + if (instr->opcode == aco_opcode::v_mad_f32 && + instr->operands[2].isTemp() && + instr->operands[2].isKill() && + instr->operands[2].getTemp().type() == RegType::vgpr && + instr->operands[1].isTemp() && + instr->operands[1].getTemp().type() == RegType::vgpr) { /* TODO: swap src0 and src1 in this case */ + VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get()); + bool can_use_mac = !(vop3->abs[0] || vop3->abs[1] || vop3->abs[2] || + vop3->opsel[0] || vop3->opsel[1] || vop3->opsel[2] || + vop3->neg[0] || vop3->neg[1] || vop3->neg[2] || + vop3->clamp || vop3->omod); + if (can_use_mac) { + instr->format = Format::VOP2; + instr->opcode = aco_opcode::v_mac_f32; + } + } + + /* handle definitions which must have the same register as an operand */ + if (instr->opcode == aco_opcode::v_interp_p2_f32 || + instr->opcode == aco_opcode::v_mac_f32 || + instr->opcode == aco_opcode::v_writelane_b32) { + instr->definitions[0].setFixed(instr->operands[2].physReg()); + } else if (instr->opcode == aco_opcode::s_addk_i32 || + instr->opcode == aco_opcode::s_mulk_i32) { + instr->definitions[0].setFixed(instr->operands[0].physReg()); + } else if ((instr->format == Format::MUBUF || + instr->format == Format::MIMG) && + instr->definitions.size() == 1 && + instr->operands.size() == 4) { + instr->definitions[0].setFixed(instr->operands[3].physReg()); + } + + ctx.defs_done.reset(); + + /* handle fixed definitions first */ + for (unsigned i = 0; i < instr->definitions.size(); ++i) { + auto& definition = instr->definitions[i]; + if (!definition.isFixed()) + continue; + + adjust_max_used_regs(ctx, definition.regClass(), definition.physReg()); + /* check if the target register is blocked */ + if (register_file[definition.physReg().reg] != 0) { + /* create parallelcopy pair to move blocking var */ + Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].second}; + Operand pc_op = Operand(tmp); + pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg]].first); + RegClass rc = pc_op.regClass(); + tmp = Temp{program->allocateId(), rc}; + Definition pc_def = Definition(tmp); + + /* re-enable the killed operands, so that we don't move the blocking var there */ + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0xFFFF; + } + + /* find a new register for the blocking variable */ + PhysReg reg = get_reg(ctx, register_file, rc, parallelcopy, instr); + /* once again, disable killed operands */ + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0; + } + for (unsigned k = 0; k < i; k++) { + if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill()) + for (unsigned j = 0; j < instr->definitions[k].size(); j++) + register_file[instr->definitions[k].physReg() + j] = instr->definitions[k].tempId(); + } + pc_def.setFixed(reg); + + /* finish assignment of parallelcopy */ + ctx.assignments[pc_def.tempId()] = {reg, pc_def.regClass()}; + parallelcopy.emplace_back(pc_op, pc_def); + + /* add changes to reg_file */ + for (unsigned i = 0; i < pc_op.size(); i++) { + register_file[pc_op.physReg() + i] = 0x0; + register_file[pc_def.physReg() + i] = pc_def.tempId(); + } + } + ctx.defs_done.set(i); + + if (!definition.isTemp()) + continue; + + /* set live if it has a kill point */ + if (!definition.isKill()) + live.emplace(definition.getTemp()); + + ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; + renames[block.index][definition.tempId()] = definition.getTemp(); + for (unsigned j = 0; j < definition.size(); j++) + register_file[definition.physReg() + j] = definition.tempId(); + } + + /* handle all other definitions */ + for (unsigned i = 0; i < instr->definitions.size(); ++i) { + auto& definition = instr->definitions[i]; + + if (definition.isFixed() || !definition.isTemp()) + continue; + + /* find free reg */ + if (definition.hasHint() && register_file[definition.physReg().reg] == 0) + definition.setFixed(definition.physReg()); + else if (instr->opcode == aco_opcode::p_split_vector) { + PhysReg reg = PhysReg{instr->operands[0].physReg() + i * definition.size()}; + if (!get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) + reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr); + definition.setFixed(reg); + } else if (instr->opcode == aco_opcode::p_wqm) { + PhysReg reg; + if (instr->operands[0].isKill() && instr->operands[0].getTemp().type() == definition.getTemp().type()) { + reg = instr->operands[0].physReg(); + assert(register_file[reg.reg] == 0); + } else { + reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr); + } + definition.setFixed(reg); + } else if (instr->opcode == aco_opcode::p_extract_vector) { + PhysReg reg; + if (instr->operands[0].isKill() && + instr->operands[0].getTemp().type() == definition.getTemp().type()) { + reg = instr->operands[0].physReg(); + reg.reg += definition.size() * instr->operands[1].constantValue(); + assert(register_file[reg.reg] == 0); + } else { + reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr); + } + definition.setFixed(reg); + } else if (instr->opcode == aco_opcode::p_create_vector) { + PhysReg reg = get_reg_create_vector(ctx, register_file, definition.regClass(), + parallelcopy, instr); + definition.setFixed(reg); + } else if (affinities.find(definition.tempId()) != affinities.end() && + ctx.assignments.find(affinities[definition.tempId()]) != ctx.assignments.end()) { + PhysReg reg = ctx.assignments[affinities[definition.tempId()]].first; + if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) + definition.setFixed(reg); + else + definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr)); + + } else if (vectors.find(definition.tempId()) != vectors.end()) { + Instruction* vec = vectors[definition.tempId()]; + unsigned offset = 0; + for (const Operand& op : vec->operands) { + if (op.isTemp() && op.tempId() == definition.tempId()) + break; + else + offset += op.size(); + } + unsigned k = 0; + for (const Operand& op : vec->operands) { + if (op.isTemp() && + op.tempId() != definition.tempId() && + op.getTemp().type() == definition.getTemp().type() && + ctx.assignments.find(op.tempId()) != ctx.assignments.end()) { + PhysReg reg = ctx.assignments[op.tempId()].first; + reg.reg = reg - k + offset; + if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) { + definition.setFixed(reg); + break; + } + } + k += op.size(); + } + if (!definition.isFixed()) { + std::pair<PhysReg, bool> res = get_reg_vec(ctx, register_file, vec->definitions[0].regClass()); + PhysReg reg = res.first; + if (res.second) { + reg.reg += offset; + } else { + reg = get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr); + } + definition.setFixed(reg); + } + } else + definition.setFixed(get_reg(ctx, register_file, definition.regClass(), parallelcopy, instr)); + + assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) || + (definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256))); + ctx.defs_done.set(i); + + /* set live if it has a kill point */ + if (!definition.isKill()) + live.emplace(definition.getTemp()); + + ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; + renames[block.index][definition.tempId()] = definition.getTemp(); + for (unsigned j = 0; j < definition.size(); j++) + register_file[definition.physReg() + j] = definition.tempId(); + } + + handle_pseudo(ctx, register_file, instr.get()); + + /* kill definitions */ + for (const Definition& def : instr->definitions) { + if (def.isTemp() && def.isKill()) { + for (unsigned j = 0; j < def.size(); j++) { + register_file[def.physReg() + j] = 0; + } + } + } + + /* emit parallelcopy */ + if (!parallelcopy.empty()) { + aco_ptr<Pseudo_instruction> pc; + pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(), parallelcopy.size())); + bool temp_in_scc = register_file[scc.reg]; + bool sgpr_operands_alias_defs = false; + uint64_t sgpr_operands[4] = {0, 0, 0, 0}; + for (unsigned i = 0; i < parallelcopy.size(); i++) { + if (temp_in_scc && parallelcopy[i].first.isTemp() && parallelcopy[i].first.getTemp().type() == RegType::sgpr) { + if (!sgpr_operands_alias_defs) { + unsigned reg = parallelcopy[i].first.physReg().reg; + unsigned size = parallelcopy[i].first.getTemp().size(); + sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u); + + reg = parallelcopy[i].second.physReg().reg; + size = parallelcopy[i].second.getTemp().size(); + if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u)) + sgpr_operands_alias_defs = true; + } + } + + pc->operands[i] = parallelcopy[i].first; + pc->definitions[i] = parallelcopy[i].second; + + /* it might happen that the operand is already renamed. we have to restore the original name. */ + std::map<unsigned, Temp>::iterator it = ctx.orig_names.find(pc->operands[i].tempId()); + if (it != ctx.orig_names.end()) + pc->operands[i].setTemp(it->second); + unsigned orig_id = pc->operands[i].tempId(); + ctx.orig_names[pc->definitions[i].tempId()] = pc->operands[i].getTemp(); + + pc->operands[i].setTemp(read_variable(pc->operands[i].getTemp(), block.index)); + renames[block.index][orig_id] = pc->definitions[i].getTemp(); + renames[block.index][pc->definitions[i].tempId()] = pc->definitions[i].getTemp(); + std::map<unsigned, phi_info>::iterator phi = phi_map.find(pc->operands[i].tempId()); + if (phi != phi_map.end()) + phi->second.uses.emplace(pc.get()); + } + + if (temp_in_scc && sgpr_operands_alias_defs) { + /* disable definitions and re-enable operands */ + for (const Definition& def : instr->definitions) { + if (def.isTemp() && !def.isKill()) { + for (unsigned j = 0; j < def.size(); j++) { + register_file[def.physReg() + j] = 0x0; + } + } + } + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) { + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0xFFFF; + } + } + + handle_pseudo(ctx, register_file, pc.get()); + + /* re-enable live vars */ + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0x0; + } + for (const Definition& def : instr->definitions) { + if (def.isTemp() && !def.isKill()) { + for (unsigned j = 0; j < def.size(); j++) { + register_file[def.physReg() + j] = def.tempId(); + } + } + } + } else { + pc->tmp_in_scc = false; + } + + instructions.emplace_back(std::move(pc)); + } + + /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */ + bool instr_needs_vop3 = !instr->isVOP3() && + ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) || + (instr->opcode == aco_opcode::v_cndmask_b32 && !(instr->operands[2].physReg() == vcc)) || + ((instr->opcode == aco_opcode::v_add_co_u32 || + instr->opcode == aco_opcode::v_addc_co_u32 || + instr->opcode == aco_opcode::v_sub_co_u32 || + instr->opcode == aco_opcode::v_subb_co_u32 || + instr->opcode == aco_opcode::v_subrev_co_u32 || + instr->opcode == aco_opcode::v_subbrev_co_u32) && + !(instr->definitions[1].physReg() == vcc)) || + ((instr->opcode == aco_opcode::v_addc_co_u32 || + instr->opcode == aco_opcode::v_subb_co_u32 || + instr->opcode == aco_opcode::v_subbrev_co_u32) && + !(instr->operands[2].physReg() == vcc))); + if (instr_needs_vop3) { + + /* if the first operand is a literal, we have to move it to a reg */ + if (instr->operands.size() && instr->operands[0].isLiteral()) { + bool can_sgpr = true; + /* check, if we have to move to vgpr */ + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.getTemp().type() == RegType::sgpr) { + can_sgpr = false; + break; + } + } + aco_ptr<Instruction> mov; + if (can_sgpr) + mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)); + else + mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)); + mov->operands[0] = instr->operands[0]; + Temp tmp = {program->allocateId(), can_sgpr ? s1 : v1}; + mov->definitions[0] = Definition(tmp); + /* disable definitions and re-enable operands */ + for (const Definition& def : instr->definitions) { + for (unsigned j = 0; j < def.size(); j++) { + register_file[def.physReg() + j] = 0x0; + } + } + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) { + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0xFFFF; + } + } + mov->definitions[0].setFixed(get_reg(ctx, register_file, tmp.regClass(), parallelcopy, mov)); + instr->operands[0] = Operand(tmp); + instr->operands[0].setFixed(mov->definitions[0].physReg()); + instructions.emplace_back(std::move(mov)); + /* re-enable live vars */ + for (const Operand& op : instr->operands) { + if (op.isTemp() && op.isFirstKill()) + for (unsigned j = 0; j < op.size(); j++) + register_file[op.physReg() + j] = 0x0; + } + for (const Definition& def : instr->definitions) { + if (def.isTemp() && !def.isKill()) { + for (unsigned j = 0; j < def.size(); j++) { + register_file[def.physReg() + j] = def.tempId(); + } + } + } + } + + /* change the instruction to VOP3 to enable an arbitrary register pair as dst */ + aco_ptr<Instruction> tmp = std::move(instr); + Format format = asVOP3(tmp->format); + instr.reset(create_instruction<VOP3A_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); + for (unsigned i = 0; i < instr->operands.size(); i++) { + Operand& operand = tmp->operands[i]; + instr->operands[i] = operand; + /* keep phi_map up to date */ + if (operand.isTemp()) { + std::map<unsigned, phi_info>::iterator phi = phi_map.find(operand.tempId()); + if (phi != phi_map.end()) { + phi->second.uses.erase(tmp.get()); + phi->second.uses.emplace(instr.get()); + } + } + } + std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin()); + } + instructions.emplace_back(std::move(*it)); + + } /* end for Instr */ + + block.instructions = std::move(instructions); + + filled[block.index] = true; + for (unsigned succ_idx : block.linear_succs) { + Block& succ = program->blocks[succ_idx]; + /* seal block if all predecessors are filled */ + bool all_filled = true; + for (unsigned pred_idx : succ.linear_preds) { + if (!filled[pred_idx]) { + all_filled = false; + break; + } + } + if (all_filled) { + /* finish incomplete phis and check if they became trivial */ + for (Instruction* phi : incomplete_phis[succ_idx]) { + std::vector<unsigned> preds = phi->definitions[0].getTemp().is_linear() ? succ.linear_preds : succ.logical_preds; + for (unsigned i = 0; i < phi->operands.size(); i++) { + phi->operands[i].setTemp(read_variable(phi->operands[i].getTemp(), preds[i])); + phi->operands[i].setFixed(ctx.assignments[phi->operands[i].tempId()].first); + } + try_remove_trivial_phi(phi_map.find(phi->definitions[0].tempId())); + } + /* complete the original phi nodes, but no need to check triviality */ + for (aco_ptr<Instruction>& instr : succ.instructions) { + if (!is_phi(instr)) + break; + std::vector<unsigned> preds = instr->opcode == aco_opcode::p_phi ? succ.logical_preds : succ.linear_preds; + + for (unsigned i = 0; i < instr->operands.size(); i++) { + auto& operand = instr->operands[i]; + if (!operand.isTemp()) + continue; + operand.setTemp(read_variable(operand.getTemp(), preds[i])); + operand.setFixed(ctx.assignments[operand.tempId()].first); + std::map<unsigned, phi_info>::iterator phi = phi_map.find(operand.getTemp().id()); + if (phi != phi_map.end()) + phi->second.uses.emplace(instr.get()); + } + } + sealed[succ_idx] = true; + } + } + } /* end for BB */ + + /* remove trivial phis */ + for (Block& block : program->blocks) { + auto end = std::find_if(block.instructions.begin(), block.instructions.end(), + [](aco_ptr<Instruction>& instr) { return !is_phi(instr);}); + auto middle = std::remove_if(block.instructions.begin(), end, + [](const aco_ptr<Instruction>& instr) { return instr->definitions.empty();}); + block.instructions.erase(middle, end); + } + + /* find scc spill registers which may be needed for parallelcopies created by phis */ + for (Block& block : program->blocks) { + if (block.linear_preds.size() <= 1) + continue; + + std::bitset<128> regs = sgpr_live_in[block.index]; + if (!regs[127]) + continue; + + /* choose a register */ + int16_t reg = 0; + for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++) + ; + assert(reg < ctx.program->max_reg_demand.sgpr); + adjust_max_used_regs(ctx, s1, reg); + + /* update predecessors */ + for (unsigned& pred_index : block.linear_preds) { + Block& pred = program->blocks[pred_index]; + pred.scc_live_out = true; + pred.scratch_sgpr = PhysReg{(uint16_t)reg}; + } + } + + /* num_gpr = rnd_up(max_used_gpr + 1) */ + program->config->num_vgprs = (ctx.max_used_vgpr + 1 + 3) & ~3; + if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) { + assert(ctx.max_used_sgpr <= 93); + ctx.max_used_sgpr = 93; /* workaround hardware bug */ + } + program->config->num_sgprs = (ctx.max_used_sgpr + 1 + 2 + 7) & ~7; /* + 2 sgprs for vcc */ +} + +} diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp new file mode 100644 index 00000000000..0cd67a979e0 --- /dev/null +++ b/src/amd/compiler/aco_scheduler.cpp @@ -0,0 +1,835 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" +#include <unordered_set> +#include <algorithm> + +#include "vulkan/radv_shader.h" // for radv_nir_compiler_options +#include "amdgfxregs.h" + +#define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35) +#define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64) +#define POS_EXP_WINDOW_SIZE 512 +#define SMEM_MAX_MOVES (80 - ctx.num_waves * 8) +#define VMEM_MAX_MOVES (128 - ctx.num_waves * 4) +#define POS_EXP_MAX_MOVES 512 + +namespace aco { + +struct sched_ctx { + std::vector<bool> depends_on; + std::vector<bool> RAR_dependencies; + RegisterDemand max_registers; + int16_t num_waves; + int16_t last_SMEM_stall; + int last_SMEM_dep_idx; +}; + +/* This scheduler is a simple bottom-up pass based on ideas from + * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler" + * from Xiaohua Shi and Peng Guo. + * The basic approach is to iterate over all instructions. When a memory instruction + * is encountered it tries to move independent instructions from above and below + * between the memory instruction and it's first user. + * The novelty is that this scheduler cares for the current register pressure: + * Instructions will only be moved if the register pressure won't exceed a certain bound. + */ + +template <typename T> +void move_element(T& list, size_t idx, size_t before) { + if (idx < before) { + auto begin = std::next(list.begin(), idx); + auto end = std::next(list.begin(), before); + std::rotate(begin, begin + 1, end); + } else if (idx > before) { + auto begin = std::next(list.begin(), before); + auto end = std::next(list.begin(), idx + 1); + std::rotate(begin, end - 1, end); + } +} + +static RegisterDemand getLiveChanges(aco_ptr<Instruction>& instr) +{ + RegisterDemand changes; + for (const Definition& def : instr->definitions) { + if (!def.isTemp() || def.isKill()) + continue; + changes += def.getTemp(); + } + + for (const Operand& op : instr->operands) { + if (!op.isTemp() || !op.isFirstKill()) + continue; + changes -= op.getTemp(); + } + + return changes; +} + +static RegisterDemand getTempRegisters(aco_ptr<Instruction>& instr) +{ + RegisterDemand temp_registers; + for (const Definition& def : instr->definitions) { + if (!def.isTemp() || !def.isKill()) + continue; + temp_registers += def.getTemp(); + } + return temp_registers; +} + +static bool is_spill_reload(aco_ptr<Instruction>& instr) +{ + return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload; +} + +bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int moving_interaction) +{ + /* don't move exports so that they stay closer together */ + if (instr->format == Format::EXP) + return false; + + /* handle barriers */ + + /* TODO: instead of stopping, maybe try to move the barriers and any + * instructions interacting with them instead? */ + if (instr->format != Format::PSEUDO_BARRIER) { + if (instr->opcode == aco_opcode::s_barrier) { + bool can_reorder = false; + switch (current->format) { + case Format::SMEM: + can_reorder = static_cast<SMEM_instruction*>(current)->can_reorder; + break; + case Format::MUBUF: + can_reorder = static_cast<MUBUF_instruction*>(current)->can_reorder; + break; + case Format::MIMG: + can_reorder = static_cast<MIMG_instruction*>(current)->can_reorder; + break; + default: + break; + } + return can_reorder && moving_interaction == barrier_none; + } else { + return true; + } + } + + int interaction = get_barrier_interaction(current); + interaction |= moving_interaction; + + switch (instr->opcode) { + case aco_opcode::p_memory_barrier_atomic: + return !(interaction & barrier_atomic); + /* For now, buffer and image barriers are treated the same. this is because of + * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp + * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important. + * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it + * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers. + * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to? + * Either way, this solution should work. */ + case aco_opcode::p_memory_barrier_buffer: + case aco_opcode::p_memory_barrier_image: + return !(interaction & (barrier_image | barrier_buffer)); + case aco_opcode::p_memory_barrier_shared: + return !(interaction & barrier_shared); + case aco_opcode::p_memory_barrier_all: + return interaction == barrier_none; + default: + return false; + } +} + +bool can_reorder(Instruction* candidate, bool allow_smem) +{ + switch (candidate->format) { + case Format::SMEM: + return allow_smem || static_cast<SMEM_instruction*>(candidate)->can_reorder; + case Format::MUBUF: + return static_cast<MUBUF_instruction*>(candidate)->can_reorder; + case Format::MIMG: + return static_cast<MIMG_instruction*>(candidate)->can_reorder; + case Format::MTBUF: + return static_cast<MTBUF_instruction*>(candidate)->can_reorder; + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: + return false; + default: + return true; + } +} + +void schedule_SMEM(sched_ctx& ctx, Block* block, + std::vector<RegisterDemand>& register_demand, + Instruction* current, int idx) +{ + assert(idx != 0); + int window_size = SMEM_WINDOW_SIZE; + int max_moves = SMEM_MAX_MOVES; + int16_t k = 0; + bool can_reorder_cur = can_reorder(current, false); + + /* create the initial set of values which current depends on */ + std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); + for (const Operand& op : current->operands) { + if (op.isTemp()) + ctx.depends_on[op.tempId()] = true; + } + + /* maintain how many registers remain free when moving instructions */ + RegisterDemand register_pressure = register_demand[idx]; + + /* first, check if we have instructions before current to move down */ + int insert_idx = idx + 1; + int moving_interaction = barrier_none; + bool moving_spill = false; + + for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) { + assert(candidate_idx >= 0); + aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; + + /* break if we'd make the previous SMEM instruction stall */ + bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx; + if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0) + break; + + /* break when encountering another MEM instruction, logical_start or barriers */ + if (!can_reorder(candidate.get(), false) && !can_reorder_cur) + break; + if (candidate->opcode == aco_opcode::p_logical_start) + break; + if (!can_move_instr(candidate, current, moving_interaction)) + break; + register_pressure.update(register_demand[candidate_idx]); + + /* if current depends on candidate, add additional dependencies and continue */ + bool can_move_down = true; + bool writes_exec = false; + for (const Definition& def : candidate->definitions) { + if (def.isTemp() && ctx.depends_on[def.tempId()]) + can_move_down = false; + if (def.isFixed() && def.physReg() == exec) + writes_exec = true; + } + if (writes_exec) + break; + + if (moving_spill && is_spill_reload(candidate)) + can_move_down = false; + if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) + can_move_down = false; + moving_interaction |= get_barrier_interaction(candidate.get()); + moving_spill |= is_spill_reload(candidate); + if (!can_move_down) { + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.depends_on[op.tempId()] = true; + } + continue; + } + + bool register_pressure_unknown = false; + /* check if one of candidate's operands is killed by depending instruction */ + for (const Operand& op : candidate->operands) { + if (op.isTemp() && ctx.depends_on[op.tempId()]) { + // FIXME: account for difference in register pressure + register_pressure_unknown = true; + } + } + if (register_pressure_unknown) { + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.depends_on[op.tempId()] = true; + } + continue; + } + + /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ + const RegisterDemand candidate_diff = getLiveChanges(candidate); + const RegisterDemand tempDemand = getTempRegisters(candidate); + if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers)) + break; + const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]); + const RegisterDemand new_demand = register_demand[insert_idx - 1] - tempDemand2 + tempDemand; + if (new_demand.exceeds(ctx.max_registers)) + break; + // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure + + /* move the candidate below the memory load */ + move_element(block->instructions, candidate_idx, insert_idx); + + /* update register pressure */ + move_element(register_demand, candidate_idx, insert_idx); + for (int i = candidate_idx; i < insert_idx - 1; i++) { + register_demand[i] -= candidate_diff; + } + register_demand[insert_idx - 1] = new_demand; + register_pressure -= candidate_diff; + + if (candidate_idx < ctx.last_SMEM_dep_idx) + ctx.last_SMEM_stall++; + insert_idx--; + k++; + } + + /* create the initial set of values which depend on current */ + std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); + std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false); + for (const Definition& def : current->definitions) { + if (def.isTemp()) + ctx.depends_on[def.tempId()] = true; + } + + /* find the first instruction depending on current or find another MEM */ + insert_idx = idx + 1; + moving_interaction = barrier_none; + moving_spill = false; + + bool found_dependency = false; + /* second, check if we have instructions after current to move up */ + for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) { + assert(candidate_idx < (int) block->instructions.size()); + aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; + + if (candidate->opcode == aco_opcode::p_logical_end) + break; + if (!can_move_instr(candidate, current, moving_interaction)) + break; + + const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(), + [](const Definition& def) { return def.isFixed() && def.physReg() == exec;}); + if (writes_exec) + break; + + /* check if candidate depends on current */ + bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(), + [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];}); + if (moving_spill && is_spill_reload(candidate)) + is_dependency = true; + if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) + is_dependency = true; + moving_interaction |= get_barrier_interaction(candidate.get()); + moving_spill |= is_spill_reload(candidate); + if (is_dependency) { + for (const Definition& def : candidate->definitions) { + if (def.isTemp()) + ctx.depends_on[def.tempId()] = true; + } + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.RAR_dependencies[op.tempId()] = true; + } + if (!found_dependency) { + insert_idx = candidate_idx; + found_dependency = true; + /* init register pressure */ + register_pressure = register_demand[insert_idx - 1]; + } + } + + if (!can_reorder(candidate.get(), false) && !can_reorder_cur) + break; + + if (!found_dependency) { + k++; + continue; + } + + /* update register pressure */ + register_pressure.update(register_demand[candidate_idx - 1]); + + if (is_dependency) + continue; + assert(insert_idx != idx); + + // TODO: correctly calculate register pressure for this case + bool register_pressure_unknown = false; + /* check if candidate uses/kills an operand which is used by a dependency */ + for (const Operand& op : candidate->operands) { + if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) + register_pressure_unknown = true; + } + if (register_pressure_unknown) { + for (const Definition& def : candidate->definitions) { + if (def.isTemp()) + ctx.RAR_dependencies[def.tempId()] = true; + } + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.RAR_dependencies[op.tempId()] = true; + } + continue; + } + + /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ + const RegisterDemand candidate_diff = getLiveChanges(candidate); + const RegisterDemand temp = getTempRegisters(candidate); + if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers)) + break; + const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]); + const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp; + if (new_demand.exceeds(ctx.max_registers)) + break; + + /* move the candidate above the insert_idx */ + move_element(block->instructions, candidate_idx, insert_idx); + + /* update register pressure */ + move_element(register_demand, candidate_idx, insert_idx); + for (int i = insert_idx + 1; i <= candidate_idx; i++) { + register_demand[i] += candidate_diff; + } + register_demand[insert_idx] = new_demand; + register_pressure += candidate_diff; + insert_idx++; + k++; + } + + ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0; + ctx.last_SMEM_stall = 10 - ctx.num_waves - k; +} + +void schedule_VMEM(sched_ctx& ctx, Block* block, + std::vector<RegisterDemand>& register_demand, + Instruction* current, int idx) +{ + assert(idx != 0); + int window_size = VMEM_WINDOW_SIZE; + int max_moves = VMEM_MAX_MOVES; + int16_t k = 0; + bool can_reorder_cur = can_reorder(current, false); + + /* create the initial set of values which current depends on */ + std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); + for (const Operand& op : current->operands) { + if (op.isTemp()) + ctx.depends_on[op.tempId()] = true; + } + + /* maintain how many registers remain free when moving instructions */ + RegisterDemand register_pressure = register_demand[idx]; + + /* first, check if we have instructions before current to move down */ + int insert_idx = idx + 1; + int moving_interaction = barrier_none; + bool moving_spill = false; + + for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) { + assert(candidate_idx >= 0); + aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; + + /* break when encountering another VMEM instruction, logical_start or barriers */ + if (!can_reorder(candidate.get(), true) && !can_reorder_cur) + break; + if (candidate->opcode == aco_opcode::p_logical_start) + break; + if (!can_move_instr(candidate, current, moving_interaction)) + break; + + /* break if we'd make the previous SMEM instruction stall */ + bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx; + if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0) + break; + register_pressure.update(register_demand[candidate_idx]); + + /* if current depends on candidate, add additional dependencies and continue */ + bool can_move_down = true; + bool writes_exec = false; + for (const Definition& def : candidate->definitions) { + if (def.isTemp() && ctx.depends_on[def.tempId()]) + can_move_down = false; + if (def.isFixed() && def.physReg() == exec) + writes_exec = true; + } + if (writes_exec) + break; + + if (moving_spill && is_spill_reload(candidate)) + can_move_down = false; + if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) + can_move_down = false; + moving_interaction |= get_barrier_interaction(candidate.get()); + moving_spill |= is_spill_reload(candidate); + if (!can_move_down) { + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.depends_on[op.tempId()] = true; + } + continue; + } + + bool register_pressure_unknown = false; + /* check if one of candidate's operands is killed by depending instruction */ + for (const Operand& op : candidate->operands) { + if (op.isTemp() && ctx.depends_on[op.tempId()]) { + // FIXME: account for difference in register pressure + register_pressure_unknown = true; + } + } + if (register_pressure_unknown) { + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.depends_on[op.tempId()] = true; + } + continue; + } + + /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ + const RegisterDemand candidate_diff = getLiveChanges(candidate); + const RegisterDemand temp = getTempRegisters(candidate);; + if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers)) + break; + const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]); + const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp; + if (new_demand.exceeds(ctx.max_registers)) + break; + // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure + + /* move the candidate below the memory load */ + move_element(block->instructions, candidate_idx, insert_idx); + + /* update register pressure */ + move_element(register_demand, candidate_idx, insert_idx); + for (int i = candidate_idx; i < insert_idx - 1; i++) { + register_demand[i] -= candidate_diff; + } + register_demand[insert_idx - 1] = new_demand; + register_pressure -= candidate_diff; + insert_idx--; + k++; + if (candidate_idx < ctx.last_SMEM_dep_idx) + ctx.last_SMEM_stall++; + } + + /* create the initial set of values which depend on current */ + std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); + std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false); + for (const Definition& def : current->definitions) { + if (def.isTemp()) + ctx.depends_on[def.tempId()] = true; + } + + /* find the first instruction depending on current or find another VMEM */ + insert_idx = idx; + moving_interaction = barrier_none; + moving_spill = false; + + bool found_dependency = false; + /* second, check if we have instructions after current to move up */ + for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) { + assert(candidate_idx < (int) block->instructions.size()); + aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; + + if (candidate->opcode == aco_opcode::p_logical_end) + break; + if (!can_move_instr(candidate, current, moving_interaction)) + break; + + const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(), + [](const Definition& def) {return def.isFixed() && def.physReg() == exec; }); + if (writes_exec) + break; + + /* check if candidate depends on current */ + bool is_dependency = !can_reorder(candidate.get(), true) && !can_reorder_cur; + for (const Operand& op : candidate->operands) { + if (op.isTemp() && ctx.depends_on[op.tempId()]) { + is_dependency = true; + break; + } + } + if (moving_spill && is_spill_reload(candidate)) + is_dependency = true; + if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) + is_dependency = true; + moving_interaction |= get_barrier_interaction(candidate.get()); + moving_spill |= is_spill_reload(candidate); + if (is_dependency) { + for (const Definition& def : candidate->definitions) { + if (def.isTemp()) + ctx.depends_on[def.tempId()] = true; + } + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.RAR_dependencies[op.tempId()] = true; + } + if (!found_dependency) { + insert_idx = candidate_idx; + found_dependency = true; + /* init register pressure */ + register_pressure = register_demand[insert_idx - 1]; + continue; + } + } + + /* update register pressure */ + register_pressure.update(register_demand[candidate_idx - 1]); + + if (is_dependency || !found_dependency) + continue; + assert(insert_idx != idx); + + bool register_pressure_unknown = false; + /* check if candidate uses/kills an operand which is used by a dependency */ + for (const Operand& op : candidate->operands) { + if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) + register_pressure_unknown = true; + } + if (register_pressure_unknown) { + for (const Definition& def : candidate->definitions) { + if (def.isTemp()) + ctx.RAR_dependencies[def.tempId()] = true; + } + for (const Operand& op : candidate->operands) { + if (op.isTemp()) + ctx.RAR_dependencies[op.tempId()] = true; + } + continue; + } + + /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ + const RegisterDemand candidate_diff = getLiveChanges(candidate); + const RegisterDemand temp = getTempRegisters(candidate); + if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers)) + break; + const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]); + const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp; + if (new_demand.exceeds(ctx.max_registers)) + break; + + /* move the candidate above the insert_idx */ + move_element(block->instructions, candidate_idx, insert_idx); + + /* update register pressure */ + move_element(register_demand, candidate_idx, insert_idx); + for (int i = insert_idx + 1; i <= candidate_idx; i++) { + register_demand[i] += candidate_diff; + } + register_demand[insert_idx] = new_demand; + register_pressure += candidate_diff; + insert_idx++; + k++; + } +} + +void schedule_position_export(sched_ctx& ctx, Block* block, + std::vector<RegisterDemand>& register_demand, + Instruction* current, int idx) +{ + assert(idx != 0); + int window_size = POS_EXP_WINDOW_SIZE; + int max_moves = POS_EXP_MAX_MOVES; + int16_t k = 0; + + /* create the initial set of values which current depends on */ + std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); + for (unsigned i = 0; i < current->operands.size(); i++) { + if (current->operands[i].isTemp()) + ctx.depends_on[current->operands[i].tempId()] = true; + } + + /* maintain how many registers remain free when moving instructions */ + RegisterDemand register_pressure = register_demand[idx]; + + /* first, check if we have instructions before current to move down */ + int insert_idx = idx + 1; + int moving_interaction = barrier_none; + bool moving_spill = false; + + for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) { + assert(candidate_idx >= 0); + aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; + + /* break when encountering logical_start or barriers */ + if (candidate->opcode == aco_opcode::p_logical_start) + break; + if (candidate->isVMEM() || candidate->format == Format::SMEM) + break; + if (!can_move_instr(candidate, current, moving_interaction)) + break; + + register_pressure.update(register_demand[candidate_idx]); + + /* if current depends on candidate, add additional dependencies and continue */ + bool can_move_down = true; + bool writes_exec = false; + for (unsigned i = 0; i < candidate->definitions.size(); i++) { + if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()]) + can_move_down = false; + if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec) + writes_exec = true; + } + if (writes_exec) + break; + + if (moving_spill && is_spill_reload(candidate)) + can_move_down = false; + if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) + can_move_down = false; + moving_interaction |= get_barrier_interaction(candidate.get()); + moving_spill |= is_spill_reload(candidate); + if (!can_move_down) { + for (unsigned i = 0; i < candidate->operands.size(); i++) { + if (candidate->operands[i].isTemp()) + ctx.depends_on[candidate->operands[i].tempId()] = true; + } + continue; + } + + bool register_pressure_unknown = false; + /* check if one of candidate's operands is killed by depending instruction */ + for (unsigned i = 0; i < candidate->operands.size(); i++) { + if (candidate->operands[i].isTemp() && ctx.depends_on[candidate->operands[i].tempId()]) { + // FIXME: account for difference in register pressure + register_pressure_unknown = true; + } + } + if (register_pressure_unknown) { + for (unsigned i = 0; i < candidate->operands.size(); i++) { + if (candidate->operands[i].isTemp()) + ctx.depends_on[candidate->operands[i].tempId()] = true; + } + continue; + } + + /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ + const RegisterDemand candidate_diff = getLiveChanges(candidate); + const RegisterDemand temp = getTempRegisters(candidate);; + if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers)) + break; + const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]); + const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp; + if (new_demand.exceeds(ctx.max_registers)) + break; + // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure + + /* move the candidate below the export */ + move_element(block->instructions, candidate_idx, insert_idx); + + /* update register pressure */ + move_element(register_demand, candidate_idx, insert_idx); + for (int i = candidate_idx; i < insert_idx - 1; i++) { + register_demand[i] -= candidate_diff; + } + register_demand[insert_idx - 1] = new_demand; + register_pressure -= candidate_diff; + insert_idx--; + k++; + } +} + +void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars) +{ + ctx.last_SMEM_dep_idx = 0; + ctx.last_SMEM_stall = INT16_MIN; + + /* go through all instructions and find memory loads */ + for (unsigned idx = 0; idx < block->instructions.size(); idx++) { + Instruction* current = block->instructions[idx].get(); + + if (current->definitions.empty()) + continue; + + if (current->isVMEM()) + schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx); + if (current->format == Format::SMEM) + schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx); + } + + if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) { + /* Try to move position exports as far up as possible, to reduce register + * usage and because ISA reference guides say so. */ + for (unsigned idx = 0; idx < block->instructions.size(); idx++) { + Instruction* current = block->instructions[idx].get(); + + if (current->format == Format::EXP) { + unsigned target = static_cast<Export_instruction*>(current)->dest; + if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM) + schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx); + } + } + } + + /* resummarize the block's register demand */ + block->register_demand = RegisterDemand(); + for (unsigned idx = 0; idx < block->instructions.size(); idx++) { + block->register_demand.update(live_vars.register_demand[block->index][idx]); + } +} + + +void schedule_program(Program *program, live& live_vars) +{ + sched_ctx ctx; + ctx.depends_on.resize(program->peekAllocationId()); + ctx.RAR_dependencies.resize(program->peekAllocationId()); + /* Allowing the scheduler to reduce the number of waves to as low as 5 + * improves performance of Thrones of Britannia significantly and doesn't + * seem to hurt anything else. */ + //TODO: maybe use some sort of heuristic instead + //TODO: this also increases window-size/max-moves? did I realize that at the time? + ctx.num_waves = std::min<uint16_t>(program->num_waves, 5); + assert(ctx.num_waves); + uint16_t total_sgpr_regs = program->chip_class >= GFX8 ? 800 : 512; + uint16_t max_addressible_sgpr = program->sgpr_limit; + ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), std::min<int16_t>(((total_sgpr_regs / ctx.num_waves) & ~7) - 2, max_addressible_sgpr)}; + + for (Block& block : program->blocks) + schedule_block(ctx, program, &block, live_vars); + + /* update max_reg_demand and num_waves */ + RegisterDemand new_demand; + for (Block& block : program->blocks) { + new_demand.update(block.register_demand); + } + update_vgpr_sgpr_demand(program, new_demand); + + /* if enabled, this code asserts that register_demand is updated correctly */ + #if 0 + int prev_num_waves = program->num_waves; + const RegisterDemand prev_max_demand = program->max_reg_demand; + + std::vector<RegisterDemand> demands(program->blocks.size()); + for (unsigned j = 0; j < program->blocks.size(); j++) { + demands[j] = program->blocks[j].register_demand; + } + + struct radv_nir_compiler_options options; + options.chip_class = program->chip_class; + live live_vars2 = aco::live_var_analysis(program, &options); + + for (unsigned j = 0; j < program->blocks.size(); j++) { + Block &b = program->blocks[j]; + for (unsigned i = 0; i < b.instructions.size(); i++) + assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]); + assert(b.register_demand == demands[j]); + } + + assert(program->max_reg_demand == prev_max_demand); + assert(program->num_waves == prev_num_waves); + #endif +} + +} diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp new file mode 100644 index 00000000000..92a23bb355c --- /dev/null +++ b/src/amd/compiler/aco_spill.cpp @@ -0,0 +1,1630 @@ +/* + * Copyright © 2018 Valve Corporation + * Copyright © 2018 Google + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" +#include <map> +#include <stack> +#include "vulkan/radv_shader.h" + + +/* + * Implements the spilling algorithm on SSA-form from + * "Register Spilling and Live-Range Splitting for SSA-Form Programs" + * by Matthias Braun and Sebastian Hack. + */ + +namespace aco { + +namespace { + +struct remat_info { + Instruction *instr; +}; + +struct spill_ctx { + RegisterDemand target_pressure; + Program* program; + std::vector<std::vector<RegisterDemand>> register_demand; + std::vector<std::map<Temp, Temp>> renames; + std::vector<std::map<Temp, uint32_t>> spills_entry; + std::vector<std::map<Temp, uint32_t>> spills_exit; + std::vector<bool> processed; + std::stack<Block*> loop_header; + std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_start; + std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_end; + std::vector<std::pair<RegClass, std::set<uint32_t>>> interferences; + std::vector<std::pair<uint32_t, uint32_t>> affinities; + std::vector<bool> is_reloaded; + std::map<Temp, remat_info> remat; + std::map<Instruction *, bool> remat_used; + + spill_ctx(const RegisterDemand target_pressure, Program* program, + std::vector<std::vector<RegisterDemand>> register_demand) + : target_pressure(target_pressure), program(program), + register_demand(register_demand), renames(program->blocks.size()), + spills_entry(program->blocks.size()), spills_exit(program->blocks.size()), + processed(program->blocks.size(), false) {} + + uint32_t allocate_spill_id(RegClass rc) + { + interferences.emplace_back(rc, std::set<uint32_t>()); + is_reloaded.push_back(false); + return next_spill_id++; + } + + uint32_t next_spill_id = 0; +}; + +int32_t get_dominator(int idx_a, int idx_b, Program* program, bool is_linear) +{ + + if (idx_a == -1) + return idx_b; + if (idx_b == -1) + return idx_a; + if (is_linear) { + while (idx_a != idx_b) { + if (idx_a > idx_b) + idx_a = program->blocks[idx_a].linear_idom; + else + idx_b = program->blocks[idx_b].linear_idom; + } + } else { + while (idx_a != idx_b) { + if (idx_a > idx_b) + idx_a = program->blocks[idx_a].logical_idom; + else + idx_b = program->blocks[idx_b].logical_idom; + } + } + assert(idx_a != -1); + return idx_a; +} + +void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>& worklist) +{ + Block* block = &ctx.program->blocks[block_idx]; + std::map<Temp, std::pair<uint32_t, uint32_t>> next_uses = ctx.next_use_distances_end[block_idx]; + + /* to compute the next use distance at the beginning of the block, we have to add the block's size */ + for (std::map<Temp, std::pair<uint32_t, uint32_t>>::iterator it = next_uses.begin(); it != next_uses.end();) { + it->second.second = it->second.second + block->instructions.size(); + + /* remove the live out exec mask as we really don't want to spill it */ + if (it->first == block->live_out_exec) + it = next_uses.erase(it); + else + ++it; + } + + int idx = block->instructions.size() - 1; + while (idx >= 0) { + aco_ptr<Instruction>& instr = block->instructions[idx]; + + if (instr->opcode == aco_opcode::p_linear_phi || + instr->opcode == aco_opcode::p_phi) + break; + + for (const Definition& def : instr->definitions) { + if (def.isTemp()) + next_uses.erase(def.getTemp()); + } + + for (const Operand& op : instr->operands) { + /* omit exec mask */ + if (op.isFixed() && op.physReg() == exec) + continue; + if (op.isTemp()) + next_uses[op.getTemp()] = {block_idx, idx}; + } + idx--; + } + + assert(block_idx != 0 || next_uses.empty()); + ctx.next_use_distances_start[block_idx] = next_uses; + while (idx >= 0) { + aco_ptr<Instruction>& instr = block->instructions[idx]; + assert(instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi); + + for (unsigned i = 0; i < instr->operands.size(); i++) { + unsigned pred_idx = instr->opcode == aco_opcode::p_phi ? + block->logical_preds[i] : + block->linear_preds[i]; + if (instr->operands[i].isTemp()) { + if (ctx.next_use_distances_end[pred_idx].find(instr->operands[i].getTemp()) == ctx.next_use_distances_end[pred_idx].end() || + ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] != std::pair<uint32_t, uint32_t>{block_idx, 0}) + worklist.insert(pred_idx); + ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] = {block_idx, 0}; + } + } + next_uses.erase(instr->definitions[0].getTemp()); + idx--; + } + + /* all remaining live vars must be live-out at the predecessors */ + for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_uses) { + Temp temp = pair.first; + uint32_t distance = pair.second.second; + uint32_t dom = pair.second.first; + std::vector<unsigned>& preds = temp.is_linear() ? block->linear_preds : block->logical_preds; + for (unsigned pred_idx : preds) { + if (ctx.program->blocks[pred_idx].loop_nest_depth > block->loop_nest_depth) + distance += 0xFFFF; + if (ctx.next_use_distances_end[pred_idx].find(temp) != ctx.next_use_distances_end[pred_idx].end()) { + dom = get_dominator(dom, ctx.next_use_distances_end[pred_idx][temp].first, ctx.program, temp.is_linear()); + distance = std::min(ctx.next_use_distances_end[pred_idx][temp].second, distance); + } + if (ctx.next_use_distances_end[pred_idx][temp] != std::pair<uint32_t, uint32_t>{dom, distance}) + worklist.insert(pred_idx); + ctx.next_use_distances_end[pred_idx][temp] = {dom, distance}; + } + } + +} + +void compute_global_next_uses(spill_ctx& ctx, std::vector<std::set<Temp>>& live_out) +{ + ctx.next_use_distances_start.resize(ctx.program->blocks.size()); + ctx.next_use_distances_end.resize(ctx.program->blocks.size()); + std::set<uint32_t> worklist; + for (Block& block : ctx.program->blocks) + worklist.insert(block.index); + + while (!worklist.empty()) { + std::set<unsigned>::reverse_iterator b_it = worklist.rbegin(); + unsigned block_idx = *b_it; + worklist.erase(block_idx); + next_uses_per_block(ctx, block_idx, worklist); + } +} + +bool should_rematerialize(aco_ptr<Instruction>& instr) +{ + /* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */ + if (instr->format != Format::VOP1 && instr->format != Format::SOP1 && instr->format != Format::PSEUDO) + return false; + /* TODO: pseudo-instruction rematerialization is only supported for p_create_vector */ + if (instr->format == Format::PSEUDO && instr->opcode != aco_opcode::p_create_vector) + return false; + + for (const Operand& op : instr->operands) { + /* TODO: rematerialization using temporaries isn't yet supported */ + if (op.isTemp()) + return false; + } + + /* TODO: rematerialization with multiple definitions isn't yet supported */ + if (instr->definitions.size() > 1) + return false; + + return true; +} + +aco_ptr<Instruction> do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id) +{ + std::map<Temp, remat_info>::iterator remat = ctx.remat.find(tmp); + if (remat != ctx.remat.end()) { + Instruction *instr = remat->second.instr; + assert((instr->format == Format::VOP1 || instr->format == Format::SOP1 || instr->format == Format::PSEUDO) && "unsupported"); + assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector) && "unsupported"); + assert(instr->definitions.size() == 1 && "unsupported"); + + aco_ptr<Instruction> res; + if (instr->format == Format::VOP1) { + res.reset(create_instruction<VOP1_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + } else if (instr->format == Format::SOP1) { + res.reset(create_instruction<SOP1_instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + } else if (instr->format == Format::PSEUDO) { + res.reset(create_instruction<Instruction>(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + } + for (unsigned i = 0; i < instr->operands.size(); i++) { + res->operands[i] = instr->operands[i]; + if (instr->operands[i].isTemp()) { + assert(false && "unsupported"); + if (ctx.remat.count(instr->operands[i].getTemp())) + ctx.remat_used[ctx.remat[instr->operands[i].getTemp()].instr] = true; + } + } + res->definitions[0] = Definition(new_name); + return res; + } else { + aco_ptr<Pseudo_instruction> reload{create_instruction<Pseudo_instruction>(aco_opcode::p_reload, Format::PSEUDO, 1, 1)}; + reload->operands[0] = Operand(spill_id); + reload->definitions[0] = Definition(new_name); + ctx.is_reloaded[spill_id] = true; + return reload; + } +} + +void get_rematerialize_info(spill_ctx& ctx) +{ + for (Block& block : ctx.program->blocks) { + bool logical = false; + for (aco_ptr<Instruction>& instr : block.instructions) { + if (instr->opcode == aco_opcode::p_logical_start) + logical = true; + else if (instr->opcode == aco_opcode::p_logical_end) + logical = false; + if (logical && should_rematerialize(instr)) { + for (const Definition& def : instr->definitions) { + if (def.isTemp()) { + ctx.remat[def.getTemp()] = (remat_info){instr.get()}; + ctx.remat_used[instr.get()] = false; + } + } + } + } + } +} + +std::vector<std::map<Temp, uint32_t>> local_next_uses(spill_ctx& ctx, Block* block) +{ + std::vector<std::map<Temp, uint32_t>> local_next_uses(block->instructions.size()); + + std::map<Temp, uint32_t> next_uses; + for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_end[block->index]) { + /* omit live out exec mask */ + if (pair.first == block->live_out_exec) + continue; + + next_uses[pair.first] = pair.second.second + block->instructions.size(); + } + + for (int idx = block->instructions.size() - 1; idx >= 0; idx--) { + aco_ptr<Instruction>& instr = block->instructions[idx]; + if (!instr) + break; + if (instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi) + break; + + for (const Operand& op : instr->operands) { + if (op.isFixed() && op.physReg() == exec) + continue; + if (op.isTemp()) + next_uses[op.getTemp()] = idx; + } + for (const Definition& def : instr->definitions) { + if (def.isTemp()) + next_uses.erase(def.getTemp()); + } + local_next_uses[idx] = next_uses; + } + return local_next_uses; +} + + +RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) +{ + RegisterDemand spilled_registers; + + /* first block, nothing was spilled before */ + if (block_idx == 0) + return {0, 0}; + + /* loop header block */ + if (block->loop_nest_depth > ctx.program->blocks[block_idx - 1].loop_nest_depth) { + assert(block->linear_preds[0] == block_idx - 1); + assert(block->logical_preds[0] == block_idx - 1); + + /* create new loop_info */ + ctx.loop_header.emplace(block); + + /* check how many live-through variables should be spilled */ + RegisterDemand new_demand; + unsigned i = block_idx; + while (ctx.program->blocks[i].loop_nest_depth >= block->loop_nest_depth) { + assert(ctx.program->blocks.size() > i); + new_demand.update(ctx.program->blocks[i].register_demand); + i++; + } + unsigned loop_end = i; + + /* select live-through vgpr variables */ + while (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr) { + unsigned distance = 0; + Temp to_spill; + for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_end[block_idx - 1]) { + if (pair.first.type() == RegType::vgpr && + pair.second.first >= loop_end && + pair.second.second > distance && + ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) { + to_spill = pair.first; + distance = pair.second.second; + } + } + if (distance == 0) + break; + + uint32_t spill_id; + if (ctx.spills_exit[block_idx - 1].find(to_spill) == ctx.spills_exit[block_idx - 1].end()) { + spill_id = ctx.allocate_spill_id(to_spill.regClass()); + } else { + spill_id = ctx.spills_exit[block_idx - 1][to_spill]; + } + + ctx.spills_entry[block_idx][to_spill] = spill_id; + spilled_registers.vgpr += to_spill.size(); + } + + /* select live-through sgpr variables */ + while (new_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) { + unsigned distance = 0; + Temp to_spill; + for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_end[block_idx - 1]) { + if (pair.first.type() == RegType::sgpr && + pair.second.first >= loop_end && + pair.second.second > distance && + ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) { + to_spill = pair.first; + distance = pair.second.second; + } + } + if (distance == 0) + break; + + uint32_t spill_id; + if (ctx.spills_exit[block_idx - 1].find(to_spill) == ctx.spills_exit[block_idx - 1].end()) { + spill_id = ctx.allocate_spill_id(to_spill.regClass()); + } else { + spill_id = ctx.spills_exit[block_idx - 1][to_spill]; + } + + ctx.spills_entry[block_idx][to_spill] = spill_id; + spilled_registers.sgpr += to_spill.size(); + } + + + + /* shortcut */ + if (!RegisterDemand(new_demand - spilled_registers).exceeds(ctx.target_pressure)) + return spilled_registers; + + /* if reg pressure is too high at beginning of loop, add variables with furthest use */ + unsigned idx = 0; + while (block->instructions[idx]->opcode == aco_opcode::p_phi || block->instructions[idx]->opcode == aco_opcode::p_linear_phi) + idx++; + + assert(idx != 0 && "loop without phis: TODO"); + idx--; + RegisterDemand reg_pressure = ctx.register_demand[block_idx][idx] - spilled_registers; + while (reg_pressure.sgpr > ctx.target_pressure.sgpr) { + unsigned distance = 0; + Temp to_spill; + for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) { + if (pair.first.type() == RegType::sgpr && + pair.second.second > distance && + ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) { + to_spill = pair.first; + distance = pair.second.second; + } + } + assert(distance != 0); + + ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass()); + spilled_registers.sgpr += to_spill.size(); + reg_pressure.sgpr -= to_spill.size(); + } + while (reg_pressure.vgpr > ctx.target_pressure.vgpr) { + unsigned distance = 0; + Temp to_spill; + for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) { + if (pair.first.type() == RegType::vgpr && + pair.second.second > distance && + ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) { + to_spill = pair.first; + distance = pair.second.second; + } + } + assert(distance != 0); + ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass()); + spilled_registers.vgpr += to_spill.size(); + reg_pressure.vgpr -= to_spill.size(); + } + + return spilled_registers; + } + + /* branch block */ + if (block->linear_preds.size() == 1) { + /* keep variables spilled if they are alive and not used in the current block */ + unsigned pred_idx = block->linear_preds[0]; + for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) { + if (pair.first.type() == RegType::sgpr && + ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() && + ctx.next_use_distances_start[block_idx][pair.first].second > block_idx) { + ctx.spills_entry[block_idx].insert(pair); + spilled_registers.sgpr += pair.first.size(); + } + } + if (block->logical_preds.size() == 1) { + pred_idx = block->logical_preds[0]; + for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) { + if (pair.first.type() == RegType::vgpr && + ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() && + ctx.next_use_distances_end[pred_idx][pair.first].second > block_idx) { + ctx.spills_entry[block_idx].insert(pair); + spilled_registers.vgpr += pair.first.size(); + } + } + } + + /* if register demand is still too high, we just keep all spilled live vars and process the block */ + if (block->register_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) { + pred_idx = block->linear_preds[0]; + for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) { + if (pair.first.type() == RegType::sgpr && + ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() && + ctx.spills_entry[block_idx].insert(pair).second) { + spilled_registers.sgpr += pair.first.size(); + } + } + } + if (block->register_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr && block->logical_preds.size() == 1) { + pred_idx = block->logical_preds[0]; + for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) { + if (pair.first.type() == RegType::vgpr && + ctx.next_use_distances_start[block_idx].find(pair.first) != ctx.next_use_distances_start[block_idx].end() && + ctx.spills_entry[block_idx].insert(pair).second) { + spilled_registers.vgpr += pair.first.size(); + } + } + } + + return spilled_registers; + } + + /* else: merge block */ + std::set<Temp> partial_spills; + + /* keep variables spilled on all incoming paths */ + for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) { + std::vector<unsigned>& preds = pair.first.type() == RegType::vgpr ? block->logical_preds : block->linear_preds; + /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload it. + * Otherwise, if any predecessor reloads it, ensure it's reloaded on all other predecessors. + * The idea is that it's better in practice to rematerialize redundantly than to create lots of phis. */ + /* TODO: test this idea with more than Dawn of War III shaders (the current pipeline-db doesn't seem to exercise this path much) */ + bool remat = ctx.remat.count(pair.first); + bool spill = !remat; + uint32_t spill_id = 0; + for (unsigned pred_idx : preds) { + /* variable is not even live at the predecessor: probably from a phi */ + if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end()) { + spill = false; + break; + } + if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end()) { + if (!remat) + spill = false; + } else { + partial_spills.insert(pair.first); + /* it might be that on one incoming path, the variable has a different spill_id, but add_couple_code() will take care of that. */ + spill_id = ctx.spills_exit[pred_idx][pair.first]; + if (remat) + spill = true; + } + } + if (spill) { + ctx.spills_entry[block_idx][pair.first] = spill_id; + partial_spills.erase(pair.first); + spilled_registers += pair.first; + } + } + + /* same for phis */ + unsigned idx = 0; + while (block->instructions[idx]->opcode == aco_opcode::p_linear_phi || + block->instructions[idx]->opcode == aco_opcode::p_phi) { + aco_ptr<Instruction>& phi = block->instructions[idx]; + std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; + bool spill = true; + + for (unsigned i = 0; i < phi->operands.size(); i++) { + if (!phi->operands[i].isTemp()) + spill = false; + else if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end()) + spill = false; + else + partial_spills.insert(phi->definitions[0].getTemp()); + } + if (spill) { + ctx.spills_entry[block_idx][phi->definitions[0].getTemp()] = ctx.allocate_spill_id(phi->definitions[0].regClass()); + partial_spills.erase(phi->definitions[0].getTemp()); + spilled_registers += phi->definitions[0].getTemp(); + } + + idx++; + } + + /* if reg pressure at first instruction is still too high, add partially spilled variables */ + RegisterDemand reg_pressure; + if (idx == 0) { + for (const Definition& def : block->instructions[idx]->definitions) { + if (def.isTemp()) { + reg_pressure -= def.getTemp(); + } + } + for (const Operand& op : block->instructions[idx]->operands) { + if (op.isTemp() && op.isFirstKill()) { + reg_pressure += op.getTemp(); + } + } + } else { + idx--; + } + reg_pressure += ctx.register_demand[block_idx][idx] - spilled_registers; + + while (reg_pressure.sgpr > ctx.target_pressure.sgpr) { + assert(!partial_spills.empty()); + + std::set<Temp>::iterator it = partial_spills.begin(); + Temp to_spill = *it; + unsigned distance = ctx.next_use_distances_start[block_idx][*it].second; + while (it != partial_spills.end()) { + assert(ctx.spills_entry[block_idx].find(*it) == ctx.spills_entry[block_idx].end()); + + if (it->type() == RegType::sgpr && ctx.next_use_distances_start[block_idx][*it].second > distance) { + distance = ctx.next_use_distances_start[block_idx][*it].second; + to_spill = *it; + } + ++it; + } + assert(distance != 0); + + ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass()); + partial_spills.erase(to_spill); + spilled_registers.sgpr += to_spill.size(); + reg_pressure.sgpr -= to_spill.size(); + } + + while (reg_pressure.vgpr > ctx.target_pressure.vgpr) { + assert(!partial_spills.empty()); + + std::set<Temp>::iterator it = partial_spills.begin(); + Temp to_spill = *it; + unsigned distance = ctx.next_use_distances_start[block_idx][*it].second; + while (it != partial_spills.end()) { + assert(ctx.spills_entry[block_idx].find(*it) == ctx.spills_entry[block_idx].end()); + + if (it->type() == RegType::vgpr && ctx.next_use_distances_start[block_idx][*it].second > distance) { + distance = ctx.next_use_distances_start[block_idx][*it].second; + to_spill = *it; + } + ++it; + } + assert(distance != 0); + + ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass()); + partial_spills.erase(to_spill); + spilled_registers.vgpr += to_spill.size(); + reg_pressure.vgpr -= to_spill.size(); + } + + return spilled_registers; +} + + +void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) +{ + /* no coupling code necessary */ + if (block->linear_preds.size() == 0) + return; + + std::vector<aco_ptr<Instruction>> instructions; + /* branch block: TODO take other branch into consideration */ + if (block->linear_preds.size() == 1) { + assert(ctx.processed[block->linear_preds[0]]); + + if (block->logical_preds.size() == 1) { + unsigned pred_idx = block->logical_preds[0]; + for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live : ctx.next_use_distances_start[block_idx]) { + if (live.first.type() == RegType::sgpr) + continue; + /* still spilled */ + if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end()) + continue; + + /* in register at end of predecessor */ + if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) { + std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first); + if (it != ctx.renames[pred_idx].end()) + ctx.renames[block_idx].insert(*it); + continue; + } + + /* variable is spilled at predecessor and live at current block: create reload instruction */ + Temp new_name = {ctx.program->allocateId(), live.first.regClass()}; + aco_ptr<Instruction> reload = do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]); + instructions.emplace_back(std::move(reload)); + ctx.renames[block_idx][live.first] = new_name; + } + } + + unsigned pred_idx = block->linear_preds[0]; + for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live : ctx.next_use_distances_start[block_idx]) { + if (live.first.type() == RegType::vgpr) + continue; + /* still spilled */ + if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end()) + continue; + + /* in register at end of predecessor */ + if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) { + std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first); + if (it != ctx.renames[pred_idx].end()) + ctx.renames[block_idx].insert(*it); + continue; + } + + /* variable is spilled at predecessor and live at current block: create reload instruction */ + Temp new_name = {ctx.program->allocateId(), live.first.regClass()}; + aco_ptr<Instruction> reload = do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]); + instructions.emplace_back(std::move(reload)); + ctx.renames[block_idx][live.first] = new_name; + } + + /* combine new reload instructions with original block */ + if (!instructions.empty()) { + unsigned insert_idx = 0; + while (block->instructions[insert_idx]->opcode == aco_opcode::p_phi || + block->instructions[insert_idx]->opcode == aco_opcode::p_linear_phi) { + insert_idx++; + } + ctx.register_demand[block->index].insert(std::next(ctx.register_demand[block->index].begin(), insert_idx), + instructions.size(), RegisterDemand()); + block->instructions.insert(std::next(block->instructions.begin(), insert_idx), + std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(instructions.begin()), + std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(instructions.end())); + } + return; + } + + /* loop header and merge blocks: check if all (linear) predecessors have been processed */ + for (ASSERTED unsigned pred : block->linear_preds) + assert(ctx.processed[pred]); + + /* iterate the phi nodes for which operands to spill at the predecessor */ + for (aco_ptr<Instruction>& phi : block->instructions) { + if (phi->opcode != aco_opcode::p_phi && + phi->opcode != aco_opcode::p_linear_phi) + break; + + /* if the phi is not spilled, add to instructions */ + if (ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == ctx.spills_entry[block_idx].end()) { + instructions.emplace_back(std::move(phi)); + continue; + } + + std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; + uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()]; + + for (unsigned i = 0; i < phi->operands.size(); i++) { + unsigned pred_idx = preds[i]; + + /* we have to spill constants to the same memory address */ + if (phi->operands[i].isConstant()) { + uint32_t spill_id = ctx.allocate_spill_id(phi->definitions[0].regClass()); + for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) { + ctx.interferences[def_spill_id].second.emplace(pair.second); + ctx.interferences[pair.second].second.emplace(def_spill_id); + } + ctx.affinities.emplace_back(std::pair<uint32_t, uint32_t>{def_spill_id, spill_id}); + + aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; + spill->operands[0] = phi->operands[i]; + spill->operands[1] = Operand(spill_id); + Block& pred = ctx.program->blocks[pred_idx]; + unsigned idx = pred.instructions.size(); + do { + assert(idx != 0); + idx--; + } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); + std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx); + pred.instructions.insert(it, std::move(spill)); + continue; + } + if (!phi->operands[i].isTemp()) + continue; + + /* build interferences between the phi def and all spilled variables at the predecessor blocks */ + for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) { + if (phi->operands[i].getTemp() == pair.first) + continue; + ctx.interferences[def_spill_id].second.emplace(pair.second); + ctx.interferences[pair.second].second.emplace(def_spill_id); + } + + /* variable is already spilled at predecessor */ + std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()); + if (spilled != ctx.spills_exit[pred_idx].end()) { + if (spilled->second != def_spill_id) + ctx.affinities.emplace_back(std::pair<uint32_t, uint32_t>{def_spill_id, spilled->second}); + continue; + } + + /* rename if necessary */ + Temp var = phi->operands[i].getTemp(); + std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var); + if (rename_it != ctx.renames[pred_idx].end()) { + var = rename_it->second; + ctx.renames[pred_idx].erase(rename_it); + } + + uint32_t spill_id = ctx.allocate_spill_id(phi->definitions[0].regClass()); + ctx.affinities.emplace_back(std::pair<uint32_t, uint32_t>{def_spill_id, spill_id}); + aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; + spill->operands[0] = Operand(var); + spill->operands[1] = Operand(spill_id); + Block& pred = ctx.program->blocks[pred_idx]; + unsigned idx = pred.instructions.size(); + do { + assert(idx != 0); + idx--; + } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); + std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx); + pred.instructions.insert(it, std::move(spill)); + ctx.spills_exit[pred_idx][phi->operands[i].getTemp()] = spill_id; + } + + /* remove phi from instructions */ + phi.reset(); + } + + /* iterate all (other) spilled variables for which to spill at the predecessor */ + // TODO: would be better to have them sorted: first vgprs and first with longest distance + for (std::pair<Temp, uint32_t> pair : ctx.spills_entry[block_idx]) { + std::vector<unsigned> preds = pair.first.type() == RegType::vgpr ? block->logical_preds : block->linear_preds; + + for (unsigned pred_idx : preds) { + /* add interferences between spilled variable and predecessors exit spills */ + for (std::pair<Temp, uint32_t> exit_spill : ctx.spills_exit[pred_idx]) { + if (exit_spill.first == pair.first) + continue; + ctx.interferences[exit_spill.second].second.emplace(pair.second); + ctx.interferences[pair.second].second.emplace(exit_spill.second); + } + + /* variable is already spilled at predecessor */ + std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(pair.first); + if (spilled != ctx.spills_exit[pred_idx].end()) { + if (spilled->second != pair.second) + ctx.affinities.emplace_back(std::pair<uint32_t, uint32_t>{pair.second, spilled->second}); + continue; + } + + /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */ // FIXME: lower_to_cssa() + if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end()) + continue; + + /* variable is in register at predecessor and has to be spilled */ + /* rename if necessary */ + Temp var = pair.first; + std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var); + if (rename_it != ctx.renames[pred_idx].end()) { + var = rename_it->second; + ctx.renames[pred_idx].erase(rename_it); + } + + aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; + spill->operands[0] = Operand(var); + spill->operands[1] = Operand(pair.second); + Block& pred = ctx.program->blocks[pred_idx]; + unsigned idx = pred.instructions.size(); + do { + assert(idx != 0); + idx--; + } while (pair.first.type() == RegType::vgpr && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); + std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx); + pred.instructions.insert(it, std::move(spill)); + ctx.spills_exit[pred.index][pair.first] = pair.second; + } + } + + /* iterate phis for which operands to reload */ + for (aco_ptr<Instruction>& phi : instructions) { + assert(phi->opcode == aco_opcode::p_phi || phi->opcode == aco_opcode::p_linear_phi); + assert(ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == ctx.spills_entry[block_idx].end()); + + std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; + for (unsigned i = 0; i < phi->operands.size(); i++) { + if (!phi->operands[i].isTemp()) + continue; + unsigned pred_idx = preds[i]; + + /* rename operand */ + if (ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()) == ctx.spills_exit[pred_idx].end()) { + std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(phi->operands[i].getTemp()); + if (it != ctx.renames[pred_idx].end()) + phi->operands[i].setTemp(it->second); + continue; + } + + Temp tmp = phi->operands[i].getTemp(); + + /* reload phi operand at end of predecessor block */ + Temp new_name = {ctx.program->allocateId(), tmp.regClass()}; + Block& pred = ctx.program->blocks[pred_idx]; + unsigned idx = pred.instructions.size(); + do { + assert(idx != 0); + idx--; + } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); + std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx); + + aco_ptr<Instruction> reload = do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]); + pred.instructions.insert(it, std::move(reload)); + + ctx.spills_exit[pred_idx].erase(tmp); + ctx.renames[pred_idx][tmp] = new_name; + phi->operands[i].setTemp(new_name); + } + } + + /* iterate live variables for which to reload */ + // TODO: reload at current block if variable is spilled on all predecessors + for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : ctx.next_use_distances_start[block_idx]) { + /* skip spilled variables */ + if (ctx.spills_entry[block_idx].find(pair.first) != ctx.spills_entry[block_idx].end()) + continue; + std::vector<unsigned> preds = pair.first.type() == RegType::vgpr ? block->logical_preds : block->linear_preds; + + /* variable is dead at predecessor, it must be from a phi */ + bool is_dead = false; + for (unsigned pred_idx : preds) { + if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end()) + is_dead = true; + } + if (is_dead) + continue; + for (unsigned pred_idx : preds) { + /* the variable is not spilled at the predecessor */ + if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end()) + continue; + + /* variable is spilled at predecessor and has to be reloaded */ + Temp new_name = {ctx.program->allocateId(), pair.first.regClass()}; + Block& pred = ctx.program->blocks[pred_idx]; + unsigned idx = pred.instructions.size(); + do { + assert(idx != 0); + idx--; + } while (pair.first.type() == RegType::vgpr && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); + std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx); + + aco_ptr<Instruction> reload = do_reload(ctx, pair.first, new_name, ctx.spills_exit[pred.index][pair.first]); + pred.instructions.insert(it, std::move(reload)); + + ctx.spills_exit[pred.index].erase(pair.first); + ctx.renames[pred.index][pair.first] = new_name; + } + + /* check if we have to create a new phi for this variable */ + Temp rename = Temp(); + bool is_same = true; + for (unsigned pred_idx : preds) { + if (ctx.renames[pred_idx].find(pair.first) == ctx.renames[pred_idx].end()) { + if (rename == Temp()) + rename = pair.first; + else + is_same = rename == pair.first; + } else { + if (rename == Temp()) + rename = ctx.renames[pred_idx][pair.first]; + else + is_same = rename == ctx.renames[pred_idx][pair.first]; + } + + if (!is_same) + break; + } + + if (!is_same) { + /* the variable was renamed differently in the predecessors: we have to create a phi */ + aco_opcode opcode = pair.first.type() == RegType::vgpr ? aco_opcode::p_phi : aco_opcode::p_linear_phi; + aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)}; + rename = {ctx.program->allocateId(), pair.first.regClass()}; + for (unsigned i = 0; i < phi->operands.size(); i++) { + Temp tmp; + if (ctx.renames[preds[i]].find(pair.first) != ctx.renames[preds[i]].end()) + tmp = ctx.renames[preds[i]][pair.first]; + else if (preds[i] >= block_idx) + tmp = rename; + else + tmp = pair.first; + phi->operands[i] = Operand(tmp); + } + phi->definitions[0] = Definition(rename); + instructions.emplace_back(std::move(phi)); + } + + /* the variable was renamed: add new name to renames */ + if (!(rename == Temp() || rename == pair.first)) + ctx.renames[block_idx][pair.first] = rename; + } + + /* combine phis with instructions */ + unsigned idx = 0; + while (!block->instructions[idx]) { + idx++; + } + + ctx.register_demand[block->index].erase(ctx.register_demand[block->index].begin(), ctx.register_demand[block->index].begin() + idx); + ctx.register_demand[block->index].insert(ctx.register_demand[block->index].begin(), instructions.size(), RegisterDemand()); + + std::vector<aco_ptr<Instruction>>::iterator start = std::next(block->instructions.begin(), idx); + instructions.insert(instructions.end(), std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(start), + std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end())); + block->instructions = std::move(instructions); +} + +void process_block(spill_ctx& ctx, unsigned block_idx, Block* block, + std::map<Temp, uint32_t> ¤t_spills, RegisterDemand spilled_registers) +{ + std::vector<std::map<Temp, uint32_t>> local_next_use_distance; + std::vector<aco_ptr<Instruction>> instructions; + unsigned idx = 0; + + /* phis are handled separetely */ + while (block->instructions[idx]->opcode == aco_opcode::p_phi || + block->instructions[idx]->opcode == aco_opcode::p_linear_phi) { + aco_ptr<Instruction>& instr = block->instructions[idx]; + for (const Operand& op : instr->operands) { + /* prevent it's definining instruction from being DCE'd if it could be rematerialized */ + if (op.isTemp() && ctx.remat.count(op.getTemp())) + ctx.remat_used[ctx.remat[op.getTemp()].instr] = true; + } + instructions.emplace_back(std::move(instr)); + idx++; + } + + if (block->register_demand.exceeds(ctx.target_pressure)) + local_next_use_distance = local_next_uses(ctx, block); + + while (idx < block->instructions.size()) { + aco_ptr<Instruction>& instr = block->instructions[idx]; + + std::map<Temp, std::pair<Temp, uint32_t>> reloads; + std::map<Temp, uint32_t> spills; + /* rename and reload operands */ + for (Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + if (current_spills.find(op.getTemp()) == current_spills.end()) { + /* the Operand is in register: check if it was renamed */ + if (ctx.renames[block_idx].find(op.getTemp()) != ctx.renames[block_idx].end()) + op.setTemp(ctx.renames[block_idx][op.getTemp()]); + /* prevent it's definining instruction from being DCE'd if it could be rematerialized */ + if (ctx.remat.count(op.getTemp())) + ctx.remat_used[ctx.remat[op.getTemp()].instr] = true; + continue; + } + /* the Operand is spilled: add it to reloads */ + Temp new_tmp = {ctx.program->allocateId(), op.regClass()}; + ctx.renames[block_idx][op.getTemp()] = new_tmp; + reloads[new_tmp] = std::make_pair(op.getTemp(), current_spills[op.getTemp()]); + current_spills.erase(op.getTemp()); + op.setTemp(new_tmp); + spilled_registers -= new_tmp; + } + + /* check if register demand is low enough before and after the current instruction */ + if (block->register_demand.exceeds(ctx.target_pressure)) { + + RegisterDemand new_demand = ctx.register_demand[block_idx][idx]; + if (idx == 0) { + for (const Definition& def : instr->definitions) { + if (!def.isTemp()) + continue; + new_demand += def.getTemp(); + } + } else { + new_demand.update(ctx.register_demand[block_idx][idx - 1]); + } + + assert(!local_next_use_distance.empty()); + + /* if reg pressure is too high, spill variable with furthest next use */ + while (RegisterDemand(new_demand - spilled_registers).exceeds(ctx.target_pressure)) { + unsigned distance = 0; + Temp to_spill; + bool do_rematerialize = false; + if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr) { + for (std::pair<Temp, uint32_t> pair : local_next_use_distance[idx]) { + bool can_rematerialize = ctx.remat.count(pair.first); + if (pair.first.type() == RegType::vgpr && + ((pair.second > distance && can_rematerialize == do_rematerialize) || + (can_rematerialize && !do_rematerialize && pair.second > idx)) && + current_spills.find(pair.first) == current_spills.end() && + ctx.spills_exit[block_idx].find(pair.first) == ctx.spills_exit[block_idx].end()) { + to_spill = pair.first; + distance = pair.second; + do_rematerialize = can_rematerialize; + } + } + } else { + for (std::pair<Temp, uint32_t> pair : local_next_use_distance[idx]) { + bool can_rematerialize = ctx.remat.count(pair.first); + if (pair.first.type() == RegType::sgpr && + ((pair.second > distance && can_rematerialize == do_rematerialize) || + (can_rematerialize && !do_rematerialize && pair.second > idx)) && + current_spills.find(pair.first) == current_spills.end() && + ctx.spills_exit[block_idx].find(pair.first) == ctx.spills_exit[block_idx].end()) { + to_spill = pair.first; + distance = pair.second; + do_rematerialize = can_rematerialize; + } + } + } + + assert(distance != 0 && distance > idx); + uint32_t spill_id = ctx.allocate_spill_id(to_spill.regClass()); + + /* add interferences with currently spilled variables */ + for (std::pair<Temp, uint32_t> pair : current_spills) { + ctx.interferences[spill_id].second.emplace(pair.second); + ctx.interferences[pair.second].second.emplace(spill_id); + } + for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads) { + ctx.interferences[spill_id].second.emplace(pair.second.second); + ctx.interferences[pair.second.second].second.emplace(spill_id); + } + + current_spills[to_spill] = spill_id; + spilled_registers += to_spill; + + /* rename if necessary */ + if (ctx.renames[block_idx].find(to_spill) != ctx.renames[block_idx].end()) { + to_spill = ctx.renames[block_idx][to_spill]; + } + + /* add spill to new instructions */ + aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; + spill->operands[0] = Operand(to_spill); + spill->operands[1] = Operand(spill_id); + instructions.emplace_back(std::move(spill)); + } + } + + /* add reloads and instruction to new instructions */ + for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads) { + aco_ptr<Instruction> reload = do_reload(ctx, pair.second.first, pair.first, pair.second.second); + instructions.emplace_back(std::move(reload)); + } + instructions.emplace_back(std::move(instr)); + idx++; + } + + block->instructions = std::move(instructions); + ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end()); +} + +void spill_block(spill_ctx& ctx, unsigned block_idx) +{ + Block* block = &ctx.program->blocks[block_idx]; + ctx.processed[block_idx] = true; + + /* determine set of variables which are spilled at the beginning of the block */ + RegisterDemand spilled_registers = init_live_in_vars(ctx, block, block_idx); + + /* add interferences for spilled variables */ + for (std::pair<Temp, uint32_t> x : ctx.spills_entry[block_idx]) { + for (std::pair<Temp, uint32_t> y : ctx.spills_entry[block_idx]) + if (x.second != y.second) + ctx.interferences[x.second].second.emplace(y.second); + } + + bool is_loop_header = block->loop_nest_depth && ctx.loop_header.top()->index == block_idx; + if (!is_loop_header) { + /* add spill/reload code on incoming control flow edges */ + add_coupling_code(ctx, block, block_idx); + } + + std::map<Temp, uint32_t> current_spills = ctx.spills_entry[block_idx]; + + /* check conditions to process this block */ + bool process = RegisterDemand(block->register_demand - spilled_registers).exceeds(ctx.target_pressure) || + !ctx.renames[block_idx].empty() || + ctx.remat_used.size(); + + std::map<Temp, uint32_t>::iterator it = current_spills.begin(); + while (!process && it != current_spills.end()) { + if (ctx.next_use_distances_start[block_idx][it->first].first == block_idx) + process = true; + ++it; + } + + if (process) + process_block(ctx, block_idx, block, current_spills, spilled_registers); + else + ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end()); + + /* check if the next block leaves the current loop */ + if (block->loop_nest_depth == 0 || ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth) + return; + + Block* loop_header = ctx.loop_header.top(); + + /* preserve original renames at end of loop header block */ + std::map<Temp, Temp> renames = std::move(ctx.renames[loop_header->index]); + + /* add coupling code to all loop header predecessors */ + add_coupling_code(ctx, loop_header, loop_header->index); + + /* update remat_used for phis added in add_coupling_code() */ + for (aco_ptr<Instruction>& instr : loop_header->instructions) { + if (!is_phi(instr)) + break; + for (const Operand& op : instr->operands) { + if (op.isTemp() && ctx.remat.count(op.getTemp())) + ctx.remat_used[ctx.remat[op.getTemp()].instr] = true; + } + } + + /* propagate new renames through loop: i.e. repair the SSA */ + renames.swap(ctx.renames[loop_header->index]); + for (std::pair<Temp, Temp> rename : renames) { + for (unsigned idx = loop_header->index; idx <= block_idx; idx++) { + Block& current = ctx.program->blocks[idx]; + std::vector<aco_ptr<Instruction>>::iterator instr_it = current.instructions.begin(); + + /* first rename phis */ + while (instr_it != current.instructions.end()) { + aco_ptr<Instruction>& phi = *instr_it; + if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) + break; + /* no need to rename the loop header phis once again. this happened in add_coupling_code() */ + if (idx == loop_header->index) { + instr_it++; + continue; + } + + for (Operand& op : phi->operands) { + if (!op.isTemp()) + continue; + if (op.getTemp() == rename.first) + op.setTemp(rename.second); + } + instr_it++; + } + + std::map<Temp, std::pair<uint32_t, uint32_t>>::iterator it = ctx.next_use_distances_start[idx].find(rename.first); + + /* variable is not live at beginning of this block */ + if (it == ctx.next_use_distances_start[idx].end()) + continue; + + /* if the variable is live at the block's exit, add rename */ + if (ctx.next_use_distances_end[idx].find(rename.first) != ctx.next_use_distances_end[idx].end()) + ctx.renames[idx].insert(rename); + + /* rename all uses in this block */ + bool renamed = false; + while (!renamed && instr_it != current.instructions.end()) { + aco_ptr<Instruction>& instr = *instr_it; + for (Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + if (op.getTemp() == rename.first) { + op.setTemp(rename.second); + /* we can stop with this block as soon as the variable is spilled */ + if (instr->opcode == aco_opcode::p_spill) + renamed = true; + } + } + instr_it++; + } + } + } + + /* remove loop header info from stack */ + ctx.loop_header.pop(); +} + +void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { + std::map<uint32_t, uint32_t> sgpr_slot; + std::map<uint32_t, uint32_t> vgpr_slot; + std::vector<bool> is_assigned(ctx.interferences.size()); + + /* first, handle affinities: just merge all interferences into both spill ids */ + for (std::pair<uint32_t, uint32_t> pair : ctx.affinities) { + assert(pair.first != pair.second); + for (uint32_t id : ctx.interferences[pair.first].second) + ctx.interferences[id].second.insert(pair.second); + for (uint32_t id : ctx.interferences[pair.second].second) + ctx.interferences[id].second.insert(pair.first); + ctx.interferences[pair.first].second.insert(ctx.interferences[pair.second].second.begin(), ctx.interferences[pair.second].second.end()); + ctx.interferences[pair.second].second.insert(ctx.interferences[pair.first].second.begin(), ctx.interferences[pair.first].second.end()); + + bool reloaded = ctx.is_reloaded[pair.first] || ctx.is_reloaded[pair.second]; + ctx.is_reloaded[pair.first] = ctx.is_reloaded[pair.second] = reloaded; + } + for (ASSERTED uint32_t i = 0; i < ctx.interferences.size(); i++) + for (ASSERTED uint32_t id : ctx.interferences[i].second) + assert(i != id); + + /* for each spill slot, assign as many spill ids as possible */ + std::vector<std::set<uint32_t>> spill_slot_interferences; + unsigned slot_idx = 0; + bool done = false; + + /* assign sgpr spill slots */ + while (!done) { + done = true; + for (unsigned id = 0; id < ctx.interferences.size(); id++) { + if (is_assigned[id] || !ctx.is_reloaded[id]) + continue; + if (ctx.interferences[id].first.type() != RegType::sgpr) + continue; + + /* check interferences */ + bool interferes = false; + for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) { + if (i == spill_slot_interferences.size()) + spill_slot_interferences.emplace_back(std::set<uint32_t>()); + if (spill_slot_interferences[i].find(id) != spill_slot_interferences[i].end() || i / 64 != slot_idx / 64) { + interferes = true; + break; + } + } + if (interferes) { + done = false; + continue; + } + + /* we found a spill id which can be assigned to current spill slot */ + sgpr_slot[id] = slot_idx; + is_assigned[id] = true; + for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) + spill_slot_interferences[i].insert(ctx.interferences[id].second.begin(), ctx.interferences[id].second.end()); + } + slot_idx++; + } + + slot_idx = 0; + done = false; + + /* assign vgpr spill slots */ + while (!done) { + done = true; + for (unsigned id = 0; id < ctx.interferences.size(); id++) { + if (is_assigned[id] || !ctx.is_reloaded[id]) + continue; + if (ctx.interferences[id].first.type() != RegType::vgpr) + continue; + + /* check interferences */ + bool interferes = false; + for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) { + if (i == spill_slot_interferences.size()) + spill_slot_interferences.emplace_back(std::set<uint32_t>()); + /* check for interference and ensure that vector regs are stored next to each other */ + if (spill_slot_interferences[i].find(id) != spill_slot_interferences[i].end() || i / 64 != slot_idx / 64) { + interferes = true; + break; + } + } + if (interferes) { + done = false; + continue; + } + + /* we found a spill id which can be assigned to current spill slot */ + vgpr_slot[id] = slot_idx; + is_assigned[id] = true; + for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) + spill_slot_interferences[i].insert(ctx.interferences[id].second.begin(), ctx.interferences[id].second.end()); + } + slot_idx++; + } + + for (unsigned id = 0; id < is_assigned.size(); id++) + assert(is_assigned[id] || !ctx.is_reloaded[id]); + + for (std::pair<uint32_t, uint32_t> pair : ctx.affinities) { + assert(is_assigned[pair.first] == is_assigned[pair.second]); + if (!is_assigned[pair.first]) + continue; + assert(ctx.is_reloaded[pair.first] == ctx.is_reloaded[pair.second]); + assert(ctx.interferences[pair.first].first.type() == ctx.interferences[pair.second].first.type()); + if (ctx.interferences[pair.first].first.type() == RegType::sgpr) + assert(sgpr_slot[pair.first] == sgpr_slot[pair.second]); + else + assert(vgpr_slot[pair.first] == vgpr_slot[pair.second]); + } + + /* hope, we didn't mess up */ + std::vector<Temp> vgpr_spill_temps((spill_slot_interferences.size() + 63) / 64); + assert(vgpr_spill_temps.size() <= spills_to_vgpr); + + /* replace pseudo instructions with actual hardware instructions */ + unsigned last_top_level_block_idx = 0; + std::vector<bool> reload_in_loop(vgpr_spill_temps.size()); + for (Block& block : ctx.program->blocks) { + + /* after loops, we insert a user if there was a reload inside the loop */ + if (block.loop_nest_depth == 0) { + int end_vgprs = 0; + for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) { + if (reload_in_loop[i]) + end_vgprs++; + } + + if (end_vgprs > 0) { + aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, end_vgprs, 0)}; + int k = 0; + for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) { + if (reload_in_loop[i]) + destr->operands[k++] = Operand(vgpr_spill_temps[i]); + reload_in_loop[i] = false; + } + /* find insertion point */ + std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin(); + while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi) + ++it; + block.instructions.insert(it, std::move(destr)); + } + } + + if (block.kind & block_kind_top_level && !block.linear_preds.empty()) { + last_top_level_block_idx = block.index; + + /* check if any spilled variables use a created linear vgpr, otherwise destroy them */ + for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) { + if (vgpr_spill_temps[i] == Temp()) + continue; + + bool can_destroy = true; + for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[block.linear_preds[0]]) { + + if (sgpr_slot.find(pair.second) != sgpr_slot.end() && + sgpr_slot[pair.second] / 64 == i) { + can_destroy = false; + break; + } + } + if (can_destroy) + vgpr_spill_temps[i] = Temp(); + } + } + + std::vector<aco_ptr<Instruction>>::iterator it; + std::vector<aco_ptr<Instruction>> instructions; + instructions.reserve(block.instructions.size()); + for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { + + if ((*it)->opcode == aco_opcode::p_spill) { + uint32_t spill_id = (*it)->operands[1].constantValue(); + + if (!ctx.is_reloaded[spill_id]) { + /* never reloaded, so don't spill */ + } else if (vgpr_slot.find(spill_id) != vgpr_slot.end()) { + /* spill vgpr */ + ctx.program->config->spilled_vgprs += (*it)->operands[0].size(); + + assert(false && "vgpr spilling not yet implemented."); + } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) { + ctx.program->config->spilled_sgprs += (*it)->operands[0].size(); + + uint32_t spill_slot = sgpr_slot[spill_id]; + + /* check if the linear vgpr already exists */ + if (vgpr_spill_temps[spill_slot / 64] == Temp()) { + Temp linear_vgpr = {ctx.program->allocateId(), v1.as_linear()}; + vgpr_spill_temps[spill_slot / 64] = linear_vgpr; + aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; + create->definitions[0] = Definition(linear_vgpr); + /* find the right place to insert this definition */ + if (last_top_level_block_idx == block.index) { + /* insert right before the current instruction */ + instructions.emplace_back(std::move(create)); + } else { + assert(last_top_level_block_idx < block.index); + /* insert before the branch at last top level block */ + std::vector<aco_ptr<Instruction>>& instructions = ctx.program->blocks[last_top_level_block_idx].instructions; + instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create)); + } + } + + /* spill sgpr: just add the vgpr temp to operands */ + Pseudo_instruction* spill = create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 3, 0); + spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / 64]); + spill->operands[1] = Operand(spill_slot % 64); + spill->operands[2] = (*it)->operands[0]; + instructions.emplace_back(aco_ptr<Instruction>(spill)); + } else { + unreachable("No spill slot assigned for spill id"); + } + + } else if ((*it)->opcode == aco_opcode::p_reload) { + uint32_t spill_id = (*it)->operands[0].constantValue(); + assert(ctx.is_reloaded[spill_id]); + + if (vgpr_slot.find(spill_id) != vgpr_slot.end()) { + /* reload vgpr */ + assert(false && "vgpr spilling not yet implemented."); + + } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) { + uint32_t spill_slot = sgpr_slot[spill_id]; + reload_in_loop[spill_slot / 64] = block.loop_nest_depth > 0; + + /* check if the linear vgpr already exists */ + if (vgpr_spill_temps[spill_slot / 64] == Temp()) { + Temp linear_vgpr = {ctx.program->allocateId(), v1.as_linear()}; + vgpr_spill_temps[spill_slot / 64] = linear_vgpr; + aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; + create->definitions[0] = Definition(linear_vgpr); + /* find the right place to insert this definition */ + if (last_top_level_block_idx == block.index) { + /* insert right before the current instruction */ + instructions.emplace_back(std::move(create)); + } else { + assert(last_top_level_block_idx < block.index); + /* insert before the branch at last top level block */ + std::vector<aco_ptr<Instruction>>& instructions = ctx.program->blocks[last_top_level_block_idx].instructions; + instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create)); + } + } + + /* reload sgpr: just add the vgpr temp to operands */ + Pseudo_instruction* reload = create_instruction<Pseudo_instruction>(aco_opcode::p_reload, Format::PSEUDO, 2, 1); + reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / 64]); + reload->operands[1] = Operand(spill_slot % 64); + reload->definitions[0] = (*it)->definitions[0]; + instructions.emplace_back(aco_ptr<Instruction>(reload)); + } else { + unreachable("No spill slot assigned for spill id"); + } + } else if (!ctx.remat_used.count(it->get()) || ctx.remat_used[it->get()]) { + instructions.emplace_back(std::move(*it)); + } + + } + block.instructions = std::move(instructions); + } + + /* SSA elimination inserts copies for logical phis right before p_logical_end + * So if a linear vgpr is used between that p_logical_end and the branch, + * we need to ensure logical phis don't choose a definition which aliases + * the linear vgpr. + * TODO: Moving the spills and reloads to before p_logical_end might produce + * slightly better code. */ + for (Block& block : ctx.program->blocks) { + /* loops exits are already handled */ + if (block.logical_preds.size() <= 1) + continue; + + bool has_logical_phis = false; + for (aco_ptr<Instruction>& instr : block.instructions) { + if (instr->opcode == aco_opcode::p_phi) { + has_logical_phis = true; + break; + } else if (instr->opcode != aco_opcode::p_linear_phi) { + break; + } + } + if (!has_logical_phis) + continue; + + std::set<Temp> vgprs; + for (unsigned pred_idx : block.logical_preds) { + Block& pred = ctx.program->blocks[pred_idx]; + for (int i = pred.instructions.size() - 1; i >= 0; i--) { + aco_ptr<Instruction>& pred_instr = pred.instructions[i]; + if (pred_instr->opcode == aco_opcode::p_logical_end) { + break; + } else if (pred_instr->opcode == aco_opcode::p_spill || + pred_instr->opcode == aco_opcode::p_reload) { + vgprs.insert(pred_instr->operands[0].getTemp()); + } + } + } + if (!vgprs.size()) + continue; + + aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vgprs.size(), 0)}; + int k = 0; + for (Temp tmp : vgprs) { + destr->operands[k++] = Operand(tmp); + } + /* find insertion point */ + std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin(); + while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi) + ++it; + block.instructions.insert(it, std::move(destr)); + } +} + +} /* end namespace */ + + +void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options) +{ + program->config->spilled_vgprs = 0; + program->config->spilled_sgprs = 0; + + /* no spilling when wave count is already high */ + if (program->num_waves >= 6) + return; + + /* else, we check if we can improve things a bit */ + uint16_t total_sgpr_regs = options->chip_class >= GFX8 ? 800 : 512; + uint16_t max_addressible_sgpr = program->sgpr_limit; + + /* calculate target register demand */ + RegisterDemand max_reg_demand; + for (Block& block : program->blocks) { + max_reg_demand.update(block.register_demand); + } + + RegisterDemand target_pressure = {256, int16_t(max_addressible_sgpr)}; + unsigned num_waves = 1; + int spills_to_vgpr = (max_reg_demand.sgpr - max_addressible_sgpr + 63) / 64; + + /* test if it possible to increase occupancy with little spilling */ + for (unsigned num_waves_next = 2; num_waves_next <= 8; num_waves_next++) { + RegisterDemand target_pressure_next = {int16_t((256 / num_waves_next) & ~3), + int16_t(std::min<uint16_t>(((total_sgpr_regs / num_waves_next) & ~7) - 2, max_addressible_sgpr))}; + + /* Currently no vgpr spilling supported. + * Spill as many sgprs as necessary to not hinder occupancy */ + if (max_reg_demand.vgpr > target_pressure_next.vgpr) + break; + /* check that we have enough free vgprs to spill sgprs to */ + if (max_reg_demand.sgpr > target_pressure_next.sgpr) { + /* add some buffer in case graph coloring is not perfect ... */ + const int spills_to_vgpr_next = (max_reg_demand.sgpr - target_pressure_next.sgpr + 63 + 32) / 64; + if (spills_to_vgpr_next + max_reg_demand.vgpr > target_pressure_next.vgpr) + break; + spills_to_vgpr = spills_to_vgpr_next; + } + + target_pressure = target_pressure_next; + num_waves = num_waves_next; + } + + assert(max_reg_demand.vgpr <= target_pressure.vgpr && "VGPR spilling not yet supported."); + /* nothing to do */ + if (num_waves == program->num_waves) + return; + + /* initialize ctx */ + spill_ctx ctx(target_pressure, program, live_vars.register_demand); + compute_global_next_uses(ctx, live_vars.live_out); + get_rematerialize_info(ctx); + + /* create spills and reloads */ + for (unsigned i = 0; i < program->blocks.size(); i++) + spill_block(ctx, i); + + /* assign spill slots and DCE rematerialized code */ + assign_spill_slots(ctx, spills_to_vgpr); + + /* update live variable information */ + live_vars = live_var_analysis(program, options); + + assert(program->num_waves >= num_waves); +} + +} + diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp new file mode 100644 index 00000000000..3d76dcd8867 --- /dev/null +++ b/src/amd/compiler/aco_ssa_elimination.cpp @@ -0,0 +1,291 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + + +#include "aco_ir.h" + +#include <map> + +namespace aco { +namespace { + +/* map: block-id -> pair (dest, src) to store phi information */ +typedef std::map<uint32_t, std::vector<std::pair<Definition, Operand>>> phi_info; + +struct ssa_elimination_ctx { + phi_info logical_phi_info; + phi_info linear_phi_info; + std::vector<bool> empty_blocks; + Program* program; + + ssa_elimination_ctx(Program* program) : empty_blocks(program->blocks.size(), true), program(program) {} +}; + +void collect_phi_info(ssa_elimination_ctx& ctx) +{ + for (Block& block : ctx.program->blocks) { + for (aco_ptr<Instruction>& phi : block.instructions) { + if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) + break; + + for (unsigned i = 0; i < phi->operands.size(); i++) { + if (phi->operands[i].isUndefined()) + continue; + if (phi->operands[i].isTemp() && phi->operands[i].physReg() == phi->definitions[0].physReg()) + continue; + + std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; + phi_info& info = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info : ctx.linear_phi_info; + const auto result = info.emplace(preds[i], std::vector<std::pair<Definition, Operand>>()); + result.first->second.emplace_back(phi->definitions[0], phi->operands[i]); + ctx.empty_blocks[preds[i]] = false; + } + } + } +} + +void insert_parallelcopies(ssa_elimination_ctx& ctx) +{ + /* insert the parallelcopies from logical phis before p_logical_end */ + for (auto&& entry : ctx.logical_phi_info) { + Block& block = ctx.program->blocks[entry.first]; + unsigned idx = block.instructions.size() - 1; + while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) { + assert(idx > 0); + idx--; + } + + std::vector<aco_ptr<Instruction>>::iterator it = std::next(block.instructions.begin(), idx); + aco_ptr<Pseudo_instruction> pc{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, entry.second.size(), entry.second.size())}; + unsigned i = 0; + for (std::pair<Definition, Operand>& pair : entry.second) + { + pc->definitions[i] = pair.first; + pc->operands[i] = pair.second; + i++; + } + /* this shouldn't be needed since we're only copying vgprs */ + pc->tmp_in_scc = false; + block.instructions.insert(it, std::move(pc)); + } + + /* insert parallelcopies for the linear phis at the end of blocks just before the branch */ + for (auto&& entry : ctx.linear_phi_info) { + Block& block = ctx.program->blocks[entry.first]; + std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end(); + --it; + assert((*it)->format == Format::PSEUDO_BRANCH); + aco_ptr<Pseudo_instruction> pc{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, entry.second.size(), entry.second.size())}; + unsigned i = 0; + for (std::pair<Definition, Operand>& pair : entry.second) + { + pc->definitions[i] = pair.first; + pc->operands[i] = pair.second; + i++; + } + pc->tmp_in_scc = block.scc_live_out; + pc->scratch_sgpr = block.scratch_sgpr; + block.instructions.insert(it, std::move(pc)); + } +} + + +void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block) +{ + /* check if the successor is another merge block which restores exec */ + // TODO: divergent loops also restore exec + if (block->linear_succs.size() != 1 || + !(ctx.program->blocks[block->linear_succs[0]].kind & block_kind_merge)) + return; + + /* check if this block is empty and the exec mask is not needed */ + for (aco_ptr<Instruction>& instr : block->instructions) { + if (instr->opcode == aco_opcode::p_parallelcopy) { + if (instr->definitions[0].physReg() == exec) + continue; + else + return; + } + + if (instr->opcode != aco_opcode::p_linear_phi && + instr->opcode != aco_opcode::p_phi && + instr->opcode != aco_opcode::p_logical_start && + instr->opcode != aco_opcode::p_logical_end && + instr->opcode != aco_opcode::p_branch) + return; + } + + /* keep the branch instruction and remove the rest */ + aco_ptr<Instruction> branch = std::move(block->instructions.back()); + block->instructions.clear(); + block->instructions.emplace_back(std::move(branch)); +} + +void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block) +{ + assert(block->linear_succs.size() == 2); + if (block->linear_succs[0] != block->linear_succs[1]) + return; + + /* check if we can remove this block */ + for (aco_ptr<Instruction>& instr : block->instructions) { + if (instr->opcode != aco_opcode::p_linear_phi && + instr->opcode != aco_opcode::p_phi && + instr->opcode != aco_opcode::s_andn2_b64 && + instr->opcode != aco_opcode::p_branch) + return; + } + + unsigned succ_idx = block->linear_succs[0]; + assert(block->linear_preds.size() == 2); + for (unsigned i = 0; i < 2; i++) { + Block *pred = &ctx.program->blocks[block->linear_preds[i]]; + pred->linear_succs[0] = succ_idx; + ctx.program->blocks[succ_idx].linear_preds[i] = pred->index; + + Pseudo_branch_instruction *branch = static_cast<Pseudo_branch_instruction*>(pred->instructions.back().get()); + assert(branch->format == Format::PSEUDO_BRANCH); + branch->target[0] = succ_idx; + branch->target[1] = succ_idx; + } + + block->instructions.clear(); + block->linear_preds.clear(); + block->linear_succs.clear(); +} + +void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block) +{ + for (aco_ptr<Instruction>& instr : block->instructions) { + if (instr->opcode != aco_opcode::p_logical_start && + instr->opcode != aco_opcode::p_logical_end && + instr->opcode != aco_opcode::p_branch) + return; + } + + Block& pred = ctx.program->blocks[block->linear_preds[0]]; + Block& succ = ctx.program->blocks[block->linear_succs[0]]; + Pseudo_branch_instruction* branch = static_cast<Pseudo_branch_instruction*>(pred.instructions.back().get()); + if (branch->opcode == aco_opcode::p_branch) { + branch->target[0] = succ.index; + branch->target[1] = succ.index; + } else if (branch->target[0] == block->index) { + branch->target[0] = succ.index; + } else if (branch->target[0] == succ.index) { + assert(branch->target[1] == block->index); + branch->target[1] = succ.index; + branch->opcode = aco_opcode::p_branch; + } else if (branch->target[1] == block->index) { + /* check if there is a fall-through path from block to succ */ + bool falls_through = true; + for (unsigned j = block->index + 1; falls_through && j < succ.index; j++) { + assert(ctx.program->blocks[j].index == j); + if (!ctx.program->blocks[j].instructions.empty()) + falls_through = false; + } + if (falls_through) { + branch->target[1] = succ.index; + } else { + /* check if there is a fall-through path for the alternative target */ + for (unsigned j = block->index + 1; j < branch->target[0]; j++) { + if (!ctx.program->blocks[j].instructions.empty()) + return; + } + + /* This is a (uniform) break or continue block. The branch condition has to be inverted. */ + if (branch->opcode == aco_opcode::p_cbranch_z) + branch->opcode = aco_opcode::p_cbranch_nz; + else if (branch->opcode == aco_opcode::p_cbranch_nz) + branch->opcode = aco_opcode::p_cbranch_z; + else + assert(false); + /* also invert the linear successors */ + pred.linear_succs[0] = pred.linear_succs[1]; + pred.linear_succs[1] = succ.index; + branch->target[1] = branch->target[0]; + branch->target[0] = succ.index; + } + } else { + assert(false); + } + + if (branch->target[0] == branch->target[1]) + branch->opcode = aco_opcode::p_branch; + + for (unsigned i = 0; i < pred.linear_succs.size(); i++) + if (pred.linear_succs[i] == block->index) + pred.linear_succs[i] = succ.index; + + for (unsigned i = 0; i < succ.linear_preds.size(); i++) + if (succ.linear_preds[i] == block->index) + succ.linear_preds[i] = pred.index; + + block->instructions.clear(); + block->linear_preds.clear(); + block->linear_succs.clear(); +} + +void jump_threading(ssa_elimination_ctx& ctx) +{ + for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) { + Block* block = &ctx.program->blocks[i]; + + if (!ctx.empty_blocks[i]) + continue; + + if (block->kind & block_kind_invert) { + try_remove_invert_block(ctx, block); + continue; + } + + if (block->linear_succs.size() > 1) + continue; + + if (block->kind & block_kind_merge || + block->kind & block_kind_loop_exit) + try_remove_merge_block(ctx, block); + + if (block->linear_preds.size() == 1) + try_remove_simple_block(ctx, block); + } +} + +} /* end namespace */ + + +void ssa_elimination(Program* program) +{ + ssa_elimination_ctx ctx(program); + + /* Collect information about every phi-instruction */ + collect_phi_info(ctx); + + /* eliminate empty blocks */ + jump_threading(ctx); + + /* insert parallelcopies from SSA elimination */ + insert_parallelcopies(ctx); + +} +} diff --git a/src/amd/compiler/aco_util.h b/src/amd/compiler/aco_util.h new file mode 100644 index 00000000000..ec77ba55716 --- /dev/null +++ b/src/amd/compiler/aco_util.h @@ -0,0 +1,233 @@ +/* + * Copyright Michael Schellenberger Costa + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#ifndef ACO_UTIL_H +#define ACO_UTIL_H + +#include <cassert> +#include <iterator> + +namespace aco { + +/*! \brief Definition of a span object +* +* \details A "span" is an "array view" type for holding a view of contiguous +* data. The "span" object does not own the data itself. +*/ +template <typename T> +class span { +public: + using value_type = T; + using pointer = value_type*; + using const_pointer = const value_type*; + using reference = value_type&; + using const_reference = const value_type&; + using iterator = pointer; + using const_iterator = const_pointer; + using reverse_iterator = std::reverse_iterator<iterator>; + using const_reverse_iterator = std::reverse_iterator<const_iterator>; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + /*! \brief Compiler generated default constructor + */ + constexpr span() = default; + + /*! \brief Constructor taking a pointer and the length of the span + * \param[in] data Pointer to the underlying data array + * \param[in] length The size of the span + */ + constexpr span(pointer data, const size_type length) + : data{ data } , length{ length } {} + + /*! \brief Returns an iterator to the begin of the span + * \return data + */ + constexpr iterator begin() noexcept { + return data; + } + + /*! \brief Returns a const_iterator to the begin of the span + * \return data + */ + constexpr const_iterator begin() const noexcept { + return data; + } + + /*! \brief Returns an iterator to the end of the span + * \return data + length + */ + constexpr iterator end() noexcept { + return std::next(data, length); + } + + /*! \brief Returns a const_iterator to the end of the span + * \return data + length + */ + constexpr const_iterator end() const noexcept { + return std::next(data, length); + } + + /*! \brief Returns a const_iterator to the begin of the span + * \return data + */ + constexpr const_iterator cbegin() const noexcept { + return data; + } + + /*! \brief Returns a const_iterator to the end of the span + * \return data + length + */ + constexpr const_iterator cend() const noexcept { + return std::next(data, length); + } + + /*! \brief Returns a reverse_iterator to the end of the span + * \return reverse_iterator(end()) + */ + constexpr reverse_iterator rbegin() noexcept { + return reverse_iterator(end()); + } + + /*! \brief Returns a const_reverse_iterator to the end of the span + * \return reverse_iterator(end()) + */ + constexpr const_reverse_iterator rbegin() const noexcept { + return const_reverse_iterator(end()); + } + + /*! \brief Returns a reverse_iterator to the begin of the span + * \return reverse_iterator(begin()) + */ + constexpr reverse_iterator rend() noexcept { + return reverse_iterator(begin()); + } + + /*! \brief Returns a const_reverse_iterator to the begin of the span + * \return reverse_iterator(begin()) + */ + constexpr const_reverse_iterator rend() const noexcept { + return const_reverse_iterator(begin()); + } + + /*! \brief Returns a const_reverse_iterator to the end of the span + * \return rbegin() + */ + constexpr const_reverse_iterator crbegin() const noexcept { + return const_reverse_iterator(cend()); + } + + /*! \brief Returns a const_reverse_iterator to the begin of the span + * \return rend() + */ + constexpr const_reverse_iterator crend() const noexcept { + return const_reverse_iterator(cbegin()); + } + + /*! \brief Unchecked access operator + * \param[in] index Index of the element we want to access + * \return *(std::next(data, index)) + */ + constexpr reference operator[](const size_type index) noexcept { + assert(length > index); + return *(std::next(data, index)); + } + + /*! \brief Unchecked const access operator + * \param[in] index Index of the element we want to access + * \return *(std::next(data, index)) + */ + constexpr const_reference operator[](const size_type index) const noexcept { + assert(length > index); + return *(std::next(data, index)); + } + + /*! \brief Returns a reference to the last element of the span + * \return *(std::next(data, length - 1)) + */ + constexpr reference back() noexcept { + assert(length > 0); + return *(std::next(data, length - 1)); + } + + /*! \brief Returns a const_reference to the last element of the span + * \return *(std::next(data, length - 1)) + */ + constexpr const_reference back() const noexcept { + assert(length > 0); + return *(std::next(data, length - 1)); + } + + /*! \brief Returns a reference to the first element of the span + * \return *begin() + */ + constexpr reference front() noexcept { + assert(length > 0); + return *begin(); + } + + /*! \brief Returns a const_reference to the first element of the span + * \return *cbegin() + */ + constexpr const_reference front() const noexcept { + assert(length > 0); + return *cbegin(); + } + + /*! \brief Returns true if the span is empty + * \return length == 0 + */ + constexpr bool empty() const noexcept { + return length == 0; + } + + /*! \brief Returns the size of the span + * \return length == 0 + */ + constexpr size_type size() const noexcept { + return length; + } + + /*! \brief Decreases the size of the span by 1 + */ + constexpr void pop_back() noexcept { + assert(length > 0); + --length; + } + + /*! \brief Clears the span + */ + constexpr void clear() noexcept { + data = nullptr; + length = 0; + } + +private: + pointer data{ nullptr }; //!> Pointer to the underlying data array + size_type length{ 0 }; //!> Size of the span +}; + +} // namespace aco + +#endif // ACO_UTIL_H
\ No newline at end of file diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp new file mode 100644 index 00000000000..0988d66df3a --- /dev/null +++ b/src/amd/compiler/aco_validate.cpp @@ -0,0 +1,460 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "aco_ir.h" + +#include <map> + +namespace aco { + +#ifndef NDEBUG +void perfwarn(bool cond, const char *msg, Instruction *instr) +{ + if (cond) { + fprintf(stderr, "ACO performance warning: %s\n", msg); + if (instr) { + fprintf(stderr, "instruction: "); + aco_print_instr(instr, stderr); + fprintf(stderr, "\n"); + } + + if (debug_flags & DEBUG_PERFWARN) + exit(1); + } +} +#endif + +void validate(Program* program, FILE * output) +{ + if (!(debug_flags & DEBUG_VALIDATE)) + return; + + bool is_valid = true; + auto check = [&output, &is_valid](bool check, const char * msg, aco::Instruction * instr) -> void { + if (!check) { + fprintf(output, "%s: ", msg); + aco_print_instr(instr, output); + fprintf(output, "\n"); + is_valid = false; + } + }; + + for (Block& block : program->blocks) { + for (aco_ptr<Instruction>& instr : block.instructions) { + + /* check base format */ + Format base_format = instr->format; + base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::SDWA); + base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::DPP); + if ((uint32_t)base_format & (uint32_t)Format::VOP1) + base_format = Format::VOP1; + else if ((uint32_t)base_format & (uint32_t)Format::VOP2) + base_format = Format::VOP2; + else if ((uint32_t)base_format & (uint32_t)Format::VOPC) + base_format = Format::VOPC; + else if ((uint32_t)base_format & (uint32_t)Format::VINTRP) + base_format = Format::VINTRP; + check(base_format == instr_info.format[(int)instr->opcode], "Wrong base format for instruction", instr.get()); + + /* check VOP3 modifiers */ + if (((uint32_t)instr->format & (uint32_t)Format::VOP3) && instr->format != Format::VOP3) { + check(base_format == Format::VOP2 || + base_format == Format::VOP1 || + base_format == Format::VOPC || + base_format == Format::VINTRP, + "Format cannot have VOP3A/VOP3B applied", instr.get()); + } + + /* check for undefs */ + for (unsigned i = 0; i < instr->operands.size(); i++) { + if (instr->operands[i].isUndefined()) { + bool flat = instr->format == Format::FLAT || instr->format == Format::SCRATCH || instr->format == Format::GLOBAL; + bool can_be_undef = is_phi(instr) || instr->format == Format::EXP || + instr->format == Format::PSEUDO_REDUCTION || + (flat && i == 1) || (instr->format == Format::MIMG && i == 2) || + ((instr->format == Format::MUBUF || instr->format == Format::MTBUF) && i == 0); + check(can_be_undef, "Undefs can only be used in certain operands", instr.get()); + } + } + + /* check num literals */ + if (instr->isSALU() || instr->isVALU()) { + unsigned num_literals = 0; + for (unsigned i = 0; i < instr->operands.size(); i++) + { + if (instr->operands[i].isLiteral()) { + check(instr->format == Format::SOP1 || + instr->format == Format::SOP2 || + instr->format == Format::SOPC || + instr->format == Format::VOP1 || + instr->format == Format::VOP2 || + instr->format == Format::VOPC, + "Literal applied on wrong instruction format", instr.get()); + + num_literals++; + check(!instr->isVALU() || i == 0 || i == 2, "Wrong source position for Literal argument", instr.get()); + } + } + check(num_literals <= 1, "Only 1 Literal allowed", instr.get()); + + /* check num sgprs for VALU */ + if (instr->isVALU()) { + check(instr->definitions[0].getTemp().type() == RegType::vgpr || + (int) instr->format & (int) Format::VOPC || + instr->opcode == aco_opcode::v_readfirstlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32, + "Wrong Definition type for VALU instruction", instr.get()); + unsigned num_sgpr = 0; + unsigned sgpr_idx = instr->operands.size(); + for (unsigned i = 0; i < instr->operands.size(); i++) + { + if (instr->operands[i].isTemp() && instr->operands[i].regClass().type() == RegType::sgpr) { + check(i != 1 || (int) instr->format & (int) Format::VOP3A, "Wrong source position for SGPR argument", instr.get()); + + if (sgpr_idx == instr->operands.size() || instr->operands[sgpr_idx].tempId() != instr->operands[i].tempId()) + num_sgpr++; + sgpr_idx = i; + } + + if (instr->operands[i].isConstant() && !instr->operands[i].isLiteral()) + check(i == 0 || (int) instr->format & (int) Format::VOP3A, "Wrong source position for constant argument", instr.get()); + } + check(num_sgpr + num_literals <= 1, "Only 1 Literal OR 1 SGPR allowed", instr.get()); + } + + if (instr->format == Format::SOP1 || instr->format == Format::SOP2) { + check(instr->definitions[0].getTemp().type() == RegType::sgpr, "Wrong Definition type for SALU instruction", instr.get()); + for (const Operand& op : instr->operands) { + check(op.isConstant() || op.regClass().type() <= RegType::sgpr, + "Wrong Operand type for SALU instruction", instr.get()); + } + } + } + + switch (instr->format) { + case Format::PSEUDO: { + if (instr->opcode == aco_opcode::p_create_vector) { + unsigned size = 0; + for (const Operand& op : instr->operands) { + size += op.size(); + } + check(size == instr->definitions[0].size(), "Definition size does not match operand sizes", instr.get()); + if (instr->definitions[0].getTemp().type() == RegType::sgpr) { + for (const Operand& op : instr->operands) { + check(op.isConstant() || op.regClass().type() == RegType::sgpr, + "Wrong Operand type for scalar vector", instr.get()); + } + } + } else if (instr->opcode == aco_opcode::p_extract_vector) { + check((instr->operands[0].isTemp()) && instr->operands[1].isConstant(), "Wrong Operand types", instr.get()); + check(instr->operands[1].constantValue() < instr->operands[0].size(), "Index out of range", instr.get()); + check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->operands[0].regClass().type() == RegType::sgpr, + "Cannot extract SGPR value from VGPR vector", instr.get()); + } else if (instr->opcode == aco_opcode::p_parallelcopy) { + check(instr->definitions.size() == instr->operands.size(), "Number of Operands does not match number of Definitions", instr.get()); + for (unsigned i = 0; i < instr->operands.size(); i++) { + if (instr->operands[i].isTemp()) + check((instr->definitions[i].getTemp().type() == instr->operands[i].regClass().type()) || + (instr->definitions[i].getTemp().type() == RegType::vgpr && instr->operands[i].regClass().type() == RegType::sgpr), + "Operand and Definition types do not match", instr.get()); + } + } else if (instr->opcode == aco_opcode::p_phi) { + check(instr->operands.size() == block.logical_preds.size(), "Number of Operands does not match number of predecessors", instr.get()); + check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->definitions[0].getTemp().regClass() == s2, "Logical Phi Definition must be vgpr or divergent boolean", instr.get()); + } else if (instr->opcode == aco_opcode::p_linear_phi) { + for (const Operand& op : instr->operands) + check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type", instr.get()); + check(instr->operands.size() == block.linear_preds.size(), "Number of Operands does not match number of predecessors", instr.get()); + } + break; + } + case Format::SMEM: { + if (instr->operands.size() >= 1) + check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr, "SMEM operands must be sgpr", instr.get()); + if (instr->operands.size() >= 2) + check(instr->operands[1].isConstant() || (instr->operands[1].isTemp() && instr->operands[1].regClass().type() == RegType::sgpr), + "SMEM offset must be constant or sgpr", instr.get()); + if (!instr->definitions.empty()) + check(instr->definitions[0].getTemp().type() == RegType::sgpr, "SMEM result must be sgpr", instr.get()); + break; + } + case Format::MTBUF: + case Format::MUBUF: + case Format::MIMG: { + check(instr->operands.size() > 1, "VMEM instructions must have at least one operand", instr.get()); + check(instr->operands[0].hasRegClass() && instr->operands[0].regClass().type() == RegType::vgpr, + "VADDR must be in vgpr for VMEM instructions", instr.get()); + check(instr->operands[1].isTemp() && instr->operands[1].regClass().type() == RegType::sgpr, "VMEM resource constant must be sgpr", instr.get()); + check(instr->operands.size() < 4 || (instr->operands[3].isTemp() && instr->operands[3].regClass().type() == RegType::vgpr), "VMEM write data must be vgpr", instr.get()); + break; + } + case Format::DS: { + for (const Operand& op : instr->operands) { + check((op.isTemp() && op.regClass().type() == RegType::vgpr) || op.physReg() == m0, + "Only VGPRs are valid DS instruction operands", instr.get()); + } + if (!instr->definitions.empty()) + check(instr->definitions[0].getTemp().type() == RegType::vgpr, "DS instruction must return VGPR", instr.get()); + break; + } + case Format::EXP: { + for (unsigned i = 0; i < 4; i++) + check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr, + "Only VGPRs are valid Export arguments", instr.get()); + break; + } + case Format::FLAT: + check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR", instr.get()); + /* fallthrough */ + case Format::GLOBAL: + case Format::SCRATCH: { + check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH address must be vgpr", instr.get()); + check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::sgpr, + "FLAT/GLOBAL/SCRATCH sgpr address must be undefined or sgpr", instr.get()); + if (!instr->definitions.empty()) + check(instr->definitions[0].getTemp().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get()); + else + check(instr->operands[2].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get()); + break; + } + default: + break; + } + } + } + assert(is_valid); +} + +/* RA validation */ +namespace { + +struct Location { + Location() : block(NULL), instr(NULL) {} + + Block *block; + Instruction *instr; //NULL if it's the block's live-in +}; + +struct Assignment { + Location defloc; + Location firstloc; + PhysReg reg; +}; + +bool ra_fail(FILE *output, Location loc, Location loc2, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + char msg[1024]; + vsprintf(msg, fmt, args); + va_end(args); + + fprintf(stderr, "RA error found at instruction in BB%d:\n", loc.block->index); + if (loc.instr) { + aco_print_instr(loc.instr, stderr); + fprintf(stderr, "\n%s", msg); + } else { + fprintf(stderr, "%s", msg); + } + if (loc2.block) { + fprintf(stderr, " in BB%d:\n", loc2.block->index); + aco_print_instr(loc2.instr, stderr); + } + fprintf(stderr, "\n\n"); + + return true; +} + +} /* end namespace */ + +bool validate_ra(Program *program, const struct radv_nir_compiler_options *options, FILE *output) { + if (!(debug_flags & DEBUG_VALIDATE_RA)) + return false; + + bool err = false; + aco::live live_vars = aco::live_var_analysis(program, options); + std::vector<std::vector<Temp>> phi_sgpr_ops(program->blocks.size()); + + std::map<unsigned, Assignment> assignments; + for (Block& block : program->blocks) { + Location loc; + loc.block = █ + for (aco_ptr<Instruction>& instr : block.instructions) { + if (instr->opcode == aco_opcode::p_phi) { + for (unsigned i = 0; i < instr->operands.size(); i++) { + if (instr->operands[i].isTemp() && + instr->operands[i].getTemp().type() == RegType::sgpr && + instr->operands[i].isFirstKill()) + phi_sgpr_ops[block.logical_preds[i]].emplace_back(instr->operands[i].getTemp()); + } + } + + loc.instr = instr.get(); + for (unsigned i = 0; i < instr->operands.size(); i++) { + Operand& op = instr->operands[i]; + if (!op.isTemp()) + continue; + if (!op.isFixed()) + err |= ra_fail(output, loc, Location(), "Operand %d is not assigned a register", i); + if (assignments.count(op.tempId()) && assignments[op.tempId()].reg != op.physReg()) + err |= ra_fail(output, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an inconsistent register assignment with instruction", i); + if ((op.getTemp().type() == RegType::vgpr && op.physReg() + op.size() > 256 + program->config->num_vgprs) || + (op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < program->sgpr_limit)) + err |= ra_fail(output, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an out-of-bounds register assignment", i); + if (!assignments[op.tempId()].firstloc.block) + assignments[op.tempId()].firstloc = loc; + if (!assignments[op.tempId()].defloc.block) + assignments[op.tempId()].reg = op.physReg(); + } + + for (unsigned i = 0; i < instr->definitions.size(); i++) { + Definition& def = instr->definitions[i]; + if (!def.isTemp()) + continue; + if (!def.isFixed()) + err |= ra_fail(output, loc, Location(), "Definition %d is not assigned a register", i); + if (assignments[def.tempId()].defloc.block) + err |= ra_fail(output, loc, assignments.at(def.tempId()).defloc, "Temporary %%%d also defined by instruction", def.tempId()); + if ((def.getTemp().type() == RegType::vgpr && def.physReg() + def.size() > 256 + program->config->num_vgprs) || + (def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < program->sgpr_limit)) + err |= ra_fail(output, loc, assignments.at(def.tempId()).firstloc, "Definition %d has an out-of-bounds register assignment", i); + if (!assignments[def.tempId()].firstloc.block) + assignments[def.tempId()].firstloc = loc; + assignments[def.tempId()].defloc = loc; + assignments[def.tempId()].reg = def.physReg(); + } + } + } + + for (Block& block : program->blocks) { + Location loc; + loc.block = █ + + std::array<unsigned, 512> regs; + regs.fill(0); + + std::set<Temp> live; + live.insert(live_vars.live_out[block.index].begin(), live_vars.live_out[block.index].end()); + /* remove killed p_phi sgpr operands */ + for (Temp tmp : phi_sgpr_ops[block.index]) + live.erase(tmp); + + /* check live out */ + for (Temp tmp : live) { + PhysReg reg = assignments.at(tmp.id()).reg; + for (unsigned i = 0; i < tmp.size(); i++) { + if (regs[reg + i]) { + err |= ra_fail(output, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg + i]); + } + regs[reg + i] = tmp.id(); + } + } + regs.fill(0); + + for (auto it = block.instructions.rbegin(); it != block.instructions.rend(); ++it) { + aco_ptr<Instruction>& instr = *it; + + /* check killed p_phi sgpr operands */ + if (instr->opcode == aco_opcode::p_logical_end) { + for (Temp tmp : phi_sgpr_ops[block.index]) { + PhysReg reg = assignments.at(tmp.id()).reg; + for (unsigned i = 0; i < tmp.size(); i++) { + if (regs[reg + i]) + err |= ra_fail(output, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg + i]); + } + live.emplace(tmp); + } + } + + for (const Definition& def : instr->definitions) { + if (!def.isTemp()) + continue; + live.erase(def.getTemp()); + } + + /* don't count phi operands as live-in, since they are actually + * killed when they are copied at the predecessor */ + if (instr->opcode != aco_opcode::p_phi && instr->opcode != aco_opcode::p_linear_phi) { + for (const Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + live.insert(op.getTemp()); + } + } + } + + for (Temp tmp : live) { + PhysReg reg = assignments.at(tmp.id()).reg; + for (unsigned i = 0; i < tmp.size(); i++) + regs[reg + i] = tmp.id(); + } + + for (aco_ptr<Instruction>& instr : block.instructions) { + loc.instr = instr.get(); + + /* remove killed p_phi operands from regs */ + if (instr->opcode == aco_opcode::p_logical_end) { + for (Temp tmp : phi_sgpr_ops[block.index]) { + PhysReg reg = assignments.at(tmp.id()).reg; + regs[reg] = 0; + } + } + + if (instr->opcode != aco_opcode::p_phi && instr->opcode != aco_opcode::p_linear_phi) { + for (const Operand& op : instr->operands) { + if (!op.isTemp()) + continue; + if (op.isFirstKill()) { + for (unsigned j = 0; j < op.getTemp().size(); j++) + regs[op.physReg() + j] = 0; + } + } + } + + for (unsigned i = 0; i < instr->definitions.size(); i++) { + Definition& def = instr->definitions[i]; + if (!def.isTemp()) + continue; + Temp tmp = def.getTemp(); + PhysReg reg = assignments.at(tmp.id()).reg; + for (unsigned j = 0; j < tmp.size(); j++) { + if (regs[reg + j]) + err |= ra_fail(output, loc, assignments.at(regs[reg + i]).defloc, "Assignment of element %d of %%%d already taken by %%%d from instruction", i, tmp.id(), regs[reg + j]); + regs[reg + j] = tmp.id(); + } + } + + for (const Definition& def : instr->definitions) { + if (!def.isTemp()) + continue; + if (def.isKill()) { + for (unsigned j = 0; j < def.getTemp().size(); j++) + regs[def.physReg() + j] = 0; + } + } + } + } + + return err; +} +} diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build new file mode 100644 index 00000000000..73151cad6eb --- /dev/null +++ b/src/amd/compiler/meson.build @@ -0,0 +1,103 @@ +# Copyright © 2018 Valve Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +aco_depends = files('aco_opcodes.py') + +aco_opcodes_h = custom_target( + 'aco_opcodes.h', + input : 'aco_opcodes_h.py', + output : 'aco_opcodes.h', + command : [prog_python, '@INPUT@'], + capture : true, + depend_files : aco_depends, +) + +aco_opcodes_c = custom_target( + 'aco_opcodes.cpp', + input : 'aco_opcodes_cpp.py', + output : 'aco_opcodes.cpp', + command : [prog_python, '@INPUT@'], + capture : true, + depend_files : aco_depends, +) + +aco_builder_h = custom_target( + 'aco_builder.h', + input : 'aco_builder_h.py', + output : 'aco_builder.h', + command : [prog_python, '@INPUT@'], + capture : true, + depend_files : aco_depends, +) + +# Headers-only dependency +idep_aco_headers = declare_dependency( + sources : [aco_opcodes_h], + include_directories : include_directories('.'), +) + +libaco_files = files( + 'aco_dead_code_analysis.cpp', + 'aco_dominance.cpp', + 'aco_instruction_selection.cpp', + 'aco_instruction_selection_setup.cpp', + 'aco_interface.cpp', + 'aco_interface.h', + 'aco_ir.h', + 'aco_assembler.cpp', + 'aco_insert_exec_mask.cpp', + 'aco_insert_NOPs.cpp', + 'aco_insert_waitcnt.cpp', + 'aco_reduce_assign.cpp', + 'aco_register_allocation.cpp', + 'aco_live_var_analysis.cpp', + 'aco_lower_bool_phis.cpp', + 'aco_lower_to_hw_instr.cpp', + 'aco_optimizer.cpp', + 'aco_opt_value_numbering.cpp', + 'aco_print_asm.cpp', + 'aco_print_ir.cpp', + 'aco_scheduler.cpp', + 'aco_ssa_elimination.cpp', + 'aco_spill.cpp', + 'aco_util.h', + 'aco_validate.cpp', +) + +_libaco = static_library( + 'aco', + [libaco_files, aco_opcodes_c, aco_opcodes_h, aco_builder_h], + include_directories : [ + inc_common, inc_compiler, inc_mesa, inc_mapi, inc_amd, inc_amd_common, + ], + dependencies : [ + dep_llvm, dep_thread, dep_elf, dep_libdrm_amdgpu, dep_valgrind, + idep_nir_headers, idep_amdgfxregs_h, + ], + c_args : [c_vis_args], + cpp_args : [cpp_vis_args], + build_by_default : true, +) + +# Also link with aco +idep_aco = declare_dependency( + dependencies : idep_aco_headers, + link_with : _libaco, +) |