diff options
Diffstat (limited to 'src/amd/compiler/aco_ir.h')
-rw-r--r-- | src/amd/compiler/aco_ir.h | 1169 |
1 files changed, 1169 insertions, 0 deletions
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h new file mode 100644 index 00000000000..663635e5b93 --- /dev/null +++ b/src/amd/compiler/aco_ir.h @@ -0,0 +1,1169 @@ +/* + * Copyright © 2018 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#ifndef ACO_IR_H +#define ACO_IR_H + +#include <vector> +#include <set> +#include <bitset> +#include <memory> + +#include "nir.h" +#include "ac_binary.h" +#include "amd_family.h" +#include "aco_opcodes.h" +#include "aco_util.h" + +struct radv_nir_compiler_options; +struct radv_shader_info; + +namespace aco { + +extern uint64_t debug_flags; + +enum { + DEBUG_VALIDATE = 0x1, + DEBUG_VALIDATE_RA = 0x2, + DEBUG_PERFWARN = 0x4, +}; + +/** + * Representation of the instruction's microcode encoding format + * Note: Some Vector ALU Formats can be combined, such that: + * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding + * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive. + * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing. + * + * (*) The same is applicable for VOP1 and VOPC instructions. + */ +enum class Format : std::uint16_t { + /* Pseudo Instruction Format */ + PSEUDO = 0, + /* Scalar ALU & Control Formats */ + SOP1 = 1, + SOP2 = 2, + SOPK = 3, + SOPP = 4, + SOPC = 5, + /* Scalar Memory Format */ + SMEM = 6, + /* LDS/GDS Format */ + DS = 8, + /* Vector Memory Buffer Formats */ + MTBUF = 9, + MUBUF = 10, + /* Vector Memory Image Format */ + MIMG = 11, + /* Export Format */ + EXP = 12, + /* Flat Formats */ + FLAT = 13, + GLOBAL = 14, + SCRATCH = 15, + + PSEUDO_BRANCH = 16, + PSEUDO_BARRIER = 17, + PSEUDO_REDUCTION = 18, + + /* Vector ALU Formats */ + VOP1 = 1 << 8, + VOP2 = 1 << 9, + VOPC = 1 << 10, + VOP3 = 1 << 11, + VOP3A = 1 << 11, + VOP3B = 1 << 11, + VOP3P = 1 << 12, + /* Vector Parameter Interpolation Format */ + VINTRP = 1 << 13, + DPP = 1 << 14, + SDWA = 1 << 15, +}; + +enum barrier_interaction { + barrier_none = 0, + barrier_buffer = 0x1, + barrier_image = 0x2, + barrier_atomic = 0x4, + barrier_shared = 0x8, + barrier_count = 4, +}; + +constexpr Format asVOP3(Format format) { + return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format); +}; + +enum class RegType { + none = 0, + sgpr, + vgpr, + linear_vgpr, +}; + +struct RegClass { + + enum RC : uint8_t { + s1 = 1, + s2 = 2, + s3 = 3, + s4 = 4, + s6 = 6, + s8 = 8, + s16 = 16, + v1 = s1 | (1 << 5), + v2 = s2 | (1 << 5), + v3 = s3 | (1 << 5), + v4 = s4 | (1 << 5), + v5 = 5 | (1 << 5), + v6 = 6 | (1 << 5), + v7 = 7 | (1 << 5), + v8 = 8 | (1 << 5), + /* these are used for WWM and spills to vgpr */ + v1_linear = v1 | (1 << 6), + v2_linear = v2 | (1 << 6), + }; + + RegClass() = default; + constexpr RegClass(RC rc) + : rc(rc) {} + constexpr RegClass(RegType type, unsigned size) + : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {} + + constexpr operator RC() const { return rc; } + explicit operator bool() = delete; + + constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; } + constexpr unsigned size() const { return (unsigned) rc & 0x1F; } + constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); } + constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); } + +private: + RC rc; +}; + +/* transitional helper expressions */ +static constexpr RegClass s1{RegClass::s1}; +static constexpr RegClass s2{RegClass::s2}; +static constexpr RegClass s3{RegClass::s3}; +static constexpr RegClass s4{RegClass::s4}; +static constexpr RegClass s8{RegClass::s8}; +static constexpr RegClass s16{RegClass::s16}; +static constexpr RegClass v1{RegClass::v1}; +static constexpr RegClass v2{RegClass::v2}; +static constexpr RegClass v3{RegClass::v3}; +static constexpr RegClass v4{RegClass::v4}; +static constexpr RegClass v5{RegClass::v5}; +static constexpr RegClass v6{RegClass::v6}; +static constexpr RegClass v7{RegClass::v7}; +static constexpr RegClass v8{RegClass::v8}; + +/** + * Temp Class + * Each temporary virtual register has a + * register class (i.e. size and type) + * and SSA id. + */ +struct Temp { + Temp() = default; + constexpr Temp(uint32_t id, RegClass cls) noexcept + : id_(id), reg_class(cls) {} + + constexpr uint32_t id() const noexcept { return id_; } + constexpr RegClass regClass() const noexcept { return reg_class; } + + constexpr unsigned size() const noexcept { return reg_class.size(); } + constexpr RegType type() const noexcept { return reg_class.type(); } + constexpr bool is_linear() const noexcept { return reg_class.is_linear(); } + + constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); } + constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); } + constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); } + +private: + uint32_t id_:24; + RegClass reg_class; +}; + +/** + * PhysReg + * Represents the physical register for each + * Operand and Definition. + */ +struct PhysReg { + constexpr PhysReg() = default; + explicit constexpr PhysReg(unsigned r) : reg(r) {} + constexpr operator unsigned() const { return reg; } + + uint16_t reg = 0; +}; + +/* helper expressions for special registers */ +static constexpr PhysReg m0{124}; +static constexpr PhysReg vcc{106}; +static constexpr PhysReg exec{126}; +static constexpr PhysReg exec_lo{126}; +static constexpr PhysReg exec_hi{127}; +static constexpr PhysReg scc{253}; + +/** + * Operand Class + * Initially, each Operand refers to either + * a temporary virtual register + * or to a constant value + * Temporary registers get mapped to physical register during RA + * Constant values are inlined into the instruction sequence. + */ +class Operand final +{ +public: + constexpr Operand() + : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), + isKill_(false), isUndef_(true), isFirstKill_(false), is64BitConst_(false) {} + + explicit Operand(Temp r) noexcept + { + data_.temp = r; + if (r.id()) { + isTemp_ = true; + } else { + isUndef_ = true; + setFixed(PhysReg{128}); + } + }; + explicit Operand(uint32_t v) noexcept + { + data_.i = v; + isConstant_ = true; + if (v <= 64) + setFixed(PhysReg{128 + v}); + else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */ + setFixed(PhysReg{192 - v}); + else if (v == 0x3f000000) /* 0.5 */ + setFixed(PhysReg{240}); + else if (v == 0xbf000000) /* -0.5 */ + setFixed(PhysReg{241}); + else if (v == 0x3f800000) /* 1.0 */ + setFixed(PhysReg{242}); + else if (v == 0xbf800000) /* -1.0 */ + setFixed(PhysReg{243}); + else if (v == 0x40000000) /* 2.0 */ + setFixed(PhysReg{244}); + else if (v == 0xc0000000) /* -2.0 */ + setFixed(PhysReg{245}); + else if (v == 0x40800000) /* 4.0 */ + setFixed(PhysReg{246}); + else if (v == 0xc0800000) /* -4.0 */ + setFixed(PhysReg{247}); + else if (v == 0x3e22f983) /* 1/(2*PI) */ + setFixed(PhysReg{248}); + else /* Literal Constant */ + setFixed(PhysReg{255}); + }; + explicit Operand(uint64_t v) noexcept + { + isConstant_ = true; + is64BitConst_ = true; + if (v <= 64) + setFixed(PhysReg{128 + (uint32_t) v}); + else if (v >= 0xFFFFFFFFFFFFFFF0) /* [-16 .. -1] */ + setFixed(PhysReg{192 - (uint32_t) v}); + else if (v == 0x3FE0000000000000) /* 0.5 */ + setFixed(PhysReg{240}); + else if (v == 0xBFE0000000000000) /* -0.5 */ + setFixed(PhysReg{241}); + else if (v == 0x3FF0000000000000) /* 1.0 */ + setFixed(PhysReg{242}); + else if (v == 0xBFF0000000000000) /* -1.0 */ + setFixed(PhysReg{243}); + else if (v == 0x4000000000000000) /* 2.0 */ + setFixed(PhysReg{244}); + else if (v == 0xC000000000000000) /* -2.0 */ + setFixed(PhysReg{245}); + else if (v == 0x4010000000000000) /* 4.0 */ + setFixed(PhysReg{246}); + else if (v == 0xC010000000000000) /* -4.0 */ + setFixed(PhysReg{247}); + else if (v == 0x3fc45f306dc9c882) /* 1/(2*PI) */ + setFixed(PhysReg{248}); + else { /* Literal Constant: we don't know if it is a long or double.*/ + isConstant_ = 0; + assert(false && "attempt to create a 64-bit literal constant"); + } + }; + explicit Operand(RegClass type) noexcept + { + isUndef_ = true; + data_.temp = Temp(0, type); + setFixed(PhysReg{128}); + }; + explicit Operand(PhysReg reg, RegClass type) noexcept + { + data_.temp = Temp(0, type); + setFixed(reg); + } + + constexpr bool isTemp() const noexcept + { + return isTemp_; + } + + constexpr void setTemp(Temp t) noexcept { + assert(!isConstant_); + isTemp_ = true; + data_.temp = t; + } + + constexpr Temp getTemp() const noexcept + { + return data_.temp; + } + + constexpr uint32_t tempId() const noexcept + { + return data_.temp.id(); + } + + constexpr bool hasRegClass() const noexcept + { + return isTemp() || isUndefined(); + } + + constexpr RegClass regClass() const noexcept + { + return data_.temp.regClass(); + } + + constexpr unsigned size() const noexcept + { + if (isConstant()) + return is64BitConst_ ? 2 : 1; + else + return data_.temp.size(); + } + + constexpr bool isFixed() const noexcept + { + return isFixed_; + } + + constexpr PhysReg physReg() const noexcept + { + return reg_; + } + + constexpr void setFixed(PhysReg reg) noexcept + { + isFixed_ = reg != unsigned(-1); + reg_ = reg; + } + + constexpr bool isConstant() const noexcept + { + return isConstant_; + } + + constexpr bool isLiteral() const noexcept + { + return isConstant() && reg_ == 255; + } + + constexpr bool isUndefined() const noexcept + { + return isUndef_; + } + + constexpr uint32_t constantValue() const noexcept + { + return data_.i; + } + + constexpr bool constantEquals(uint32_t cmp) const noexcept + { + return isConstant() && constantValue() == cmp; + } + + constexpr void setKill(bool flag) noexcept + { + isKill_ = flag; + if (!flag) + setFirstKill(false); + } + + constexpr bool isKill() const noexcept + { + return isKill_ || isFirstKill(); + } + + constexpr void setFirstKill(bool flag) noexcept + { + isFirstKill_ = flag; + if (flag) + setKill(flag); + } + + /* When there are multiple operands killing the same temporary, + * isFirstKill() is only returns true for the first one. */ + constexpr bool isFirstKill() const noexcept + { + return isFirstKill_; + } + +private: + union { + uint32_t i; + float f; + Temp temp = Temp(0, s1); + } data_; + PhysReg reg_; + union { + struct { + uint8_t isTemp_:1; + uint8_t isFixed_:1; + uint8_t isConstant_:1; + uint8_t isKill_:1; + uint8_t isUndef_:1; + uint8_t isFirstKill_:1; + uint8_t is64BitConst_:1; + }; + /* can't initialize bit-fields in c++11, so work around using a union */ + uint8_t control_ = 0; + }; +}; + +/** + * Definition Class + * Definitions are the results of Instructions + * and refer to temporary virtual registers + * which are later mapped to physical registers + */ +class Definition final +{ +public: + constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0) {} + Definition(uint32_t index, RegClass type) noexcept + : temp(index, type) {} + explicit Definition(Temp tmp) noexcept + : temp(tmp) {} + Definition(PhysReg reg, RegClass type) noexcept + : temp(Temp(0, type)) + { + setFixed(reg); + } + Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept + : temp(Temp(tmpId, type)) + { + setFixed(reg); + } + + constexpr bool isTemp() const noexcept + { + return tempId() > 0; + } + + constexpr Temp getTemp() const noexcept + { + return temp; + } + + constexpr uint32_t tempId() const noexcept + { + return temp.id(); + } + + constexpr void setTemp(Temp t) noexcept { + temp = t; + } + + constexpr RegClass regClass() const noexcept + { + return temp.regClass(); + } + + constexpr unsigned size() const noexcept + { + return temp.size(); + } + + constexpr bool isFixed() const noexcept + { + return isFixed_; + } + + constexpr PhysReg physReg() const noexcept + { + return reg_; + } + + constexpr void setFixed(PhysReg reg) noexcept + { + isFixed_ = 1; + reg_ = reg; + } + + constexpr void setHint(PhysReg reg) noexcept + { + hasHint_ = 1; + reg_ = reg; + } + + constexpr bool hasHint() const noexcept + { + return hasHint_; + } + + constexpr void setKill(bool flag) noexcept + { + isKill_ = flag; + } + + constexpr bool isKill() const noexcept + { + return isKill_; + } + +private: + Temp temp = Temp(0, s1); + PhysReg reg_; + union { + struct { + uint8_t isFixed_:1; + uint8_t hasHint_:1; + uint8_t isKill_:1; + }; + /* can't initialize bit-fields in c++11, so work around using a union */ + uint8_t control_ = 0; + }; +}; + +class Block; + +struct Instruction { + aco_opcode opcode; + Format format; + + aco::span<Operand> operands; + aco::span<Definition> definitions; + + constexpr bool isVALU() const noexcept + { + return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1 + || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2 + || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC + || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A + || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B + || ((uint16_t) format & (uint16_t) Format::VOP3P) == (uint16_t) Format::VOP3P; + } + + constexpr bool isSALU() const noexcept + { + return format == Format::SOP1 || + format == Format::SOP2 || + format == Format::SOPC || + format == Format::SOPK || + format == Format::SOPP; + } + + constexpr bool isVMEM() const noexcept + { + return format == Format::MTBUF || + format == Format::MUBUF || + format == Format::MIMG; + } + + constexpr bool isDPP() const noexcept + { + return (uint16_t) format & (uint16_t) Format::DPP; + } + + constexpr bool isVOP3() const noexcept + { + return ((uint16_t) format & (uint16_t) Format::VOP3A) || + ((uint16_t) format & (uint16_t) Format::VOP3B) || + format == Format::VOP3P; + } + + constexpr bool isSDWA() const noexcept + { + return (uint16_t) format & (uint16_t) Format::SDWA; + } + + constexpr bool isFlatOrGlobal() const noexcept + { + return format == Format::FLAT || format == Format::GLOBAL; + } +}; + +struct SOPK_instruction : public Instruction { + uint16_t imm; +}; + +struct SOPP_instruction : public Instruction { + uint32_t imm; + int block; +}; + +struct SOPC_instruction : public Instruction { +}; + +struct SOP1_instruction : public Instruction { +}; + +struct SOP2_instruction : public Instruction { +}; + +/** + * Scalar Memory Format: + * For s_(buffer_)load_dword*: + * Operand(0): SBASE - SGPR-pair which provides base address + * Operand(1): Offset - immediate (un)signed offset or SGPR + * Operand(2) / Definition(0): SDATA - SGPR for read / write result + * Operand(n-1): SOffset - SGPR offset (Vega only) + * + * Having no operands is also valid for instructions such as s_dcache_inv. + * + */ +struct SMEM_instruction : public Instruction { + bool glc; /* VI+: globally coherent */ + bool dlc; /* NAVI: device level coherent */ + bool nv; /* VEGA only: Non-volatile */ + bool can_reorder; + bool disable_wqm; + barrier_interaction barrier; +}; + +struct VOP1_instruction : public Instruction { +}; + +struct VOP2_instruction : public Instruction { +}; + +struct VOPC_instruction : public Instruction { +}; + +struct VOP3A_instruction : public Instruction { + bool abs[3]; + bool opsel[3]; + bool clamp; + unsigned omod; + bool neg[3]; +}; + +/** + * Data Parallel Primitives Format: + * This format can be used for VOP1, VOP2 or VOPC instructions. + * The swizzle applies to the src0 operand. + * + */ +struct DPP_instruction : public Instruction { + uint16_t dpp_ctrl; + uint8_t row_mask; + uint8_t bank_mask; + bool abs[2]; + bool neg[2]; + bool bound_ctrl; +}; + +struct Interp_instruction : public Instruction { + unsigned attribute; + unsigned component; +}; + +/** + * Local and Global Data Sharing instructions + * Operand(0): ADDR - VGPR which supplies the address. + * Operand(1): DATA0 - First data VGPR. + * Operand(2): DATA1 - Second data VGPR. + * Operand(n-1): M0 - LDS size. + * Definition(0): VDST - Destination VGPR when results returned to VGPRs. + * + */ +struct DS_instruction : public Instruction { + int16_t offset0; + int8_t offset1; + bool gds; +}; + +/** + * Vector Memory Untyped-buffer Instructions + * Operand(0): VADDR - Address source. Can carry an index and/or offset + * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) + * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data + * + */ +struct MUBUF_instruction : public Instruction { + unsigned offset; /* Unsigned byte offset - 12 bit */ + bool offen; /* Supply an offset from VGPR (VADDR) */ + bool idxen; /* Supply an index from VGPR (VADDR) */ + bool glc; /* globally coherent */ + bool dlc; /* NAVI: device level coherent */ + bool slc; /* system level coherent */ + bool tfe; /* texture fail enable */ + bool lds; /* Return read-data to LDS instead of VGPRs */ + bool disable_wqm; /* Require an exec mask without helper invocations */ + bool can_reorder; + barrier_interaction barrier; +}; + +/** + * Vector Memory Typed-buffer Instructions + * Operand(0): VADDR - Address source. Can carry an index and/or offset + * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) + * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data + * + */ +struct MTBUF_instruction : public Instruction { + union { + struct { + uint8_t dfmt : 4; /* Data Format of data in memory buffer */ + uint8_t nfmt : 3; /* Numeric format of data in memory */ + }; + uint8_t img_format; /* Buffer or image format as used by GFX10 */ + }; + unsigned offset; /* Unsigned byte offset - 12 bit */ + bool offen; /* Supply an offset from VGPR (VADDR) */ + bool idxen; /* Supply an index from VGPR (VADDR) */ + bool glc; /* globally coherent */ + bool dlc; /* NAVI: device level coherent */ + bool slc; /* system level coherent */ + bool tfe; /* texture fail enable */ + bool disable_wqm; /* Require an exec mask without helper invocations */ + bool can_reorder; + barrier_interaction barrier; +}; + +/** + * Vector Memory Image Instructions + * Operand(0): VADDR - Address source. Can carry an offset or an index. + * Operand(1): SRSRC - Scalar GPR that specifies the resource constant. + * Operand(2): SSAMP - Scalar GPR that specifies sampler constant. + * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result. + * + */ +struct MIMG_instruction : public Instruction { + unsigned dmask; /* Data VGPR enable mask */ + bool unrm; /* Force address to be un-normalized */ + bool dlc; /* NAVI: device level coherent */ + bool glc; /* globally coherent */ + bool slc; /* system level coherent */ + bool tfe; /* texture fail enable */ + bool da; /* declare an array */ + bool lwe; /* Force data to be un-normalized */ + bool r128; /* NAVI: Texture resource size */ + bool a16; /* VEGA, NAVI: Address components are 16-bits */ + bool d16; /* Convert 32-bit data to 16-bit data */ + bool disable_wqm; /* Require an exec mask without helper invocations */ + bool can_reorder; + barrier_interaction barrier; +}; + +/** + * Flat/Scratch/Global Instructions + * Operand(0): ADDR + * Operand(1): SADDR + * Operand(2) / Definition(0): DATA/VDST + * + */ +struct FLAT_instruction : public Instruction { + uint16_t offset; /* Vega only */ + bool slc; + bool glc; + bool lds; + bool nv; +}; + +struct Export_instruction : public Instruction { + unsigned enabled_mask; + unsigned dest; + bool compressed; + bool done; + bool valid_mask; +}; + +struct Pseudo_instruction : public Instruction { + bool tmp_in_scc; + PhysReg scratch_sgpr; /* might not be valid if it's not needed */ +}; + +struct Pseudo_branch_instruction : public Instruction { + /* target[0] is the block index of the branch target. + * For conditional branches, target[1] contains the fall-through alternative. + * A value of 0 means the target has not been initialized (BB0 cannot be a branch target). + */ + uint32_t target[2]; +}; + +struct Pseudo_barrier_instruction : public Instruction { +}; + +enum ReduceOp { + iadd32, iadd64, + imul32, imul64, + fadd32, fadd64, + fmul32, fmul64, + imin32, imin64, + imax32, imax64, + umin32, umin64, + umax32, umax64, + fmin32, fmin64, + fmax32, fmax64, + iand32, iand64, + ior32, ior64, + ixor32, ixor64, +}; + +/** + * Subgroup Reduction Instructions, everything except for the data to be + * reduced and the result as inserted by setup_reduce_temp(). + * Operand(0): data to be reduced + * Operand(1): reduce temporary + * Operand(2): vector temporary + * Definition(0): result + * Definition(1): scalar temporary + * Definition(2): scalar identity temporary + * Definition(3): scc clobber + * Definition(4): vcc clobber + * + */ +struct Pseudo_reduction_instruction : public Instruction { + ReduceOp reduce_op; + unsigned cluster_size; // must be 0 for scans +}; + +struct instr_deleter_functor { + void operator()(void* p) { + free(p); + } +}; + +template<typename T> +using aco_ptr = std::unique_ptr<T, instr_deleter_functor>; + +template<typename T> +T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions) +{ + std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition); + char *data = (char*) calloc(1, size); + T* inst = (T*) data; + + inst->opcode = opcode; + inst->format = format; + + inst->operands = aco::span<Operand>((Operand*)(data + sizeof(T)), num_operands); + inst->definitions = aco::span<Definition>((Definition*)inst->operands.end(), num_definitions); + + return inst; +} + +constexpr bool is_phi(Instruction* instr) +{ + return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi; +} + +static inline bool is_phi(aco_ptr<Instruction>& instr) +{ + return is_phi(instr.get()); +} + +constexpr barrier_interaction get_barrier_interaction(Instruction* instr) +{ + switch (instr->format) { + case Format::SMEM: + return static_cast<SMEM_instruction*>(instr)->barrier; + case Format::MUBUF: + return static_cast<MUBUF_instruction*>(instr)->barrier; + case Format::MIMG: + return static_cast<MIMG_instruction*>(instr)->barrier; + case Format::FLAT: + case Format::GLOBAL: + return barrier_buffer; + case Format::DS: + return barrier_shared; + default: + return barrier_none; + } +} + +enum block_kind { + /* uniform indicates that leaving this block, + * all actives lanes stay active */ + block_kind_uniform = 1 << 0, + block_kind_top_level = 1 << 1, + block_kind_loop_preheader = 1 << 2, + block_kind_loop_header = 1 << 3, + block_kind_loop_exit = 1 << 4, + block_kind_continue = 1 << 5, + block_kind_break = 1 << 6, + block_kind_continue_or_break = 1 << 7, + block_kind_discard = 1 << 8, + block_kind_branch = 1 << 9, + block_kind_merge = 1 << 10, + block_kind_invert = 1 << 11, + block_kind_uses_discard_if = 1 << 12, + block_kind_needs_lowering = 1 << 13, +}; + + +struct RegisterDemand { + constexpr RegisterDemand() = default; + constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept + : vgpr{v}, sgpr{s} {} + int16_t vgpr = 0; + int16_t sgpr = 0; + + constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept { + return a.vgpr == b.vgpr && a.sgpr == b.sgpr; + } + + constexpr bool exceeds(const RegisterDemand other) const noexcept { + return vgpr > other.vgpr || sgpr > other.sgpr; + } + + constexpr RegisterDemand operator+(const Temp t) const noexcept { + if (t.type() == RegType::sgpr) + return RegisterDemand( vgpr, sgpr + t.size() ); + else + return RegisterDemand( vgpr + t.size(), sgpr ); + } + + constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept { + return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr); + } + + constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept { + return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr); + } + + constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept { + vgpr += other.vgpr; + sgpr += other.sgpr; + return *this; + } + + constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept { + vgpr -= other.vgpr; + sgpr -= other.sgpr; + return *this; + } + + constexpr RegisterDemand& operator+=(const Temp t) noexcept { + if (t.type() == RegType::sgpr) + sgpr += t.size(); + else + vgpr += t.size(); + return *this; + } + + constexpr RegisterDemand& operator-=(const Temp t) noexcept { + if (t.type() == RegType::sgpr) + sgpr -= t.size(); + else + vgpr -= t.size(); + return *this; + } + + constexpr void update(const RegisterDemand other) noexcept { + vgpr = std::max(vgpr, other.vgpr); + sgpr = std::max(sgpr, other.sgpr); + } + +}; + +/* CFG */ +struct Block { + unsigned index; + unsigned offset = 0; + std::vector<aco_ptr<Instruction>> instructions; + std::vector<unsigned> logical_preds; + std::vector<unsigned> linear_preds; + std::vector<unsigned> logical_succs; + std::vector<unsigned> linear_succs; + RegisterDemand register_demand = RegisterDemand(); + uint16_t loop_nest_depth = 0; + uint16_t kind = 0; + int logical_idom = -1; + int linear_idom = -1; + Temp live_out_exec = Temp(); + + /* this information is needed for predecessors to blocks with phis when + * moving out of ssa */ + bool scc_live_out = false; + PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */ + + Block(unsigned idx) : index(idx) {} + Block() : index(0) {} +}; + +using Stage = uint16_t; + +/* software stages */ +static constexpr Stage sw_vs = 1 << 0; +static constexpr Stage sw_gs = 1 << 1; +static constexpr Stage sw_tcs = 1 << 2; +static constexpr Stage sw_tes = 1 << 3; +static constexpr Stage sw_fs = 1 << 4; +static constexpr Stage sw_cs = 1 << 5; +static constexpr Stage sw_mask = 0x3f; + +/* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */ +static constexpr Stage hw_vs = 1 << 6; +static constexpr Stage hw_es = 1 << 7; +static constexpr Stage hw_gs = 1 << 8; /* not on GFX9. combined into ES on GFX9 (and GFX10/legacy). */ +static constexpr Stage hw_ls = 1 << 9; +static constexpr Stage hw_hs = 1 << 10; /* not on GFX9. combined into LS on GFX9 (and GFX10/legacy). */ +static constexpr Stage hw_fs = 1 << 11; +static constexpr Stage hw_cs = 1 << 12; +static constexpr Stage hw_mask = 0x7f << 6; + +/* possible settings of Program::stage */ +static constexpr Stage vertex_vs = sw_vs | hw_vs; +static constexpr Stage fragment_fs = sw_fs | hw_fs; +static constexpr Stage compute_cs = sw_cs | hw_cs; +static constexpr Stage tess_eval_vs = sw_tes | hw_vs; +/* GFX10/NGG */ +static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs; +static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs; +static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs; +static constexpr Stage ngg_vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs; +/* GFX9 (and GFX10 if NGG isn't used) */ +static constexpr Stage vertex_geometry_es = sw_vs | sw_gs | hw_es; +static constexpr Stage vertex_tess_control_ls = sw_vs | sw_tcs | hw_ls; +static constexpr Stage tess_eval_geometry_es = sw_tes | sw_gs | hw_es; +/* pre-GFX9 */ +static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */ +static constexpr Stage tess_control_hs = sw_tcs | hw_hs; +static constexpr Stage tess_eval_es = sw_tes | hw_gs; /* tesselation evaluation before GS */ +static constexpr Stage geometry_gs = sw_gs | hw_gs; + +class Program final { +public: + std::vector<Block> blocks; + RegisterDemand max_reg_demand = RegisterDemand(); + uint16_t sgpr_limit = 0; + uint16_t num_waves = 0; + ac_shader_config* config; + struct radv_shader_info *info; + enum chip_class chip_class; + enum radeon_family family; + Stage stage; /* Stage */ + bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ + bool needs_wqm = false; /* there exists a p_wqm instruction */ + bool wb_smem_l1_on_end = false; + + std::vector<uint8_t> constant_data; + + uint32_t allocateId() + { + assert(allocationID <= 16777215); + return allocationID++; + } + + uint32_t peekAllocationId() + { + return allocationID; + } + + void setAllocationId(uint32_t id) + { + allocationID = id; + } + + Block* create_and_insert_block() { + blocks.emplace_back(blocks.size()); + return &blocks.back(); + } + + Block* insert_block(Block&& block) { + block.index = blocks.size(); + blocks.emplace_back(std::move(block)); + return &blocks.back(); + } + +private: + uint32_t allocationID = 1; +}; + +struct live { + /* live temps out per block */ + std::vector<std::set<Temp>> live_out; + /* register demand (sgpr/vgpr) per instruction per block */ + std::vector<std::vector<RegisterDemand>> register_demand; +}; + +void select_program(Program *program, + unsigned shader_count, + struct nir_shader *const *shaders, + ac_shader_config* config, + struct radv_shader_info *info, + struct radv_nir_compiler_options *options); + +void lower_wqm(Program* program, live& live_vars, + const struct radv_nir_compiler_options *options); +void lower_bool_phis(Program* program); +void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand); +live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options); +std::vector<uint16_t> dead_code_analysis(Program *program); +void dominator_tree(Program* program); +void insert_exec_mask(Program *program); +void value_numbering(Program* program); +void optimize(Program* program); +void setup_reduce_temp(Program* program); +void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); +void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_per_block); +void ssa_elimination(Program* program); +void lower_to_hw_instr(Program* program); +void schedule_program(Program* program, live& live_vars); +void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); +void insert_wait_states(Program* program); +void insert_NOPs(Program* program); +unsigned emit_program(Program* program, std::vector<uint32_t>& code); +void print_asm(Program *program, std::vector<uint32_t>& binary, unsigned exec_size, + enum radeon_family family, std::ostream& out); +void validate(Program* program, FILE *output); +bool validate_ra(Program* program, const struct radv_nir_compiler_options *options, FILE *output); +#ifndef NDEBUG +void perfwarn(bool cond, const char *msg, Instruction *instr=NULL); +#else +#define perfwarn(program, cond, msg, ...) +#endif + +void aco_print_instr(Instruction *instr, FILE *output); +void aco_print_program(Program *program, FILE *output); + +typedef struct { + const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)]; + const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)]; + const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers; + const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers; + const char *name[static_cast<int>(aco_opcode::num_opcodes)]; + const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)]; +} Info; + +extern const Info instr_info; + +} + +#endif /* ACO_IR_H */ + |