diff options
author | Christoph Bumiller <[email protected]> | 2011-09-14 16:18:23 +0200 |
---|---|---|
committer | Christoph Bumiller <[email protected]> | 2011-09-14 16:19:52 +0200 |
commit | 57594065c30feec9376be9b2132659f7d87362ee (patch) | |
tree | 7e6808e0c5240b513851b7925c5be6678663b5e5 /src/gallium/drivers/nvc0/codegen | |
parent | a42eca84c56f6860e67c0c57f4765a5530cc5f81 (diff) |
nv50/ir: import new shader backend code
Diffstat (limited to 'src/gallium/drivers/nvc0/codegen')
4 files changed, 3033 insertions, 0 deletions
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp new file mode 100644 index 00000000000..2ab06f426e5 --- /dev/null +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp @@ -0,0 +1,1714 @@ + +#include "nv50_ir_target_nvc0.h" + +namespace nv50_ir { + +// Argh, all these assertions ... + +class CodeEmitterNVC0 : public CodeEmitter +{ +public: + CodeEmitterNVC0(const TargetNVC0 *); + + virtual bool emitInstruction(Instruction *); + virtual uint32_t getMinEncodingSize(const Instruction *) const; + + inline void setProgramType(Program::Type pType) { progType = pType; } + +private: + const TargetNVC0 *targ; + + Program::Type progType; + +private: + void emitForm_A(const Instruction *, uint64_t); + void emitForm_B(const Instruction *, uint64_t); + void emitForm_S(const Instruction *, uint32_t, bool pred); + + void emitPredicate(const Instruction *); + + void setAddress16(const ValueRef&); + void setImmediate(const Instruction *, const int s); // needs op already set + void setImmediateS8(const ValueRef&); + + void emitCondCode(CondCode cc, int pos); + void emitInterpMode(const Instruction *); + void emitLoadStoreType(DataType ty); + void emitCachingMode(CacheMode c); + + void emitShortSrc2(const ValueRef&); + + inline uint8_t getSRegEncoding(const ValueRef&); + + void roundMode_A(const Instruction *); + void roundMode_C(const Instruction *); + void roundMode_CS(const Instruction *); + + void emitNegAbs12(const Instruction *); + + void emitNOP(const Instruction *); + + void emitLOAD(const Instruction *); + void emitSTORE(const Instruction *); + void emitMOV(const Instruction *); + + void emitINTERP(const Instruction *); + void emitPFETCH(const Instruction *); + void emitVFETCH(const Instruction *); + void emitEXPORT(const Instruction *); + void emitOUT(const Instruction *); + + void emitUADD(const Instruction *); + void emitFADD(const Instruction *); + void emitUMUL(const Instruction *); + void emitFMUL(const Instruction *); + void emitIMAD(const Instruction *); + void emitFMAD(const Instruction *); + + void emitNOT(Instruction *); + void emitLogicOp(const Instruction *, uint8_t subOp); + void emitPOPC(const Instruction *); + void emitINSBF(const Instruction *); + void emitShift(const Instruction *); + + void emitSFnOp(const Instruction *, uint8_t subOp); + + void emitCVT(Instruction *); + void emitMINMAX(const Instruction *); + void emitPreOp(const Instruction *); + + void emitSET(const CmpInstruction *); + void emitSLCT(const CmpInstruction *); + void emitSELP(const Instruction *); + + void emitTEX(const TexInstruction *); + void emitTEXCSAA(const TexInstruction *); + void emitTXQ(const TexInstruction *); + void emitPIXLD(const TexInstruction *); + + void emitQUADOP(const Instruction *, uint8_t qOp, uint8_t laneMask); + + void emitFlow(const Instruction *); + + inline void defId(const ValueDef&, const int pos); + inline void srcId(const ValueRef&, const int pos); + + inline void srcAddr32(const ValueRef&, const int pos); // address / 4 + + inline void srcId(const ValueRef *, const int pos); + + inline bool isLIMM(const ValueRef&, DataType ty); +}; + +// for better visibility +#define HEX64(h, l) 0x##h##l##ULL + +#define SDATA(a) ((a).rep()->reg.data) +#define DDATA(a) ((a).rep()->reg.data) + +void CodeEmitterNVC0::srcId(const ValueRef& src, const int pos) +{ + code[pos / 32] |= (src.get() ? SDATA(src).id : 63) << (pos % 32); +} + +void CodeEmitterNVC0::srcId(const ValueRef *src, const int pos) +{ + code[pos / 32] |= (src ? SDATA(*src).id : 63) << (pos % 32); +} + +void CodeEmitterNVC0::srcAddr32(const ValueRef& src, const int pos) +{ + code[pos / 32] |= (SDATA(src).offset >> 2) << (pos % 32); +} + +void CodeEmitterNVC0::defId(const ValueDef& def, const int pos) +{ + code[pos / 32] |= (def.get() ? DDATA(def).id : 63) << (pos % 32); +} + +bool CodeEmitterNVC0::isLIMM(const ValueRef& ref, DataType ty) +{ + const ImmediateValue *imm = ref.get()->asImm(); + + return imm && (imm->reg.data.u32 & ((ty == TYPE_F32) ? 0xfff : 0xfff00000)); +} + +void +CodeEmitterNVC0::roundMode_A(const Instruction *insn) +{ + switch (insn->rnd) { + case ROUND_M: code[1] |= 1 << 23; break; + case ROUND_P: code[1] |= 2 << 23; break; + case ROUND_Z: code[1] |= 3 << 23; break; + default: + assert(insn->rnd == ROUND_N); + break; + } +} + +void +CodeEmitterNVC0::emitNegAbs12(const Instruction *i) +{ + if (i->src[1].mod.abs()) code[0] |= 1 << 6; + if (i->src[0].mod.abs()) code[0] |= 1 << 7; + if (i->src[1].mod.neg()) code[0] |= 1 << 8; + if (i->src[0].mod.neg()) code[0] |= 1 << 9; +} + +void CodeEmitterNVC0::emitCondCode(CondCode cc, int pos) +{ + uint8_t val; + + switch (cc) { + case CC_LT: val = 0x1; break; + case CC_LTU: val = 0x9; break; + case CC_EQ: val = 0x2; break; + case CC_EQU: val = 0xa; break; + case CC_LE: val = 0x3; break; + case CC_LEU: val = 0xb; break; + case CC_GT: val = 0x4; break; + case CC_GTU: val = 0xc; break; + case CC_NE: val = 0x5; break; + case CC_NEU: val = 0xd; break; + case CC_GE: val = 0x6; break; + case CC_GEU: val = 0xe; break; + case CC_TR: val = 0xf; break; + case CC_FL: val = 0x0; break; + + case CC_A: val = 0x14; break; + case CC_NA: val = 0x13; break; + case CC_S: val = 0x15; break; + case CC_NS: val = 0x12; break; + case CC_C: val = 0x16; break; + case CC_NC: val = 0x11; break; + case CC_O: val = 0x17; break; + case CC_NO: val = 0x10; break; + + default: + val = 0; + assert(!"invalid condition code"); + break; + } + code[pos / 32] |= val << (pos % 32); +} + +void +CodeEmitterNVC0::emitPredicate(const Instruction *i) +{ + if (i->predSrc >= 0) { + assert(i->getPredicate()->reg.file == FILE_PREDICATE); + srcId(i->src[i->predSrc], 10); + if (i->cc == CC_NOT_P) + code[0] |= 0x2000; // negate + } else { + code[0] |= 0x1c00; + } +} + +void +CodeEmitterNVC0::setAddress16(const ValueRef& src) +{ + Symbol *sym = src.get()->asSym(); + + assert(sym); + + code[0] |= (sym->reg.data.offset & 0x003f) << 26; + code[1] |= (sym->reg.data.offset & 0xffc0) >> 6; +} + +void +CodeEmitterNVC0::setImmediate(const Instruction *i, const int s) +{ + const ImmediateValue *imm = i->src[s].get()->asImm(); + uint32_t u32; + + assert(imm); + u32 = imm->reg.data.u32; + + if ((code[0] & 0xf) == 0x2) { + // LIMM + code[0] |= (u32 & 0x3f) << 26; + code[1] |= u32 >> 6; + } else + if ((code[0] & 0xf) == 0x3 || (code[0] & 0xf) == 4) { + // integer immediate + assert((u32 & 0xfff00000) == 0 || (u32 & 0xfff00000) == 0xfff00000); + assert(!(code[1] & 0xc000)); + u32 &= 0xfffff; + code[0] |= (u32 & 0x3f) << 26; + code[1] |= 0xc000 | (u32 >> 6); + } else { + // float immediate + assert(!(u32 & 0x00000fff)); + assert(!(code[1] & 0xc000)); + code[0] |= ((u32 >> 12) & 0x3f) << 26; + code[1] |= 0xc000 | (u32 >> 18); + } +} + +void CodeEmitterNVC0::setImmediateS8(const ValueRef &ref) +{ + const ImmediateValue *imm = ref.get()->asImm(); + + int8_t s8 = static_cast<int8_t>(imm->reg.data.s32); + + assert(s8 == imm->reg.data.s32); + + code[0] |= (s8 & 0x3f) << 26; + code[0] |= (s8 >> 6) << 8; +} + +void +CodeEmitterNVC0::emitForm_A(const Instruction *i, uint64_t opc) +{ + code[0] = opc; + code[1] = opc >> 32; + + emitPredicate(i); + + defId(i->def[0], 14); + + int s1 = 26; + if (i->srcExists(2) && i->getSrc(2)->reg.file == FILE_MEMORY_CONST) + s1 = 49; + + for (int s = 0; s < 3 && i->srcExists(s); ++s) { + switch (i->getSrc(s)->reg.file) { + case FILE_MEMORY_CONST: + assert(!(code[1] & 0xc000)); + code[1] |= (s == 2) ? 0x8000 : 0x4000; + code[1] |= i->getSrc(s)->reg.fileIndex << 10; + setAddress16(i->src[s]); + break; + case FILE_IMMEDIATE: + assert(s == 1 || + i->op == OP_MOV || i->op == OP_PRESIN || i->op == OP_PREEX2); + assert(!(code[1] & 0xc000)); + setImmediate(i, s); + break; + case FILE_GPR: + if ((s == 2) && ((code[0] & 0x7) == 2)) // LIMM: 3rd src == dst + break; + srcId(i->src[s], s ? ((s == 2) ? 49 : s1) : 20); + break; + default: + // ignore here, can be predicate or flags, but must not be address + break; + } + } +} + +void +CodeEmitterNVC0::emitForm_B(const Instruction *i, uint64_t opc) +{ + code[0] = opc; + code[1] = opc >> 32; + + emitPredicate(i); + + defId(i->def[0], 14); + + switch (i->src[0].getFile()) { + case FILE_MEMORY_CONST: + assert(!(code[1] & 0xc000)); + code[1] |= 0x4000 | (i->src[0].get()->reg.fileIndex << 10); + setAddress16(i->src[0]); + break; + case FILE_IMMEDIATE: + assert(!(code[1] & 0xc000)); + setImmediate(i, 0); + break; + case FILE_GPR: + srcId(i->src[0], 26); + break; + default: + // ignore here, can be predicate or flags, but must not be address + break; + } +} + +void +CodeEmitterNVC0::emitForm_S(const Instruction *i, uint32_t opc, bool pred) +{ + code[0] = opc; + + int ss2a = 0; + if (opc == 0x0d || opc == 0x0e) + ss2a = 2; + + defId(i->def[0], 14); + srcId(i->src[0], 20); + + assert(pred || (i->predSrc < 0)); + if (pred) + emitPredicate(i); + + for (int s = 1; s < 3 && i->srcExists(s); ++s) { + if (i->src[s].get()->reg.file == FILE_MEMORY_CONST) { + assert(!(code[0] & (0x300 >> ss2a))); + switch (i->src[s].get()->reg.fileIndex) { + case 0: code[0] |= 0x100 >> ss2a; break; + case 1: code[0] |= 0x200 >> ss2a; break; + case 16: code[0] |= 0x300 >> ss2a; break; + default: + ERROR("invalid c[] space for short form\n"); + break; + } + if (s == 1) + code[0] |= i->getSrc(s)->reg.data.offset << 24; + else + code[0] |= i->getSrc(s)->reg.data.offset << 6; + } else + if (i->src[s].getFile() == FILE_IMMEDIATE) { + assert(s == 1); + setImmediateS8(i->src[s]); + } else + if (i->src[s].getFile() == FILE_GPR) { + srcId(i->src[s], (s == 1) ? 26 : 8); + } + } +} + +void +CodeEmitterNVC0::emitShortSrc2(const ValueRef &src) +{ + if (src.getFile() == FILE_MEMORY_CONST) { + switch (src.get()->reg.fileIndex) { + case 0: code[0] |= 0x100; break; + case 1: code[0] |= 0x200; break; + case 16: code[0] |= 0x300; break; + default: + assert(!"unsupported file index for short op"); + break; + } + srcAddr32(src, 20); + } else { + srcId(src, 20); + assert(src.getFile() == FILE_GPR); + } +} + +void +CodeEmitterNVC0::emitNOP(const Instruction *i) +{ + code[0] = 0x000001e4; + code[1] = 0x40000000; + emitPredicate(i); +} + +void +CodeEmitterNVC0::emitFMAD(const Instruction *i) +{ + bool neg1 = (i->src[0].mod ^ i->src[1].mod).neg(); + + if (i->encSize == 8) { + if (isLIMM(i->src[1], TYPE_F32)) { + emitForm_A(i, HEX64(20000000, 00000002)); + } else { + emitForm_A(i, HEX64(30000000, 00000000)); + + if (i->src[2].mod.neg()) + code[0] |= 1 << 8; + } + roundMode_A(i); + + if (neg1) + code[0] |= 1 << 9; + + if (i->saturate) + code[0] |= 1 << 5; + if (i->ftz) + code[0] |= 1 << 6; + } else { + assert(!i->saturate && !i->src[2].mod.neg()); + emitForm_S(i, (i->src[2].getFile() == FILE_MEMORY_CONST) ? 0x2e : 0x0e, + false); + if (neg1) + code[0] |= 1 << 4; + } +} + +void +CodeEmitterNVC0::emitFMUL(const Instruction *i) +{ + bool neg = (i->src[0].mod ^ i->src[1].mod).neg(); + + assert(i->postFactor >= -3 && i->postFactor <= 3); + + if (i->encSize == 8) { + if (isLIMM(i->src[1], TYPE_F32)) { + assert(i->postFactor == 0); // constant folded, hopefully + emitForm_A(i, HEX64(30000000, 00000002)); + } else { + emitForm_A(i, HEX64(58000000, 00000000)); + roundMode_A(i); + code[1] |= ((i->postFactor > 0) ? + (7 - i->postFactor) : (0 - i->postFactor)) << 17; + } + if (neg) + code[1] ^= 1 << 25; // aliases with LIMM sign bit + + if (i->saturate) + code[0] |= 1 << 5; + + if (i->dnz) + code[0] |= 1 << 7; + else + if (i->ftz) + code[0] |= 1 << 6; + } else { + assert(!neg && !i->saturate && !i->ftz && !i->postFactor); + emitForm_S(i, 0xa8, true); + } +} + +void +CodeEmitterNVC0::emitUMUL(const Instruction *i) +{ + if (i->encSize == 8) { + if (i->src[1].getFile() == FILE_IMMEDIATE) { + emitForm_A(i, HEX64(10000000, 00000002)); + } else { + emitForm_A(i, HEX64(50000000, 00000003)); + } + if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) + code[0] |= 1 << 6; + if (i->sType == TYPE_S32) + code[0] |= 1 << 5; + if (i->dType == TYPE_S32) + code[0] |= 1 << 7; + } else { + emitForm_S(i, i->src[1].getFile() == FILE_IMMEDIATE ? 0xaa : 0x2a, true); + + if (i->sType == TYPE_S32) + code[0] |= 1 << 6; + } +} + +void +CodeEmitterNVC0::emitFADD(const Instruction *i) +{ + if (i->encSize == 8) { + if (isLIMM(i->src[1], TYPE_F32)) { + emitForm_A(i, HEX64(28000000, 00000002)); + + assert(!i->src[1].mod.neg() && !i->src[1].mod.abs() && !i->saturate); + } else { + emitForm_A(i, HEX64(50000000, 00000000)); + + roundMode_A(i); + if (i->saturate) + code[1] |= 1 << 17; + } + emitNegAbs12(i); + + if (i->op == OP_SUB) code[0] ^= 1 << 8; + + if (i->ftz) + code[0] |= 1 << 5; + } else { + assert(!i->saturate && i->op != OP_SUB && + !i->src[0].mod.abs() && + !i->src[1].mod.neg() && !i->src[1].mod.abs()); + + emitForm_S(i, 0x49, true); + + if (i->src[0].mod.neg()) + code[0] |= 1 << 7; + } +} + +void +CodeEmitterNVC0::emitUADD(const Instruction *i) +{ + uint32_t addOp = 0; + + assert(!i->src[0].mod.abs() && !i->src[1].mod.abs()); + assert(!i->src[0].mod.neg() || !i->src[1].mod.neg()); + + if (i->src[0].mod.neg()) + addOp |= 0x200; + if (i->src[1].mod.neg()) + addOp |= 0x100; + if (i->op == OP_SUB) { + addOp ^= 0x100; + assert(addOp != 0x300); // would be add-plus-one + } + + if (i->encSize == 8) { + if (isLIMM(i->src[1], TYPE_U32)) { + emitForm_A(i, HEX64(08000000, 00000002)); + if (i->def[1].exists()) + code[1] |= 1 << 26; // write carry + } else { + emitForm_A(i, HEX64(48000000, 00000003)); + if (i->def[1].exists()) + code[1] |= 1 << 16; // write carry + } + code[0] |= addOp; + + if (i->saturate) + code[0] |= 1 << 5; + if (i->flagsSrc >= 0) // add carry + code[0] |= 1 << 6; + } else { + assert(!(addOp & 0x100)); + emitForm_S(i, (addOp >> 3) | + ((i->src[1].getFile() == FILE_IMMEDIATE) ? 0xac : 0x2c), true); + } +} + +// TODO: shl-add +void +CodeEmitterNVC0::emitIMAD(const Instruction *i) +{ + assert(i->encSize == 8); + emitForm_A(i, HEX64(20000000, 00000003)); + + if (isSignedType(i->dType)) + code[0] |= 1 << 7; + if (isSignedType(i->sType)) + code[0] |= 1 << 5; + + code[1] |= i->saturate << 24; + + if (i->flagsDef >= 0) code[1] |= 1 << 16; + if (i->flagsSrc >= 0) code[1] |= 1 << 23; + + if (i->src[2].mod.neg()) code[0] |= 0x10; + if (i->src[1].mod.neg() ^ + i->src[0].mod.neg()) code[0] |= 0x20; + + if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) + code[0] |= 1 << 6; +} + +void +CodeEmitterNVC0::emitNOT(Instruction *i) +{ + assert(i->encSize == 8); + i->src[1].set(i->src[0]); + emitForm_A(i, HEX64(68000000, 000001c3)); +} + +void +CodeEmitterNVC0::emitLogicOp(const Instruction *i, uint8_t subOp) +{ + if (i->encSize == 8) { + if (isLIMM(i->src[1], TYPE_U32)) { + emitForm_A(i, HEX64(38000000, 00000002)); + + if (i->src[2].exists()) + code[1] |= 1 << 26; + } else { + emitForm_A(i, HEX64(68000000, 00000003)); + + if (i->src[2].exists()) + code[1] |= 1 << 16; + } + code[0] |= subOp << 6; + + if (i->src[2].exists()) // carry + code[0] |= 1 << 5; + + if (i->src[0].mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9; + if (i->src[1].mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8; + } else { + emitForm_S(i, (subOp << 5) | + ((i->src[1].getFile() == FILE_IMMEDIATE) ? 0x1d : 0x8d), true); + } +} + +void +CodeEmitterNVC0::emitPOPC(const Instruction *i) +{ + emitForm_A(i, HEX64(54000000, 00000004)); + + if (i->src[0].mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9; + if (i->src[1].mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8; +} + +void +CodeEmitterNVC0::emitINSBF(const Instruction *i) +{ + emitForm_A(i, HEX64(28000000, 30000000)); +} + +void +CodeEmitterNVC0::emitShift(const Instruction *i) +{ + if (i->op == OP_SHR) { + emitForm_A(i, HEX64(58000000, 00000003) + | (isSignedType(i->dType) ? 0x20 : 0x00)); + } else { + emitForm_A(i, HEX64(60000000, 00000003)); + } + + if (0) + code[0] |= 1 << 9; // clamp shift amount +} + +void +CodeEmitterNVC0::emitPreOp(const Instruction *i) +{ + if (i->encSize == 8) { + emitForm_B(i, HEX64(60000000, 00000000)); + + if (i->op == OP_PREEX2) + code[0] |= 0x20; + + if (i->src[0].mod.abs()) code[0] |= 1 << 6; + if (i->src[0].mod.neg()) code[0] |= 1 << 8; + } else { + emitForm_S(i, i->op == OP_PREEX2 ? 0x74000008 : 0x70000008, true); + } +} + +void +CodeEmitterNVC0::emitSFnOp(const Instruction *i, uint8_t subOp) +{ + if (i->encSize == 8) { + code[0] = 0x00000000 | (subOp << 26); + code[1] = 0xc8000000; + + emitPredicate(i); + + defId(i->def[0], 14); + srcId(i->src[0], 20); + + assert(i->src[0].getFile() == FILE_GPR); + + if (i->saturate) code[0] |= 1 << 5; + + if (i->src[0].mod.abs()) code[0] |= 1 << 7; + if (i->src[0].mod.neg()) code[0] |= 1 << 9; + } else { + emitForm_S(i, 0x80000008 | (subOp << 26), true); + + assert(!i->src[0].mod.neg()); + if (i->src[0].mod.abs()) code[0] |= 1 << 30; + } +} + +void +CodeEmitterNVC0::emitMINMAX(const Instruction *i) +{ + uint64_t op; + + assert(i->encSize == 8); + + op = (i->op == OP_MIN) ? 0x080e000000000000ULL : 0x081e000000000000ULL; + + if (i->ftz) + op |= 1 << 5; + else + if (!isFloatType(i->dType)) + op |= isSignedType(i->dType) ? 0x23 : 0x03; + + emitForm_A(i, op); + emitNegAbs12(i); +} + +void +CodeEmitterNVC0::roundMode_C(const Instruction *i) +{ + switch (i->rnd) { + case ROUND_M: code[1] |= 1 << 17; break; + case ROUND_P: code[1] |= 2 << 17; break; + case ROUND_Z: code[1] |= 3 << 17; break; + case ROUND_NI: code[0] |= 1 << 7; break; + case ROUND_MI: code[0] |= 1 << 7; code[1] |= 1 << 17; break; + case ROUND_PI: code[0] |= 1 << 7; code[1] |= 2 << 17; break; + case ROUND_ZI: code[0] |= 1 << 7; code[1] |= 3 << 17; break; + case ROUND_N: break; + default: + assert(!"invalid round mode"); + break; + } +} + +void +CodeEmitterNVC0::roundMode_CS(const Instruction *i) +{ + switch (i->rnd) { + case ROUND_M: + case ROUND_MI: code[0] |= 1 << 16; break; + case ROUND_P: + case ROUND_PI: code[0] |= 2 << 16; break; + case ROUND_Z: + case ROUND_ZI: code[0] |= 3 << 16; break; + default: + break; + } +} + +void +CodeEmitterNVC0::emitCVT(Instruction *i) +{ + const bool f2f = isFloatType(i->dType) && isFloatType(i->sType); + + switch (i->op) { + case OP_CEIL: i->rnd = f2f ? ROUND_PI : ROUND_P; break; + case OP_FLOOR: i->rnd = f2f ? ROUND_MI : ROUND_M; break; + case OP_TRUNC: i->rnd = f2f ? ROUND_ZI : ROUND_Z; break; + default: + break; + } + + const bool sat = (i->op == OP_SAT) || i->saturate; + const bool abs = (i->op == OP_ABS) || i->src[0].mod.abs(); + const bool neg = (i->op == OP_NEG) || i->src[0].mod.neg(); + + if (i->encSize == 8) { + emitForm_B(i, HEX64(10000000, 00000004)); + + roundMode_C(i); + + code[0] |= util_logbase2(i->def[0].getSize()) << 20; + code[0] |= util_logbase2(i->src[0].getSize()) << 23; + + if (sat) + code[0] |= 0x20; + if (abs) + code[0] |= 1 << 6; + if (neg && i->op != OP_ABS) + code[0] |= 1 << 8; + + if (i->ftz) + code[1] |= 1 << 23; + + if (isSignedIntType(i->dType)) + code[0] |= 0x080; + if (isSignedIntType(i->sType)) + code[0] |= 0x200; + + if (isFloatType(i->dType)) { + if (!isFloatType(i->sType)) + code[1] |= 0x08000000; + } else { + if (isFloatType(i->sType)) + code[1] |= 0x04000000; + else + code[1] |= 0x0c000000; + } + } else { + if (i->op == OP_CEIL || i->op == OP_FLOOR || i->op == OP_TRUNC) { + code[0] = 0x298; + } else + if (isFloatType(i->dType)) { + if (isFloatType(i->sType)) + code[0] = 0x098; + else + code[0] = 0x088 | (isSignedType(i->sType) ? (1 << 8) : 0); + } else { + assert(isFloatType(i->sType)); + + code[0] = 0x288 | (isSignedType(i->sType) ? (1 << 8) : 0); + } + + if (neg) code[0] |= 1 << 16; + if (sat) code[0] |= 1 << 18; + if (abs) code[0] |= 1 << 19; + + roundMode_CS(i); + } +} + +void +CodeEmitterNVC0::emitSET(const CmpInstruction *i) +{ + uint32_t hi; + uint32_t lo = 0; + + if (i->sType == TYPE_F64) + lo = 0x1; + else + if (!isFloatType(i->sType)) + lo = 0x3; + + if (isFloatType(i->dType) || isSignedIntType(i->sType)) + lo |= 0x20; + + switch (i->op) { + case OP_SET_AND: hi = 0x10000000; break; + case OP_SET_OR: hi = 0x10200000; break; + case OP_SET_XOR: hi = 0x10400000; break; + default: + hi = 0x100e0000; + break; + } + emitForm_A(i, (static_cast<uint64_t>(hi) << 32) | lo); + + if (i->def[0].getFile() == FILE_PREDICATE) { + if (i->sType == TYPE_F32) + code[1] += 0x10000000; + else + code[1] += 0x08000000; + + code[0] &= ~0xfc000; + defId(i->def[0], 17); + if (i->defExists(1)) + defId(i->def[1], 14); + else + code[0] |= 0x1c000; + } + + if (i->ftz) + code[1] |= 1 << 27; + + emitCondCode(i->setCond, 32 + 23); + emitNegAbs12(i); +} + +void +CodeEmitterNVC0::emitSLCT(const CmpInstruction *i) +{ + uint64_t op; + + switch (i->dType) { + case TYPE_S32: + op = HEX64(30000000, 00000023); + break; + case TYPE_U32: + op = HEX64(30000000, 00000003); + break; + case TYPE_F32: + op = HEX64(38000000, 00000000); + break; + default: + assert(!"invalid type for SLCT"); + op = 0; + break; + } + emitForm_A(i, op); + + CondCode cc = i->setCond; + + if (i->src[2].mod.neg()) + cc = reverseCondCode(cc); + + emitCondCode(cc, 32 + 23); + + if (i->ftz) + code[0] |= 1 << 5; +} + +void CodeEmitterNVC0::emitSELP(const Instruction *i) +{ + emitForm_A(i, HEX64(20000000, 00000004)); + + if (i->cc == CC_NOT_P || i->src[2].mod & Modifier(NV50_IR_MOD_NOT)) + code[1] |= 1 << 20; +} + +void CodeEmitterNVC0::emitTEXCSAA(const TexInstruction *i) +{ + code[0] = 0x00000086; + code[1] = 0xd0000000; + + code[1] |= i->tex.r; + code[1] |= i->tex.s << 8; + + if (i->tex.liveOnly) + code[0] |= 1 << 9; + + defId(i->def[0], 14); + srcId(i->src[0], 20); +} + +void +CodeEmitterNVC0::emitTEX(const TexInstruction *i) +{ + code[0] = 0x00000006; + + if (1) + code[0] |= 0x80; // normal/t/p mode = t, XXX: what is this ? + + if (i->tex.liveOnly) + code[0] |= 1 << 9; + + switch (i->op) { + case OP_TEX: code[1] = 0x80000000; break; + case OP_TXB: code[1] = 0x84000000; break; + case OP_TXL: code[1] = 0x86000000; break; + case OP_TXF: code[1] = 0x92000000; break; + case OP_TXG: code[1] = 0xa0000000; break; + case OP_TXD: code[1] = 0xe0000000; break; + default: + assert(!"invalid texture op"); + break; + } + defId(i->def[0], 14); + srcId(i->src[0], 20); + + emitPredicate(i); + + if (i->op == OP_TXG) code[0] |= i->tex.gatherComp << 5; + + code[1] |= i->tex.mask << 14; + + code[1] |= i->tex.r; + code[1] |= i->tex.s << 8; + if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) + code[1] |= 1 << 18; // in 1st source (with array index) + + // texture target: + code[1] |= (i->tex.target.getDim() - 1) << 20; + if (i->tex.target.isCube()) + code[1] += 2 << 20; + if (i->tex.target.isArray()) + code[1] |= 1 << 19; + if (i->tex.target.isShadow()) + code[1] |= 1 << 24; + + int src1 = i->tex.target.getArgCount(); + + if (i->src[src1].getFile() == FILE_IMMEDIATE) { // lzero + if (i->op == OP_TXL) + code[1] &= ~(1 << 26); + else + if (i->op == OP_TXF) + code[1] &= ~(1 << 25); + } + if (i->tex.target == TEX_TARGET_2D_MS || + i->tex.target == TEX_TARGET_2D_MS_ARRAY) + code[1] |= 1 << 23; + + if (i->tex.useOffsets) // in vecSrc0.w + code[1] |= 1 << 22; + + srcId(i->src[src1], 26); +} + +void +CodeEmitterNVC0::emitTXQ(const TexInstruction *i) +{ + code[0] = 0x00000086; + code[1] = 0xc0000000; + + switch (i->tex.query) { + case TXQ_DIMS: code[1] |= 0 << 22; break; + case TXQ_TYPE: code[1] |= 1 << 22; break; + case TXQ_SAMPLE_POSITION: code[1] |= 2 << 22; break; + case TXQ_FILTER: code[1] |= 3 << 22; break; + case TXQ_LOD: code[1] |= 4 << 22; break; + case TXQ_BORDER_COLOUR: code[1] |= 5 << 22; break; + default: + assert(!"invalid texture query"); + break; + } + + code[1] |= i->tex.mask << 14; + + code[1] |= i->tex.r; + code[1] |= i->tex.s << 8; + if (i->tex.sIndirectSrc >= 0 || i->tex.rIndirectSrc >= 0) + code[1] |= 1 << 18; + + defId(i->def[0], 14); + srcId(i->src[0], 20); + srcId(i->src[1], 26); + + emitPredicate(i); +} + +void +CodeEmitterNVC0::emitQUADOP(const Instruction *i, uint8_t qOp, uint8_t laneMask) +{ + code[0] = 0x00000000 | (laneMask << 6); + code[1] = 0x48000000 | qOp; + + defId(i->def[0], 14); + srcId(i->src[0], 20); + srcId(i->srcExists(1) ? i->src[1] : i->src[0], 26); + + emitPredicate(i); +} + +void +CodeEmitterNVC0::emitFlow(const Instruction *i) +{ + const FlowInstruction *f = i->asFlow(); + + unsigned mask; // bit 0: predicate, bit 1: target + + code[0] = 0x00000007; + + switch (i->op) { + case OP_BRA: + code[1] = f->absolute ? 0x00000000 : 0x40000000; + if (i->src[0].getFile() == FILE_MEMORY_CONST || + i->src[1].getFile() == FILE_MEMORY_CONST) + code[1] |= 0x4000; + mask = 3; + break; + case OP_CALL: + code[1] = f->absolute ? 0x10000000 : 0x50000000; + if (i->src[0].getFile() == FILE_MEMORY_CONST) + code[1] |= 0x4000; + mask = 2; + break; + + case OP_EXIT: code[1] = 0x80000000; mask = 1; break; + case OP_RET: code[1] = 0x90000000; mask = 1; break; + case OP_DISCARD: code[1] = 0x98000000; mask = 1; break; + case OP_BREAK: code[1] = 0xa8000000; mask = 1; break; + case OP_CONT: code[1] = 0xb0000000; mask = 1; break; + + case OP_JOINAT: code[1] = 0x60000000; mask = 2; break; + case OP_PREBREAK: code[1] = 0x68000000; mask = 2; break; + case OP_PRECONT: code[1] = 0x70000000; mask = 2; break; + case OP_PRERET: code[1] = 0x78000000; mask = 2; break; + + case OP_QUADON: code[1] = 0xc0000000; mask = 0; break; + case OP_QUADPOP: code[1] = 0xc8000000; mask = 0; break; + case OP_BRKPT: code[1] = 0xd0000000; mask = 0; break; + default: + assert(!"invalid flow operation"); + return; + } + + if (mask & 1) { + emitPredicate(i); + if (i->flagsSrc < 0) + code[0] |= 0x1e0; + } + + if (!f) + return; + + if (f->allWarp) + code[0] |= 1 << 15; + if (f->limit) + code[0] |= 1 << 16; + + if (f->op == OP_CALL) { + if (f->builtin) { + assert(f->absolute); + uint32_t pcAbs = targ->getBuiltinOffset(f->target.builtin); + addReloc(RelocEntry::TYPE_BUILTIN, 0, pcAbs, 0xfc000000, 26); + addReloc(RelocEntry::TYPE_BUILTIN, 1, pcAbs, 0x03ffffff, -6); + } else { + assert(!f->absolute); + int32_t pcRel = f->target.fn->binPos - (codeSize + 8); + code[0] |= (pcRel & 0x3f) << 26; + code[1] |= (pcRel >> 6) & 0x3ffff; + } + } else + if (mask & 2) { + int32_t pcRel = f->target.bb->binPos - (codeSize + 8); + // currently we don't want absolute branches + assert(!f->absolute); + code[0] |= (pcRel & 0x3f) << 26; + code[1] |= (pcRel >> 6) & 0x3ffff; + } +} + +void +CodeEmitterNVC0::emitPFETCH(const Instruction *i) +{ + uint32_t prim = i->src[0].get()->reg.data.u32; + + code[0] = 0x00000006 | ((prim & 0x3f) << 26); + code[1] = 0x00000000 | (prim >> 6); + + emitPredicate(i); + + defId(i->def[0], 14); + srcId(i->src[1], 20); +} + +void +CodeEmitterNVC0::emitVFETCH(const Instruction *i) +{ + code[0] = 0x00000006; + code[1] = 0x06000000 | i->src[0].get()->reg.data.offset; + + if (i->perPatch) + code[0] |= 0x100; + if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT) + code[0] |= 0x200; // yes, TCPs can read from *outputs* of other threads + + emitPredicate(i); + + code[0] |= (i->defCount(0xf) - 1) << 5; + + defId(i->def[0], 14); + srcId(i->src[0].getIndirect(0), 20); + srcId(i->src[0].getIndirect(1), 26); // vertex address +} + +void +CodeEmitterNVC0::emitEXPORT(const Instruction *i) +{ + unsigned int size = typeSizeof(i->dType); + + code[0] = 0x00000006 | ((size / 4 - 1) << 5); + code[1] = 0x0a000000 | i->src[0].get()->reg.data.offset; + + assert(size != 12 && !(code[1] & (size - 1))); + + if (i->perPatch) + code[0] |= 0x100; + + emitPredicate(i); + + assert(i->src[1].getFile() == FILE_GPR); + + srcId(i->src[0].getIndirect(0), 20); + srcId(i->src[0].getIndirect(1), 32 + 17); // vertex base address + srcId(i->src[1], 26); +} + +void +CodeEmitterNVC0::emitOUT(const Instruction *i) +{ + code[0] = 0x00000006; + code[1] = 0x1c000000; + + emitPredicate(i); + + defId(i->def[0], 14); // new secret address + srcId(i->src[0], 20); // old secret address, should be 0 initially + + assert(i->src[0].getFile() == FILE_GPR); + + if (i->op == OP_EMIT) + code[0] |= 1 << 5; + if (i->op == OP_RESTART || i->subOp == NV50_IR_SUBOP_EMIT_RESTART) + code[0] |= 1 << 6; + + // vertex stream + if (i->src[1].getFile() == FILE_IMMEDIATE) { + code[1] |= 0xc000; + code[0] |= SDATA(i->src[1]).u32 << 26; + } else { + srcId(i->src[1], 26); + } +} + +void +CodeEmitterNVC0::emitInterpMode(const Instruction *i) +{ + if (i->encSize == 8) { + code[0] |= i->ipa << 6; // TODO: INTERP_SAMPLEID + } else { + if (i->getInterpMode() == NV50_IR_INTERP_SC) + code[0] |= 0x80; + assert(i->op == OP_PINTERP && i->getSampleMode() == 0); + } +} + +void +CodeEmitterNVC0::emitINTERP(const Instruction *i) +{ + const uint32_t base = i->getSrc(0)->reg.data.offset; + + if (i->encSize == 8) { + code[0] = 0x00000000; + code[1] = 0xc0000000 | (base & 0xffff); + + if (i->saturate) + code[0] |= 1 << 5; + + if (i->op == OP_PINTERP) + srcId(i->src[1], 26); + else + code[0] |= 0x3f << 26; + + srcId(i->src[0].getIndirect(0), 20); + } else { + assert(i->op == OP_PINTERP); + code[0] = 0x00000009 | ((base & 0xc) << 6) | ((base >> 4) << 26); + srcId(i->src[1], 20); + } + emitInterpMode(i); + + emitPredicate(i); + defId(i->def[0], 14); + + if (i->getSampleMode() == NV50_IR_INTERP_OFFSET) + srcId(i->src[i->op == OP_PINTERP ? 2 : 1], 17); + else + code[1] |= 0x3f << 17; +} + +void +CodeEmitterNVC0::emitLoadStoreType(DataType ty) +{ + uint8_t val; + + switch (ty) { + case TYPE_U8: + val = 0x00; + break; + case TYPE_S8: + val = 0x20; + break; + case TYPE_F16: + case TYPE_U16: + val = 0x40; + break; + case TYPE_S16: + val = 0x60; + break; + case TYPE_F32: + case TYPE_U32: + case TYPE_S32: + val = 0x80; + break; + case TYPE_F64: + case TYPE_U64: + case TYPE_S64: + val = 0xa0; + break; + case TYPE_B128: + val = 0xc0; + break; + default: + val = 0x80; + assert(!"invalid type"); + break; + } + code[0] |= val; +} + +void +CodeEmitterNVC0::emitCachingMode(CacheMode c) +{ + uint32_t val; + + switch (c) { + case CACHE_CA: +// case CACHE_WB: + val = 0x000; + break; + case CACHE_CG: + val = 0x100; + break; + case CACHE_CS: + val = 0x200; + break; + case CACHE_CV: +// case CACHE_WT: + val = 0x300; + break; + default: + val = 0; + assert(!"invalid caching mode"); + break; + } + code[0] |= val; +} + +void +CodeEmitterNVC0::emitSTORE(const Instruction *i) +{ + uint32_t opc; + + switch (i->src[0].getFile()) { + case FILE_MEMORY_GLOBAL: opc = 0x90000000; break; + case FILE_MEMORY_LOCAL: opc = 0xc8000000; break; + case FILE_MEMORY_SHARED: opc = 0xc9000000; break; + default: + assert(!"invalid memory file"); + opc = 0; + break; + } + code[0] = 0x00000005; + code[1] = opc; + + setAddress16(i->src[0]); + srcId(i->src[1], 14); + srcId(i->src[0].getIndirect(0), 20); + + emitPredicate(i); + + emitLoadStoreType(i->dType); + emitCachingMode(i->cache); +} + +void +CodeEmitterNVC0::emitLOAD(const Instruction *i) +{ + uint32_t opc; + + code[0] = 0x00000005; + + switch (i->src[0].getFile()) { + case FILE_MEMORY_GLOBAL: opc = 0x80000000; break; + case FILE_MEMORY_LOCAL: opc = 0xc0000000; break; + case FILE_MEMORY_SHARED: opc = 0xc1000000; break; + case FILE_MEMORY_CONST: + if (!i->src[0].isIndirect(0) && typeSizeof(i->dType) == 4) { + emitMOV(i); // not sure if this is any better + return; + } + opc = 0x14000000 | (i->src[0].get()->reg.fileIndex << 10); + code[0] = 0x00000006 | (i->subOp << 8); + break; + default: + assert(!"invalid memory file"); + opc = 0; + break; + } + code[1] = opc; + + defId(i->def[0], 14); + + setAddress16(i->src[0]); + srcId(i->src[0].getIndirect(0), 20); + + emitPredicate(i); + + emitLoadStoreType(i->dType); + emitCachingMode(i->cache); +} + +uint8_t +CodeEmitterNVC0::getSRegEncoding(const ValueRef& ref) +{ + switch (SDATA(ref).sv.sv) { + case SV_LANEID: return 0x00; + case SV_PHYSID: return 0x03; + case SV_VERTEX_COUNT: return 0x10; + case SV_INVOCATION_ID: return 0x11; + case SV_YDIR: return 0x12; + case SV_TID: return 0x21 + SDATA(ref).sv.index; + case SV_CTAID: return 0x25 + SDATA(ref).sv.index; + case SV_NTID: return 0x29 + SDATA(ref).sv.index; + case SV_GRIDID: return 0x2c; + case SV_NCTAID: return 0x2d + SDATA(ref).sv.index; + case SV_LBASE: return 0x34; + case SV_SBASE: return 0x30; + case SV_CLOCK: return 0x50 + SDATA(ref).sv.index; + default: + assert(!"no sreg for system value"); + return 0; + } +} + +void +CodeEmitterNVC0::emitMOV(const Instruction *i) +{ + if (i->src[0].getFile() == FILE_SYSTEM_VALUE) { + uint8_t sr = getSRegEncoding(i->src[0]); + + if (i->encSize == 8) { + code[0] = 0x00000004 | (sr << 26); + code[1] = 0x2c000000; + } else { + code[0] = 0x40000008 | (sr << 20); + } + defId(i->def[0], 14); + + emitPredicate(i); + } else + if (i->encSize == 8) { + uint64_t opc; + + if (i->src[0].getFile() == FILE_IMMEDIATE) + opc = HEX64(18000000, 000001e2); + else + if (i->src[0].getFile() == FILE_PREDICATE) + opc = HEX64(080e0000, 1c000004); + else + opc = HEX64(28000000, 00000004); + + opc |= i->lanes << 5; + + emitForm_B(i, opc); + } else { + uint32_t imm; + + if (i->src[0].getFile() == FILE_IMMEDIATE) { + imm = SDATA(i->src[0]).u32; + if (imm & 0xfff00000) { + assert(!(imm & 0x000fffff)); + code[0] = 0x00000318 | imm; + } else { + assert(imm < 0x800 || ((int32_t)imm >= -0x800)); + code[0] = 0x00000118 | (imm << 20); + } + } else { + code[0] = 0x0028; + emitShortSrc2(i->src[0]); + } + defId(i->def[0], 14); + + emitPredicate(i); + } +} + +bool +CodeEmitterNVC0::emitInstruction(Instruction *insn) +{ + if (!insn->encSize) { + ERROR("skipping unencodable instruction: "); insn->print(); + return false; + } else + if (codeSize + insn->encSize > codeSizeLimit) { + ERROR("code emitter output buffer too small\n"); + return false; + } + + // assert that instructions with multiple defs don't corrupt registers + for (int d = 0; insn->defExists(d); ++d) + assert(insn->asTex() || insn->def[d].rep()->reg.data.id >= 0); + + switch (insn->op) { + case OP_MOV: + case OP_RDSV: + emitMOV(insn); + break; + case OP_NOP: + break; + case OP_LOAD: + emitLOAD(insn); + break; + case OP_STORE: + emitSTORE(insn); + break; + case OP_LINTERP: + case OP_PINTERP: + emitINTERP(insn); + break; + case OP_VFETCH: + emitVFETCH(insn); + break; + case OP_EXPORT: + emitEXPORT(insn); + break; + case OP_PFETCH: + emitPFETCH(insn); + break; + case OP_EMIT: + case OP_RESTART: + emitOUT(insn); + break; + case OP_ADD: + case OP_SUB: + if (isFloatType(insn->dType)) + emitFADD(insn); + else + emitUADD(insn); + break; + case OP_MUL: + if (isFloatType(insn->dType)) + emitFMUL(insn); + else + emitUMUL(insn); + break; + case OP_MAD: + case OP_FMA: + if (isFloatType(insn->dType)) + emitFMAD(insn); + else + emitIMAD(insn); + break; + case OP_NOT: + emitNOT(insn); + break; + case OP_AND: + emitLogicOp(insn, 0); + break; + case OP_OR: + emitLogicOp(insn, 1); + break; + case OP_XOR: + emitLogicOp(insn, 2); + break; + case OP_SHL: + case OP_SHR: + emitShift(insn); + break; + case OP_SET: + case OP_SET_AND: + case OP_SET_OR: + case OP_SET_XOR: + emitSET(insn->asCmp()); + break; + case OP_SELP: + emitSELP(insn); + break; + case OP_SLCT: + emitSLCT(insn->asCmp()); + break; + case OP_MIN: + case OP_MAX: + emitMINMAX(insn); + break; + case OP_ABS: + case OP_NEG: + case OP_CEIL: + case OP_FLOOR: + case OP_TRUNC: + case OP_CVT: + case OP_SAT: + emitCVT(insn); + break; + case OP_RSQ: + emitSFnOp(insn, 5); + break; + case OP_RCP: + emitSFnOp(insn, 4); + break; + case OP_LG2: + emitSFnOp(insn, 3); + break; + case OP_EX2: + emitSFnOp(insn, 2); + break; + case OP_SIN: + emitSFnOp(insn, 1); + break; + case OP_COS: + emitSFnOp(insn, 0); + break; + case OP_PRESIN: + case OP_PREEX2: + emitPreOp(insn); + break; + case OP_TEX: + case OP_TXB: + case OP_TXL: + case OP_TXD: + case OP_TXF: + emitTEX(insn->asTex()); + break; + case OP_TXQ: + emitTXQ(insn->asTex()); + break; + case OP_BRA: + case OP_CALL: + case OP_PRERET: + case OP_RET: + case OP_DISCARD: + case OP_EXIT: + case OP_PRECONT: + case OP_CONT: + case OP_PREBREAK: + case OP_BREAK: + case OP_JOINAT: + case OP_BRKPT: + case OP_QUADON: + case OP_QUADPOP: + emitFlow(insn); + break; + case OP_QUADOP: + emitQUADOP(insn, insn->subOp, insn->lanes); + break; + case OP_DFDX: + emitQUADOP(insn, insn->src[0].mod.neg() ? 0x66 : 0x99, 0x4); + break; + case OP_DFDY: + emitQUADOP(insn, insn->src[0].mod.neg() ? 0x5a : 0xa5, 0x5); + break; + case OP_POPCNT: + emitPOPC(insn); + break; + case OP_JOIN: + emitNOP(insn); + insn->join = 1; + break; + case OP_PHI: + case OP_UNION: + case OP_CONSTRAINT: + ERROR("operation should have been eliminated"); + return false; + case OP_EXP: + case OP_LOG: + case OP_SQRT: + case OP_POW: + ERROR("operation should have been lowered\n"); + return false; + default: + ERROR("unknow op\n"); + return false; + } + + if (insn->join) { + code[0] |= 0x10; + assert(insn->encSize == 8); + } + + code += insn->encSize / 4; + codeSize += insn->encSize; + return true; +} + +uint32_t +CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const +{ + const Target::OpInfo &info = targ->getOpInfo(i); + + if (info.minEncSize == 8 || 1) + return 8; + + if (i->ftz || i->saturate || i->join) + return 8; + if (i->rnd != ROUND_N) + return 8; + if (i->predSrc >= 0 && i->op == OP_MAD) + return 8; + + if (i->op == OP_PINTERP) { + if (i->getSampleMode() || 1) // XXX: grr, short op doesn't work + return 8; + } else + if (i->op == OP_MOV && i->lanes != 0xf) { + return 8; + } + + for (int s = 0; i->srcExists(s); ++s) { + if (i->src[s].isIndirect(0)) + return 8; + + if (i->src[s].getFile() == FILE_MEMORY_CONST) { + if (SDATA(i->src[s]).offset >= 0x100) + return 8; + if (i->getSrc(s)->reg.fileIndex > 1 && + i->getSrc(s)->reg.fileIndex != 16) + return 8; + } else + if (i->src[s].getFile() == FILE_IMMEDIATE) { + if (i->dType == TYPE_F32) { + if (SDATA(i->src[s]).u32 >= 0x100) + return 8; + } else { + if (SDATA(i->src[s]).u32 > 0xff) + return 8; + } + } + + if (i->op == OP_CVT) + continue; + if (i->src[s].mod != Modifier(0)) { + if (i->src[s].mod == Modifier(NV50_IR_MOD_ABS)) + if (i->op != OP_RSQ) + return 8; + if (i->src[s].mod == Modifier(NV50_IR_MOD_NEG)) + if (i->op != OP_ADD || s != 0) + return 8; + } + } + + return 4; +} + +CodeEmitterNVC0::CodeEmitterNVC0(const TargetNVC0 *target) : targ(target) +{ + code = NULL; + codeSize = codeSizeLimit = 0; + relocInfo = NULL; +} + +CodeEmitter * +TargetNVC0::getCodeEmitter(Program::Type type) +{ + CodeEmitterNVC0 *emit = new CodeEmitterNVC0(this); + emit->setProgramType(type); + return emit; +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp new file mode 100644 index 00000000000..de73efcc56a --- /dev/null +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp @@ -0,0 +1,705 @@ + +#include "nv50/codegen/nv50_ir.h" +#include "nv50/codegen/nv50_ir_build_util.h" + +#include "nv50_ir_target_nvc0.h" + +namespace nv50_ir { + +#define QOP_ADD 0 +#define QOP_SUBR 1 +#define QOP_SUB 2 +#define QOP_MOV2 3 + +#define QUADOP(q, r, s, t) \ + ((QOP_##q << 0) | (QOP_##r << 2) | \ + (QOP_##s << 4) | (QOP_##t << 6)) + +class NVC0LegalizeSSA : public Pass +{ +private: + virtual bool visit(BasicBlock *); + virtual bool visit(Function *); + + // we want to insert calls to the builtin library only after optimization + void handleDIV(Instruction *); // integer division, modulus + void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt + +private: + BuildUtil bld; +}; + +void +NVC0LegalizeSSA::handleDIV(Instruction *i) +{ + FlowInstruction *call; + int builtin; + Value *def[2]; + + bld.setPosition(i, false); + def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0); + def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0); + switch (i->dType) { + case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break; + case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break; + default: + return; + } + call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL); + bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]); + bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2); + bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0); + + call->fixed = 1; + call->absolute = call->builtin = 1; + call->target.builtin = builtin; + delete_Instruction(prog, i); +} + +void +NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) +{ + // TODO +} + +bool +NVC0LegalizeSSA::visit(Function *fn) +{ + bld.setProgram(fn->getProgram()); + return true; +} + +bool +NVC0LegalizeSSA::visit(BasicBlock *bb) +{ + Instruction *next; + for (Instruction *i = bb->getEntry(); i; i = next) { + next = i->next; + if (i->dType == TYPE_F32) + continue; + switch (i->op) { + case OP_DIV: + case OP_MOD: + handleDIV(i); + break; + case OP_RCP: + case OP_RSQ: + if (i->dType == TYPE_F64) + handleRCPRSQ(i); + break; + default: + break; + } + } + return true; +} + +class NVC0LegalizePostRA : public Pass +{ +private: + virtual bool visit(Function *); + virtual bool visit(BasicBlock *); + + void replaceZero(Instruction *); + void split64BitOp(Instruction *); + bool tryReplaceContWithBra(BasicBlock *); + void propagateJoin(BasicBlock *); + + LValue *r63; +}; + +bool +NVC0LegalizePostRA::visit(Function *fn) +{ + r63 = new_LValue(fn, FILE_GPR); + r63->reg.data.id = 63; + return true; +} + +void +NVC0LegalizePostRA::replaceZero(Instruction *i) +{ + for (int s = 0; i->srcExists(s); ++s) { + ImmediateValue *imm = i->getSrc(s)->asImm(); + if (imm && imm->reg.data.u64 == 0) + i->setSrc(s, r63); + } +} + +void +NVC0LegalizePostRA::split64BitOp(Instruction *i) +{ + if (i->dType == TYPE_F64) { + if (i->op == OP_MAD) + i->op = OP_FMA; + if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA || + i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX || + i->op == OP_SET) + return; + i->dType = i->sType = TYPE_U32; + + i->bb->insertAfter(i, i->clone(true)); // deep cloning + } +} + +// replace CONT with BRA for single unconditional continue +bool +NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb) +{ + if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT) + return false; + Graph::EdgeIterator ei = bb->cfg.incident(); + if (ei.getType() != Graph::Edge::BACK) + ei.next(); + if (ei.getType() != Graph::Edge::BACK) + return false; + BasicBlock *contBB = BasicBlock::get(ei.getNode()); + + if (!contBB->getExit() || contBB->getExit()->op != OP_CONT || + contBB->getExit()->getPredicate()) + return false; + contBB->getExit()->op = OP_BRA; + bb->remove(bb->getEntry()); // delete PRECONT + + ei.next(); + assert(ei.end() || ei.getType() != Graph::Edge::BACK); + return true; +} + +// replace branches to join blocks with join ops +void +NVC0LegalizePostRA::propagateJoin(BasicBlock *bb) +{ + if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit) + return; + for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) { + BasicBlock *in = BasicBlock::get(ei.getNode()); + Instruction *exit = in->getExit(); + if (!exit) { + in->insertTail(new FlowInstruction(func, OP_JOIN, bb)); + // there should always be a terminator instruction + WARN("inserted missing terminator in BB:%i\n", in->getId()); + } else + if (exit->op == OP_BRA) { + exit->op = OP_JOIN; + exit->asFlow()->limit = 1; // must-not-propagate marker + } + } + bb->remove(bb->getEntry()); +} + +bool +NVC0LegalizePostRA::visit(BasicBlock *bb) +{ + Instruction *i, *next; + + // remove pseudo operations and non-fixed no-ops, split 64 bit operations + for (i = bb->getFirst(); i; i = next) { + next = i->next; + if (i->op == OP_EMIT || i->op == OP_RESTART) { + if (!i->getDef(0)->refCount()) + i->setDef(0, NULL); + if (i->src[0].getFile() == FILE_IMMEDIATE) + i->setSrc(0, r63); // initial value must be 0 + } else + if (i->isNop()) { + bb->remove(i); + } else { + if (i->op != OP_MOV && i->op != OP_PFETCH) + replaceZero(i); + if (typeSizeof(i->dType) == 8) + split64BitOp(i); + } + } + if (!bb->getEntry()) + return true; + + if (!tryReplaceContWithBra(bb)) + propagateJoin(bb); + + return true; +} + +class NVC0LoweringPass : public Pass +{ +public: + NVC0LoweringPass(Program *); + +private: + virtual bool visit(Function *); + virtual bool visit(BasicBlock *); + virtual bool visit(Instruction *); + + bool handleRDSV(Instruction *); + bool handleWRSV(Instruction *); + bool handleEXPORT(Instruction *); + bool handleOUT(Instruction *); + bool handleDIV(Instruction *); + bool handleMOD(Instruction *); + bool handleSQRT(Instruction *); + bool handlePOW(Instruction *); + bool handleTEX(TexInstruction *); + bool handleTXD(TexInstruction *); + bool handleManualTXD(TexInstruction *); + + void checkPredicate(Instruction *); + + void readTessCoord(LValue *dst, int c); + +private: + const Target *const targ; + + BuildUtil bld; + + LValue *gpEmitAddress; +}; + +NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget()) +{ + bld.setProgram(prog); +} + +bool +NVC0LoweringPass::visit(Function *fn) +{ + if (prog->getType() == Program::TYPE_GEOMETRY) { + assert(!strncmp(fn->getName(), "MAIN", 4)); + // TODO: when we generate actual functions pass this value along somehow + bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false); + gpEmitAddress = bld.loadImm(NULL, 0)->asLValue(); + } + return true; +} + +bool +NVC0LoweringPass::visit(BasicBlock *bb) +{ + return true; +} + +// move array source to first slot, convert to u16, add indirections +bool +NVC0LoweringPass::handleTEX(TexInstruction *i) +{ + const int dim = i->tex.target.getDim(); + const int arg = i->tex.target.getDim() + i->tex.target.isArray(); + + // generate and move the tsc/tic/array source to the front + if (dim != arg || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) { + LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa + + Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(dim) : NULL; + for (int s = dim; s >= 1; --s) + i->setSrc(s, i->getSrc(s - 1)); + i->setSrc(0, arrayIndex); + + Value *ticRel = i->getIndirectR(); + Value *tscRel = i->getIndirectS(); + + if (arrayIndex) + bld.mkCvt(OP_CVT, TYPE_U16, src, TYPE_F32, arrayIndex); + else + bld.loadImm(src, 0); + + if (ticRel) { + i->setSrc(i->tex.rIndirectSrc, NULL); + bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src); + } + if (tscRel) { + i->setSrc(i->tex.sIndirectSrc, NULL); + bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src); + } + + i->setSrc(0, src); + } + + // offset is last source (lod 1st, dc 2nd) + if (i->tex.useOffsets) { + uint32_t value = 0; + int n, c; + int s = i->srcCount(0xff); + for (n = 0; n < i->tex.useOffsets; ++n) + for (c = 0; c < 3; ++c) + value |= (i->tex.offset[n][c] & 0xf) << (n * 12 + c * 4); + i->setSrc(s, bld.loadImm(NULL, value)); + } + + return true; +} + +bool +NVC0LoweringPass::handleManualTXD(TexInstruction *i) +{ + static const uint8_t qOps[4][2] = + { + { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0 + { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1 + { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2 + { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3 + }; + Value *def[4][4]; + Value *crd[3]; + Instruction *tex; + Value *zero = bld.loadImm(bld.getSSA(), 0); + int l, c; + const int dim = i->tex.target.getDim(); + + i->op = OP_TEX; // no need to clone dPdx/dPdy later + + for (c = 0; c < dim; ++c) + crd[c] = bld.getScratch(); + + bld.mkOp(OP_QUADON, TYPE_NONE, NULL); + for (l = 0; l < 4; ++l) { + // mov coordinates from lane l to all lanes + for (c = 0; c < dim; ++c) + bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero); + // add dPdx from lane l to lanes dx + for (c = 0; c < dim; ++c) + bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]); + // add dPdy from lane l to lanes dy + for (c = 0; c < dim; ++c) + bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]); + // texture + bld.insert(tex = i->clone(true)); + for (c = 0; c < dim; ++c) + tex->setSrc(c, crd[c]); + // save results + for (c = 0; i->defExists(c); ++c) { + Instruction *mov; + def[c][l] = bld.getSSA(); + mov = bld.mkMov(def[c][l], tex->getDef(c)); + mov->fixed = 1; + mov->lanes = 1 << l; + } + } + bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); + + for (c = 0; i->defExists(c); ++c) { + Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c)); + for (l = 0; l < 4; ++l) + u->setSrc(l, def[c][l]); + } + + i->bb->remove(i); + return true; +} + +bool +NVC0LoweringPass::handleTXD(TexInstruction *txd) +{ + int dim = txd->tex.target.getDim(); + int arg = txd->tex.target.getDim() + txd->tex.target.isArray(); + + handleTEX(txd); + if (txd->src[arg].exists()) + ++arg; + + if (dim > 2 || txd->tex.target.isShadow()) + return handleManualTXD(txd); + + // at most s/t/array, x, y, offset + assert(arg <= 4 && !txd->src[arg].exists()); + + for (int c = 0; c < dim; ++c) { + txd->src[arg + c * 2 + 0].set(txd->dPdx[c]); + txd->src[arg + c * 2 + 1].set(txd->dPdy[c]); + txd->dPdx[c] = NULL; + txd->dPdy[c] = NULL; + } + return true; +} + +bool +NVC0LoweringPass::handleWRSV(Instruction *i) +{ + Instruction *st; + Symbol *sym; + uint32_t addr; + + // must replace, $sreg are not writeable + addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym()); + if (addr >= 0x400) + return false; + sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr); + + st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), + i->getSrc(1)); + st->perPatch = i->perPatch; + + bld.getBB()->remove(i); + return true; +} + +void +NVC0LoweringPass::readTessCoord(LValue *dst, int c) +{ + Value *laneid = bld.getSSA(); + Value *x, *y; + + bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0)); + + if (c == 0) { + x = dst; + y = NULL; + } else + if (c == 1) { + x = NULL; + y = dst; + } else { + assert(c == 2); + x = bld.getSSA(); + y = bld.getSSA(); + } + if (x) + bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid); + if (y) + bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid); + + if (c == 2) { + bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y); + bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst); + } +} + +bool +NVC0LoweringPass::handleRDSV(Instruction *i) +{ + Symbol *sym = i->getSrc(0)->asSym(); + Value *vtx = NULL; + Instruction *ld; + uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym); + + if (addr >= 0x400) // mov $sreg + return true; + + switch (i->getSrc(0)->reg.data.sv.sv) { + case SV_POSITION: + assert(prog->getType() == Program::TYPE_FRAGMENT); + ld = new_Instruction(func, OP_LINTERP, TYPE_F32); + ld->setDef(0, i->getDef(0)); + ld->setSrc(0, bld.mkSymbol(FILE_SHADER_INPUT, 0, TYPE_F32, addr)); + ld->setInterpolate(NV50_IR_INTERP_LINEAR); + bld.getBB()->insertAfter(i, ld); + break; + case SV_TESS_COORD: + assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL); + readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index); + break; + default: + if (prog->getType() == Program::TYPE_TESSELLATION_EVAL) + vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0)); + ld = bld.mkFetch(i->getDef(0), i->dType, + FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx); + ld->perPatch = i->perPatch; + break; + } + bld.getBB()->remove(i); + return true; +} + +bool +NVC0LoweringPass::handleDIV(Instruction *i) +{ + if (!isFloatType(i->dType)) + return true; + Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1)); + i->op = OP_MUL; + i->setSrc(1, rcp->getDef(0)); + return true; +} + +bool +NVC0LoweringPass::handleMOD(Instruction *i) +{ + if (i->dType != TYPE_F32) + return true; + LValue *value = bld.getScratch(); + bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1)); + bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value); + bld.mkOp1(OP_TRUNC, TYPE_F32, value, value); + bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value); + i->op = OP_SUB; + i->setSrc(1, value); + return true; +} + +bool +NVC0LoweringPass::handleSQRT(Instruction *i) +{ + Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32, + bld.getSSA(), i->getSrc(0)); + i->op = OP_MUL; + i->setSrc(1, rsq->getDef(0)); + + return true; +} + +bool +NVC0LoweringPass::handlePOW(Instruction *i) +{ + LValue *val = bld.getScratch(); + + bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0)); + bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1; + bld.mkOp1(OP_PREEX2, TYPE_F32, val, val); + + i->op = OP_EX2; + i->setSrc(0, val); + i->setSrc(1, NULL); + + return true; +} + +bool +NVC0LoweringPass::handleEXPORT(Instruction *i) +{ + if (prog->getType() == Program::TYPE_FRAGMENT) { + int id = i->getSrc(0)->reg.data.offset / 4; + + if (i->src[0].isIndirect(0)) // TODO, ugly + return false; + i->op = OP_MOV; + i->src[0].set(i->src[1]); + i->setSrc(1, NULL); + i->setDef(0, new_LValue(func, FILE_GPR)); + i->getDef(0)->reg.data.id = id; + + prog->maxGPR = MAX2(prog->maxGPR, id); + } else + if (prog->getType() == Program::TYPE_GEOMETRY) { + i->setIndirect(0, 1, gpEmitAddress); + } + return true; +} + +bool +NVC0LoweringPass::handleOUT(Instruction *i) +{ + if (i->op == OP_RESTART && i->prev && i->prev->op == OP_EMIT) { + i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART; + delete_Instruction(prog, i); + } else { + assert(gpEmitAddress); + i->setDef(0, gpEmitAddress); + if (i->srcExists(0)) + i->setSrc(1, i->getSrc(0)); + i->setSrc(0, gpEmitAddress); + } + return true; +} + +// Generate a binary predicate if an instruction is predicated by +// e.g. an f32 value. +void +NVC0LoweringPass::checkPredicate(Instruction *insn) +{ + Value *pred = insn->getPredicate(); + Value *pdst; + + if (!pred || pred->reg.file == FILE_PREDICATE) + return; + pdst = new_LValue(func, FILE_PREDICATE); + + // CAUTION: don't use pdst->getInsn, the definition might not be unique, + // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass + + bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, pdst, bld.mkImm(0), pred); + + insn->setPredicate(insn->cc, pdst); +} + +// +// - add quadop dance for texturing +// - put FP outputs in GPRs +// - convert instruction sequences +// +bool +NVC0LoweringPass::visit(Instruction *i) +{ + if (i->prev) + bld.setPosition(i->prev, true); + else + if (i->next) + bld.setPosition(i->next, false); + else + bld.setPosition(i->bb, true); + + if (i->cc != CC_ALWAYS) + checkPredicate(i); + + switch (i->op) { + case OP_TEX: + case OP_TXB: + case OP_TXL: + case OP_TXF: + case OP_TXQ: + case OP_TXG: + return handleTEX(i->asTex()); + case OP_TXD: + return handleTXD(i->asTex()); + case OP_EX2: + bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0)); + i->setSrc(0, i->getDef(0)); + break; + case OP_POW: + return handlePOW(i); + case OP_DIV: + return handleDIV(i); + case OP_MOD: + return handleMOD(i); + case OP_SQRT: + return handleSQRT(i); + case OP_EXPORT: + return handleEXPORT(i); + case OP_EMIT: + case OP_RESTART: + return handleOUT(i); + case OP_RDSV: + return handleRDSV(i); + case OP_WRSV: + return handleWRSV(i); + case OP_LOAD: + if (i->src[0].getFile() == FILE_SHADER_INPUT) { + i->op = OP_VFETCH; + assert(prog->getType() != Program::TYPE_FRAGMENT); + } + break; + case OP_PINTERP: + if (i->getSrc(0)->reg.data.offset >= 0x280 && + i->getSrc(0)->reg.data.offset < 0x2c0) + i->setInterpolate(i->getSampleMode() | NV50_IR_INTERP_SC); + break; + case OP_LINTERP: + if (i->getSrc(0)->reg.data.offset == 0x3fc) { + Value *face = i->getDef(0); + bld.setPosition(i, true); + bld.mkOp2(OP_SHL, TYPE_U32, face, face, bld.mkImm(31)); + bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000)); + } + break; + default: + break; + } + return true; +} + +bool +TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const +{ + if (stage == CG_STAGE_PRE_SSA) { + NVC0LoweringPass pass(prog); + return pass.run(prog, false, true); + } else + if (stage == CG_STAGE_POST_RA) { + NVC0LegalizePostRA pass; + return pass.run(prog, false, true); + } else + if (stage == CG_STAGE_SSA) { + NVC0LegalizeSSA pass; + return pass.run(prog, false, true); + } + return false; +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp new file mode 100644 index 00000000000..60b2016878e --- /dev/null +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp @@ -0,0 +1,568 @@ + +#include "nv50_ir_target_nvc0.h" + +namespace nv50_ir { + +Target *getTargetNVC0(unsigned int chipset) +{ + return new TargetNVC0(chipset); +} + +TargetNVC0::TargetNVC0(unsigned int card) +{ + chipset = card; + initOpInfo(); +} + +// BULTINS / LIBRARY FUNCTIONS: + +// lazyness -> will just hardcode everything for the time being + +// Will probably make this nicer once we support subroutines properly, +// i.e. when we have an input IR that provides function declarations. + +static const uint32_t nvc0_builtin_code[] = +{ +// DIV U32: slow unsigned integer division +// +// UNR recurrence (q = a / b): +// look for z such that 2^32 - b <= b * z < 2^32 +// then q - 1 <= (a * z) / 2^32 <= q +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p1 +// SIZE: 22 / 14 * 8 bytes +// +#if 1 + 0x04009c03, 0x78000000, + 0x7c209cdd, + 0x0010dd18, + 0x08309c03, 0x60000000, + 0x05605c18, + 0x0810dc2a, + 0x0c209c43, 0x20040000, + 0x0810dc03, 0x50000000, + 0x0c209c43, 0x20040000, + 0x0810dc03, 0x50000000, + 0x0c209c43, 0x20040000, + 0x0810dc03, 0x50000000, + 0x0c209c43, 0x20040000, + 0x0810dc03, 0x50000000, + 0x0c209c43, 0x20040000, + 0x0000dde4, 0x28000000, + 0x08001c43, 0x50000000, + 0x05609c18, + 0x0010430d, + 0x0811dc03, 0x1b0e0000, + 0x08104103, 0x48000000, + 0x04000002, 0x08000000, + 0x0811c003, 0x1b0e0000, + 0x08104103, 0x48000000, + 0x040000ac, + 0x90001dff, +#else + 0x0401dc03, 0x1b0e0000, + 0x00008003, 0x78000000, + 0x0400c003, 0x78000000, + 0x0c20c103, 0x48000000, + 0x0c108003, 0x60000000, + 0x00005c28, + 0x00001d18, + 0x0031c023, 0x1b0ec000, + 0xb000a1e7, 0x40000000, + 0x04000003, 0x6000c000, + 0x0813dc03, 0x1b000000, + 0x0420446c, + 0x040004bd, + 0x04208003, 0x5800c000, + 0x0430c103, 0x4800c000, + 0x0ffc5dff, + 0x90001dff, +#endif + +// DIV S32: slow signed integer division +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p3 +// SIZE: 18 * 8 bytes +// + 0xfc05dc23, 0x188e0000, + 0xfc17dc23, 0x18c40000, + 0x03301e18, + 0x07305e18, + 0x0401dc03, 0x1b0e0000, + 0x00008003, 0x78000000, + 0x0400c003, 0x78000000, + 0x0c20c103, 0x48000000, + 0x0c108003, 0x60000000, + 0x00005c28, + 0x00001d18, + 0x0031c023, 0x1b0ec000, + 0xb000a1e7, 0x40000000, + 0x04000003, 0x6000c000, + 0x0813dc03, 0x1b000000, + 0x0420446c, + 0x040004bd, + 0x04208003, 0x5800c000, + 0x0430c103, 0x4800c000, + 0x0ffc5dff, + 0x01700e18, + 0x05704a18, + 0x90001dff, + +// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i) +// +// INPUT: $r0d (x) +// OUTPUT: $r0d (rcp(x)) +// CLOBBER: $r2 - $r7 +// SIZE: 9 * 8 bytes +// + 0x9810dc08, + 0x00009c28, + 0x4001df18, + 0x00019d18, + 0x08011e01, 0x200c0000, + 0x10209c01, 0x50000000, + 0x08011e01, 0x200c0000, + 0x10209c01, 0x50000000, + 0x08011e01, 0x200c0000, + 0x10201c01, 0x50000000, + 0x00001de7, 0x90000000, + +// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i) +// +// INPUT: $r0d (x) +// OUTPUT: $r0d (rsqrt(x)) +// CLOBBER: $r2 - $r7 +// SIZE: 14 * 8 bytes +// + 0x9c10dc08, + 0x00009c28, + 0x00019d18, + 0x3fe1df18, + 0x18001c01, 0x50000000, + 0x0001dde2, 0x18ffe000, + 0x08211c01, 0x50000000, + 0x10011e01, 0x200c0000, + 0x10209c01, 0x50000000, + 0x08211c01, 0x50000000, + 0x10011e01, 0x200c0000, + 0x10209c01, 0x50000000, + 0x08211c01, 0x50000000, + 0x10011e01, 0x200c0000, + 0x10201c01, 0x50000000, + 0x00001de7, 0x90000000, +}; + +static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] = +{ + 0, + 8 * (22), + 8 * (22 + 18), + 8 * (22 + 18 + 9) +}; + +void +TargetNVC0::getBuiltinCode(const uint32_t **code, uint32_t *size) const +{ + *code = &nvc0_builtin_code[0]; + *size = sizeof(nvc0_builtin_code); +} + +uint32_t +TargetNVC0::getBuiltinOffset(int builtin) const +{ + assert(builtin < NVC0_BUILTIN_COUNT); + return nvc0_builtin_offsets[builtin]; +} + +struct opProperties +{ + operation op; + unsigned int mNeg : 4; + unsigned int mAbs : 4; + unsigned int mNot : 4; + unsigned int mSat : 4; + unsigned int fConst : 3; + unsigned int fImmd : 4; // last bit indicates if full immediate is suppoted +}; + +static const struct opProperties _initProps[] = +{ + // neg abs not sat c[] imm + { OP_ADD, 0x3, 0x3, 0x0, 0x8, 0x2, 0x2 | 0x8 }, + { OP_SUB, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 | 0x8 }, + { OP_MUL, 0x3, 0x0, 0x0, 0x8, 0x2, 0x2 | 0x8 }, + { OP_MAX, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, + { OP_MIN, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, + { OP_MAD, 0x7, 0x0, 0x0, 0x8, 0x6, 0x2 | 0x8 }, // special c[] constraint + { OP_ABS, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0 }, + { OP_NEG, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0 }, + { OP_CVT, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 }, + { OP_AND, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 }, + { OP_OR, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 }, + { OP_XOR, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 }, + { OP_SHL, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 }, + { OP_SHR, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 }, + { OP_SET, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, + { OP_SLCT, 0x4, 0x0, 0x0, 0x0, 0x6, 0x2 }, // special c[] constraint + { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 }, + { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 }, + { OP_COS, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, + { OP_SIN, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, + { OP_EX2, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, + { OP_LG2, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, + { OP_RCP, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, + { OP_RSQ, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, + { OP_DFDX, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 }, + { OP_DFDY, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 }, + { OP_CALL, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0 }, + { OP_INSBF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4 }, + { OP_SET_AND, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, + { OP_SET_OR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, + { OP_SET_XOR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, + // saturate only: + { OP_LINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 }, + { OP_PINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 }, +}; + +void TargetNVC0::initOpInfo() +{ + unsigned int i, j; + + static const uint32_t commutative[(OP_LAST + 31) / 32] = + { + // ADD, MAD, MUL, AND, OR, XOR, MAX, MIN + 0x0670ca00, 0x0000003f, 0x00000000 + }; + + static const uint32_t shortForm[(OP_LAST + 31) / 32] = + { + // ADD, MAD, MUL, AND, OR, XOR, PRESIN, PREEX2, SFN, CVT, PINTERP, MOV + 0x0670ca00, 0x00000000, 0x00000000 + }; + + static const operation noDest[] = + { + OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT, + OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET, + OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART, + OP_QUADON, OP_QUADPOP + }; + + joinAnterior = false; + + for (i = 0; i < DATA_FILE_COUNT; ++i) + nativeFileMap[i] = (DataFile)i; + nativeFileMap[FILE_ADDRESS] = FILE_GPR; + + for (i = 0; i < OP_LAST; ++i) { + opInfo[i].variants = NULL; + opInfo[i].op = (operation)i; + opInfo[i].srcTypes = 1 << (int)TYPE_F32; + opInfo[i].dstTypes = 1 << (int)TYPE_F32; + opInfo[i].immdBits = 0; + opInfo[i].srcNr = operationSrcNr[i]; + + for (j = 0; j < opInfo[i].srcNr; ++j) { + opInfo[i].srcMods[j] = 0; + opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR; + } + opInfo[i].dstMods = 0; + opInfo[i].dstFiles = 1 << (int)FILE_GPR; + + opInfo[i].hasDest = 1; + opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA); + opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1; + opInfo[i].pseudo = (i < OP_MOV); + opInfo[i].predicate = !opInfo[i].pseudo; + opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN); + opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8; + } + for (i = 0; i < sizeof(noDest) / sizeof(noDest[0]); ++i) + opInfo[noDest[i]].hasDest = 0; + + for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) { + const struct opProperties *prop = &_initProps[i]; + + for (int s = 0; s < 3; ++s) { + if (prop->mNeg & (1 << s)) + opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG; + if (prop->mAbs & (1 << s)) + opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS; + if (prop->mNot & (1 << s)) + opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT; + if (prop->fConst & (1 << s)) + opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST; + if (prop->fImmd & (1 << s)) + opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE; + if (prop->fImmd & 8) + opInfo[prop->op].immdBits = 0xffffffff; + } + if (prop->mSat & 8) + opInfo[prop->op].dstMods = NV50_IR_MOD_SAT; + } +} + +unsigned int +TargetNVC0::getFileSize(DataFile file) const +{ + switch (file) { + case FILE_NULL: return 0; + case FILE_GPR: return 63; + case FILE_PREDICATE: return 7; + case FILE_FLAGS: return 1; + case FILE_ADDRESS: return 0; + case FILE_IMMEDIATE: return 0; + case FILE_MEMORY_CONST: return 65536; + case FILE_SHADER_INPUT: return 0x400; + case FILE_SHADER_OUTPUT: return 0x400; + case FILE_MEMORY_GLOBAL: return 0xffffffff; + case FILE_MEMORY_SHARED: return 16 << 10; + case FILE_MEMORY_LOCAL: return 48 << 10; + case FILE_SYSTEM_VALUE: return 32; + default: + assert(!"invalid file"); + return 0; + } +} + +unsigned int +TargetNVC0::getFileUnit(DataFile file) const +{ + if (file == FILE_GPR || file == FILE_ADDRESS || file == FILE_SYSTEM_VALUE) + return 2; + return 0; +} + +uint32_t +TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const +{ + const int idx = sym->reg.data.sv.index; + const SVSemantic sv = sym->reg.data.sv.sv; + + const bool isInput = shaderFile == FILE_SHADER_INPUT; + + switch (sv) { + case SV_POSITION: return 0x070 + idx * 4; + case SV_INSTANCE_ID: return 0x2f8; + case SV_VERTEX_ID: return 0x2fc; + case SV_PRIMITIVE_ID: return isInput ? 0x060 : 0x040; + case SV_LAYER: return 0x064; + case SV_VIEWPORT_INDEX: return 0x068; + case SV_POINT_SIZE: return 0x06c; + case SV_CLIP_DISTANCE: return 0x2c0 + idx * 4; + case SV_POINT_COORD: return 0x2e0 + idx * 4; + case SV_FACE: return 0x3fc; + case SV_TESS_FACTOR: return 0x000 + idx * 4; + case SV_TESS_COORD: return 0x2f0 + idx * 4; + default: + return 0xffffffff; + } +} + +bool +TargetNVC0::insnCanLoad(const Instruction *i, int s, + const Instruction *ld) const +{ + DataFile sf = ld->src[0].getFile(); + + // immediate 0 can be represented by GPR $r63 + if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0) + return (!i->asTex() && i->op != OP_EXPORT && i->op != OP_STORE); + + if (s > opInfo[i->op].srcNr) + return false; + if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf))) + return false; + + // indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0 + if (ld->src[0].isIndirect(0)) + return false; + + for (int k = 0; i->srcExists(k); ++k) { + if (i->src[k].getFile() == FILE_IMMEDIATE) { + if (i->getSrc(k)->reg.data.u64 != 0) + return false; + } else + if (i->src[k].getFile() != FILE_GPR && + i->src[k].getFile() != FILE_PREDICATE) { + return false; + } + } + + // not all instructions support full 32 bit immediates + if (sf == FILE_IMMEDIATE) { + Storage ® = ld->getSrc(0)->asImm()->reg; + + if (opInfo[i->op].immdBits != 0xffffffff) { + if (i->sType == TYPE_F32) { + if (reg.data.u32 & 0xfff) + return false; + } else + if (i->sType == TYPE_S32 || i->sType == TYPE_U32) { + // with u32, 0xfffff counts as 0xffffffff as well + if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000) + return false; + } + } else + if (i->op == OP_MAD || i->op == OP_FMA) { + // requires src == dst, cannot decide before RA + // (except if we implement more constraints) + if (ld->getSrc(0)->asImm()->reg.data.u32 & 0xfff) + return false; + } + } + + return true; +} + +bool +TargetNVC0::isOpSupported(operation op, DataType ty) const +{ + if ((op == OP_MAD || op == OP_FMA) && (ty != TYPE_F32)) + return false; + if (op == OP_SAD && ty != TYPE_S32) + return false; + if (op == OP_POW || op == OP_SQRT || op == OP_DIV || op == OP_MOD) + return false; + return true; +} + +bool +TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const +{ + if (!isFloatType(insn->dType)) { + switch (insn->op) { + case OP_ABS: + case OP_NEG: + case OP_CVT: + case OP_CEIL: + case OP_FLOOR: + case OP_TRUNC: + case OP_AND: + case OP_OR: + case OP_XOR: + break; + case OP_ADD: + if (insn->src[s ? 0 : 1].mod.neg()) + return false; + break; + case OP_SUB: + if (s == 0) + return insn->src[1].mod.neg() ? false : true; + break; + default: + return false; + } + } + if (s > 3) + return false; + return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod; +} + +bool +TargetNVC0::mayPredicate(const Instruction *insn, const Value *pred) const +{ + if (insn->getPredicate()) + return false; + return opInfo[insn->op].predicate; +} + +bool +TargetNVC0::isSatSupported(const Instruction *insn) const +{ + if (insn->op == OP_CVT) + return true; + if (!(opInfo[insn->op].dstMods & NV50_IR_MOD_SAT)) + return false; + + if (insn->dType == TYPE_U32) + return (insn->op == OP_ADD) || (insn->op == OP_MAD); + + return insn->dType == TYPE_F32; +} + +// TODO: better values +int TargetNVC0::getLatency(const Instruction *i) const +{ + if (i->op == OP_LOAD) { + if (i->cache == CACHE_CV) + return 700; + return 48; + } + return 24; +} + +// These are "inverse" throughput values, i.e. the number of cycles required +// to issue a specific instruction for a full warp (32 threads). +// +// Assuming we have more than 1 warp in flight, a higher issue latency results +// in a lower result latency since the MP will have spent more time with other +// warps. +// This also helps to determine the number of cycles between instructions in +// a single warp. +// +int TargetNVC0::getThroughput(const Instruction *i) const +{ + // TODO: better values + if (i->dType == TYPE_F32) { + switch (i->op) { + case OP_ADD: + case OP_MUL: + case OP_MAD: + case OP_FMA: + return 1; + case OP_CVT: + case OP_CEIL: + case OP_FLOOR: + case OP_TRUNC: + case OP_SET: + case OP_SLCT: + case OP_MIN: + case OP_MAX: + return 2; + case OP_RCP: + case OP_RSQ: + case OP_LG2: + case OP_SIN: + case OP_COS: + case OP_PRESIN: + case OP_PREEX2: + default: + return 8; + } + } else + if (i->dType == TYPE_U32 || i->dType == TYPE_S32) { + switch (i->op) { + case OP_ADD: + case OP_AND: + case OP_OR: + case OP_XOR: + case OP_NOT: + return 1; + case OP_MUL: + case OP_MAD: + case OP_CVT: + case OP_SET: + case OP_SLCT: + case OP_SHL: + case OP_SHR: + case OP_NEG: + case OP_ABS: + case OP_MIN: + case OP_MAX: + default: + return 2; + } + } else + if (i->dType == TYPE_F64) { + return 2; + } else { + return 1; + } +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h new file mode 100644 index 00000000000..f96bfbeaa6a --- /dev/null +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h @@ -0,0 +1,46 @@ + +#include "nv50/codegen/nv50_ir_target.h" + +namespace nv50_ir { + +#define NVC0_BUILTIN_DIV_U32 0 +#define NVC0_BUILTIN_DIV_S32 1 +#define NVC0_BUILTIN_RCP_F64 2 +#define NVC0_BUILTIN_RSQ_F64 3 + +#define NVC0_BUILTIN_COUNT 4 + +class TargetNVC0 : public Target +{ +public: + TargetNVC0(unsigned int chipset); + + virtual CodeEmitter *getCodeEmitter(Program::Type); + + virtual bool runLegalizePass(Program *, CGStage stage) const; + + virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const; + + virtual bool insnCanLoad(const Instruction *insn, int s, + const Instruction *ld) const; + virtual bool isOpSupported(operation, DataType) const; + virtual bool isModSupported(const Instruction *, int s, Modifier) const; + virtual bool isSatSupported(const Instruction *) const; + virtual bool mayPredicate(const Instruction *, const Value *) const; + + virtual int getLatency(const Instruction *) const; + virtual int getThroughput(const Instruction *) const; + + virtual unsigned int getFileSize(DataFile) const; + virtual unsigned int getFileUnit(DataFile) const; + + virtual uint32_t getSVAddress(DataFile shaderFile, const Symbol *sv) const; + + uint32_t getBuiltinOffset(int builtin) const; + +private: + void initOpInfo(); + +}; + +} // namespace nv50_ir |