diff options
author | Christoph Bumiller <[email protected]> | 2012-04-14 21:40:35 +0200 |
---|---|---|
committer | Christoph Bumiller <[email protected]> | 2012-04-14 21:54:04 +0200 |
commit | 322bc7ed68ed92233c97168c036d0aa50c11a20e (patch) | |
tree | 0ecfa903b0950c71455b6420c4f3550e4abb2007 /src/gallium/drivers/nv50/codegen | |
parent | 15ce0f76e2e014374a292550505f58da88333fb7 (diff) |
nv50/ir: import nv50 target
Diffstat (limited to 'src/gallium/drivers/nv50/codegen')
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir.cpp | 12 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir.h | 4 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h | 13 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_driver.h | 2 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_emit_nv50.cpp | 896 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp | 1118 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp | 25 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp | 10 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_target.h | 8 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp | 531 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.h | 72 |
11 files changed, 2472 insertions, 219 deletions
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir.cpp index 19a90806c70..048759060ad 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir.cpp @@ -559,8 +559,11 @@ void Instruction::init() subOp = 0; saturate = 0; - join = terminator = 0; - ftz = dnz = 0; + join = 0; + exit = 0; + terminator = 0; + ftz = 0; + dnz = 0; atomic = 0; perPatch = 0; fixed = 0; @@ -982,6 +985,9 @@ Program::Program(Type type, Target *arch) calls.insert(&main->call); dbgFlags = 0; + optLevel = 0; + + targetPriv = NULL; } Program::~Program() @@ -1085,6 +1091,7 @@ nv50_ir_generate_code(struct nv50_ir_prog_info *info) if (!prog) return -1; prog->dbgFlags = info->dbgFlags; + prog->optLevel = info->optLevel; switch (info->bin.sourceRep) { #if 0 @@ -1105,6 +1112,7 @@ nv50_ir_generate_code(struct nv50_ir_prog_info *info) if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE) prog->print(); + targ->parseDriverInfo(info); prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA); prog->convertToSSA(); diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir.h b/src/gallium/drivers/nv50/codegen/nv50_ir.h index c0a867d9552..6ec4fc95441 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir.h @@ -140,6 +140,7 @@ enum operation #define NV50_IR_SUBOP_LDC_IS 2 #define NV50_IR_SUBOP_LDC_ISL 3 #define NV50_IR_SUBOP_SHIFT_WRAP 1 +#define NV50_IR_SUBOP_EMU_PRERET 1 enum DataType { @@ -1060,6 +1061,9 @@ public: MemoryPool mem_ImmediateValue; uint32_t dbgFlags; + uint8_t optLevel; + + void *targetPriv; // e.g. to carry information between passes void releaseInstruction(Instruction *); void releaseValue(Value *); diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h index 022a27f1748..9ee04dbcd12 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h @@ -46,7 +46,8 @@ public: inline void remove(Instruction *i) { assert(i->bb == bb); bb->remove(i); } inline LValue *getScratch(int size = 4, DataFile = FILE_GPR); - inline LValue *getSSA(int size = 4); // scratch value for a single assignment + // scratch value for a single assignment: + inline LValue *getSSA(int size = 4, DataFile = FILE_GPR); inline Instruction *mkOp(operation, DataType, Value *); Instruction *mkOp1(operation, DataType, Value *, Value *); @@ -215,18 +216,16 @@ LValue * BuildUtil::getScratch(int size, DataFile f) { LValue *lval = new_LValue(func, f); - if (size != 4) - lval->reg.size = size; + lval->reg.size = size; return lval; } LValue * -BuildUtil::getSSA(int size) +BuildUtil::getSSA(int size, DataFile f) { - LValue *lval = new_LValue(func, FILE_GPR); + LValue *lval = new_LValue(func, f); lval->ssa = 1; - if (size != 4) - lval->reg.size = size; + lval->reg.size = size; return lval; } diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h b/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h index ae733a1a924..dc42b8295e9 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h @@ -175,6 +175,8 @@ struct nv50_ir_prog_info /* driver callback to assign input/output locations */ int (*assignSlots)(struct nv50_ir_prog_info *); + + void *driverPriv; }; #ifdef __cplusplus diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_emit_nv50.cpp index 1c09494f46d..c534d4a0c5e 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_emit_nv50.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_emit_nv50.cpp @@ -21,14 +21,19 @@ */ #include "nv50_ir.h" -#include "nv50_ir_target.h" +#include "nv50_ir_target_nv50.h" namespace nv50_ir { +#define NV50_OP_ENC_LONG 0 +#define NV50_OP_ENC_SHORT 1 +#define NV50_OP_ENC_IMM 2 +#define NV50_OP_ENC_LONG_ALT 3 + class CodeEmitterNV50 : public CodeEmitter { public: - CodeEmitterNV50(const Target *); + CodeEmitterNV50(const TargetNV50 *); virtual bool emitInstruction(Instruction *); @@ -36,23 +41,25 @@ public: inline void setProgramType(Program::Type pType) { progType = pType; } -private: - const Target *targ; + virtual void prepareEmission(Function *); +private: Program::Type progType; + const TargetNV50 *targ; + private: inline void defId(const ValueDef&, const int pos); inline void srcId(const ValueRef&, const int pos); inline void srcId(const ValueRef *, const int pos); - inline void srcAddr16(const ValueRef&, const int pos); + inline void srcAddr16(const ValueRef&, bool adj, const int pos); inline void srcAddr8(const ValueRef&, const int pos); void emitFlagsRd(const Instruction *); void emitFlagsWr(const Instruction *); - void emitCondCode(CondCode cc, int pos); + void emitCondCode(CondCode cc, DataType ty, int pos); inline void setARegBits(unsigned int); @@ -61,16 +68,16 @@ private: void setDst(const Value *); void setDst(const Instruction *, int d); - void emitSrc0(const ValueRef&); - void emitSrc1(const ValueRef&); - void emitSrc2(const ValueRef&); + void setSrcFileBits(const Instruction *, int enc); + void setSrc(const Instruction *, unsigned int s, int slot); void emitForm_MAD(const Instruction *); void emitForm_ADD(const Instruction *); void emitForm_MUL(const Instruction *); void emitForm_IMM(const Instruction *); - void emitLoadStoreSize(DataType ty, int pos); + void emitLoadStoreSizeLG(DataType ty, int pos); + void emitLoadStoreSizeCS(DataType ty); void roundMode_MAD(const Instruction *); void roundMode_CVT(RoundMode); @@ -88,9 +95,10 @@ private: void emitUADD(const Instruction *); void emitAADD(const Instruction *); void emitFADD(const Instruction *); - void emitUMUL(const Instruction *); + void emitIMUL(const Instruction *); void emitFMUL(const Instruction *); void emitFMAD(const Instruction *); + void emitIMAD(const Instruction *); void emitMINMAX(const Instruction *); @@ -98,17 +106,20 @@ private: void emitSFnOp(const Instruction *, uint8_t subOp); void emitShift(const Instruction *); - void emitARL(const Instruction *); + void emitARL(const Instruction *, unsigned int shl); void emitLogicOp(const Instruction *); + void emitNOT(const Instruction *); void emitCVT(const Instruction *); void emitSET(const Instruction *); void emitTEX(const TexInstruction *); + void emitTXQ(const TexInstruction *); void emitQUADOP(const Instruction *, uint8_t lane, uint8_t quOp); void emitFlow(const Instruction *, uint8_t flowOp); + void emitPRERETEmu(const FlowInstruction *); }; #define SDATA(a) ((a).rep()->reg.data) @@ -126,13 +137,20 @@ void CodeEmitterNV50::srcId(const ValueRef *src, const int pos) code[pos / 32] |= SDATA(*src).id << (pos % 32); } -void CodeEmitterNV50::srcAddr16(const ValueRef& src, const int pos) +void CodeEmitterNV50::srcAddr16(const ValueRef& src, bool adj, const int pos) { assert(src.get()); - uint32_t offset = SDATA(src).offset; + int32_t offset = SDATA(src).offset; + + assert(!adj || src.get()->reg.size <= 4); + if (adj) + offset /= src.get()->reg.size; - assert(offset <= 0xffff && (pos % 32) <= 16); + assert(offset <= 0x7fff && offset >= (int32_t)-0x8000 && (pos % 32) <= 16); + + if (offset < 0) + offset &= adj ? (0xffff >> (src.get()->reg.size >> 1)) : 0xffff; code[pos / 32] |= offset << (pos % 32); } @@ -143,14 +161,15 @@ void CodeEmitterNV50::srcAddr8(const ValueRef& src, const int pos) uint32_t offset = SDATA(src).offset; - assert(offset <= 0x1fc && !(offset & 0x3)); + assert((offset <= 0x1fc || offset == 0x3fc) && !(offset & 0x3)); code[pos / 32] |= (offset >> 2) << (pos % 32); } void CodeEmitterNV50::defId(const ValueDef& def, const int pos) { - assert(def.get()); + assert(def.get() && def.getFile() != FILE_SHADER_OUTPUT); + code[pos / 32] |= DDATA(def).id << (pos % 32); } @@ -170,11 +189,11 @@ CodeEmitterNV50::roundMode_MAD(const Instruction *insn) void CodeEmitterNV50::emitMNeg12(const Instruction *i) { - code[1] |= i->src[0].mod.neg() << 26; - code[1] |= i->src[1].mod.neg() << 27; + code[1] |= i->src(0).mod.neg() << 26; + code[1] |= i->src(1).mod.neg() << 27; } -void CodeEmitterNV50::emitCondCode(CondCode cc, int pos) +void CodeEmitterNV50::emitCondCode(CondCode cc, DataType ty, int pos) { uint8_t enc; @@ -210,6 +229,9 @@ void CodeEmitterNV50::emitCondCode(CondCode cc, int pos) assert(!"invalid condition code"); break; } + if (ty != TYPE_NONE && !isFloatType(ty)) + enc &= ~0x8; // unordered only exists for float types + code[pos / 32] |= enc << (pos % 32); } @@ -222,8 +244,8 @@ CodeEmitterNV50::emitFlagsRd(const Instruction *i) if (s >= 0) { assert(i->getSrc(s)->reg.file == FILE_FLAGS); - emitCondCode(i->cc, 32 + 7); - srcId(i->src[s], 32 + 12); + emitCondCode(i->cc, TYPE_NONE, 32 + 7); + srcId(i->src(s), 32 + 12); } else { code[1] |= 0x0780; } @@ -234,8 +256,22 @@ CodeEmitterNV50::emitFlagsWr(const Instruction *i) { assert(!(code[1] & 0x70)); - if (i->flagsDef >= 0) - code[1] |= (DDATA(i->def[i->flagsDef]).id << 4) | 0x40; + int flagsDef = i->flagsDef; + + // find flags definition and check that it is the last def + if (flagsDef < 0) { + for (int d = 0; i->defExists(d); ++d) + if (i->def(d).getFile() == FILE_FLAGS) + flagsDef = d; + if (flagsDef >= 0 && 0) // TODO: enforce use of flagsDef at some point + WARN("Instruction::flagsDef was not set properly\n"); + } + if (flagsDef == 0 && i->defExists(1)) + WARN("flags def should not be the primary definition\n"); + + if (flagsDef >= 0) + code[1] |= (DDATA(i->def(flagsDef)).id << 4) | 0x40; + } void @@ -248,20 +284,27 @@ CodeEmitterNV50::setARegBits(unsigned int u) void CodeEmitterNV50::setAReg16(const Instruction *i, int s) { - s = i->src[s].indirect[0]; - if (s >= 0) - setARegBits(SDATA(i->src[s]).id + 1); + if (i->srcExists(s)) { + s = i->src(s).indirect[0]; + if (s >= 0) + setARegBits(SDATA(i->src(s)).id + 1); + } } void CodeEmitterNV50::setImmediate(const Instruction *i, int s) { - const ImmediateValue *imm = i->src[s].get()->asImm(); + const ImmediateValue *imm = i->src(s).get()->asImm(); assert(imm); + uint32_t u = imm->reg.data.u32; + + if (i->src(s).mod & Modifier(NV50_IR_MOD_NOT)) + u = ~u; + code[1] |= 3; - code[0] |= (imm->reg.data.u32 & 0x3f) << 16; - code[1] |= (imm->reg.data.u32 >> 6) << 2; + code[0] |= (u & 0x3f) << 16; + code[1] |= (u >> 6) << 2; } void @@ -271,13 +314,18 @@ CodeEmitterNV50::setDst(const Value *dst) assert(reg->file != FILE_ADDRESS); - if (reg->data.id < 0) { + if (reg->data.id < 0 || reg->file == FILE_FLAGS) { code[0] |= (127 << 2) | 1; code[1] |= 8; } else { - if (reg->file == FILE_SHADER_OUTPUT) + int id; + if (reg->file == FILE_SHADER_OUTPUT) { code[1] |= 8; - code[0] |= reg->data.id << 2; + id = reg->data.offset / 4; + } else { + id = reg->data.id; + } + code[0] |= id << 2; } } @@ -293,60 +341,135 @@ CodeEmitterNV50::setDst(const Instruction *i, int d) } } +// 3 * 2 bits: +// 0: r +// 1: a/s +// 2: c +// 3: i void -CodeEmitterNV50::emitSrc0(const ValueRef& ref) +CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc) { - const Storage *reg = &ref.rep()->reg; - - if (reg->file == FILE_SHADER_INPUT) - code[1] |= 0x00200000; - else - if (reg->file != FILE_GPR) - ERROR("invalid src0 register file: %d\n", reg->file); + uint8_t mode = 0; - assert(reg->data.id < 128); - code[0] |= reg->data.id << 9; -} + for (unsigned int s = 0; s < Target::operationSrcNr[i->op]; ++s) { + switch (i->src(s).getFile()) { + case FILE_GPR: + break; + case FILE_MEMORY_SHARED: + case FILE_SHADER_INPUT: + mode |= 1 << (s * 2); + break; + case FILE_MEMORY_CONST: + mode |= 2 << (s * 2); + break; + case FILE_IMMEDIATE: + mode |= 3 << (s * 2); + break; + default: + ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile()); + assert(0); + break; + } + } + switch (mode) { + case 0x00: // rrr + break; + case 0x01: // arr/grr + if (progType == Program::TYPE_GEOMETRY) { + code[0] |= 0x01800000; + if (enc == NV50_OP_ENC_LONG || enc == NV50_OP_ENC_LONG_ALT) + code[1] |= 0x00200000; + } else { + if (enc == NV50_OP_ENC_SHORT) + code[0] |= 0x01000000; + else + code[1] |= 0x00200000; + } + break; + case 0x03: // irr + assert(i->op == OP_MOV); + return; + case 0x0c: // rir + break; + case 0x0d: // gir + code[0] |= 0x01000000; + assert(progType == Program::TYPE_GEOMETRY || + progType == Program::TYPE_COMPUTE); + break; + case 0x08: // rcr + code[0] |= (enc == NV50_OP_ENC_LONG_ALT) ? 0x01000000 : 0x00800000; + code[1] |= (i->getSrc(1)->reg.fileIndex << 22); + break; + case 0x09: // acr/gcr + if (progType == Program::TYPE_GEOMETRY) { + code[0] |= 0x01800000; + } else { + code[0] |= (enc == NV50_OP_ENC_LONG_ALT) ? 0x01000000 : 0x00800000; + code[1] |= 0x00200000; + } + code[1] |= (i->getSrc(1)->reg.fileIndex << 22); + break; + case 0x20: // rrc + code[0] |= 0x01000000; + code[1] |= (i->getSrc(2)->reg.fileIndex << 22); + break; + case 0x21: // arc + code[0] |= 0x01000000; + code[1] |= 0x00200000 | (i->getSrc(2)->reg.fileIndex << 22); + assert(progType != Program::TYPE_GEOMETRY); + break; + default: + ERROR("not encodable: %x\n", mode); + assert(0); + break; + } + if (progType != Program::TYPE_COMPUTE) + return; -void -CodeEmitterNV50::emitSrc1(const ValueRef& ref) -{ - const Storage *reg = &ref.rep()->reg; + if ((mode & 3) == 1) { + const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14; - if (reg->file == FILE_MEMORY_CONST) { - assert(!(code[1] & 0x01800000)); - code[0] |= 1 << 23; - code[1] |= reg->fileIndex << 22; - } else - if (reg->file != FILE_GPR) { - ERROR("invalid src1 register file: %d\n", reg->file); + switch (i->getSrc(0)->reg.type) { + case TYPE_U8: + break; + case TYPE_U16: + code[0] |= 1 << pos; + break; + case TYPE_S16: + code[0] |= 2 << pos; + break; + default: + code[0] |= 3 << pos; + assert(i->getSrc(0)->reg.size == 4); + break; + } } - - assert(reg->data.id < 128); - code[0] |= reg->data.id << 16; } void -CodeEmitterNV50::emitSrc2(const ValueRef& ref) +CodeEmitterNV50::setSrc(const Instruction *i, unsigned int s, int slot) { - const Storage *reg = &ref.rep()->reg; - - if (reg->file == FILE_MEMORY_CONST) { - assert(!(code[1] & 0x01800000)); - code[0] |= 1 << 24; - code[1] |= reg->fileIndex << 22; - } else - if (reg->file != FILE_GPR) { - ERROR("invalid src1 register file: %d\n", reg->file); + if (Target::operationSrcNr[i->op] <= s) + return; + const Storage *reg = &i->src(s).rep()->reg; + + unsigned int id = (reg->file == FILE_GPR) ? + reg->data.id : + reg->data.offset >> (reg->size >> 1); // no > 4 byte sources here + + switch (slot) { + case 0: code[0] |= id << 9; break; + case 1: code[0] |= id << 16; break; + case 2: code[1] |= id << 14; break; + default: + assert(0); + break; } - - assert(reg->data.id < 128); - code[1] |= reg->data.id << 14; } // the default form: // - long instruction -// - 1 to 3 sources in slots 0, 1, 2 +// - 1 to 3 sources in slots 0, 1, 2 (rrr, arr, rcr, acr, rrc, arc, gcr, grr) // - address & flags void CodeEmitterNV50::emitForm_MAD(const Instruction *i) @@ -359,14 +482,10 @@ CodeEmitterNV50::emitForm_MAD(const Instruction *i) setDst(i, 0); - if (i->srcExists(0)) - emitSrc0(i->src[0]); - - if (i->srcExists(1)) - emitSrc1(i->src[1]); - - if (i->srcExists(2)) - emitSrc2(i->src[2]); + setSrcFileBits(i, NV50_OP_ENC_LONG); + setSrc(i, 0, 0); + setSrc(i, 1, 1); + setSrc(i, 2, 2); setAReg16(i, 1); } @@ -383,16 +502,14 @@ CodeEmitterNV50::emitForm_ADD(const Instruction *i) setDst(i, 0); - if (i->srcExists(0)) - emitSrc0(i->src[0]); - - if (i->srcExists(1)) - emitSrc2(i->src[1]); + setSrcFileBits(i, NV50_OP_ENC_LONG_ALT); + setSrc(i, 0, 0); + setSrc(i, 1, 2); setAReg16(i, 1); } -// default short form +// default short form (rr, ar, rc, gr) void CodeEmitterNV50::emitForm_MUL(const Instruction *i) { @@ -402,15 +519,13 @@ CodeEmitterNV50::emitForm_MUL(const Instruction *i) setDst(i, 0); - if (i->srcExists(0)) - emitSrc0(i->src[0]); - - if (i->srcExists(1)) - emitSrc1(i->src[1]); + setSrcFileBits(i, NV50_OP_ENC_SHORT); + setSrc(i, 0, 0); + setSrc(i, 1, 1); } // usual immediate form -// - 1 to 3 sources where last is immediate +// - 1 to 3 sources where last is immediate (rir, gir) // - no address or predicate possible void CodeEmitterNV50::emitForm_IMM(const Instruction *i) @@ -422,21 +537,18 @@ CodeEmitterNV50::emitForm_IMM(const Instruction *i) setDst(i, 0); - if (i->srcExists(2)) { - emitSrc0(i->src[0]); - emitSrc1(i->src[1]); - setImmediate(i, 2); - } else - if (i->srcExists(1)) { - emitSrc0(i->src[0]); + setSrcFileBits(i, NV50_OP_ENC_IMM); + if (Target::operationSrcNr[i->op] > 1) { + setSrc(i, 0, 0); setImmediate(i, 1); + setSrc(i, 2, 1); } else { setImmediate(i, 0); } } void -CodeEmitterNV50::emitLoadStoreSize(DataType ty, int pos) +CodeEmitterNV50::emitLoadStoreSizeLG(DataType ty, int pos) { uint8_t enc; @@ -445,7 +557,9 @@ CodeEmitterNV50::emitLoadStoreSize(DataType ty, int pos) case TYPE_S32: // fall through case TYPE_U32: enc = 0x6; break; case TYPE_B128: enc = 0x5; break; - case TYPE_F64: enc = 0x4; break; + case TYPE_F64: // fall through + case TYPE_S64: // fall through + case TYPE_U64: enc = 0x4; break; case TYPE_S16: enc = 0x3; break; case TYPE_U16: enc = 0x2; break; case TYPE_S8: enc = 0x1; break; @@ -459,18 +573,58 @@ CodeEmitterNV50::emitLoadStoreSize(DataType ty, int pos) } void +CodeEmitterNV50::emitLoadStoreSizeCS(DataType ty) +{ + switch (ty) { + case TYPE_U8: break; + case TYPE_U16: code[1] |= 0x4000; break; + case TYPE_S16: code[1] |= 0x8000; break; + case TYPE_F32: + case TYPE_S32: + case TYPE_U32: code[1] |= 0xc000; break; + default: + assert(0); + break; + } +} + +void CodeEmitterNV50::emitLOAD(const Instruction *i) { - DataFile sf = i->src[0].getFile(); + DataFile sf = i->src(0).getFile(); + int32_t offset = i->getSrc(0)->reg.data.offset; switch (sf) { case FILE_SHADER_INPUT: - code[0] = 0x10000001; - code[1] = 0x04200000 | (i->lanes << 14); + // use 'mov' where we can + code[0] = i->src(0).isIndirect(0) ? 0x00000001 : 0x10000001; + code[1] = 0x00200000 | (i->lanes << 14); + if (typeSizeof(i->dType) == 4) + code[1] |= 0x04000000; + break; + case FILE_MEMORY_SHARED: + if (targ->getChipset() >= 0x84) { + assert(offset <= (int32_t)(0x3fff * typeSizeof(i->sType))); + code[0] = 0x10000001; + code[1] = 0x40000000; + + if (typeSizeof(i->dType) == 4) + code[1] |= 0x04000000; + + emitLoadStoreSizeCS(i->sType); + } else { + assert(offset <= (int32_t)(0x1f * typeSizeof(i->sType))); + code[0] = 0x10000001; + code[1] = 0x00200000 | (i->lanes << 14); + emitLoadStoreSizeCS(i->sType); + } break; case FILE_MEMORY_CONST: code[0] = 0x10000001; - code[1] = 0x24000000 | (i->getSrc(0)->reg.fileIndex << 22); + code[1] = 0x20000000 | (i->getSrc(0)->reg.fileIndex << 22); + if (typeSizeof(i->dType) == 4) + code[1] |= 0x04000000; + emitLoadStoreSizeCS(i->sType); break; case FILE_MEMORY_LOCAL: code[0] = 0xd0000001; @@ -486,18 +640,18 @@ CodeEmitterNV50::emitLOAD(const Instruction *i) } if (sf == FILE_MEMORY_LOCAL || sf == FILE_MEMORY_GLOBAL) - emitLoadStoreSize(i->sType, 21 + 32); + emitLoadStoreSizeLG(i->sType, 21 + 32); setDst(i, 0); emitFlagsRd(i); emitFlagsWr(i); - if (i->src[0].getFile() == FILE_MEMORY_GLOBAL) { - srcId(*i->src[0].getIndirect(0), 9); + if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) { + srcId(*i->src(0).getIndirect(0), 9); } else { setAReg16(i, 0); - srcAddr16(i->src[0], 9); + srcAddr16(i->src(0), i->src(0).getFile() != FILE_MEMORY_LOCAL, 9); } } @@ -509,19 +663,21 @@ CodeEmitterNV50::emitSTORE(const Instruction *i) switch (f) { case FILE_SHADER_OUTPUT: - code[0] = 0x00000001 | ((offset >> 2) << 2); + code[0] = 0x00000001 | ((offset >> 2) << 9); code[1] = 0x80c00000; - srcId(i->src[1], 32 + 15); + srcId(i->src(1), 32 + 14); break; case FILE_MEMORY_GLOBAL: - code[0] = 0xd0000000; + code[0] = 0xd0000001 | (i->getSrc(0)->reg.fileIndex << 16); code[1] = 0xa0000000; - emitLoadStoreSize(i->dType, 21 + 32); + emitLoadStoreSizeLG(i->dType, 21 + 32); + srcId(i->src(1), 2); break; case FILE_MEMORY_LOCAL: code[0] = 0xd0000001; code[1] = 0x60000000; - emitLoadStoreSize(i->dType, 21 + 32); + emitLoadStoreSizeLG(i->dType, 21 + 32); + srcId(i->src(1), 2); break; case FILE_MEMORY_SHARED: code[0] = 0x00000001; @@ -536,28 +692,27 @@ CodeEmitterNV50::emitSTORE(const Instruction *i) break; case 4: code[0] |= (offset >> 2) << 9; - code[1] |= 0x04000000; + code[1] |= 0x04200000; break; default: assert(0); break; } + srcId(i->src(1), 32 + 14); break; default: assert(!"invalid store destination file"); break; } - if (f != FILE_SHADER_OUTPUT) { - srcId(i->src[1], 2); - if (f == FILE_MEMORY_GLOBAL) - srcId(*i->src[0].getIndirect(0), 9); - if (f == FILE_MEMORY_LOCAL) - srcAddr16(i->src[0], 9); - } - if (f != FILE_MEMORY_GLOBAL) + if (f == FILE_MEMORY_GLOBAL) + srcId(*i->src(0).getIndirect(0), 9); + else setAReg16(i, 0); + if (f == FILE_MEMORY_LOCAL) + srcAddr16(i->src(0), false, 9); + emitFlagsRd(i); } @@ -572,21 +727,22 @@ CodeEmitterNV50::emitMOV(const Instruction *i) if (sf == FILE_FLAGS) { code[0] = 0x00000001; code[1] = 0x20000000; - defId(i->def[0], 2); - srcId(i->src[0], 12); + defId(i->def(0), 2); + srcId(i->src(0), 12); emitFlagsRd(i); } else if (sf == FILE_ADDRESS) { code[0] = 0x00000001; code[1] = 0x40000000; - defId(i->def[0], 2); - setARegBits(SDATA(i->src[0]).id + 1); + defId(i->def(0), 2); + setARegBits(SDATA(i->src(0)).id + 1); + emitFlagsRd(i); } else if (df == FILE_FLAGS) { code[0] = 0x00000001; code[1] = 0xa0000000; - defId(i->def[0], 4); - srcId(i->src[0], 9); + defId(i->def(0), 4); + srcId(i->src(0), 9); emitFlagsRd(i); } else if (sf == FILE_IMMEDIATE) { @@ -598,10 +754,12 @@ CodeEmitterNV50::emitMOV(const Instruction *i) code[0] = 0x10008000; } else { code[0] = 0x10000001; - code[1] = 0x04000000 | (i->lanes << 14); + code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000; + code[1] |= (i->lanes << 14); + emitFlagsRd(i); } - defId(i->def[0], 2); - srcId(i->src[0], 9); + defId(i->def(0), 2); + srcId(i->src(0), 9); } if (df == FILE_SHADER_OUTPUT) { assert(i->encSize == 8); @@ -628,7 +786,7 @@ CodeEmitterNV50::emitQUADOP(const Instruction *i, uint8_t lane, uint8_t quOp) emitForm_ADD(i); if (!i->srcExists(1)) - srcId(i->src[0], 32 + 14); + srcId(i->src(0), 32 + 14); } void @@ -637,8 +795,8 @@ CodeEmitterNV50::emitPFETCH(const Instruction *i) code[0] = 0x11800001; code[1] = 0x04200000 | (0xf << 14); - defId(i->def[0], 2); - srcAddr8(i->src[0], 9); + defId(i->def(0), 2); + srcAddr8(i->src(0), 9); setAReg16(i, 0); } @@ -647,27 +805,27 @@ CodeEmitterNV50::emitINTERP(const Instruction *i) { code[0] = 0x80000000; - defId(i->def[0], 2); - srcAddr8(i->src[0], 16); + defId(i->def(0), 2); + srcAddr8(i->src(0), 16); if (i->getInterpMode() == NV50_IR_INTERP_FLAT) { code[0] |= 1 << 8; } else { if (i->op == OP_PINTERP) { code[0] |= 1 << 25; - srcId(i->src[1], 9); + srcId(i->src(1), 9); } if (i->getSampleMode() == NV50_IR_INTERP_CENTROID) code[0] |= 1 << 24; } if (i->encSize == 8) { - emitFlagsRd(i); - code[1] |= + code[1] = (code[0] & (3 << 24)) >> (24 - 16) | - (code[0] & (1 << 8)) >> (18 - 8); + (code[0] & (1 << 8)) << (18 - 8); code[0] &= ~0x03000100; code[0] |= 1; + emitFlagsRd(i); } } @@ -693,8 +851,8 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i) assert(0); break; } - code[1] |= i->src[0].mod.abs() << 20; - code[1] |= i->src[1].mod.abs() << 19; + code[1] |= i->src(0).mod.abs() << 20; + code[1] |= i->src(1).mod.abs() << 19; } emitForm_MAD(i); } @@ -702,8 +860,8 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i) void CodeEmitterNV50::emitFMAD(const Instruction *i) { - const int neg_mul = i->src[0].mod.neg() ^ i->src[1].mod.neg(); - const int neg_add = i->src[2].mod.neg(); + const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg(); + const int neg_add = i->src(2).mod.neg(); code[0] = 0xe0000000; @@ -711,30 +869,32 @@ CodeEmitterNV50::emitFMAD(const Instruction *i) emitForm_MUL(i); assert(!neg_mul && !neg_add); } else { - emitForm_MAD(i); - code[1] |= neg_mul << 26; + code[1] = neg_mul << 26; code[1] |= neg_add << 27; if (i->saturate) code[1] |= 1 << 29; + emitForm_MAD(i); } } void CodeEmitterNV50::emitFADD(const Instruction *i) { - const int neg0 = i->src[0].mod.neg(); - const int neg1 = i->src[1].mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0); + const int neg0 = i->src(0).mod.neg(); + const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0); code[0] = 0xb0000000; - assert(!(i->src[0].mod | i->src[1].mod).abs()); + assert(!(i->src(0).mod | i->src(1).mod).abs()); - if (i->src[1].getFile() == FILE_IMMEDIATE) { + if (i->src(1).getFile() == FILE_IMMEDIATE) { + code[1] = 0; emitForm_IMM(i); code[0] |= neg0 << 15; code[0] |= neg1 << 22; } else if (i->encSize == 8) { + code[1] = 0; emitForm_ADD(i); code[1] |= neg0 << 26; code[1] |= neg1 << 27; @@ -744,27 +904,40 @@ CodeEmitterNV50::emitFADD(const Instruction *i) emitForm_MUL(i); code[0] |= neg0 << 15; code[0] |= neg1 << 22; + if (i->saturate) + code[0] |= 1 << 8; } } void CodeEmitterNV50::emitUADD(const Instruction *i) { + const int neg0 = i->src(0).mod.neg(); + const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0); + code[0] = 0x20008000; - if (i->src[0].getFile() == FILE_IMMEDIATE) { + if (i->src(1).getFile() == FILE_IMMEDIATE) { + code[1] = 0; emitForm_IMM(i); } else if (i->encSize == 8) { code[0] = 0x20000000; - code[1] = 0x04000000; + code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000; emitForm_ADD(i); } else { emitForm_MUL(i); } - assert(!(i->src[0].mod.neg() && i->src[1].mod.neg())); - code[0] |= i->src[0].mod.neg() << 28; - code[0] |= i->src[1].mod.neg() << 22; + assert(!(neg0 && neg1)); + code[0] |= neg0 << 28; + code[0] |= neg1 << 22; + + if (i->flagsSrc >= 0) { + // addc == sub | subr + assert(!(code[0] & 0x10400000) && !i->getPredicate()); + code[0] |= 0x10400000; + srcId(i->src(i->flagsSrc), 32 + 12); + } } void @@ -775,30 +948,47 @@ CodeEmitterNV50::emitAADD(const Instruction *i) code[0] = 0xd0000001 | (i->getSrc(s)->reg.data.u16 << 9); code[1] = 0x20000000; - code[0] |= (DDATA(i->def[0]).id + 1) << 2; + code[0] |= (DDATA(i->def(0)).id + 1) << 2; emitFlagsRd(i); if (s && i->srcExists(0)) - setARegBits(SDATA(i->src[0]).id + 1); + setARegBits(SDATA(i->src(0)).id + 1); +} + +void +CodeEmitterNV50::emitIMUL(const Instruction *i) +{ + code[0] = 0x40000000; + + if (i->encSize == 8) { + code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000; + emitForm_MAD(i); + } else { + if (i->sType == TYPE_S16) + code[0] |= 0x8100; + emitForm_MUL(i); + } } void CodeEmitterNV50::emitFMUL(const Instruction *i) { - const int neg = (i->src[0].mod ^ i->src[1].mod).neg(); + const int neg = (i->src(0).mod ^ i->src(1).mod).neg(); code[0] = 0xc0000000; - if (i->src[0].getFile() == FILE_IMMEDIATE) { + if (i->src(1).getFile() == FILE_IMMEDIATE) { + code[1] = 0; emitForm_IMM(i); if (neg) code[0] |= 0x8000; } else if (i->encSize == 8) { - emitForm_MAD(i); + code[1] = i->rnd == ROUND_Z ? 0x0000c000 : 0; if (neg) code[1] |= 0x08000000; + emitForm_MAD(i); } else { emitForm_MUL(i); if (neg) @@ -807,12 +997,38 @@ CodeEmitterNV50::emitFMUL(const Instruction *i) } void +CodeEmitterNV50::emitIMAD(const Instruction *i) +{ + code[0] = 0x60000000; + if (isSignedType(i->sType)) + code[1] = i->saturate ? 0x40000000 : 0x20000000; + else + code[1] = 0x00000000; + + int neg1 = i->src(0).mod.neg() ^ i->src(1).mod.neg(); + int neg2 = i->src(2).mod.neg(); + + assert(!(neg1 & neg2)); + code[1] |= neg1 << 27; + code[1] |= neg2 << 26; + + emitForm_MAD(i); + + if (i->flagsSrc >= 0) { + // add with carry from $cX + assert(!(code[1] & 0x0c000000) && !i->getPredicate()); + code[1] |= 0xc << 24; + srcId(i->src(i->flagsSrc), 32 + 12); + } +} + +void CodeEmitterNV50::emitSET(const Instruction *i) { code[0] = 0x30000000; code[1] = 0x60000000; - emitCondCode(i->asCmp()->setCond, 32 + 14); + emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14); switch (i->sType) { case TYPE_F32: code[0] |= 0x80000000; break; @@ -824,6 +1040,11 @@ CodeEmitterNV50::emitSET(const Instruction *i) assert(0); break; } + if (i->src(0).mod.neg()) code[1] |= 0x04000000; + if (i->src(1).mod.neg()) code[1] |= 0x08000000; + if (i->src(0).mod.abs()) code[1] |= 0x00100000; + if (i->src(1).mod.abs()) code[1] |= 0x00080000; + emitForm_MAD(i); } @@ -938,6 +1159,7 @@ CodeEmitterNV50::emitCVT(const Instruction *i) assert(0); break; } + break; case TYPE_S16: case TYPE_U16: case TYPE_S8: @@ -958,12 +1180,12 @@ CodeEmitterNV50::emitCVT(const Instruction *i) default: break; } - code[1] ^= i->src[0].mod.neg() << 29; - code[1] |= i->src[0].mod.abs() << 20; + code[1] ^= i->src(0).mod.neg() << 29; + code[1] |= i->src(0).mod.abs() << 20; if (i->saturate) code[1] |= 1 << 19; - assert(i->op != OP_ABS || !i->src[0].mod.neg()); + assert(i->op != OP_ABS || !i->src(0).mod.neg()); emitForm_MAD(i); } @@ -974,8 +1196,8 @@ CodeEmitterNV50::emitPreOp(const Instruction *i) code[0] = 0xb0000000; code[1] = (i->op == OP_PREEX2) ? 0xc0004000 : 0xc0000000; - code[1] |= i->src[0].mod.abs() << 20; - code[1] |= i->src[0].mod.neg() << 26; + code[1] |= i->src(0).mod.abs() << 20; + code[1] |= i->src(0).mod.neg() << 26; emitForm_MAD(i); } @@ -990,18 +1212,37 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp) emitForm_MUL(i); } else { code[1] = subOp << 29; - code[1] |= i->src[0].mod.abs() << 20; - code[1] |= i->src[0].mod.neg() << 26; + code[1] |= i->src(0).mod.abs() << 20; + code[1] |= i->src(0).mod.neg() << 26; emitForm_MAD(i); } } void +CodeEmitterNV50::emitNOT(const Instruction *i) +{ + code[0] = 0xd0000000; + code[1] = 0x0002c000; + + switch (i->sType) { + case TYPE_U32: + case TYPE_S32: + code[1] |= 0x04000000; + break; + default: + break; + } + emitForm_MAD(i); + setSrc(i, 0, 1); +} + +void CodeEmitterNV50::emitLogicOp(const Instruction *i) { code[0] = 0xd0000000; + code[1] = 0; - if (i->src[1].getFile() == FILE_IMMEDIATE) { + if (i->src(1).getFile() == FILE_IMMEDIATE) { switch (i->op) { case OP_OR: code[0] |= 0x0100; break; case OP_XOR: code[0] |= 0x8000; break; @@ -1019,37 +1260,45 @@ CodeEmitterNV50::emitLogicOp(const Instruction *i) assert(0); break; } + if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT)) + code[1] |= 1 << 16; + if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT)) + code[1] |= 1 << 17; + emitForm_MAD(i); } } void -CodeEmitterNV50::emitARL(const Instruction *i) +CodeEmitterNV50::emitARL(const Instruction *i, unsigned int shl) { - assert(i->src[1].getFile() == FILE_IMMEDIATE); - - code[0] = 0x00000001 | (i->getSrc(1)->reg.data.u32 & 0x3f) << 16; + code[0] = 0x00000001 | (shl << 16); code[1] = 0xc0000000; - code[0] |= (DDATA(i->def[0]).id + 1) << 2; - emitSrc0(i->src[0]); + code[0] |= (DDATA(i->def(0)).id + 1) << 2; + + setSrcFileBits(i, NV50_OP_ENC_IMM); + setSrc(i, 0, 0); emitFlagsRd(i); } void CodeEmitterNV50::emitShift(const Instruction *i) { - if (i->def[0].getFile() == FILE_ADDRESS) { - emitARL(i); + if (i->def(0).getFile() == FILE_ADDRESS) { + assert(i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE); + emitARL(i, i->getSrc(1)->reg.data.u32 & 0x3f); } else { code[0] = 0x30000001; code[1] = (i->op == OP_SHR) ? 0xe4000000 : 0xc4000000; - if (isSignedType(i->sType)) + if (i->op == OP_SHR && isSignedType(i->sType)) code[1] |= 1 << 27; - if (i->src[1].getFile() == FILE_IMMEDIATE) { + if (i->src(1).getFile() == FILE_IMMEDIATE) { code[1] |= 1 << 20; code[0] |= (i->getSrc(1)->reg.data.u32 & 0x7f) << 16; + defId(i->def(0), 2); + srcId(i->src(0), 9); emitFlagsRd(i); } else { emitForm_MAD(i); @@ -1080,7 +1329,7 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i) code[1] = 0x40000000; break; case OP_TXF: - code[0] = 0x01000000; + code[0] |= 0x01000000; break; case OP_TXG: code[0] = 0x01000000; @@ -1096,7 +1345,7 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i) int argc = i->tex.target.getArgCount(); - if (i->op == OP_TXB || i->op == OP_TXL) + if (i->op == OP_TXB || i->op == OP_TXL || i->op == OP_TXF) argc += 1; if (i->tex.target.isShadow()) argc += 1; @@ -1108,9 +1357,9 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i) code[0] |= 0x08000000; } else if (i->tex.useOffsets) { - code[1] |= (i->tex.offset[0][0] & 0xf) << 16; + code[1] |= (i->tex.offset[0][0] & 0xf) << 24; code[1] |= (i->tex.offset[0][1] & 0xf) << 20; - code[1] |= (i->tex.offset[0][2] & 0xf) << 24; + code[1] |= (i->tex.offset[0][2] & 0xf) << 16; } code[0] |= (i->tex.mask & 0x3) << 25; @@ -1119,27 +1368,100 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i) if (i->tex.liveOnly) code[1] |= 4; - defId(i->def[0], 2); + defId(i->def(0), 2); + + emitFlagsRd(i); +} + +void +CodeEmitterNV50::emitTXQ(const TexInstruction *i) +{ + assert(i->tex.query == TXQ_DIMS); + + code[0] = 0xf0000001; + code[1] = 0x60000000; + + code[0] |= i->tex.r << 9; + code[0] |= i->tex.s << 17; + + code[0] |= (i->tex.mask & 0x3) << 25; + code[1] |= (i->tex.mask & 0xc) << 12; + + defId(i->def(0), 2); emitFlagsRd(i); } void +CodeEmitterNV50::emitPRERETEmu(const FlowInstruction *i) +{ + uint32_t pos = i->target.bb->binPos + 8; // +8 to skip an op */ + + code[0] = 0x10000003; // bra + code[1] = 0x00000780; // always + + switch (i->subOp) { + case NV50_IR_SUBOP_EMU_PRERET + 0: // bra to the call + break; + case NV50_IR_SUBOP_EMU_PRERET + 1: // bra to skip the call + pos += 8; + break; + default: + assert(i->subOp == (NV50_IR_SUBOP_EMU_PRERET + 2)); + code[0] = 0x20000003; // call + code[1] = 0x00000000; // no predicate + break; + } + addReloc(RelocEntry::TYPE_CODE, 0, pos, 0x07fff800, 9); + addReloc(RelocEntry::TYPE_CODE, 1, pos, 0x000fc000, -4); +} + +void CodeEmitterNV50::emitFlow(const Instruction *i, uint8_t flowOp) { const FlowInstruction *f = i->asFlow(); + bool hasPred = false; + bool hasTarg = false; code[0] = 0x00000003 | (flowOp << 28); code[1] = 0x00000000; - emitFlagsRd(i); + switch (i->op) { + case OP_BRA: + hasPred = true; + hasTarg = true; + break; + case OP_BREAK: + case OP_BRKPT: + case OP_DISCARD: + case OP_RET: + hasPred = true; + break; + case OP_CALL: + case OP_PREBREAK: + case OP_JOINAT: + hasTarg = true; + break; + case OP_PRERET: + hasTarg = true; + if (i->subOp >= NV50_IR_SUBOP_EMU_PRERET) { + emitPRERETEmu(f); + return; + } + break; + default: + break; + } + + if (hasPred) + emitFlagsRd(i); - if (f && f->target.bb) { + if (hasTarg && f) { uint32_t pos; if (f->op == OP_CALL) { if (f->builtin) { - pos = 0; // XXX: TODO + pos = targ->getBuiltinOffset(f->target.builtin); } else { pos = f->target.fn->binPos; } @@ -1149,6 +1471,13 @@ CodeEmitterNV50::emitFlow(const Instruction *i, uint8_t flowOp) code[0] |= ((pos >> 2) & 0xffff) << 11; code[1] |= ((pos >> 18) & 0x003f) << 14; + + RelocEntry::Type relocTy; + + relocTy = f->builtin ? RelocEntry::TYPE_BUILTIN : RelocEntry::TYPE_CODE; + + addReloc(relocTy, 0, pos, 0x07fff800, 9); + addReloc(relocTy, 1, pos, 0x000fc000, -4); } } @@ -1164,10 +1493,15 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) return false; } + if (insn->bb->getProgram()->dbgFlags & NV50_IR_DEBUG_BASIC) { + INFO("EMIT: "); insn->print(); + } + switch (insn->op) { case OP_MOV: emitMOV(insn); break; + case OP_EXIT: case OP_NOP: case OP_JOIN: emitNOP(); @@ -1191,6 +1525,8 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) case OP_SUB: if (isFloatType(insn->dType)) emitFADD(insn); + else if (insn->getDef(0)->reg.file == FILE_ADDRESS) + emitAADD(insn); else emitUADD(insn); break; @@ -1198,18 +1534,30 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) if (isFloatType(insn->dType)) emitFMUL(insn); else - emitUMUL(insn); + emitIMUL(insn); break; case OP_MAD: case OP_FMA: - emitFMAD(insn); + if (isFloatType(insn->dType)) + emitFMAD(insn); + else + emitIMAD(insn); break; + case OP_NOT: + emitNOT(insn); break; case OP_AND: case OP_OR: case OP_XOR: emitLogicOp(insn); break; + case OP_SHL: + case OP_SHR: + emitShift(insn); + break; + case OP_SET: + emitSET(insn); + break; case OP_MIN: case OP_MAX: emitMINMAX(insn); @@ -1217,9 +1565,22 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) case OP_CEIL: case OP_FLOOR: case OP_TRUNC: - case OP_CVT: + case OP_ABS: + case OP_NEG: + case OP_SAT: emitCVT(insn); break; + case OP_CVT: + if (insn->def(0).getFile() == FILE_ADDRESS) + emitARL(insn, 0); + else + if (insn->def(0).getFile() == FILE_FLAGS || + insn->src(0).getFile() == FILE_FLAGS || + insn->src(0).getFile() == FILE_ADDRESS) + emitMOV(insn); + else + emitCVT(insn); + break; case OP_RCP: emitSFnOp(insn, 0); break; @@ -1245,8 +1606,12 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) case OP_TEX: case OP_TXB: case OP_TXL: + case OP_TXF: emitTEX(insn->asTex()); break; + case OP_TXQ: + emitTXQ(insn->asTex()); + break; case OP_EMIT: case OP_RESTART: emitOUT(insn); @@ -1285,15 +1650,15 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) emitQUADOP(insn, insn->lanes, insn->subOp); break; case OP_DFDX: - emitQUADOP(insn, 4, insn->src[0].mod.neg() ? 0x66 : 0x99); + emitQUADOP(insn, 4, insn->src(0).mod.neg() ? 0x66 : 0x99); break; case OP_DFDY: - emitQUADOP(insn, 5, insn->src[0].mod.neg() ? 0x5a : 0xa5); + emitQUADOP(insn, 5, insn->src(0).mod.neg() ? 0x5a : 0xa5); break; case OP_PHI: case OP_UNION: case OP_CONSTRAINT: - ERROR("operation should have been eliminated"); + ERROR("operation should have been eliminated\n"); return false; case OP_EXP: case OP_LOG: @@ -1310,16 +1675,16 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) ERROR("operation should have been lowered\n"); return false; default: - ERROR("unknow op\n"); + ERROR("unknown op: %u\n", insn->op); return false; } - if (insn->join) + if (insn->join || insn->op == OP_JOIN) code[1] |= 0x2; else - if (insn->exit) + if (insn->exit || insn->op == OP_EXIT) code[1] |= 0x1; - assert((insn->encSize == 8) == (code[1] & 1)); + assert((insn->encSize == 8) == (code[0] & 1)); code += insn->encSize / 4; codeSize += insn->encSize; @@ -1331,20 +1696,147 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const { const Target::OpInfo &info = targ->getOpInfo(i); - if (info.minEncSize == 8) + if (info.minEncSize > 4) + return 8; + + // check constraints on dst and src operands + for (int d = 0; i->defExists(d); ++d) { + if (i->def(d).rep()->reg.data.id > 63 || + i->def(d).rep()->reg.file != FILE_GPR) + return 8; + } + + for (int s = 0; i->srcExists(s); ++s) { + DataFile sf = i->src(s).getFile(); + if (sf != FILE_GPR) + if (sf != FILE_SHADER_INPUT || progType != Program::TYPE_FRAGMENT) + return 8; + if (i->src(s).rep()->reg.data.id > 63) + return 8; + } + + // check modifiers & rounding + if (i->join || i->lanes != 0xf || i->exit) + return 8; + if (i->op == OP_MUL && i->rnd != ROUND_N) return 8; - return 4; + if (i->asTex()) + return 8; // TODO: short tex encoding + + // check constraints on short MAD + if (info.srcNr >= 2 && i->srcExists(2)) { + if (i->saturate || i->src(2).mod) + return 8; + if ((i->src(0).mod ^ i->src(1).mod) || + (i->src(0).mod | i->src(1).mod).abs()) + return 8; + if (!i->defExists(0) || + i->def(0).rep()->reg.data.id != i->src(2).rep()->reg.data.id) + return 8; + } + + return info.minEncSize; +} + +// Change the encoding size of an instruction after BBs have been scheduled. +static void +makeInstructionLong(Instruction *insn) +{ + if (insn->encSize == 8) + return; + Function *fn = insn->bb->getFunction(); + int n = 0; + int adj = 4; + + for (Instruction *i = insn->next; i && i->encSize == 4; ++n, i = i->next); + + if (n & 1) { + adj = 8; + insn->next->encSize = 8; + } else + if (insn->prev && insn->prev->encSize == 4) { + adj = 8; + insn->prev->encSize = 8; + } + insn->encSize = 8; + + for (int i = fn->bbCount - 1; i >= 0 && fn->bbArray[i] != insn->bb; --i) { + fn->bbArray[i]->binPos += 4; + } + fn->binSize += adj; + insn->bb->binSize += adj; +} + +static bool +trySetExitModifier(Instruction *insn) +{ + if (insn->op == OP_DISCARD || + insn->op == OP_QUADON || + insn->op == OP_QUADPOP) + return false; + for (int s = 0; insn->srcExists(s); ++s) + if (insn->src(s).getFile() == FILE_IMMEDIATE) + return false; + if (insn->asFlow()) { + if (insn->op == OP_CALL) // side effects ! + return false; + if (insn->getPredicate()) // cannot do conditional exit (or can we ?) + return false; + insn->op = OP_EXIT; + } + insn->exit = 1; + makeInstructionLong(insn); + return true; +} + +static void +replaceExitWithModifier(Function *func) +{ + BasicBlock *epilogue = BasicBlock::get(func->cfgExit); + + if (!epilogue->getExit() || + epilogue->getExit()->op != OP_EXIT) // only main will use OP_EXIT + return; + + if (epilogue->getEntry()->op != OP_EXIT) { + Instruction *insn = epilogue->getExit()->prev; + if (!insn || !trySetExitModifier(insn)) + return; + insn->exit = 1; + } else { + for (Graph::EdgeIterator ei = func->cfgExit->incident(); + !ei.end(); ei.next()) { + BasicBlock *bb = BasicBlock::get(ei.getNode()); + Instruction *i = bb->getExit(); + + if (!i || !trySetExitModifier(i)) + return; + } + } + epilogue->binSize -= 8; + func->binSize -= 8; + delete_Instruction(func->getProgram(), epilogue->getExit()); +} + +void +CodeEmitterNV50::prepareEmission(Function *func) +{ + CodeEmitter::prepareEmission(func); + + replaceExitWithModifier(func); } -CodeEmitterNV50::CodeEmitterNV50(const Target *target) : targ(target) +CodeEmitterNV50::CodeEmitterNV50(const TargetNV50 *target) : CodeEmitter(target) { + targ = target; // specialized code = NULL; codeSize = codeSizeLimit = 0; + relocInfo = NULL; } CodeEmitter * -Target::getCodeEmitter(Program::Type type) +TargetNV50::getCodeEmitter(Program::Type type) { CodeEmitterNV50 *emit = new CodeEmitterNV50(this); emit->setProgramType(type); diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp new file mode 100644 index 00000000000..30d8acee3bc --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp @@ -0,0 +1,1118 @@ +/* + * Copyright 2011 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "nv50/codegen/nv50_ir.h" +#include "nv50/codegen/nv50_ir_build_util.h" + +#include "nv50_ir_target_nv50.h" + +namespace nv50_ir { + +// nv50 doesn't support 32 bit integer multiplication +// +// ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl) +// ------------------- +// al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) + +// ah*bh 00 00 ( carry1) << 16 + ( carry2) +// al*bl +// ah*bl 00 +// +// fffe0001 + fffe0001 +static bool +expandIntegerMUL(BuildUtil *bld, Instruction *mul) +{ + const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH; + + DataType fTy = mul->sType; // full type + DataType hTy; + switch (fTy) { + case TYPE_S32: hTy = TYPE_S16; break; + case TYPE_U32: hTy = TYPE_U16; break; + case TYPE_U64: hTy = TYPE_U32; break; + case TYPE_S64: hTy = TYPE_S32; break; + default: + return false; + } + unsigned int fullSize = typeSizeof(fTy); + unsigned int halfSize = typeSizeof(hTy); + + Instruction *i[9]; + + Value *a[2] = { bld->getSSA(halfSize), bld->getSSA(halfSize) }; + Value *b[2] = { bld->getSSA(halfSize), bld->getSSA(halfSize) }; + Value *c[2]; + Value *t[4]; + for (int j = 0; j < 4; ++j) + t[j] = bld->getSSA(fullSize); + + (i[0] = bld->mkOp1(OP_SPLIT, fTy, a[0], mul->getSrc(0)))->setDef(1, a[1]); + (i[1] = bld->mkOp1(OP_SPLIT, fTy, b[0], mul->getSrc(1)))->setDef(1, b[1]); + + i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]); + i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]); + i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8)); + i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]); + + if (highResult) { + Value *r[3]; + Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8)); + c[0] = bld->getSSA(1, FILE_FLAGS); + c[1] = bld->getSSA(1, FILE_FLAGS); + for (int j = 0; j < 3; ++j) + r[j] = bld->getSSA(fullSize); + + i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8)); + i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm); + bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]); + i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]); + + // set carry defs / sources + i[3]->setFlagsDef(1, c[0]); + i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry + i[6]->setPredicate(CC_C, c[0]); + i[5]->setFlagsSrc(3, c[1]); + } else { + bld->mkMov(mul->getDef(0), t[3]); + } + delete_Instruction(bld->getProgram(), mul); + + for (int j = 2; j <= (highResult ? 5 : 4); ++j) + i[j]->sType = hTy; + + return true; +} + +#define QOP_ADD 0 +#define QOP_SUBR 1 +#define QOP_SUB 2 +#define QOP_MOV2 3 + +#define QUADOP(q, r, s, t) \ + ((QOP_##q << 0) | (QOP_##r << 2) | \ + (QOP_##s << 4) | (QOP_##t << 6)) + +class NV50LegalizePostRA : public Pass +{ +private: + virtual bool visit(Function *); + virtual bool visit(BasicBlock *); + + void handlePRERET(FlowInstruction *); + void replaceZero(Instruction *); + void split64BitOp(Instruction *); + + LValue *r63; +}; + +bool +NV50LegalizePostRA::visit(Function *fn) +{ + Program *prog = fn->getProgram(); + + r63 = new_LValue(fn, FILE_GPR); + r63->reg.data.id = 63; + + // this is actually per-program, but we can do it all on visiting main() + std::list<Instruction *> *outWrites = + reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); + + if (outWrites) { + for (std::list<Instruction *>::iterator it = outWrites->begin(); + it != outWrites->end(); ++it) + (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0)); + // instructions will be deleted on exit + outWrites->clear(); + } + + return true; +} + +void +NV50LegalizePostRA::replaceZero(Instruction *i) +{ + for (int s = 0; i->srcExists(s); ++s) { + ImmediateValue *imm = i->getSrc(s)->asImm(); + if (imm && imm->reg.data.u64 == 0) + i->setSrc(s, r63); + } +} + +void +NV50LegalizePostRA::split64BitOp(Instruction *i) +{ + if (i->dType == TYPE_F64) { + if (i->op == OP_MAD) + i->op = OP_FMA; + if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA || + i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX || + i->op == OP_SET) + return; + i->dType = i->sType = TYPE_U32; + + i->bb->insertAfter(i, cloneForward(func, i)); + } +} + +// Emulate PRERET: jump to the target and call to the origin from there +// +// WARNING: atm only works if BBs are affected by at most a single PRERET +// +// BB:0 +// preret BB:3 +// (...) +// BB:3 +// (...) +// ---> +// BB:0 +// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate) +// (...) +// BB:3 +// bra BB:3 + n1 (skip the call) +// call BB:0 + n2 (skip bra at beginning of BB:0) +// (...) +void +NV50LegalizePostRA::handlePRERET(FlowInstruction *pre) +{ + BasicBlock *bbE = pre->bb; + BasicBlock *bbT = pre->target.bb; + + pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0; + bbE->remove(pre); + bbE->insertHead(pre); + + Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT); + Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE); + + bbT->insertHead(call); + bbT->insertHead(skip); + + // NOTE: maybe split blocks to prevent the instructions from moving ? + + skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1; + call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2; +} + +bool +NV50LegalizePostRA::visit(BasicBlock *bb) +{ + Instruction *i, *next; + + // remove pseudo operations and non-fixed no-ops, split 64 bit operations + for (i = bb->getFirst(); i; i = next) { + next = i->next; + if (i->isNop()) { + bb->remove(i); + } else + if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) { + handlePRERET(i->asFlow()); + } else { + if (i->op != OP_MOV && i->op != OP_PFETCH && + (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS)) + replaceZero(i); + if (typeSizeof(i->dType) == 8) + split64BitOp(i); + } + } + if (!bb->getEntry()) + return true; + + return true; +} + +class NV50LegalizeSSA : public Pass +{ +public: + NV50LegalizeSSA(Program *); + + virtual bool visit(BasicBlock *bb); + +private: + void propagateWriteToOutput(Instruction *); + void handleDIV(Instruction *); + void handleMOD(Instruction *); + void handleMUL(Instruction *); + void handleAddrDef(Instruction *); + + inline bool isARL(const Instruction *) const; + + BuildUtil bld; + + std::list<Instruction *> *outWrites; +}; + +NV50LegalizeSSA::NV50LegalizeSSA(Program *prog) +{ + bld.setProgram(prog); + + if (prog->optLevel >= 2 && + (prog->getType() == Program::TYPE_GEOMETRY || + prog->getType() == Program::TYPE_VERTEX)) + outWrites = + reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); + else + outWrites = NULL; +} + +void +NV50LegalizeSSA::propagateWriteToOutput(Instruction *st) +{ + if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1) + return; + + // check def instruction can store + Instruction *di = st->getSrc(1)->defs.front()->getInsn(); + + // TODO: move exports (if beneficial) in common opt pass + if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1) + return; + for (int s = 0; di->srcExists(s); ++s) + if (di->src(s).getFile() == FILE_IMMEDIATE) + return; + + // We cannot set defs to non-lvalues before register allocation, so + // save & remove (to save registers) the exports and replace later. + outWrites->push_back(st); + st->bb->remove(st); +} + +bool +NV50LegalizeSSA::isARL(const Instruction *i) const +{ + ImmediateValue imm; + + if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR) + return false; + if (!i->src(1).getImmediate(imm)) + return false; + return imm.isInteger(0); +} + +void +NV50LegalizeSSA::handleAddrDef(Instruction *i) +{ + Instruction *arl; + + i->getDef(0)->reg.size = 2; // $aX are only 16 bit + + // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid + if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) { + if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR) + return; + if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS) + return; + } + + // turn $a sources into $r sources (can't operate on $a) + for (int s = 0; i->srcExists(s); ++s) { + Value *a = i->getSrc(s); + Value *r; + if (a->reg.file == FILE_ADDRESS) { + if (a->getInsn() && isARL(a->getInsn())) { + i->setSrc(s, a->getInsn()->getSrc(0)); + } else { + bld.setPosition(i, false); + r = bld.getSSA(); + bld.mkMov(r, a); + i->setSrc(s, r); + } + } + } + if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE) + return; + + // turn result back into $a + bld.setPosition(i, true); + arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0)); + i->setDef(0, arl->getSrc(0)); +} + +void +NV50LegalizeSSA::handleMUL(Instruction *mul) +{ + if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2) + return; + Value *def = mul->getDef(0); + Value *pred = mul->getPredicate(); + CondCode cc = mul->cc; + if (pred) + mul->setPredicate(CC_ALWAYS, NULL); + + if (mul->op == OP_MAD) { + Instruction *add = mul; + bld.setPosition(add, false); + Value *res = cloneShallow(func, mul->getDef(0)); + mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1)); + add->op = OP_ADD; + add->setSrc(0, mul->getDef(0)); + add->setSrc(1, add->getSrc(2)); + for (int s = 2; add->srcExists(s); ++s) + add->setSrc(s, NULL); + mul->subOp = add->subOp; + add->subOp = 0; + } + expandIntegerMUL(&bld, mul); + if (pred) + def->getInsn()->setPredicate(cc, pred); +} + +// Use f32 division: first compute an approximate result, use it to reduce +// the dividend, which should then be representable as f32, divide the reduced +// dividend, and add the quotients. +void +NV50LegalizeSSA::handleDIV(Instruction *div) +{ + const DataType ty = div->sType; + + if (ty != TYPE_U32 && ty != TYPE_S32) + return; + + Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond; + + bld.setPosition(div, false); + + Value *a, *af = bld.getSSA(); + Value *b, *bf = bld.getSSA(); + + bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0)); + bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1)); + + if (isSignedType(ty)) { + af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS); + bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS); + a = bld.getSSA(); + b = bld.getSSA(); + bld.mkOp1(OP_ABS, ty, a, div->getSrc(0)); + bld.mkOp1(OP_ABS, ty, b, div->getSrc(1)); + } else { + a = div->getSrc(0); + b = div->getSrc(1); + } + + bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf); + bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2)); + + bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z; + bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z; + + // get error of 1st result + expandIntegerMUL(&bld, + bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b)); + bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t); + + bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf); + + bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z; + bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf) + ->rnd = ROUND_Z; + bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients + + // correction: if modulus >= divisor, add 1 + expandIntegerMUL(&bld, + bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b)); + bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t); + bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b); + if (!isSignedType(ty)) { + div->op = OP_SUB; + div->setSrc(0, q); + div->setSrc(1, s); + } else { + t = q; + bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s); + s = bld.getSSA(); + t = bld.getSSA(); + // fix the sign + bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1)) + ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS))); + bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond); + bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond); + + div->op = OP_UNION; + div->setSrc(0, s); + div->setSrc(1, t); + } +} + +void +NV50LegalizeSSA::handleMOD(Instruction *mod) +{ + if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32) + return; + bld.setPosition(mod, false); + + Value *q = bld.getSSA(); + Value *m = bld.getSSA(); + + bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1)); + handleDIV(q->getInsn()); + + bld.setPosition(mod, false); + expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1))); + + mod->op = OP_SUB; + mod->setSrc(1, m); +} + +bool +NV50LegalizeSSA::visit(BasicBlock *bb) +{ + Instruction *insn, *next; + // skipping PHIs (don't pass them to handleAddrDef) ! + for (insn = bb->getEntry(); insn; insn = next) { + next = insn->next; + + switch (insn->op) { + case OP_EXPORT: + if (outWrites) + propagateWriteToOutput(insn); + break; + case OP_DIV: + handleDIV(insn); + break; + case OP_MOD: + handleMOD(insn); + break; + case OP_MAD: + case OP_MUL: + handleMUL(insn); + break; + default: + break; + } + + if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS) + handleAddrDef(insn); + } + return true; +} + +class NV50LoweringPreSSA : public Pass +{ +public: + NV50LoweringPreSSA(Program *); + +private: + virtual bool visit(Instruction *); + virtual bool visit(Function *); + + bool handleRDSV(Instruction *); + bool handleWRSV(Instruction *); + + bool handleEXPORT(Instruction *); + + bool handleMUL(Instruction *); + bool handleDIV(Instruction *); + bool handleSQRT(Instruction *); + bool handlePOW(Instruction *); + + bool handleSET(Instruction *); + bool handleSLCT(CmpInstruction *); + bool handleSELP(Instruction *); + + bool handleTEX(TexInstruction *); + bool handleTXB(TexInstruction *); // I really + bool handleTXL(TexInstruction *); // hate + bool handleTXD(TexInstruction *); // these 3 + + bool handleCALL(Instruction *); + bool handlePRECONT(Instruction *); + bool handleCONT(Instruction *); + + void checkPredicate(Instruction *); + +private: + const Target *const targ; + + BuildUtil bld; + + Value *tid; +}; + +NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) : + targ(prog->getTarget()), tid(NULL) +{ + bld.setProgram(prog); +} + +bool +NV50LoweringPreSSA::visit(Function *f) +{ + BasicBlock *root = BasicBlock::get(func->cfg.getRoot()); + + if (prog->getType() == Program::TYPE_COMPUTE) { + // Add implicit "thread id" argument in $r0 to the function + Value *arg = new_LValue(func, FILE_GPR); + arg->reg.data.id = 0; + f->ins.push_back(arg); + + bld.setPosition(root, false); + tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0); + } + + return true; +} + +// move array source to first slot, convert to u16, add indirections +bool +NV50LoweringPreSSA::handleTEX(TexInstruction *i) +{ + const int arg = i->tex.target.getArgCount(); + const int dref = arg; + const int lod = i->tex.target.isShadow() ? (arg + 1) : arg; + + // dref comes before bias/lod + if (i->tex.target.isShadow()) + if (i->op == OP_TXB || i->op == OP_TXL) + i->swapSources(dref, lod); + + // array index must be converted to u32 + if (i->tex.target.isArray()) { + Value *layer = i->getSrc(arg - 1); + LValue *src = new_LValue(func, FILE_GPR); + bld.mkCvt(OP_CVT, TYPE_U16, src, TYPE_F32, layer); + i->setSrc(arg - 1, src); + + if (i->tex.target.isCube()) { + // Value *face = layer; + Value *x, *y; + x = new_LValue(func, FILE_GPR); + y = new_LValue(func, FILE_GPR); + layer = new_LValue(func, FILE_GPR); + + i->tex.target = TEX_TARGET_2D_ARRAY; + + // TODO: use TEXPREP to convert x,y,z,face -> x,y,layer + bld.mkMov(x, i->getSrc(0)); + bld.mkMov(y, i->getSrc(1)); + bld.mkMov(layer, i->getSrc(3)); + + i->setSrc(0, x); + i->setSrc(1, y); + i->setSrc(2, layer); + i->setSrc(3, i->getSrc(4)); + i->setSrc(4, NULL); + } + } + + // texel offsets are 3 immediate fields in the instruction, + // nv50 cannot do textureGatherOffsets + assert(i->tex.useOffsets <= 1); + + return true; +} + +// Bias must be equal for all threads of a quad or lod calculation will fail. +// +// The lanes of a quad are grouped by the bit in the condition register they +// have set, which is selected by differing bias values. +// Move the input values for TEX into a new register set for each group and +// execute TEX only for a specific group. +// We always need to use 4 new registers for the inputs/outputs because the +// implicitly calculated derivatives must be correct. +// +// TODO: move to SSA phase so we can easily determine whether bias is constant +bool +NV50LoweringPreSSA::handleTXB(TexInstruction *i) +{ + const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O }; + int l, d; + + handleTEX(i); + Value *bias = i->getSrc(i->tex.target.getArgCount()); + if (bias->isUniform()) + return true; + + Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(), + bld.loadImm(NULL, 1)); + bld.setPosition(cond, false); + + for (l = 1; l < 4; ++l) { + const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR); + Value *bit = bld.getSSA(); + Value *pred = bld.getScratch(1, FILE_FLAGS); + Value *imm = bld.loadImm(NULL, (1 << l)); + bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0; + bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred); + cond->setSrc(l, bit); + } + Value *flags = bld.getScratch(1, FILE_FLAGS); + bld.setPosition(cond, true); + bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0)); + + Instruction *tex[4]; + for (l = 0; l < 4; ++l) { + (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags); + bld.insert(tex[l]); + } + + Value *res[4][4]; + for (d = 0; i->defExists(d); ++d) + res[0][d] = tex[0]->getDef(d); + for (l = 1; l < 4; ++l) { + for (d = 0; tex[l]->defExists(d); ++d) { + res[l][d] = cloneShallow(func, res[0][d]); + bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags); + } + } + + for (d = 0; i->defExists(d); ++d) { + Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d)); + for (l = 0; l < 4; ++l) + dst->setSrc(l, res[l][d]); + } + delete_Instruction(prog, i); + return true; +} + +// LOD must be equal for all threads of a quad. +// Unlike with TXB, here we can just diverge since there's no LOD calculation +// that would require all 4 threads' sources to be set up properly. +bool +NV50LoweringPreSSA::handleTXL(TexInstruction *i) +{ + handleTEX(i); + Value *lod = i->getSrc(i->tex.target.getArgCount()); + if (lod->isUniform()) + return true; + + BasicBlock *currBB = i->bb; + BasicBlock *texiBB = i->bb->splitBefore(i, false); + BasicBlock *joinBB = i->bb->splitAfter(i); + + currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL); + + for (int l = 0; l <= 3; ++l) { + const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR); + Value *pred = bld.getScratch(1, FILE_FLAGS); + bld.setPosition(currBB, true); + bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0; + bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1; + currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD); + if (l <= 2) { + BasicBlock *laneBB = new BasicBlock(func); + currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE); + currBB = laneBB; + } + } + bld.setPosition(joinBB, false); + bld.mkOp(OP_JOIN, TYPE_NONE, NULL); + return true; +} + +bool +NV50LoweringPreSSA::handleTXD(TexInstruction *i) +{ + static const uint8_t qOps[4][2] = + { + { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0 + { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1 + { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2 + { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3 + }; + Value *def[4][4]; + Value *crd[3]; + Instruction *tex; + Value *zero = bld.loadImm(bld.getSSA(), 0); + int l, c; + const int dim = i->tex.target.getDim(); + + handleTEX(i); + i->op = OP_TEX; // no need to clone dPdx/dPdy later + + for (c = 0; c < dim; ++c) + crd[c] = bld.getScratch(); + + bld.mkOp(OP_QUADON, TYPE_NONE, NULL); + for (l = 0; l < 4; ++l) { + // mov coordinates from lane l to all lanes + for (c = 0; c < dim; ++c) + bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero); + // add dPdx from lane l to lanes dx + for (c = 0; c < dim; ++c) + bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]); + // add dPdy from lane l to lanes dy + for (c = 0; c < dim; ++c) + bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]); + // texture + bld.insert(tex = cloneForward(func, i)); + for (c = 0; c < dim; ++c) + tex->setSrc(c, crd[c]); + // save results + for (c = 0; i->defExists(c); ++c) { + Instruction *mov; + def[c][l] = bld.getSSA(); + mov = bld.mkMov(def[c][l], tex->getDef(c)); + mov->fixed = 1; + mov->lanes = 1 << l; + } + } + bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); + + for (c = 0; i->defExists(c); ++c) { + Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c)); + for (l = 0; l < 4; ++l) + u->setSrc(l, def[c][l]); + } + + i->bb->remove(i); + return true; +} + +bool +NV50LoweringPreSSA::handleSET(Instruction *i) +{ + if (i->dType == TYPE_F32) { + bld.setPosition(i, true); + i->dType = TYPE_U32; + bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0)); + bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0)); + } + return true; +} + +bool +NV50LoweringPreSSA::handleSLCT(CmpInstruction *i) +{ + Value *src0 = bld.getSSA(); + Value *src1 = bld.getSSA(); + Value *pred = bld.getScratch(1, FILE_FLAGS); + + Value *v0 = i->getSrc(0); + Value *v1 = i->getSrc(1); + // XXX: these probably shouldn't be immediates in the first place ... + if (v0->asImm()) + v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0); + if (v1->asImm()) + v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0); + + bld.setPosition(i, true); + bld.mkMov(src0, v0)->setPredicate(CC_NE, pred); + bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred); + bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1); + + bld.setPosition(i, false); + i->op = OP_SET; + i->setFlagsDef(0, pred); + i->dType = TYPE_U8; + i->setSrc(0, i->getSrc(2)); + i->setSrc(2, NULL); + i->setSrc(1, bld.loadImm(NULL, 0)); + + return true; +} + +bool +NV50LoweringPreSSA::handleSELP(Instruction *i) +{ + Value *src0 = bld.getSSA(); + Value *src1 = bld.getSSA(); + + Value *v0 = i->getSrc(0); + Value *v1 = i->getSrc(1); + if (v0->asImm()) + v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0); + if (v1->asImm()) + v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0); + + bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2)); + bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2)); + bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1); + delete_Instruction(prog, i); + return true; +} + +bool +NV50LoweringPreSSA::handleWRSV(Instruction *i) +{ + Symbol *sym = i->getSrc(0)->asSym(); + + // these are all shader outputs, $sreg are not writeable + uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym); + if (addr >= 0x400) + return false; + sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr); + + bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1)); + + bld.getBB()->remove(i); + return true; +} + +bool +NV50LoweringPreSSA::handleCALL(Instruction *i) +{ + if (prog->getType() == Program::TYPE_COMPUTE) { + // Add implicit "thread id" argument in $r0 to the function + i->setSrc(i->srcCount(), tid); + } + return true; +} + +bool +NV50LoweringPreSSA::handlePRECONT(Instruction *i) +{ + delete_Instruction(prog, i); + return true; +} + +bool +NV50LoweringPreSSA::handleCONT(Instruction *i) +{ + i->op = OP_BRA; + return true; +} + +bool +NV50LoweringPreSSA::handleRDSV(Instruction *i) +{ + Symbol *sym = i->getSrc(0)->asSym(); + uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym); + Value *def = i->getDef(0); + SVSemantic sv = sym->reg.data.sv.sv; + int idx = sym->reg.data.sv.index; + + if (addr >= 0x400) // mov $sreg + return true; + + switch (sv) { + case SV_POSITION: + assert(prog->getType() == Program::TYPE_FRAGMENT); + bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL); + break; + case SV_FACE: + bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL); + if (i->dType == TYPE_F32) { + bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000)); + bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000)); + } + break; + case SV_NCTAID: + case SV_CTAID: + case SV_NTID: + if ((sv == SV_NCTAID && idx >= 2) || + (sv == SV_NTID && idx >= 3)) { + bld.mkMov(def, bld.mkImm(1)); + } else if (sv == SV_CTAID && idx >= 2) { + bld.mkMov(def, bld.mkImm(0)); + } else { + Value *x = bld.getSSA(2); + bld.mkOp1(OP_LOAD, TYPE_U16, x, + bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr)); + bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x); + } + break; + case SV_TID: + if (idx == 0) { + bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff)); + } else if (idx == 1) { + bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000)); + bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16)); + } else if (idx == 2) { + bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26)); + } else { + bld.mkMov(def, bld.mkImm(0)); + } + break; + default: + bld.mkFetch(i->getDef(0), i->dType, + FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL); + break; + } + bld.getBB()->remove(i); + return true; +} + +bool +NV50LoweringPreSSA::handleMUL(Instruction *i) +{ + if (!isFloatType(i->dType) && typeSizeof(i->sType) > 2) + return expandIntegerMUL(&bld, i); + return true; +} + +bool +NV50LoweringPreSSA::handleDIV(Instruction *i) +{ + if (!isFloatType(i->dType)) + return true; + bld.setPosition(i, false); + Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1)); + i->op = OP_MUL; + i->setSrc(1, rcp->getDef(0)); + return true; +} + +bool +NV50LoweringPreSSA::handleSQRT(Instruction *i) +{ + Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32, + bld.getSSA(), i->getSrc(0)); + i->op = OP_MUL; + i->setSrc(1, rsq->getDef(0)); + + return true; +} + +bool +NV50LoweringPreSSA::handlePOW(Instruction *i) +{ + LValue *val = bld.getScratch(); + + bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0)); + bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1; + bld.mkOp1(OP_PREEX2, TYPE_F32, val, val); + + i->op = OP_EX2; + i->setSrc(0, val); + i->setSrc(1, NULL); + + return true; +} + +bool +NV50LoweringPreSSA::handleEXPORT(Instruction *i) +{ + if (prog->getType() == Program::TYPE_FRAGMENT) { + if (i->getIndirect(0, 0)) { + // TODO: redirect to l[] here, load to GPRs at exit + return false; + } else { + int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units + + i->op = OP_MOV; + i->src(0).set(i->src(1)); + i->setSrc(1, NULL); + i->setDef(0, new_LValue(func, FILE_GPR)); + i->getDef(0)->reg.data.id = id; + + prog->maxGPR = MAX2(prog->maxGPR, id); + } + } + return true; +} + +// Set flags according to predicate and make the instruction read $cX. +void +NV50LoweringPreSSA::checkPredicate(Instruction *insn) +{ + Value *pred = insn->getPredicate(); + Value *cdst; + + if (!pred || pred->reg.file == FILE_FLAGS) + return; + cdst = bld.getSSA(1, FILE_FLAGS); + + bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, cdst, bld.loadImm(NULL, 0), pred); + + insn->setPredicate(insn->cc, cdst); +} + +// +// - add quadop dance for texturing +// - put FP outputs in GPRs +// - convert instruction sequences +// +bool +NV50LoweringPreSSA::visit(Instruction *i) +{ + if (i->prev) + bld.setPosition(i->prev, true); + else + if (i->next) + bld.setPosition(i->next, false); + else + bld.setPosition(i->bb, true); + + if (i->cc != CC_ALWAYS) + checkPredicate(i); + + switch (i->op) { + case OP_TEX: + case OP_TXF: + case OP_TXG: + return handleTEX(i->asTex()); + case OP_TXB: + return handleTXB(i->asTex()); + case OP_TXL: + return handleTXL(i->asTex()); + case OP_TXD: + return handleTXD(i->asTex()); + case OP_EX2: + bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0)); + i->setSrc(0, i->getDef(0)); + break; + case OP_SET: + return handleSET(i); + case OP_SLCT: + return handleSLCT(i->asCmp()); + case OP_SELP: + return handleSELP(i); + case OP_POW: + return handlePOW(i); + case OP_MUL: + return handleMUL(i); + case OP_DIV: + return handleDIV(i); + case OP_SQRT: + return handleSQRT(i); + case OP_EXPORT: + return handleEXPORT(i); + case OP_RDSV: + return handleRDSV(i); + case OP_WRSV: + return handleWRSV(i); + case OP_CALL: + return handleCALL(i); + case OP_PRECONT: + return handlePRECONT(i); + case OP_CONT: + return handleCONT(i); + default: + break; + } + return true; +} + +bool +TargetNV50::runLegalizePass(Program *prog, CGStage stage) const +{ + bool ret = false; + + if (stage == CG_STAGE_PRE_SSA) { + NV50LoweringPreSSA pass(prog); + ret = pass.run(prog, false, true); + } else + if (stage == CG_STAGE_SSA) { + if (!prog->targetPriv) + prog->targetPriv = new std::list<Instruction *>(); + NV50LegalizeSSA pass(prog); + ret = pass.run(prog, false, true); + } else + if (stage == CG_STAGE_POST_RA) { + NV50LegalizePostRA pass; + ret = pass.run(prog, false, true); + if (prog->targetPriv) + delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); + } + return ret; +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp index 61382336bc4..c6134465996 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp @@ -265,7 +265,7 @@ int Modifier::print(char *buf, size_t size) const return pos; } - + int LValue::print(char *buf, size_t size, DataType ty) const { const char *postFix = ""; @@ -278,14 +278,23 @@ int LValue::print(char *buf, size_t size, DataType ty) const switch (reg.file) { case FILE_GPR: r = 'r'; col = TXT_GPR; - if (reg.size == 8) + if (reg.size == 2) { + if (p == '$') { + postFix = (idx & 1) ? "h" : "l"; + idx /= 2; + } else { + postFix = "s"; + } + } else + if (reg.size == 8) { postFix = "d"; - else - if (reg.size == 16) + } else + if (reg.size == 16) { postFix = "q"; - else - if (reg.size == 12) + } else + if (reg.size == 12) { postFix = "t"; + } break; case FILE_PREDICATE: r = 'p'; col = TXT_REGISTER; @@ -419,7 +428,7 @@ void Instruction::print() const } else { PRINT("%s", CondCodeStr[cc]); } - if (pos > pre + 1) + if (pos > pre) SPACE(); pos += getSrc(predSrc)->print(&buf[pos], BUFSZ - pos); PRINT(" %s", colour[TXT_INSN]); @@ -489,6 +498,8 @@ void Instruction::print() const else pos += getSrc(s)->print(&buf[pos], BUFSZ - pos, sType); } + if (exit) + PRINT("%s exit", colour[TXT_INSN]); PRINT("%s", colour[TXT_DEFAULT]); diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp index 598e0d26384..27b9610ed52 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp @@ -54,6 +54,7 @@ const uint8_t Target::operationSrcNr[OP_LAST + 1] = extern Target *getTargetNVC0(unsigned int chipset); +extern Target *getTargetNV50(unsigned int chipset); Target *Target::create(unsigned int chipset) { @@ -65,6 +66,7 @@ Target *Target::create(unsigned int chipset) case 0x80: case 0x90: case 0xa0: + return getTargetNV50(chipset); default: ERROR("unsupported target: NV%x\n", chipset); return 0; @@ -76,6 +78,10 @@ void Target::destroy(Target *targ) delete targ; } +CodeEmitter::CodeEmitter(const Target *target) : targ(target) +{ +} + void CodeEmitter::setCodeLocation(void *ptr, uint32_t size) { @@ -261,6 +267,10 @@ Program::emitBinary(struct nv50_ir_prog_info *info) emitSymbolTable(info); + // the nvc0 driver will print the binary iself together with the header + if ((dbgFlags & NV50_IR_DEBUG_BASIC) && getTarget()->getChipset() < 0xc0) + emit->printBinary(); + delete emit; return true; } diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target.h b/src/gallium/drivers/nv50/codegen/nv50_ir_target.h index 6640198f090..88996ebbde3 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_target.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target.h @@ -61,6 +61,8 @@ struct RelocInfo class CodeEmitter { public: + CodeEmitter(const Target *); + // returns whether the instruction was encodable and written virtual bool emitInstruction(Instruction *) = 0; @@ -76,12 +78,14 @@ public: inline void *getRelocInfo() const { return relocInfo; } void prepareEmission(Program *); - void prepareEmission(Function *); + virtual void prepareEmission(Function *); virtual void prepareEmission(BasicBlock *); void printBinary() const; protected: + const Target *targ; + uint32_t *code; uint32_t codeSize; uint32_t codeSizeLimit; @@ -105,6 +109,8 @@ public: // The address chosen is supplied to the relocation routine. virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const = 0; + virtual void parseDriverInfo(const struct nv50_ir_prog_info *info) { } + virtual bool runLegalizePass(Program *, CGStage stage) const = 0; public: diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp new file mode 100644 index 00000000000..a64f7f72255 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp @@ -0,0 +1,531 @@ +/* + * Copyright 2011 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "nv50_ir_target_nv50.h" + +namespace nv50_ir { + +Target *getTargetNV50(unsigned int chipset) +{ + return new TargetNV50(chipset); +} + +TargetNV50::TargetNV50(unsigned int card) +{ + chipset = card; + + wposMask = 0; + for (unsigned int i = 0; i <= SV_LAST; ++i) + sysvalLocation[i] = ~0; + + initOpInfo(); +} + +#if 0 +// BULTINS / LIBRARY FUNCTIONS: + +// TODO +static const uint32_t nvc0_builtin_code[] = +{ +}; + +static const uint16_t nvc0_builtin_offsets[NV50_BUILTIN_COUNT] = +{ +}; +#endif + +void +TargetNV50::getBuiltinCode(const uint32_t **code, uint32_t *size) const +{ + *code = NULL; + *size = 0; +} + +uint32_t +TargetNV50::getBuiltinOffset(int builtin) const +{ + return 0; +} + +struct opProperties +{ + operation op; + unsigned int mNeg : 4; + unsigned int mAbs : 4; + unsigned int mNot : 4; + unsigned int mSat : 4; + unsigned int fConst : 3; + unsigned int fShared : 3; + unsigned int fAttrib : 3; + unsigned int fImm : 3; +}; + +static const struct opProperties _initProps[] = +{ + // neg abs not sat c[] s[], a[], imm + { OP_ADD, 0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 }, + { OP_SUB, 0x3, 0x0, 0x0, 0x0, 0x2, 0x1, 0x1, 0x2 }, + { OP_MUL, 0x3, 0x0, 0x0, 0x0, 0x2, 0x1, 0x1, 0x2 }, + { OP_MAX, 0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 }, + { OP_MIN, 0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 }, + { OP_MAD, 0x7, 0x0, 0x0, 0x0, 0x6, 0x1, 0x1, 0x0 }, // special constraint + { OP_ABS, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x0 }, + { OP_NEG, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x1, 0x0 }, + { OP_CVT, 0x1, 0x1, 0x0, 0x8, 0x0, 0x1, 0x1, 0x0 }, + { OP_AND, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 }, + { OP_OR, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 }, + { OP_XOR, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 }, + { OP_SHL, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2 }, + { OP_SHR, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2 }, + { OP_SET, 0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 }, + { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, + { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, + { OP_LG2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, + { OP_RCP, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, + { OP_RSQ, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, + { OP_DFDX, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, + { OP_DFDY, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }, +}; + +void TargetNV50::initOpInfo() +{ + unsigned int i, j; + + static const uint32_t commutative[(OP_LAST + 31) / 32] = + { + // ADD,MAD,MUL,AND,OR,XOR,MAX,MIN + 0x0670ca00, 0x0000003f, 0x00000000 + }; + static const uint32_t shortForm[(OP_LAST + 31) / 32] = + { + // MOV,ADD,SUB,MUL,SAD,L/PINTERP,RCP,TEX,TXF + 0x00010e40, 0x00000040, 0x00000498 + }; + static const operation noDestList[] = + { + OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT, + OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET, + OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART, + OP_QUADON, OP_QUADPOP + }; + static const operation noPredList[] = + { + OP_CALL, OP_PREBREAK, OP_PRERET, OP_QUADON, OP_QUADPOP, OP_JOINAT + }; + + joinAnterior = true; + + for (i = 0; i < DATA_FILE_COUNT; ++i) + nativeFileMap[i] = (DataFile)i; + nativeFileMap[FILE_PREDICATE] = FILE_FLAGS; + + for (i = 0; i < OP_LAST; ++i) { + opInfo[i].variants = NULL; + opInfo[i].op = (operation)i; + opInfo[i].srcTypes = 1 << (int)TYPE_F32; + opInfo[i].dstTypes = 1 << (int)TYPE_F32; + opInfo[i].immdBits = 0xffffffff; + opInfo[i].srcNr = operationSrcNr[i]; + + for (j = 0; j < opInfo[i].srcNr; ++j) { + opInfo[i].srcMods[j] = 0; + opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR; + } + opInfo[i].dstMods = 0; + opInfo[i].dstFiles = 1 << (int)FILE_GPR; + + opInfo[i].hasDest = 1; + opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA); + opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1; + opInfo[i].pseudo = (i < OP_MOV); + opInfo[i].predicate = !opInfo[i].pseudo; + opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN); + opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8; + } + for (i = 0; i < sizeof(noDestList) / sizeof(noDestList[0]); ++i) + opInfo[noDestList[i]].hasDest = 0; + for (i = 0; i < sizeof(noPredList) / sizeof(noPredList[0]); ++i) + opInfo[noPredList[i]].predicate = 0; + + for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) { + const struct opProperties *prop = &_initProps[i]; + + for (int s = 0; s < 3; ++s) { + if (prop->mNeg & (1 << s)) + opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG; + if (prop->mAbs & (1 << s)) + opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS; + if (prop->mNot & (1 << s)) + opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT; + if (prop->fConst & (1 << s)) + opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST; + if (prop->fShared & (1 << s)) + opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_SHARED; + if (prop->fAttrib & (1 << s)) + opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_SHADER_INPUT; + if (prop->fImm & (1 << s)) + opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE; + } + if (prop->mSat & 8) + opInfo[prop->op].dstMods = NV50_IR_MOD_SAT; + } +} + +unsigned int +TargetNV50::getFileSize(DataFile file) const +{ + switch (file) { + case FILE_NULL: return 0; + case FILE_GPR: return 256; // in 16-bit units ** + case FILE_PREDICATE: return 0; + case FILE_FLAGS: return 4; + case FILE_ADDRESS: return 4; + case FILE_IMMEDIATE: return 0; + case FILE_MEMORY_CONST: return 65536; + case FILE_SHADER_INPUT: return 0x200; + case FILE_SHADER_OUTPUT: return 0x200; + case FILE_MEMORY_GLOBAL: return 0xffffffff; + case FILE_MEMORY_SHARED: return 16 << 10; + case FILE_MEMORY_LOCAL: return 48 << 10; + case FILE_SYSTEM_VALUE: return 16; + default: + assert(!"invalid file"); + return 0; + } + // ** only first 128 units encodable for 16-bit regs +} + +unsigned int +TargetNV50::getFileUnit(DataFile file) const +{ + if (file == FILE_GPR || file == FILE_ADDRESS) + return 1; + if (file == FILE_SYSTEM_VALUE) + return 2; + return 0; +} + +uint32_t +TargetNV50::getSVAddress(DataFile shaderFile, const Symbol *sym) const +{ + switch (sym->reg.data.sv.sv) { + case SV_FACE: + return 0x3fc; + case SV_POSITION: + { + uint32_t addr = sysvalLocation[sym->reg.data.sv.sv]; + for (int c = 0; c < sym->reg.data.sv.index; ++c) + if (wposMask & (1 << c)) + addr += 4; + return addr; + } + case SV_NCTAID: + return 0x8 + 2 * sym->reg.data.sv.index; + case SV_CTAID: + return 0xc + 2 * sym->reg.data.sv.index; + case SV_NTID: + return 0x2 + 2 * sym->reg.data.sv.index; + case SV_TID: + return 0; + default: + return sysvalLocation[sym->reg.data.sv.sv]; + } +} + +// long: rrr, arr, rcr, acr, rrc, arc, gcr, grr +// short: rr, ar, rc, gr +// immd: ri, gi +bool +TargetNV50::insnCanLoad(const Instruction *i, int s, + const Instruction *ld) const +{ + DataFile sf = ld->src(0).getFile(); + + if (sf == FILE_IMMEDIATE && (i->predSrc >= 0 || i->flagsDef >= 0)) + return false; + if (s >= opInfo[i->op].srcNr) + return false; + if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf))) + return false; + if (s == 2 && i->src(1).getFile() != FILE_GPR) + return false; + + // NOTE: don't rely on flagsDef + for (int d = 0; i->defExists(d); ++d) + if (i->def(d).getFile() == FILE_FLAGS) + return false; + + unsigned mode = 0; + + for (int z = 0; z < Target::operationSrcNr[i->op]; ++z) { + DataFile zf = (z == s) ? sf : i->src(z).getFile(); + switch (zf) { + case FILE_GPR: + break; + case FILE_MEMORY_SHARED: + case FILE_SHADER_INPUT: + mode |= 1 << (z * 2); + break; + case FILE_MEMORY_CONST: + mode |= 2 << (z * 2); + break; + case FILE_IMMEDIATE: + mode |= 3 << (z * 2); + default: + break; + } + } + + switch (mode) { + case 0x00: + case 0x01: + case 0x03: + case 0x08: + case 0x09: + case 0x0c: + case 0x20: + case 0x21: + break; + case 0x0d: + if (ld->bb->getProgram()->getType() != Program::TYPE_GEOMETRY) + return false; + default: + return false; + } + + if (ld->getSrc(0)->reg.data.offset > (int32_t)(127 * typeSizeof(ld->dType))) + return false; + + if (ld->src(0).isIndirect(0)) { + for (int z = 0; i->srcExists(z); ++z) + if (i->src(z).isIndirect(0)) + return false; + + // s[] access only possible in CP, $aX always applies + if (sf == FILE_MEMORY_SHARED) + return true; + if (!ld->bb) // can't check type ... + return false; + Program::Type pt = ld->bb->getProgram()->getType(); + + // $aX applies to c[] only in VP, FP, GP if p[] is not accessed + if (pt == Program::TYPE_COMPUTE) + return false; + if (pt == Program::TYPE_GEOMETRY) { + if (sf == FILE_MEMORY_CONST) + return i->src(s).getFile() != FILE_SHADER_INPUT; + return sf == FILE_SHADER_INPUT; + } + return sf == FILE_MEMORY_CONST; + } + return true; +} + +bool +TargetNV50::isAccessSupported(DataFile file, DataType ty) const +{ + if (ty == TYPE_B96 || ty == TYPE_NONE) + return false; + if (typeSizeof(ty) > 4) + return (file == FILE_MEMORY_LOCAL) || (file == FILE_MEMORY_GLOBAL); + return true; +} + +bool +TargetNV50::isOpSupported(operation op, DataType ty) const +{ + if (ty == TYPE_F64 && chipset < 0xa0) + return false; + + switch (op) { + case OP_PRERET: + return chipset >= 0xa0; + case OP_TXG: + return chipset >= 0xa3; + case OP_POW: + case OP_SQRT: + case OP_DIV: + case OP_MOD: + case OP_SET_AND: + case OP_SET_OR: + case OP_SET_XOR: + case OP_SLCT: + case OP_SELP: + case OP_POPCNT: + case OP_INSBF: + case OP_EXTBF: + case OP_EXIT: // want exit modifier instead (on NOP if required) + return false; + case OP_SAD: + return ty == TYPE_S32; + default: + return true; + } +} + +bool +TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const +{ + if (!isFloatType(insn->dType)) { + switch (insn->op) { + case OP_ABS: + case OP_NEG: + case OP_CVT: + case OP_CEIL: + case OP_FLOOR: + case OP_TRUNC: + case OP_AND: + case OP_OR: + case OP_XOR: + break; + case OP_ADD: + if (insn->src(s ? 0 : 1).mod.neg()) + return false; + break; + case OP_SUB: + if (s == 0) + return insn->src(1).mod.neg() ? false : true; + break; + case OP_SET: + if (insn->sType != TYPE_F32) + return false; + break; + default: + return false; + } + } + if (s > 3) + return false; + return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod; +} + +bool +TargetNV50::mayPredicate(const Instruction *insn, const Value *pred) const +{ + if (insn->getPredicate() || insn->flagsSrc >= 0) + return false; + for (int s = 0; insn->srcExists(s); ++s) + if (insn->src(s).getFile() == FILE_IMMEDIATE) + return false; + return opInfo[insn->op].predicate; +} + +bool +TargetNV50::isSatSupported(const Instruction *insn) const +{ + if (insn->op == OP_CVT) + return true; + if (insn->dType != TYPE_F32) + return false; + return opInfo[insn->op].dstMods & NV50_IR_MOD_SAT; +} + +int TargetNV50::getLatency(const Instruction *i) const +{ + // TODO: tune these values + if (i->op == OP_LOAD) { + switch (i->src(0).getFile()) { + case FILE_MEMORY_LOCAL: + case FILE_MEMORY_GLOBAL: + return 100; // really 400 to 800 + default: + return 22; + } + } + return 22; +} + +// These are "inverse" throughput values, i.e. the number of cycles required +// to issue a specific instruction for a full warp (32 threads). +// +// Assuming we have more than 1 warp in flight, a higher issue latency results +// in a lower result latency since the MP will have spent more time with other +// warps. +// This also helps to determine the number of cycles between instructions in +// a single warp. +// +int TargetNV50::getThroughput(const Instruction *i) const +{ + // TODO: tune these values + if (i->dType == TYPE_F32) { + switch (i->op) { + case OP_RCP: + case OP_RSQ: + case OP_LG2: + case OP_SIN: + case OP_COS: + case OP_PRESIN: + case OP_PREEX2: + return 16; + default: + return 4; + } + } else + if (i->dType == TYPE_U32 || i->dType == TYPE_S32) { + return 4; + } else + if (i->dType == TYPE_F64) { + return 32; + } else { + return 1; + } +} + +static void +recordLocation(uint16_t *locs, uint8_t *masks, + const struct nv50_ir_varying *var) +{ + uint16_t addr = var->slot[0] * 4; + + switch (var->sn) { + case TGSI_SEMANTIC_POSITION: locs[SV_POSITION] = addr; break; + case TGSI_SEMANTIC_INSTANCEID: locs[SV_INSTANCE_ID] = addr; break; + case TGSI_SEMANTIC_VERTEXID: locs[SV_VERTEX_ID] = addr; break; + case TGSI_SEMANTIC_PRIMID: locs[SV_PRIMITIVE_ID] = addr; break; + case NV50_SEMANTIC_LAYER: locs[SV_LAYER] = addr; break; + case NV50_SEMANTIC_VIEWPORTINDEX: locs[SV_VIEWPORT_INDEX] = addr; break; + default: + break; + } + if (var->sn == TGSI_SEMANTIC_POSITION && masks) + masks[0] = var->mask; +} + +void +TargetNV50::parseDriverInfo(const struct nv50_ir_prog_info *info) +{ + unsigned int i; + for (i = 0; i < info->numOutputs; ++i) + recordLocation(sysvalLocation, NULL, &info->out[i]); + for (i = 0; i < info->numInputs; ++i) + recordLocation(sysvalLocation, &wposMask, &info->in[i]); + for (i = 0; i < info->numSysVals; ++i) + recordLocation(sysvalLocation, NULL, &info->sv[i]); + + if (sysvalLocation[SV_POSITION] >= 0x200) { + // not assigned by driver, but we need it internally + wposMask = 0x8; + sysvalLocation[SV_POSITION] = 0; + } +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.h b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.h new file mode 100644 index 00000000000..99e6f565612 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.h @@ -0,0 +1,72 @@ +/* + * Copyright 2011 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "nv50/codegen/nv50_ir_target.h" + +namespace nv50_ir { + +#define NVC0_BUILTIN_DIV_U32 0 +#define NVC0_BUILTIN_DIV_S32 1 +#define NVC0_BUILTIN_RCP_F64 2 +#define NVC0_BUILTIN_RSQ_F64 3 + +#define NVC0_BUILTIN_COUNT 4 + +class TargetNV50 : public Target +{ +public: + TargetNV50(unsigned int chipset); + + virtual CodeEmitter *getCodeEmitter(Program::Type); + + virtual bool runLegalizePass(Program *, CGStage stage) const; + + virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const; + + virtual void parseDriverInfo(const struct nv50_ir_prog_info *); + + virtual bool insnCanLoad(const Instruction *insn, int s, + const Instruction *ld) const; + virtual bool isOpSupported(operation, DataType) const; + virtual bool isAccessSupported(DataFile, DataType) const; + virtual bool isModSupported(const Instruction *, int s, Modifier) const; + virtual bool isSatSupported(const Instruction *) const; + virtual bool mayPredicate(const Instruction *, const Value *) const; + + virtual int getLatency(const Instruction *) const; + virtual int getThroughput(const Instruction *) const; + + virtual unsigned int getFileSize(DataFile) const; + virtual unsigned int getFileUnit(DataFile) const; + + virtual uint32_t getSVAddress(DataFile shaderFile, const Symbol *sv) const; + + uint32_t getBuiltinOffset(int builtin) const; + +private: + void initOpInfo(); + + uint16_t sysvalLocation[SV_LAST + 1]; + uint8_t wposMask; +}; + +} // namespace nv50_ir |