nv50/ir: import new shader backend code

author: Christoph Bumiller <[email protected]> 2011-09-14 16:18:23 +0200
committer: Christoph Bumiller <[email protected]> 2011-09-14 16:19:52 +0200
commit: 57594065c30feec9376be9b2132659f7d87362ee (patch)
tree: 7e6808e0c5240b513851b7925c5be6678663b5e5 /src/gallium/drivers/nvc0
parent: a42eca84c56f6860e67c0c57f4765a5530cc5f81 (diff)
6 files changed, 3039 insertions, 1 deletions
diff --git a/src/gallium/drivers/nvc0/Makefile b/src/gallium/drivers/nvc0/Makefile
index 3a5314625e6..c41262559cd 100644
--- a/src/gallium/drivers/nvc0/Makefile
+++ b/src/gallium/drivers/nvc0/Makefile
@@ -3,7 +3,7 @@ include $(TOP)/configs/current
 
 LIBNAME = nvc0
 
-# get C_SOURCES
+# get C/CPP_SOURCES
 include Makefile.sources
 
 LIBRARY_INCLUDES = \
diff --git a/src/gallium/drivers/nvc0/Makefile.sources b/src/gallium/drivers/nvc0/Makefile.sources
index a057f060130..9b1fb97f0cb 100644
--- a/src/gallium/drivers/nvc0/Makefile.sources
+++ b/src/gallium/drivers/nvc0/Makefile.sources
@@ -22,3 +22,8 @@ C_SOURCES := \
 	nvc0_push.c \
 	nvc0_push2.c \
 	nvc0_query.c
+
+CPP_SOURCES := \
+	codegen/nv50_ir_emit_nvc0.cpp \
+	codegen/nv50_ir_lowering_nvc0.cpp \
+	codegen/nv50_ir_target_nvc0.cpp
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
new file mode 100644
index 00000000000..2ab06f426e5
--- /dev/null
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
@@ -0,0 +1,1714 @@
+
+#include "nv50_ir_target_nvc0.h"
+
+namespace nv50_ir {
+
+// Argh, all these assertions ...
+
+class CodeEmitterNVC0 : public CodeEmitter
+{
+public:
+   CodeEmitterNVC0(const TargetNVC0 *);
+
+   virtual bool emitInstruction(Instruction *);
+   virtual uint32_t getMinEncodingSize(const Instruction *) const;
+
+   inline void setProgramType(Program::Type pType) { progType = pType; }
+
+private:
+   const TargetNVC0 *targ;
+
+   Program::Type progType;
+
+private:
+   void emitForm_A(const Instruction *, uint64_t);
+   void emitForm_B(const Instruction *, uint64_t);
+   void emitForm_S(const Instruction *, uint32_t, bool pred);
+
+   void emitPredicate(const Instruction *);
+
+   void setAddress16(const ValueRef&);
+   void setImmediate(const Instruction *, const int s); // needs op already set
+   void setImmediateS8(const ValueRef&);
+
+   void emitCondCode(CondCode cc, int pos);
+   void emitInterpMode(const Instruction *);
+   void emitLoadStoreType(DataType ty);
+   void emitCachingMode(CacheMode c);
+
+   void emitShortSrc2(const ValueRef&);
+
+   inline uint8_t getSRegEncoding(const ValueRef&);
+
+   void roundMode_A(const Instruction *);
+   void roundMode_C(const Instruction *);
+   void roundMode_CS(const Instruction *);
+
+   void emitNegAbs12(const Instruction *);
+
+   void emitNOP(const Instruction *);
+
+   void emitLOAD(const Instruction *);
+   void emitSTORE(const Instruction *);
+   void emitMOV(const Instruction *);
+
+   void emitINTERP(const Instruction *);
+   void emitPFETCH(const Instruction *);
+   void emitVFETCH(const Instruction *);
+   void emitEXPORT(const Instruction *);
+   void emitOUT(const Instruction *);
+
+   void emitUADD(const Instruction *);
+   void emitFADD(const Instruction *);
+   void emitUMUL(const Instruction *);
+   void emitFMUL(const Instruction *);
+   void emitIMAD(const Instruction *);
+   void emitFMAD(const Instruction *);
+
+   void emitNOT(Instruction *);
+   void emitLogicOp(const Instruction *, uint8_t subOp);
+   void emitPOPC(const Instruction *);
+   void emitINSBF(const Instruction *);
+   void emitShift(const Instruction *);
+
+   void emitSFnOp(const Instruction *, uint8_t subOp);
+
+   void emitCVT(Instruction *);
+   void emitMINMAX(const Instruction *);
+   void emitPreOp(const Instruction *);
+
+   void emitSET(const CmpInstruction *);
+   void emitSLCT(const CmpInstruction *);
+   void emitSELP(const Instruction *);
+
+   void emitTEX(const TexInstruction *);
+   void emitTEXCSAA(const TexInstruction *);
+   void emitTXQ(const TexInstruction *);
+   void emitPIXLD(const TexInstruction *);
+
+   void emitQUADOP(const Instruction *, uint8_t qOp, uint8_t laneMask);
+
+   void emitFlow(const Instruction *);
+
+   inline void defId(const ValueDef&, const int pos);
+   inline void srcId(const ValueRef&, const int pos);
+
+   inline void srcAddr32(const ValueRef&, const int pos); // address / 4
+
+   inline void srcId(const ValueRef *, const int pos);
+
+   inline bool isLIMM(const ValueRef&, DataType ty);
+};
+
+// for better visibility
+#define HEX64(h, l) 0x##h##l##ULL
+
+#define SDATA(a) ((a).rep()->reg.data)
+#define DDATA(a) ((a).rep()->reg.data)
+
+void CodeEmitterNVC0::srcId(const ValueRef& src, const int pos)
+{
+   code[pos / 32] |= (src.get() ? SDATA(src).id : 63) << (pos % 32);
+}
+
+void CodeEmitterNVC0::srcId(const ValueRef *src, const int pos)
+{
+   code[pos / 32] |= (src ? SDATA(*src).id : 63) << (pos % 32);
+}
+
+void CodeEmitterNVC0::srcAddr32(const ValueRef& src, const int pos)
+{
+   code[pos / 32] |= (SDATA(src).offset >> 2) << (pos % 32);
+}
+
+void CodeEmitterNVC0::defId(const ValueDef& def, const int pos)
+{
+   code[pos / 32] |= (def.get() ? DDATA(def).id : 63) << (pos % 32);
+}
+
+bool CodeEmitterNVC0::isLIMM(const ValueRef& ref, DataType ty)
+{
+   const ImmediateValue *imm = ref.get()->asImm();
+
+   return imm && (imm->reg.data.u32 & ((ty == TYPE_F32) ? 0xfff : 0xfff00000));
+}
+
+void
+CodeEmitterNVC0::roundMode_A(const Instruction *insn)
+{
+   switch (insn->rnd) {
+   case ROUND_M: code[1] |= 1 << 23; break;
+   case ROUND_P: code[1] |= 2 << 23; break;
+   case ROUND_Z: code[1] |= 3 << 23; break;
+   default:
+      assert(insn->rnd == ROUND_N);
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::emitNegAbs12(const Instruction *i)
+{
+   if (i->src[1].mod.abs()) code[0] |= 1 << 6;
+   if (i->src[0].mod.abs()) code[0] |= 1 << 7;
+   if (i->src[1].mod.neg()) code[0] |= 1 << 8;
+   if (i->src[0].mod.neg()) code[0] |= 1 << 9;
+}
+
+void CodeEmitterNVC0::emitCondCode(CondCode cc, int pos)
+{
+   uint8_t val;
+
+   switch (cc) {
+   case CC_LT:  val = 0x1; break;
+   case CC_LTU: val = 0x9; break;
+   case CC_EQ:  val = 0x2; break;
+   case CC_EQU: val = 0xa; break;
+   case CC_LE:  val = 0x3; break;
+   case CC_LEU: val = 0xb; break;
+   case CC_GT:  val = 0x4; break;
+   case CC_GTU: val = 0xc; break;
+   case CC_NE:  val = 0x5; break;
+   case CC_NEU: val = 0xd; break;
+   case CC_GE:  val = 0x6; break;
+   case CC_GEU: val = 0xe; break;
+   case CC_TR:  val = 0xf; break;
+   case CC_FL:  val = 0x0; break;
+
+   case CC_A:  val = 0x14; break;
+   case CC_NA: val = 0x13; break;
+   case CC_S:  val = 0x15; break;
+   case CC_NS: val = 0x12; break;
+   case CC_C:  val = 0x16; break;
+   case CC_NC: val = 0x11; break;
+   case CC_O:  val = 0x17; break;
+   case CC_NO: val = 0x10; break;
+
+   default:
+      val = 0;
+      assert(!"invalid condition code");
+      break;
+   }
+   code[pos / 32] |= val << (pos % 32);
+}
+
+void
+CodeEmitterNVC0::emitPredicate(const Instruction *i)
+{
+   if (i->predSrc >= 0) {
+      assert(i->getPredicate()->reg.file == FILE_PREDICATE);
+      srcId(i->src[i->predSrc], 10);
+      if (i->cc == CC_NOT_P)
+         code[0] |= 0x2000; // negate
+   } else {
+      code[0] |= 0x1c00;
+   }
+}
+
+void
+CodeEmitterNVC0::setAddress16(const ValueRef& src)
+{
+   Symbol *sym = src.get()->asSym();
+
+   assert(sym);
+
+   code[0] |= (sym->reg.data.offset & 0x003f) << 26;
+   code[1] |= (sym->reg.data.offset & 0xffc0) >> 6;
+}
+
+void
+CodeEmitterNVC0::setImmediate(const Instruction *i, const int s)
+{
+   const ImmediateValue *imm = i->src[s].get()->asImm();
+   uint32_t u32;
+
+   assert(imm);
+   u32 = imm->reg.data.u32;
+
+   if ((code[0] & 0xf) == 0x2) {
+      // LIMM
+      code[0] |= (u32 & 0x3f) << 26;
+      code[1] |= u32 >> 6;
+   } else
+   if ((code[0] & 0xf) == 0x3 || (code[0] & 0xf) == 4) {
+      // integer immediate
+      assert((u32 & 0xfff00000) == 0 || (u32 & 0xfff00000) == 0xfff00000);
+      assert(!(code[1] & 0xc000));
+      u32 &= 0xfffff;
+      code[0] |= (u32 & 0x3f) << 26;
+      code[1] |= 0xc000 | (u32 >> 6);
+   } else {
+      // float immediate
+      assert(!(u32 & 0x00000fff));
+      assert(!(code[1] & 0xc000));
+      code[0] |= ((u32 >> 12) & 0x3f) << 26;
+      code[1] |= 0xc000 | (u32 >> 18);
+   }
+}
+
+void CodeEmitterNVC0::setImmediateS8(const ValueRef &ref)
+{
+   const ImmediateValue *imm = ref.get()->asImm();
+
+   int8_t s8 = static_cast<int8_t>(imm->reg.data.s32);
+
+   assert(s8 == imm->reg.data.s32);
+
+   code[0] |= (s8 & 0x3f) << 26;
+   code[0] |= (s8 >> 6) << 8;
+}
+
+void
+CodeEmitterNVC0::emitForm_A(const Instruction *i, uint64_t opc)
+{
+   code[0] = opc;
+   code[1] = opc >> 32;
+
+   emitPredicate(i);
+
+   defId(i->def[0], 14);
+
+   int s1 = 26;
+   if (i->srcExists(2) && i->getSrc(2)->reg.file == FILE_MEMORY_CONST)
+      s1 = 49;
+
+   for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+      switch (i->getSrc(s)->reg.file) {
+      case FILE_MEMORY_CONST:
+         assert(!(code[1] & 0xc000));
+         code[1] |= (s == 2) ? 0x8000 : 0x4000;
+         code[1] |= i->getSrc(s)->reg.fileIndex << 10;
+         setAddress16(i->src[s]);
+         break;
+      case FILE_IMMEDIATE:
+         assert(s == 1 ||
+                i->op == OP_MOV || i->op == OP_PRESIN || i->op == OP_PREEX2);
+         assert(!(code[1] & 0xc000));
+         setImmediate(i, s);
+         break;
+      case FILE_GPR:
+         if ((s == 2) && ((code[0] & 0x7) == 2)) // LIMM: 3rd src == dst
+            break;
+         srcId(i->src[s], s ? ((s == 2) ? 49 : s1) : 20);
+         break;
+      default:
+         // ignore here, can be predicate or flags, but must not be address
+         break;
+      }
+   }
+}
+
+void
+CodeEmitterNVC0::emitForm_B(const Instruction *i, uint64_t opc)
+{
+   code[0] = opc;
+   code[1] = opc >> 32;
+
+   emitPredicate(i);
+
+   defId(i->def[0], 14);
+
+   switch (i->src[0].getFile()) {
+   case FILE_MEMORY_CONST:
+      assert(!(code[1] & 0xc000));
+      code[1] |= 0x4000 | (i->src[0].get()->reg.fileIndex << 10);
+      setAddress16(i->src[0]);
+      break;
+   case FILE_IMMEDIATE:
+      assert(!(code[1] & 0xc000));
+      setImmediate(i, 0);
+      break;
+   case FILE_GPR:
+      srcId(i->src[0], 26);
+      break;
+   default:
+      // ignore here, can be predicate or flags, but must not be address
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::emitForm_S(const Instruction *i, uint32_t opc, bool pred)
+{
+   code[0] = opc;
+
+   int ss2a = 0;
+   if (opc == 0x0d || opc == 0x0e)
+      ss2a = 2;
+
+   defId(i->def[0], 14);
+   srcId(i->src[0], 20);
+
+   assert(pred || (i->predSrc < 0));
+   if (pred)
+      emitPredicate(i);
+
+   for (int s = 1; s < 3 && i->srcExists(s); ++s) {
+      if (i->src[s].get()->reg.file == FILE_MEMORY_CONST) {
+         assert(!(code[0] & (0x300 >> ss2a)));
+         switch (i->src[s].get()->reg.fileIndex) {
+         case 0:  code[0] |= 0x100 >> ss2a; break;
+         case 1:  code[0] |= 0x200 >> ss2a; break;
+         case 16: code[0] |= 0x300 >> ss2a; break;
+         default:
+            ERROR("invalid c[] space for short form\n");
+            break;
+         }
+         if (s == 1)
+            code[0] |= i->getSrc(s)->reg.data.offset << 24;
+         else
+            code[0] |= i->getSrc(s)->reg.data.offset << 6;
+      } else
+      if (i->src[s].getFile() == FILE_IMMEDIATE) {
+         assert(s == 1);
+         setImmediateS8(i->src[s]);
+      } else
+      if (i->src[s].getFile() == FILE_GPR) {
+         srcId(i->src[s], (s == 1) ? 26 : 8);
+      }
+   }
+}
+
+void
+CodeEmitterNVC0::emitShortSrc2(const ValueRef &src)
+{
+   if (src.getFile() == FILE_MEMORY_CONST) {
+      switch (src.get()->reg.fileIndex) {
+      case 0:  code[0] |= 0x100; break;
+      case 1:  code[0] |= 0x200; break;
+      case 16: code[0] |= 0x300; break;
+      default:
+         assert(!"unsupported file index for short op");
+         break;
+      }
+      srcAddr32(src, 20);
+   } else {
+      srcId(src, 20);
+      assert(src.getFile() == FILE_GPR);
+   }
+}
+
+void
+CodeEmitterNVC0::emitNOP(const Instruction *i)
+{
+   code[0] = 0x000001e4;
+   code[1] = 0x40000000;
+   emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitFMAD(const Instruction *i)
+{
+   bool neg1 = (i->src[0].mod ^ i->src[1].mod).neg();
+
+   if (i->encSize == 8) {
+      if (isLIMM(i->src[1], TYPE_F32)) {
+         emitForm_A(i, HEX64(20000000, 00000002));
+      } else {
+         emitForm_A(i, HEX64(30000000, 00000000));
+
+         if (i->src[2].mod.neg())
+            code[0] |= 1 << 8;
+      }
+      roundMode_A(i);
+
+      if (neg1)
+         code[0] |= 1 << 9;
+
+      if (i->saturate)
+         code[0] |= 1 << 5;
+      if (i->ftz)
+         code[0] |= 1 << 6;
+   } else {
+      assert(!i->saturate && !i->src[2].mod.neg());
+      emitForm_S(i, (i->src[2].getFile() == FILE_MEMORY_CONST) ? 0x2e : 0x0e,
+                 false);
+      if (neg1)
+         code[0] |= 1 << 4;
+   }
+}
+
+void
+CodeEmitterNVC0::emitFMUL(const Instruction *i)
+{
+   bool neg = (i->src[0].mod ^ i->src[1].mod).neg();
+
+   assert(i->postFactor >= -3 && i->postFactor <= 3);
+
+   if (i->encSize == 8) {
+      if (isLIMM(i->src[1], TYPE_F32)) {
+         assert(i->postFactor == 0); // constant folded, hopefully
+         emitForm_A(i, HEX64(30000000, 00000002));
+      } else {
+         emitForm_A(i, HEX64(58000000, 00000000));
+         roundMode_A(i);
+         code[1] |= ((i->postFactor > 0) ?
+                     (7 - i->postFactor) : (0 - i->postFactor)) << 17;
+      }
+      if (neg)
+         code[1] ^= 1 << 25; // aliases with LIMM sign bit
+
+      if (i->saturate)
+         code[0] |= 1 << 5;
+
+      if (i->dnz)
+         code[0] |= 1 << 7;
+      else
+      if (i->ftz)
+         code[0] |= 1 << 6;
+   } else {
+      assert(!neg && !i->saturate && !i->ftz && !i->postFactor);
+      emitForm_S(i, 0xa8, true);
+   }
+}
+
+void
+CodeEmitterNVC0::emitUMUL(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      if (i->src[1].getFile() == FILE_IMMEDIATE) {
+         emitForm_A(i, HEX64(10000000, 00000002));
+      } else {
+         emitForm_A(i, HEX64(50000000, 00000003));
+      }
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+         code[0] |= 1 << 6;
+      if (i->sType == TYPE_S32)
+         code[0] |= 1 << 5;
+      if (i->dType == TYPE_S32)
+         code[0] |= 1 << 7;
+   } else {
+      emitForm_S(i, i->src[1].getFile() == FILE_IMMEDIATE ? 0xaa : 0x2a, true);
+
+      if (i->sType == TYPE_S32)
+         code[0] |= 1 << 6;
+   }
+}
+
+void
+CodeEmitterNVC0::emitFADD(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      if (isLIMM(i->src[1], TYPE_F32)) {
+         emitForm_A(i, HEX64(28000000, 00000002));
+
+         assert(!i->src[1].mod.neg() && !i->src[1].mod.abs() && !i->saturate);
+      } else {
+         emitForm_A(i, HEX64(50000000, 00000000));
+
+         roundMode_A(i);
+         if (i->saturate)
+            code[1] |= 1 << 17;
+      }
+      emitNegAbs12(i);
+
+      if (i->op == OP_SUB) code[0] ^= 1 << 8;
+
+      if (i->ftz)
+         code[0] |= 1 << 5;
+   } else {
+      assert(!i->saturate && i->op != OP_SUB &&
+             !i->src[0].mod.abs() &&
+             !i->src[1].mod.neg() && !i->src[1].mod.abs());
+
+      emitForm_S(i, 0x49, true);
+
+      if (i->src[0].mod.neg())
+         code[0] |= 1 << 7;
+   }
+}
+
+void
+CodeEmitterNVC0::emitUADD(const Instruction *i)
+{
+   uint32_t addOp = 0;
+
+   assert(!i->src[0].mod.abs() && !i->src[1].mod.abs());
+   assert(!i->src[0].mod.neg() || !i->src[1].mod.neg());
+
+   if (i->src[0].mod.neg())
+      addOp |= 0x200;
+   if (i->src[1].mod.neg())
+      addOp |= 0x100;
+   if (i->op == OP_SUB) {
+      addOp ^= 0x100;
+      assert(addOp != 0x300); // would be add-plus-one
+   }
+
+   if (i->encSize == 8) {
+      if (isLIMM(i->src[1], TYPE_U32)) {
+         emitForm_A(i, HEX64(08000000, 00000002));
+         if (i->def[1].exists())
+            code[1] |= 1 << 26; // write carry
+      } else {
+         emitForm_A(i, HEX64(48000000, 00000003));
+         if (i->def[1].exists())
+            code[1] |= 1 << 16; // write carry
+      }
+      code[0] |= addOp;
+
+      if (i->saturate)
+         code[0] |= 1 << 5;
+      if (i->flagsSrc >= 0) // add carry
+         code[0] |= 1 << 6;
+   } else {
+      assert(!(addOp & 0x100));
+      emitForm_S(i, (addOp >> 3) |
+                 ((i->src[1].getFile() == FILE_IMMEDIATE) ? 0xac : 0x2c), true);
+   }
+}
+
+// TODO: shl-add
+void
+CodeEmitterNVC0::emitIMAD(const Instruction *i)
+{
+   assert(i->encSize == 8);
+   emitForm_A(i, HEX64(20000000, 00000003));
+
+   if (isSignedType(i->dType))
+      code[0] |= 1 << 7;
+   if (isSignedType(i->sType))
+      code[0] |= 1 << 5;
+
+   code[1] |= i->saturate << 24;
+
+   if (i->flagsDef >= 0) code[1] |= 1 << 16;
+   if (i->flagsSrc >= 0) code[1] |= 1 << 23;
+
+   if (i->src[2].mod.neg()) code[0] |= 0x10;
+   if (i->src[1].mod.neg() ^
+       i->src[0].mod.neg()) code[0] |= 0x20;
+
+   if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+      code[0] |= 1 << 6;
+}
+
+void
+CodeEmitterNVC0::emitNOT(Instruction *i)
+{
+   assert(i->encSize == 8);
+   i->src[1].set(i->src[0]);
+   emitForm_A(i, HEX64(68000000, 000001c3));
+}
+
+void
+CodeEmitterNVC0::emitLogicOp(const Instruction *i, uint8_t subOp)
+{
+   if (i->encSize == 8) {
+      if (isLIMM(i->src[1], TYPE_U32)) {
+         emitForm_A(i, HEX64(38000000, 00000002));
+
+         if (i->src[2].exists())
+            code[1] |= 1 << 26;
+      } else {
+         emitForm_A(i, HEX64(68000000, 00000003));
+
+         if (i->src[2].exists())
+            code[1] |= 1 << 16;
+      }
+      code[0] |= subOp << 6;
+
+      if (i->src[2].exists()) // carry
+         code[0] |= 1 << 5;
+
+      if (i->src[0].mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9;
+      if (i->src[1].mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8;
+   } else {
+      emitForm_S(i, (subOp << 5) |
+                 ((i->src[1].getFile() == FILE_IMMEDIATE) ? 0x1d : 0x8d), true);
+   }
+}
+
+void
+CodeEmitterNVC0::emitPOPC(const Instruction *i)
+{
+   emitForm_A(i, HEX64(54000000, 00000004));
+
+   if (i->src[0].mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9;
+   if (i->src[1].mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8;
+}
+
+void
+CodeEmitterNVC0::emitINSBF(const Instruction *i)
+{
+   emitForm_A(i, HEX64(28000000, 30000000));
+}
+
+void
+CodeEmitterNVC0::emitShift(const Instruction *i)
+{
+   if (i->op == OP_SHR) {
+      emitForm_A(i, HEX64(58000000, 00000003)
+                 | (isSignedType(i->dType) ? 0x20 : 0x00));
+   } else {
+      emitForm_A(i, HEX64(60000000, 00000003));
+   }
+
+   if (0)
+      code[0] |= 1 << 9; // clamp shift amount
+}
+
+void
+CodeEmitterNVC0::emitPreOp(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      emitForm_B(i, HEX64(60000000, 00000000));
+
+      if (i->op == OP_PREEX2)
+         code[0] |= 0x20;
+
+      if (i->src[0].mod.abs()) code[0] |= 1 << 6;
+      if (i->src[0].mod.neg()) code[0] |= 1 << 8;
+   } else {
+      emitForm_S(i, i->op == OP_PREEX2 ? 0x74000008 : 0x70000008, true);
+   }
+}
+
+void
+CodeEmitterNVC0::emitSFnOp(const Instruction *i, uint8_t subOp)
+{
+   if (i->encSize == 8) {
+      code[0] = 0x00000000 | (subOp << 26);
+      code[1] = 0xc8000000;
+
+      emitPredicate(i);
+
+      defId(i->def[0], 14);
+      srcId(i->src[0], 20);
+
+      assert(i->src[0].getFile() == FILE_GPR);
+
+      if (i->saturate) code[0] |= 1 << 5;
+
+      if (i->src[0].mod.abs()) code[0] |= 1 << 7;
+      if (i->src[0].mod.neg()) code[0] |= 1 << 9;
+   } else {
+      emitForm_S(i, 0x80000008 | (subOp << 26), true);
+
+      assert(!i->src[0].mod.neg());
+      if (i->src[0].mod.abs()) code[0] |= 1 << 30;
+   }
+}
+
+void
+CodeEmitterNVC0::emitMINMAX(const Instruction *i)
+{
+   uint64_t op;
+
+   assert(i->encSize == 8);
+
+   op = (i->op == OP_MIN) ? 0x080e000000000000ULL : 0x081e000000000000ULL;
+
+   if (i->ftz)
+      op |= 1 << 5;
+   else
+   if (!isFloatType(i->dType))
+      op |= isSignedType(i->dType) ? 0x23 : 0x03;
+
+   emitForm_A(i, op);
+   emitNegAbs12(i);
+}
+
+void
+CodeEmitterNVC0::roundMode_C(const Instruction *i)
+{
+   switch (i->rnd) {
+   case ROUND_M:  code[1] |= 1 << 17; break;
+   case ROUND_P:  code[1] |= 2 << 17; break;
+   case ROUND_Z:  code[1] |= 3 << 17; break;
+   case ROUND_NI: code[0] |= 1 << 7; break;
+   case ROUND_MI: code[0] |= 1 << 7; code[1] |= 1 << 17; break;
+   case ROUND_PI: code[0] |= 1 << 7; code[1] |= 2 << 17; break;
+   case ROUND_ZI: code[0] |= 1 << 7; code[1] |= 3 << 17; break;
+   case ROUND_N: break;
+   default:
+      assert(!"invalid round mode");
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::roundMode_CS(const Instruction *i)
+{
+   switch (i->rnd) {
+   case ROUND_M:
+   case ROUND_MI: code[0] |= 1 << 16; break;
+   case ROUND_P:
+   case ROUND_PI: code[0] |= 2 << 16; break;
+   case ROUND_Z:
+   case ROUND_ZI: code[0] |= 3 << 16; break;
+   default:
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::emitCVT(Instruction *i)
+{
+   const bool f2f = isFloatType(i->dType) && isFloatType(i->sType);
+
+   switch (i->op) {
+   case OP_CEIL:  i->rnd = f2f ? ROUND_PI : ROUND_P; break;
+   case OP_FLOOR: i->rnd = f2f ? ROUND_MI : ROUND_M; break;
+   case OP_TRUNC: i->rnd = f2f ? ROUND_ZI : ROUND_Z; break;
+   default:
+      break;
+   }
+
+   const bool sat = (i->op == OP_SAT) || i->saturate;
+   const bool abs = (i->op == OP_ABS) || i->src[0].mod.abs();
+   const bool neg = (i->op == OP_NEG) || i->src[0].mod.neg();
+
+   if (i->encSize == 8) {
+      emitForm_B(i, HEX64(10000000, 00000004));
+
+      roundMode_C(i);
+
+      code[0] |= util_logbase2(i->def[0].getSize()) << 20;
+      code[0] |= util_logbase2(i->src[0].getSize()) << 23;
+
+      if (sat)
+         code[0] |= 0x20;
+      if (abs)
+         code[0] |= 1 << 6;
+      if (neg && i->op != OP_ABS)
+         code[0] |= 1 << 8;
+
+      if (i->ftz)
+         code[1] |= 1 << 23;
+
+      if (isSignedIntType(i->dType))
+         code[0] |= 0x080;
+      if (isSignedIntType(i->sType))
+         code[0] |= 0x200;
+
+      if (isFloatType(i->dType)) {
+         if (!isFloatType(i->sType))
+            code[1] |= 0x08000000;
+      } else {
+         if (isFloatType(i->sType))
+            code[1] |= 0x04000000;
+         else
+            code[1] |= 0x0c000000;
+      }
+   } else {
+      if (i->op == OP_CEIL || i->op == OP_FLOOR || i->op == OP_TRUNC) {
+         code[0] = 0x298;
+      } else
+      if (isFloatType(i->dType)) {
+         if (isFloatType(i->sType))
+            code[0] = 0x098;
+         else
+            code[0] = 0x088 | (isSignedType(i->sType) ? (1 << 8) : 0);
+      } else {
+         assert(isFloatType(i->sType));
+
+         code[0] = 0x288 | (isSignedType(i->sType) ? (1 << 8) : 0);
+      }
+
+      if (neg) code[0] |= 1 << 16;
+      if (sat) code[0] |= 1 << 18;
+      if (abs) code[0] |= 1 << 19;
+
+      roundMode_CS(i);
+   }
+}
+
+void
+CodeEmitterNVC0::emitSET(const CmpInstruction *i)
+{
+   uint32_t hi;
+   uint32_t lo = 0;
+
+   if (i->sType == TYPE_F64)
+      lo = 0x1;
+   else
+   if (!isFloatType(i->sType))
+      lo = 0x3;
+
+   if (isFloatType(i->dType) || isSignedIntType(i->sType))
+      lo |= 0x20;
+
+   switch (i->op) {
+   case OP_SET_AND: hi = 0x10000000; break;
+   case OP_SET_OR:  hi = 0x10200000; break;
+   case OP_SET_XOR: hi = 0x10400000; break;
+   default:
+      hi = 0x100e0000;
+      break;
+   }
+   emitForm_A(i, (static_cast<uint64_t>(hi) << 32) | lo);
+
+   if (i->def[0].getFile() == FILE_PREDICATE) {
+      if (i->sType == TYPE_F32)
+         code[1] += 0x10000000;
+      else
+         code[1] += 0x08000000;
+
+      code[0] &= ~0xfc000;
+      defId(i->def[0], 17);
+      if (i->defExists(1))
+         defId(i->def[1], 14);
+      else
+         code[0] |= 0x1c000;
+   }
+
+   if (i->ftz)
+      code[1] |= 1 << 27;
+
+   emitCondCode(i->setCond, 32 + 23);
+   emitNegAbs12(i);
+}
+
+void
+CodeEmitterNVC0::emitSLCT(const CmpInstruction *i)
+{
+   uint64_t op;
+
+   switch (i->dType) {
+   case TYPE_S32:
+      op = HEX64(30000000, 00000023);
+      break;
+   case TYPE_U32:
+      op = HEX64(30000000, 00000003);
+      break;
+   case TYPE_F32:
+      op = HEX64(38000000, 00000000);
+      break;
+   default:
+      assert(!"invalid type for SLCT");
+      op = 0;
+      break;
+   }
+   emitForm_A(i, op);
+
+   CondCode cc = i->setCond;
+
+   if (i->src[2].mod.neg())
+      cc = reverseCondCode(cc);
+
+   emitCondCode(cc, 32 + 23);
+
+   if (i->ftz)
+      code[0] |= 1 << 5;
+}
+
+void CodeEmitterNVC0::emitSELP(const Instruction *i)
+{
+   emitForm_A(i, HEX64(20000000, 00000004));
+
+   if (i->cc == CC_NOT_P || i->src[2].mod & Modifier(NV50_IR_MOD_NOT))
+      code[1] |= 1 << 20;
+}
+
+void CodeEmitterNVC0::emitTEXCSAA(const TexInstruction *i)
+{
+   code[0] = 0x00000086;
+   code[1] = 0xd0000000;
+
+   code[1] |= i->tex.r;
+   code[1] |= i->tex.s << 8;
+
+   if (i->tex.liveOnly)
+      code[0] |= 1 << 9;
+
+   defId(i->def[0], 14);
+   srcId(i->src[0], 20);
+}
+
+void
+CodeEmitterNVC0::emitTEX(const TexInstruction *i)
+{
+   code[0] = 0x00000006;
+
+   if (1)
+      code[0] |= 0x80; // normal/t/p mode = t, XXX: what is this ?
+
+   if (i->tex.liveOnly)
+      code[0] |= 1 << 9;
+
+   switch (i->op) {
+   case OP_TEX: code[1] = 0x80000000; break;
+   case OP_TXB: code[1] = 0x84000000; break;
+   case OP_TXL: code[1] = 0x86000000; break;
+   case OP_TXF: code[1] = 0x92000000; break;
+   case OP_TXG: code[1] = 0xa0000000; break;
+   case OP_TXD: code[1] = 0xe0000000; break;
+   default:
+      assert(!"invalid texture op");
+      break;
+   }
+   defId(i->def[0], 14);
+   srcId(i->src[0], 20);
+
+   emitPredicate(i);
+
+   if (i->op == OP_TXG) code[0] |= i->tex.gatherComp << 5;
+
+   code[1] |= i->tex.mask << 14;
+
+   code[1] |= i->tex.r;
+   code[1] |= i->tex.s << 8;
+   if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0)
+      code[1] |= 1 << 18; // in 1st source (with array index)
+
+   // texture target:
+   code[1] |= (i->tex.target.getDim() - 1) << 20;
+   if (i->tex.target.isCube())
+      code[1] += 2 << 20;
+   if (i->tex.target.isArray())
+      code[1] |= 1 << 19;
+   if (i->tex.target.isShadow())
+      code[1] |= 1 << 24;
+
+   int src1 = i->tex.target.getArgCount();
+
+   if (i->src[src1].getFile() == FILE_IMMEDIATE) { // lzero
+      if (i->op == OP_TXL)
+         code[1] &= ~(1 << 26);
+      else
+      if (i->op == OP_TXF)
+         code[1] &= ~(1 << 25);
+   }
+   if (i->tex.target == TEX_TARGET_2D_MS ||
+       i->tex.target == TEX_TARGET_2D_MS_ARRAY)
+      code[1] |= 1 << 23;
+
+   if (i->tex.useOffsets) // in vecSrc0.w
+      code[1] |= 1 << 22;
+
+   srcId(i->src[src1], 26);
+}
+
+void
+CodeEmitterNVC0::emitTXQ(const TexInstruction *i)
+{
+   code[0] = 0x00000086;
+   code[1] = 0xc0000000;
+
+   switch (i->tex.query) {
+   case TXQ_DIMS:            code[1] |= 0 << 22; break;
+   case TXQ_TYPE:            code[1] |= 1 << 22; break;
+   case TXQ_SAMPLE_POSITION: code[1] |= 2 << 22; break;
+   case TXQ_FILTER:          code[1] |= 3 << 22; break;
+   case TXQ_LOD:             code[1] |= 4 << 22; break;
+   case TXQ_BORDER_COLOUR:   code[1] |= 5 << 22; break;
+   default:
+      assert(!"invalid texture query");
+      break;
+   }
+
+   code[1] |= i->tex.mask << 14;
+
+   code[1] |= i->tex.r;
+   code[1] |= i->tex.s << 8;
+   if (i->tex.sIndirectSrc >= 0 || i->tex.rIndirectSrc >= 0)
+      code[1] |= 1 << 18;
+
+   defId(i->def[0], 14);
+   srcId(i->src[0], 20);
+   srcId(i->src[1], 26);
+
+   emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitQUADOP(const Instruction *i, uint8_t qOp, uint8_t laneMask)
+{
+   code[0] = 0x00000000 | (laneMask << 6);
+   code[1] = 0x48000000 | qOp;
+
+   defId(i->def[0], 14);
+   srcId(i->src[0], 20);
+   srcId(i->srcExists(1) ? i->src[1] : i->src[0], 26);
+
+   emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitFlow(const Instruction *i)
+{
+   const FlowInstruction *f = i->asFlow();
+
+   unsigned mask; // bit 0: predicate, bit 1: target
+
+   code[0] = 0x00000007;
+
+   switch (i->op) {
+   case OP_BRA:
+      code[1] = f->absolute ? 0x00000000 : 0x40000000;
+      if (i->src[0].getFile() == FILE_MEMORY_CONST ||
+          i->src[1].getFile() == FILE_MEMORY_CONST)
+         code[1] |= 0x4000;
+      mask = 3;
+      break;
+   case OP_CALL:
+      code[1] = f->absolute ? 0x10000000 : 0x50000000;
+      if (i->src[0].getFile() == FILE_MEMORY_CONST)
+         code[1] |= 0x4000;
+      mask = 2;
+      break;
+
+   case OP_EXIT:    code[1] = 0x80000000; mask = 1; break;
+   case OP_RET:     code[1] = 0x90000000; mask = 1; break;
+   case OP_DISCARD: code[1] = 0x98000000; mask = 1; break;
+   case OP_BREAK:   code[1] = 0xa8000000; mask = 1; break;
+   case OP_CONT:    code[1] = 0xb0000000; mask = 1; break;
+
+   case OP_JOINAT:   code[1] = 0x60000000; mask = 2; break;
+   case OP_PREBREAK: code[1] = 0x68000000; mask = 2; break;
+   case OP_PRECONT:  code[1] = 0x70000000; mask = 2; break;
+   case OP_PRERET:   code[1] = 0x78000000; mask = 2; break;
+
+   case OP_QUADON:  code[1] = 0xc0000000; mask = 0; break;
+   case OP_QUADPOP: code[1] = 0xc8000000; mask = 0; break;
+   case OP_BRKPT:   code[1] = 0xd0000000; mask = 0; break;
+   default:
+      assert(!"invalid flow operation");
+      return;
+   }
+
+   if (mask & 1) {
+      emitPredicate(i);
+      if (i->flagsSrc < 0)
+         code[0] |= 0x1e0;
+   }
+
+   if (!f)
+      return;
+
+   if (f->allWarp)
+      code[0] |= 1 << 15;
+   if (f->limit)
+      code[0] |= 1 << 16;
+
+   if (f->op == OP_CALL) {
+      if (f->builtin) {
+         assert(f->absolute);
+         uint32_t pcAbs = targ->getBuiltinOffset(f->target.builtin);
+         addReloc(RelocEntry::TYPE_BUILTIN, 0, pcAbs, 0xfc000000, 26);
+         addReloc(RelocEntry::TYPE_BUILTIN, 1, pcAbs, 0x03ffffff, -6);
+      } else {
+         assert(!f->absolute);
+         int32_t pcRel = f->target.fn->binPos - (codeSize + 8);
+         code[0] |= (pcRel & 0x3f) << 26;
+         code[1] |= (pcRel >> 6) & 0x3ffff;
+      }
+   } else
+   if (mask & 2) {
+      int32_t pcRel = f->target.bb->binPos - (codeSize + 8);
+      // currently we don't want absolute branches
+      assert(!f->absolute);
+      code[0] |= (pcRel & 0x3f) << 26;
+      code[1] |= (pcRel >> 6) & 0x3ffff;
+   }
+}
+
+void
+CodeEmitterNVC0::emitPFETCH(const Instruction *i)
+{
+   uint32_t prim = i->src[0].get()->reg.data.u32;
+
+   code[0] = 0x00000006 | ((prim & 0x3f) << 26);
+   code[1] = 0x00000000 | (prim >> 6);
+
+   emitPredicate(i);
+
+   defId(i->def[0], 14);
+   srcId(i->src[1], 20);
+}
+
+void
+CodeEmitterNVC0::emitVFETCH(const Instruction *i)
+{
+   code[0] = 0x00000006;
+   code[1] = 0x06000000 | i->src[0].get()->reg.data.offset;
+
+   if (i->perPatch)
+      code[0] |= 0x100;
+   if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+      code[0] |= 0x200; // yes, TCPs can read from *outputs* of other threads
+
+   emitPredicate(i);
+
+   code[0] |= (i->defCount(0xf) - 1) << 5;
+
+   defId(i->def[0], 14);
+   srcId(i->src[0].getIndirect(0), 20);
+   srcId(i->src[0].getIndirect(1), 26); // vertex address
+}
+
+void
+CodeEmitterNVC0::emitEXPORT(const Instruction *i)
+{
+   unsigned int size = typeSizeof(i->dType);
+
+   code[0] = 0x00000006 | ((size / 4 - 1) << 5);
+   code[1] = 0x0a000000 | i->src[0].get()->reg.data.offset;
+
+   assert(size != 12 && !(code[1] & (size - 1)));
+
+   if (i->perPatch)
+      code[0] |= 0x100;
+
+   emitPredicate(i);
+
+   assert(i->src[1].getFile() == FILE_GPR);
+
+   srcId(i->src[0].getIndirect(0), 20);
+   srcId(i->src[0].getIndirect(1), 32 + 17); // vertex base address
+   srcId(i->src[1], 26);
+}
+
+void
+CodeEmitterNVC0::emitOUT(const Instruction *i)
+{
+   code[0] = 0x00000006;
+   code[1] = 0x1c000000;
+
+   emitPredicate(i);
+
+   defId(i->def[0], 14); // new secret address
+   srcId(i->src[0], 20); // old secret address, should be 0 initially
+
+   assert(i->src[0].getFile() == FILE_GPR);
+
+   if (i->op == OP_EMIT)
+      code[0] |= 1 << 5;
+   if (i->op == OP_RESTART || i->subOp == NV50_IR_SUBOP_EMIT_RESTART)
+      code[0] |= 1 << 6;
+
+   // vertex stream
+   if (i->src[1].getFile() == FILE_IMMEDIATE) {
+      code[1] |= 0xc000;
+      code[0] |= SDATA(i->src[1]).u32 << 26;
+   } else {
+      srcId(i->src[1], 26);
+   }
+}
+
+void
+CodeEmitterNVC0::emitInterpMode(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      code[0] |= i->ipa << 6; // TODO: INTERP_SAMPLEID
+   } else {
+      if (i->getInterpMode() == NV50_IR_INTERP_SC)
+         code[0] |= 0x80;
+      assert(i->op == OP_PINTERP && i->getSampleMode() == 0);
+   }
+}
+
+void
+CodeEmitterNVC0::emitINTERP(const Instruction *i)
+{
+   const uint32_t base = i->getSrc(0)->reg.data.offset;
+
+   if (i->encSize == 8) {
+      code[0] = 0x00000000;
+      code[1] = 0xc0000000 | (base & 0xffff);
+
+      if (i->saturate)
+         code[0] |= 1 << 5;
+
+      if (i->op == OP_PINTERP)
+         srcId(i->src[1], 26);
+      else
+         code[0] |= 0x3f << 26;
+
+      srcId(i->src[0].getIndirect(0), 20);
+   } else {
+      assert(i->op == OP_PINTERP);
+      code[0] = 0x00000009 | ((base & 0xc) << 6) | ((base >> 4) << 26);
+      srcId(i->src[1], 20);
+   }
+   emitInterpMode(i);
+
+   emitPredicate(i);
+   defId(i->def[0], 14);
+
+   if (i->getSampleMode() == NV50_IR_INTERP_OFFSET)
+      srcId(i->src[i->op == OP_PINTERP ? 2 : 1], 17);
+   else
+      code[1] |= 0x3f << 17;
+}
+
+void
+CodeEmitterNVC0::emitLoadStoreType(DataType ty)
+{
+   uint8_t val;
+
+   switch (ty) {
+   case TYPE_U8:
+      val = 0x00;
+      break;
+   case TYPE_S8:
+      val = 0x20;
+      break;
+   case TYPE_F16:
+   case TYPE_U16:
+      val = 0x40;
+      break;
+   case TYPE_S16:
+      val = 0x60;
+      break;
+   case TYPE_F32:
+   case TYPE_U32:
+   case TYPE_S32:
+      val = 0x80;
+      break;
+   case TYPE_F64:
+   case TYPE_U64:
+   case TYPE_S64:
+      val = 0xa0;
+      break;
+   case TYPE_B128:
+      val = 0xc0;
+      break;
+   default:
+      val = 0x80;
+      assert(!"invalid type");
+      break;
+   }
+   code[0] |= val;
+}
+
+void
+CodeEmitterNVC0::emitCachingMode(CacheMode c)
+{
+   uint32_t val;
+
+   switch (c) {
+   case CACHE_CA:
+// case CACHE_WB:
+      val = 0x000;
+      break;
+   case CACHE_CG:
+      val = 0x100;
+      break;
+   case CACHE_CS:
+      val = 0x200;
+      break;
+   case CACHE_CV:
+// case CACHE_WT:
+      val = 0x300;
+      break;
+   default:
+      val = 0;
+      assert(!"invalid caching mode");
+      break;
+   }
+   code[0] |= val;
+}
+
+void
+CodeEmitterNVC0::emitSTORE(const Instruction *i)
+{
+   uint32_t opc;
+
+   switch (i->src[0].getFile()) {
+   case FILE_MEMORY_GLOBAL: opc = 0x90000000; break;
+   case FILE_MEMORY_LOCAL:  opc = 0xc8000000; break;
+   case FILE_MEMORY_SHARED: opc = 0xc9000000; break;
+   default:
+      assert(!"invalid memory file");
+      opc = 0;
+      break;
+   }
+   code[0] = 0x00000005;
+   code[1] = opc;
+
+   setAddress16(i->src[0]);
+   srcId(i->src[1], 14);
+   srcId(i->src[0].getIndirect(0), 20);
+
+   emitPredicate(i);
+
+   emitLoadStoreType(i->dType);
+   emitCachingMode(i->cache);
+}
+
+void
+CodeEmitterNVC0::emitLOAD(const Instruction *i)
+{
+   uint32_t opc;
+
+   code[0] = 0x00000005;
+
+   switch (i->src[0].getFile()) {
+   case FILE_MEMORY_GLOBAL: opc = 0x80000000; break;
+   case FILE_MEMORY_LOCAL:  opc = 0xc0000000; break;
+   case FILE_MEMORY_SHARED: opc = 0xc1000000; break;
+   case FILE_MEMORY_CONST:
+      if (!i->src[0].isIndirect(0) && typeSizeof(i->dType) == 4) {
+         emitMOV(i); // not sure if this is any better
+         return;
+      }
+      opc = 0x14000000 | (i->src[0].get()->reg.fileIndex << 10);
+      code[0] = 0x00000006 | (i->subOp << 8);
+      break;
+   default:
+      assert(!"invalid memory file");
+      opc = 0;
+      break;
+   }
+   code[1] = opc;
+
+   defId(i->def[0], 14);
+
+   setAddress16(i->src[0]);
+   srcId(i->src[0].getIndirect(0), 20);
+
+   emitPredicate(i);
+
+   emitLoadStoreType(i->dType);
+   emitCachingMode(i->cache);
+}
+
+uint8_t
+CodeEmitterNVC0::getSRegEncoding(const ValueRef& ref)
+{
+   switch (SDATA(ref).sv.sv) {
+   case SV_LANEID:        return 0x00;
+   case SV_PHYSID:        return 0x03;
+   case SV_VERTEX_COUNT:  return 0x10;
+   case SV_INVOCATION_ID: return 0x11;
+   case SV_YDIR:          return 0x12;
+   case SV_TID:           return 0x21 + SDATA(ref).sv.index;
+   case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
+   case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
+   case SV_GRIDID:        return 0x2c;
+   case SV_NCTAID:        return 0x2d + SDATA(ref).sv.index;
+   case SV_LBASE:         return 0x34;
+   case SV_SBASE:         return 0x30;
+   case SV_CLOCK:         return 0x50 + SDATA(ref).sv.index;
+   default:
+      assert(!"no sreg for system value");
+      return 0;
+   }
+}
+
+void
+CodeEmitterNVC0::emitMOV(const Instruction *i)
+{
+   if (i->src[0].getFile() == FILE_SYSTEM_VALUE) {
+      uint8_t sr = getSRegEncoding(i->src[0]);
+
+      if (i->encSize == 8) {
+         code[0] = 0x00000004 | (sr << 26);
+         code[1] = 0x2c000000;
+      } else {
+         code[0] = 0x40000008 | (sr << 20);
+      }
+      defId(i->def[0], 14);
+
+      emitPredicate(i);
+   } else
+   if (i->encSize == 8) {
+      uint64_t opc;
+
+      if (i->src[0].getFile() == FILE_IMMEDIATE)
+         opc = HEX64(18000000, 000001e2);
+      else
+      if (i->src[0].getFile() == FILE_PREDICATE)
+         opc = HEX64(080e0000, 1c000004);
+      else
+         opc = HEX64(28000000, 00000004);
+
+      opc |= i->lanes << 5;
+
+      emitForm_B(i, opc);
+   } else {
+      uint32_t imm;
+
+      if (i->src[0].getFile() == FILE_IMMEDIATE) {
+         imm = SDATA(i->src[0]).u32;
+         if (imm & 0xfff00000) {
+            assert(!(imm & 0x000fffff));
+            code[0] = 0x00000318 | imm;
+         } else {
+            assert(imm < 0x800 || ((int32_t)imm >= -0x800));
+            code[0] = 0x00000118 | (imm << 20);
+         }
+      } else {
+         code[0] = 0x0028;
+         emitShortSrc2(i->src[0]);
+      }
+      defId(i->def[0], 14);
+
+      emitPredicate(i);
+   }
+}
+
+bool
+CodeEmitterNVC0::emitInstruction(Instruction *insn)
+{
+   if (!insn->encSize) {
+      ERROR("skipping unencodable instruction: "); insn->print();
+      return false;
+   } else
+   if (codeSize + insn->encSize > codeSizeLimit) {
+      ERROR("code emitter output buffer too small\n");
+      return false;
+   }
+
+   // assert that instructions with multiple defs don't corrupt registers
+   for (int d = 0; insn->defExists(d); ++d)
+      assert(insn->asTex() || insn->def[d].rep()->reg.data.id >= 0);
+
+   switch (insn->op) {
+   case OP_MOV:
+   case OP_RDSV:
+      emitMOV(insn);
+      break;
+   case OP_NOP:
+      break;
+   case OP_LOAD:
+      emitLOAD(insn);
+      break;
+   case OP_STORE:
+      emitSTORE(insn);
+      break;
+   case OP_LINTERP:
+   case OP_PINTERP:
+      emitINTERP(insn);
+      break;
+   case OP_VFETCH:
+      emitVFETCH(insn);
+      break;
+   case OP_EXPORT:
+      emitEXPORT(insn);
+      break;
+   case OP_PFETCH:
+      emitPFETCH(insn);
+      break;
+   case OP_EMIT:
+   case OP_RESTART:
+      emitOUT(insn);
+      break;
+   case OP_ADD:
+   case OP_SUB:
+      if (isFloatType(insn->dType))
+         emitFADD(insn);
+      else
+         emitUADD(insn);
+      break;
+   case OP_MUL:
+      if (isFloatType(insn->dType))
+         emitFMUL(insn);
+      else
+         emitUMUL(insn);
+      break;
+   case OP_MAD:
+   case OP_FMA:
+      if (isFloatType(insn->dType))
+         emitFMAD(insn);
+      else
+         emitIMAD(insn);
+      break;
+   case OP_NOT:
+      emitNOT(insn);
+      break;
+   case OP_AND:
+      emitLogicOp(insn, 0);
+      break;
+   case OP_OR:
+      emitLogicOp(insn, 1);
+      break;
+   case OP_XOR:
+      emitLogicOp(insn, 2);
+      break;
+   case OP_SHL:
+   case OP_SHR:
+      emitShift(insn);
+      break;
+   case OP_SET:
+   case OP_SET_AND:
+   case OP_SET_OR:
+   case OP_SET_XOR:
+      emitSET(insn->asCmp());
+      break;
+   case OP_SELP:
+      emitSELP(insn);
+      break;
+   case OP_SLCT:
+      emitSLCT(insn->asCmp());
+      break;
+   case OP_MIN:
+   case OP_MAX:
+      emitMINMAX(insn);
+      break;
+   case OP_ABS:
+   case OP_NEG:
+   case OP_CEIL:
+   case OP_FLOOR:
+   case OP_TRUNC:
+   case OP_CVT:
+   case OP_SAT:
+      emitCVT(insn);
+      break;
+   case OP_RSQ:
+      emitSFnOp(insn, 5);
+      break;
+   case OP_RCP:
+      emitSFnOp(insn, 4);
+      break;
+   case OP_LG2:
+      emitSFnOp(insn, 3);
+      break;
+   case OP_EX2:
+      emitSFnOp(insn, 2);
+      break;
+   case OP_SIN:
+      emitSFnOp(insn, 1);
+      break;
+   case OP_COS:
+      emitSFnOp(insn, 0);
+      break;
+   case OP_PRESIN:
+   case OP_PREEX2:
+      emitPreOp(insn);
+      break;
+   case OP_TEX:
+   case OP_TXB:
+   case OP_TXL:
+   case OP_TXD:
+   case OP_TXF:
+      emitTEX(insn->asTex());
+      break;
+   case OP_TXQ:
+      emitTXQ(insn->asTex());
+      break;
+   case OP_BRA:
+   case OP_CALL:
+   case OP_PRERET:
+   case OP_RET:
+   case OP_DISCARD:
+   case OP_EXIT:
+   case OP_PRECONT:
+   case OP_CONT:
+   case OP_PREBREAK:
+   case OP_BREAK:
+   case OP_JOINAT:
+   case OP_BRKPT:
+   case OP_QUADON:
+   case OP_QUADPOP:
+      emitFlow(insn);
+      break;
+   case OP_QUADOP:
+      emitQUADOP(insn, insn->subOp, insn->lanes);
+      break;
+   case OP_DFDX:
+      emitQUADOP(insn, insn->src[0].mod.neg() ? 0x66 : 0x99, 0x4);
+      break;
+   case OP_DFDY:
+      emitQUADOP(insn, insn->src[0].mod.neg() ? 0x5a : 0xa5, 0x5);
+      break;
+   case OP_POPCNT:
+      emitPOPC(insn);
+      break;
+   case OP_JOIN:
+      emitNOP(insn);
+      insn->join = 1;
+      break;
+   case OP_PHI:
+   case OP_UNION:
+   case OP_CONSTRAINT:
+      ERROR("operation should have been eliminated");
+      return false;
+   case OP_EXP:
+   case OP_LOG:
+   case OP_SQRT:
+   case OP_POW:
+      ERROR("operation should have been lowered\n");
+      return false;
+   default:
+      ERROR("unknow op\n");
+      return false;
+   }
+
+   if (insn->join) {
+      code[0] |= 0x10;
+      assert(insn->encSize == 8);
+   }
+
+   code += insn->encSize / 4;
+   codeSize += insn->encSize;
+   return true;
+}
+
+uint32_t
+CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const
+{
+   const Target::OpInfo &info = targ->getOpInfo(i);
+
+   if (info.minEncSize == 8 || 1)
+      return 8;
+
+   if (i->ftz || i->saturate || i->join)
+      return 8;
+   if (i->rnd != ROUND_N)
+      return 8;
+   if (i->predSrc >= 0 && i->op == OP_MAD)
+      return 8;
+
+   if (i->op == OP_PINTERP) {
+      if (i->getSampleMode() || 1) // XXX: grr, short op doesn't work
+         return 8;
+   } else
+   if (i->op == OP_MOV && i->lanes != 0xf) {
+      return 8;
+   }
+
+   for (int s = 0; i->srcExists(s); ++s) {
+      if (i->src[s].isIndirect(0))
+         return 8;
+
+      if (i->src[s].getFile() == FILE_MEMORY_CONST) {
+         if (SDATA(i->src[s]).offset >= 0x100)
+            return 8;
+         if (i->getSrc(s)->reg.fileIndex > 1 &&
+             i->getSrc(s)->reg.fileIndex != 16)
+             return 8;
+      } else
+      if (i->src[s].getFile() == FILE_IMMEDIATE) {
+         if (i->dType == TYPE_F32) {
+            if (SDATA(i->src[s]).u32 >= 0x100)
+               return 8;
+         } else {
+            if (SDATA(i->src[s]).u32 > 0xff)
+               return 8;
+         }
+      }
+
+      if (i->op == OP_CVT)
+         continue;
+      if (i->src[s].mod != Modifier(0)) {
+         if (i->src[s].mod == Modifier(NV50_IR_MOD_ABS))
+            if (i->op != OP_RSQ)
+               return 8;
+         if (i->src[s].mod == Modifier(NV50_IR_MOD_NEG))
+            if (i->op != OP_ADD || s != 0)
+               return 8;
+      }
+   }
+
+   return 4;
+}
+
+CodeEmitterNVC0::CodeEmitterNVC0(const TargetNVC0 *target) : targ(target)
+{
+   code = NULL;
+   codeSize = codeSizeLimit = 0;
+   relocInfo = NULL;
+}
+
+CodeEmitter *
+TargetNVC0::getCodeEmitter(Program::Type type)
+{
+   CodeEmitterNVC0 *emit = new CodeEmitterNVC0(this);
+   emit->setProgramType(type);
+   return emit;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
new file mode 100644
index 00000000000..de73efcc56a
--- /dev/null
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
@@ -0,0 +1,705 @@
+
+#include "nv50/codegen/nv50_ir.h"
+#include "nv50/codegen/nv50_ir_build_util.h"
+
+#include "nv50_ir_target_nvc0.h"
+
+namespace nv50_ir {
+
+#define QOP_ADD  0
+#define QOP_SUBR 1
+#define QOP_SUB  2
+#define QOP_MOV2 3
+
+#define QUADOP(q, r, s, t)                      \
+   ((QOP_##q << 0) | (QOP_##r << 2) |           \
+    (QOP_##s << 4) | (QOP_##t << 6))
+
+class NVC0LegalizeSSA : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+   virtual bool visit(Function *);
+
+   // we want to insert calls to the builtin library only after optimization
+   void handleDIV(Instruction *); // integer division, modulus
+   void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
+
+private:
+   BuildUtil bld;
+};
+
+void
+NVC0LegalizeSSA::handleDIV(Instruction *i)
+{
+   FlowInstruction *call;
+   int builtin;
+   Value *def[2];
+
+   bld.setPosition(i, false);
+   def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
+   def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
+   switch (i->dType) {
+   case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
+   case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
+   default:
+      return;
+   }
+   call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
+   bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
+   bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
+   bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
+
+   call->fixed = 1;
+   call->absolute = call->builtin = 1;
+   call->target.builtin = builtin;
+   delete_Instruction(prog, i);
+}
+
+void
+NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
+{
+   // TODO
+}
+
+bool
+NVC0LegalizeSSA::visit(Function *fn)
+{
+   bld.setProgram(fn->getProgram());
+   return true;
+}
+
+bool
+NVC0LegalizeSSA::visit(BasicBlock *bb)
+{
+   Instruction *next;
+   for (Instruction *i = bb->getEntry(); i; i = next) {
+      next = i->next;
+      if (i->dType == TYPE_F32)
+         continue;
+      switch (i->op) {
+      case OP_DIV:
+      case OP_MOD:
+         handleDIV(i);
+         break;
+      case OP_RCP:
+      case OP_RSQ:
+         if (i->dType == TYPE_F64)
+            handleRCPRSQ(i);
+         break;
+      default:
+         break;
+      }
+   }
+   return true;
+}
+
+class NVC0LegalizePostRA : public Pass
+{
+private:
+   virtual bool visit(Function *);
+   virtual bool visit(BasicBlock *);
+
+   void replaceZero(Instruction *);
+   void split64BitOp(Instruction *);
+   bool tryReplaceContWithBra(BasicBlock *);
+   void propagateJoin(BasicBlock *);
+
+   LValue *r63;
+};
+
+bool
+NVC0LegalizePostRA::visit(Function *fn)
+{
+   r63 = new_LValue(fn, FILE_GPR);
+   r63->reg.data.id = 63;
+   return true;
+}
+
+void
+NVC0LegalizePostRA::replaceZero(Instruction *i)
+{
+   for (int s = 0; i->srcExists(s); ++s) {
+      ImmediateValue *imm = i->getSrc(s)->asImm();
+      if (imm && imm->reg.data.u64 == 0)
+         i->setSrc(s, r63);
+   }
+}
+
+void
+NVC0LegalizePostRA::split64BitOp(Instruction *i)
+{
+   if (i->dType == TYPE_F64) {
+      if (i->op == OP_MAD)
+         i->op = OP_FMA;
+      if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA ||
+          i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX ||
+          i->op == OP_SET)
+         return;
+      i->dType = i->sType = TYPE_U32;
+
+      i->bb->insertAfter(i, i->clone(true)); // deep cloning
+   }
+}
+
+// replace CONT with BRA for single unconditional continue
+bool
+NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
+{
+   if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
+      return false;
+   Graph::EdgeIterator ei = bb->cfg.incident();
+   if (ei.getType() != Graph::Edge::BACK)
+      ei.next();
+   if (ei.getType() != Graph::Edge::BACK)
+      return false;
+   BasicBlock *contBB = BasicBlock::get(ei.getNode());
+
+   if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
+       contBB->getExit()->getPredicate())
+      return false;
+   contBB->getExit()->op = OP_BRA;
+   bb->remove(bb->getEntry()); // delete PRECONT
+
+   ei.next();
+   assert(ei.end() || ei.getType() != Graph::Edge::BACK);
+   return true;
+}
+
+// replace branches to join blocks with join ops
+void
+NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
+{
+   if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
+      return;
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+      BasicBlock *in = BasicBlock::get(ei.getNode());
+      Instruction *exit = in->getExit();
+      if (!exit) {
+         in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
+         // there should always be a terminator instruction
+         WARN("inserted missing terminator in BB:%i\n", in->getId());
+      } else
+      if (exit->op == OP_BRA) {
+         exit->op = OP_JOIN;
+         exit->asFlow()->limit = 1; // must-not-propagate marker
+      }
+   }
+   bb->remove(bb->getEntry());
+}
+
+bool
+NVC0LegalizePostRA::visit(BasicBlock *bb)
+{
+   Instruction *i, *next;
+
+   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
+   for (i = bb->getFirst(); i; i = next) {
+      next = i->next;
+      if (i->op == OP_EMIT || i->op == OP_RESTART) {
+         if (!i->getDef(0)->refCount())
+            i->setDef(0, NULL);
+         if (i->src[0].getFile() == FILE_IMMEDIATE)
+            i->setSrc(0, r63); // initial value must be 0
+      } else
+      if (i->isNop()) {
+         bb->remove(i);
+      } else {
+         if (i->op != OP_MOV && i->op != OP_PFETCH)
+            replaceZero(i);
+         if (typeSizeof(i->dType) == 8)
+            split64BitOp(i);
+      }
+   }
+   if (!bb->getEntry())
+      return true;
+
+   if (!tryReplaceContWithBra(bb))
+      propagateJoin(bb);
+
+   return true;
+}
+
+class NVC0LoweringPass : public Pass
+{
+public:
+   NVC0LoweringPass(Program *);
+
+private:
+   virtual bool visit(Function *);
+   virtual bool visit(BasicBlock *);
+   virtual bool visit(Instruction *);
+
+   bool handleRDSV(Instruction *);
+   bool handleWRSV(Instruction *);
+   bool handleEXPORT(Instruction *);
+   bool handleOUT(Instruction *);
+   bool handleDIV(Instruction *);
+   bool handleMOD(Instruction *);
+   bool handleSQRT(Instruction *);
+   bool handlePOW(Instruction *);
+   bool handleTEX(TexInstruction *);
+   bool handleTXD(TexInstruction *);
+   bool handleManualTXD(TexInstruction *);
+
+   void checkPredicate(Instruction *);
+
+   void readTessCoord(LValue *dst, int c);
+
+private:
+   const Target *const targ;
+
+   BuildUtil bld;
+
+   LValue *gpEmitAddress;
+};
+
+NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
+{
+   bld.setProgram(prog);
+}
+
+bool
+NVC0LoweringPass::visit(Function *fn)
+{
+   if (prog->getType() == Program::TYPE_GEOMETRY) {
+      assert(!strncmp(fn->getName(), "MAIN", 4));
+      // TODO: when we generate actual functions pass this value along somehow
+      bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
+      gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
+   }
+   return true;
+}
+
+bool
+NVC0LoweringPass::visit(BasicBlock *bb)
+{
+   return true;
+}
+
+// move array source to first slot, convert to u16, add indirections
+bool
+NVC0LoweringPass::handleTEX(TexInstruction *i)
+{
+   const int dim = i->tex.target.getDim();
+   const int arg = i->tex.target.getDim() + i->tex.target.isArray();
+
+   // generate and move the tsc/tic/array source to the front
+   if (dim != arg || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
+      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
+
+      Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(dim) : NULL;
+      for (int s = dim; s >= 1; --s)
+         i->setSrc(s, i->getSrc(s - 1));
+      i->setSrc(0, arrayIndex);
+
+      Value *ticRel = i->getIndirectR();
+      Value *tscRel = i->getIndirectS();
+
+      if (arrayIndex)
+         bld.mkCvt(OP_CVT, TYPE_U16, src, TYPE_F32, arrayIndex);
+      else
+         bld.loadImm(src, 0);
+
+      if (ticRel) {
+         i->setSrc(i->tex.rIndirectSrc, NULL);
+         bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
+      }
+      if (tscRel) {
+         i->setSrc(i->tex.sIndirectSrc, NULL);
+         bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
+      }
+
+      i->setSrc(0, src);
+   }
+
+   // offset is last source (lod 1st, dc 2nd)
+   if (i->tex.useOffsets) {
+      uint32_t value = 0;
+      int n, c;
+      int s = i->srcCount(0xff);
+      for (n = 0; n < i->tex.useOffsets; ++n)
+         for (c = 0; c < 3; ++c)
+            value |= (i->tex.offset[n][c] & 0xf) << (n * 12 + c * 4);
+      i->setSrc(s, bld.loadImm(NULL, value));
+   }
+
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleManualTXD(TexInstruction *i)
+{
+   static const uint8_t qOps[4][2] =
+   {
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
+   };
+   Value *def[4][4];
+   Value *crd[3];
+   Instruction *tex;
+   Value *zero = bld.loadImm(bld.getSSA(), 0);
+   int l, c;
+   const int dim = i->tex.target.getDim();
+
+   i->op = OP_TEX; // no need to clone dPdx/dPdy later
+
+   for (c = 0; c < dim; ++c)
+      crd[c] = bld.getScratch();
+
+   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
+   for (l = 0; l < 4; ++l) {
+      // mov coordinates from lane l to all lanes
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
+      // add dPdx from lane l to lanes dx
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
+      // add dPdy from lane l to lanes dy
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // texture
+      bld.insert(tex = i->clone(true));
+      for (c = 0; c < dim; ++c)
+         tex->setSrc(c, crd[c]);
+      // save results
+      for (c = 0; i->defExists(c); ++c) {
+         Instruction *mov;
+         def[c][l] = bld.getSSA();
+         mov = bld.mkMov(def[c][l], tex->getDef(c));
+         mov->fixed = 1;
+         mov->lanes = 1 << l;
+      }
+   }
+   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
+
+   for (c = 0; i->defExists(c); ++c) {
+      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
+      for (l = 0; l < 4; ++l)
+         u->setSrc(l, def[c][l]);
+   }
+
+   i->bb->remove(i);
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleTXD(TexInstruction *txd)
+{
+   int dim = txd->tex.target.getDim();
+   int arg = txd->tex.target.getDim() + txd->tex.target.isArray();
+
+   handleTEX(txd);
+   if (txd->src[arg].exists())
+      ++arg;
+
+   if (dim > 2 || txd->tex.target.isShadow())
+      return handleManualTXD(txd);
+
+   // at most s/t/array, x, y, offset
+   assert(arg <= 4 && !txd->src[arg].exists());
+
+   for (int c = 0; c < dim; ++c) {
+      txd->src[arg + c * 2 + 0].set(txd->dPdx[c]);
+      txd->src[arg + c * 2 + 1].set(txd->dPdy[c]);
+      txd->dPdx[c] = NULL;
+      txd->dPdy[c] = NULL;
+   }
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleWRSV(Instruction *i)
+{
+   Instruction *st;
+   Symbol *sym;
+   uint32_t addr;
+
+   // must replace, $sreg are not writeable
+   addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
+   if (addr >= 0x400)
+      return false;
+   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
+
+   st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
+                    i->getSrc(1));
+   st->perPatch = i->perPatch;
+
+   bld.getBB()->remove(i);
+   return true;
+}
+
+void
+NVC0LoweringPass::readTessCoord(LValue *dst, int c)
+{
+   Value *laneid = bld.getSSA();
+   Value *x, *y;
+
+   bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
+
+   if (c == 0) {
+      x = dst;
+      y = NULL;
+   } else
+   if (c == 1) {
+      x = NULL;
+      y = dst;
+   } else {
+      assert(c == 2);
+      x = bld.getSSA();
+      y = bld.getSSA();
+   }
+   if (x)
+      bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
+   if (y)
+      bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
+
+   if (c == 2) {
+      bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
+      bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
+   }
+}
+
+bool
+NVC0LoweringPass::handleRDSV(Instruction *i)
+{
+   Symbol *sym = i->getSrc(0)->asSym();
+   Value *vtx = NULL;
+   Instruction *ld;
+   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
+
+   if (addr >= 0x400) // mov $sreg
+      return true;
+
+   switch (i->getSrc(0)->reg.data.sv.sv) {
+   case SV_POSITION:
+      assert(prog->getType() == Program::TYPE_FRAGMENT);
+      ld = new_Instruction(func, OP_LINTERP, TYPE_F32);
+      ld->setDef(0, i->getDef(0));
+      ld->setSrc(0, bld.mkSymbol(FILE_SHADER_INPUT, 0, TYPE_F32, addr));
+      ld->setInterpolate(NV50_IR_INTERP_LINEAR);
+      bld.getBB()->insertAfter(i, ld);
+      break;
+   case SV_TESS_COORD:
+      assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
+      readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
+      break;
+   default:
+      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
+         vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
+      ld = bld.mkFetch(i->getDef(0), i->dType,
+                       FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
+      ld->perPatch = i->perPatch;
+      break;
+   }
+   bld.getBB()->remove(i);
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleDIV(Instruction *i)
+{
+   if (!isFloatType(i->dType))
+      return true;
+   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
+   i->op = OP_MUL;
+   i->setSrc(1, rcp->getDef(0));
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleMOD(Instruction *i)
+{
+   if (i->dType != TYPE_F32)
+      return true;
+   LValue *value = bld.getScratch();
+   bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
+   bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
+   bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
+   bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
+   i->op = OP_SUB;
+   i->setSrc(1, value);
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleSQRT(Instruction *i)
+{
+   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+                                bld.getSSA(), i->getSrc(0));
+   i->op = OP_MUL;
+   i->setSrc(1, rsq->getDef(0));
+
+   return true;
+}
+
+bool
+NVC0LoweringPass::handlePOW(Instruction *i)
+{
+   LValue *val = bld.getScratch();
+
+   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
+   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
+   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
+
+   i->op = OP_EX2;
+   i->setSrc(0, val);
+   i->setSrc(1, NULL);
+
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleEXPORT(Instruction *i)
+{
+   if (prog->getType() == Program::TYPE_FRAGMENT) {
+      int id = i->getSrc(0)->reg.data.offset / 4;
+
+      if (i->src[0].isIndirect(0)) // TODO, ugly
+         return false;
+      i->op = OP_MOV;
+      i->src[0].set(i->src[1]);
+      i->setSrc(1, NULL);
+      i->setDef(0, new_LValue(func, FILE_GPR));
+      i->getDef(0)->reg.data.id = id;
+
+      prog->maxGPR = MAX2(prog->maxGPR, id);
+   } else
+   if (prog->getType() == Program::TYPE_GEOMETRY) {
+      i->setIndirect(0, 1, gpEmitAddress);
+   }
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleOUT(Instruction *i)
+{
+   if (i->op == OP_RESTART && i->prev && i->prev->op == OP_EMIT) {
+      i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
+      delete_Instruction(prog, i);
+   } else {
+      assert(gpEmitAddress);
+      i->setDef(0, gpEmitAddress);
+      if (i->srcExists(0))
+         i->setSrc(1, i->getSrc(0));
+      i->setSrc(0, gpEmitAddress);
+   }
+   return true;
+}
+
+// Generate a binary predicate if an instruction is predicated by
+// e.g. an f32 value.
+void
+NVC0LoweringPass::checkPredicate(Instruction *insn)
+{
+   Value *pred = insn->getPredicate();
+   Value *pdst;
+
+   if (!pred || pred->reg.file == FILE_PREDICATE)
+      return;
+   pdst = new_LValue(func, FILE_PREDICATE);
+
+   // CAUTION: don't use pdst->getInsn, the definition might not be unique,
+   //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
+
+   bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, pdst, bld.mkImm(0), pred);
+
+   insn->setPredicate(insn->cc, pdst);
+}
+
+//
+// - add quadop dance for texturing
+// - put FP outputs in GPRs
+// - convert instruction sequences
+//
+bool
+NVC0LoweringPass::visit(Instruction *i)
+{
+   if (i->prev)
+      bld.setPosition(i->prev, true);
+   else
+   if (i->next)
+      bld.setPosition(i->next, false);
+   else
+      bld.setPosition(i->bb, true);
+
+   if (i->cc != CC_ALWAYS)
+      checkPredicate(i);
+
+   switch (i->op) {
+   case OP_TEX:
+   case OP_TXB:
+   case OP_TXL:
+   case OP_TXF:
+   case OP_TXQ:
+   case OP_TXG:
+      return handleTEX(i->asTex());
+   case OP_TXD:
+      return handleTXD(i->asTex());
+   case OP_EX2:
+      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
+      i->setSrc(0, i->getDef(0));
+      break;
+   case OP_POW:
+      return handlePOW(i);
+   case OP_DIV:
+      return handleDIV(i);
+   case OP_MOD:
+      return handleMOD(i);
+   case OP_SQRT:
+      return handleSQRT(i);
+   case OP_EXPORT:
+      return handleEXPORT(i);
+   case OP_EMIT:
+   case OP_RESTART:
+      return handleOUT(i);
+   case OP_RDSV:
+      return handleRDSV(i);
+   case OP_WRSV:
+      return handleWRSV(i);
+   case OP_LOAD:
+      if (i->src[0].getFile() == FILE_SHADER_INPUT) {
+         i->op = OP_VFETCH;
+         assert(prog->getType() != Program::TYPE_FRAGMENT);
+      }
+      break;
+   case OP_PINTERP:
+      if (i->getSrc(0)->reg.data.offset >= 0x280 &&
+          i->getSrc(0)->reg.data.offset <  0x2c0)
+         i->setInterpolate(i->getSampleMode() | NV50_IR_INTERP_SC);
+      break;
+   case OP_LINTERP:
+      if (i->getSrc(0)->reg.data.offset == 0x3fc) {
+         Value *face = i->getDef(0);
+         bld.setPosition(i, true);
+         bld.mkOp2(OP_SHL, TYPE_U32, face, face, bld.mkImm(31));
+         bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000));
+      }
+      break;
+   default:
+      break;
+   }   
+   return true;
+}
+
+bool
+TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
+{
+   if (stage == CG_STAGE_PRE_SSA) {
+      NVC0LoweringPass pass(prog);
+      return pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_POST_RA) {
+      NVC0LegalizePostRA pass;
+      return pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_SSA) {
+      NVC0LegalizeSSA pass;
+      return pass.run(prog, false, true);
+   }
+   return false;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
new file mode 100644
index 00000000000..60b2016878e
--- /dev/null
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
@@ -0,0 +1,568 @@
+
+#include "nv50_ir_target_nvc0.h"
+
+namespace nv50_ir {
+
+Target *getTargetNVC0(unsigned int chipset)
+{
+   return new TargetNVC0(chipset);
+}
+
+TargetNVC0::TargetNVC0(unsigned int card)
+{
+   chipset = card;
+   initOpInfo();
+}
+
+// BULTINS / LIBRARY FUNCTIONS:
+
+// lazyness -> will just hardcode everything for the time being
+
+// Will probably make this nicer once we support subroutines properly,
+// i.e. when we have an input IR that provides function declarations.
+
+static const uint32_t nvc0_builtin_code[] =
+{
+// DIV U32: slow unsigned integer division
+//
+// UNR recurrence (q = a / b):
+// look for z such that 2^32 - b <= b * z < 2^32
+// then q - 1 <= (a * z) / 2^32 <= q
+//
+// INPUT:   $r0: dividend, $r1: divisor
+// OUTPUT:  $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p1
+// SIZE:    22 / 14 * 8 bytes
+//
+#if 1
+   0x04009c03, 0x78000000,
+   0x7c209cdd,
+   0x0010dd18,
+   0x08309c03, 0x60000000,
+   0x05605c18,
+   0x0810dc2a,
+   0x0c209c43, 0x20040000,
+   0x0810dc03, 0x50000000,
+   0x0c209c43, 0x20040000,
+   0x0810dc03, 0x50000000,
+   0x0c209c43, 0x20040000,
+   0x0810dc03, 0x50000000,
+   0x0c209c43, 0x20040000,
+   0x0810dc03, 0x50000000,
+   0x0c209c43, 0x20040000,
+   0x0000dde4, 0x28000000,
+   0x08001c43, 0x50000000,
+   0x05609c18,
+   0x0010430d,
+   0x0811dc03, 0x1b0e0000,
+   0x08104103, 0x48000000,
+   0x04000002, 0x08000000,
+   0x0811c003, 0x1b0e0000,
+   0x08104103, 0x48000000,
+   0x040000ac,
+   0x90001dff,
+#else
+   0x0401dc03, 0x1b0e0000,
+   0x00008003, 0x78000000,
+   0x0400c003, 0x78000000,
+   0x0c20c103, 0x48000000,
+   0x0c108003, 0x60000000,
+   0x00005c28,
+   0x00001d18,
+   0x0031c023, 0x1b0ec000,
+   0xb000a1e7, 0x40000000,
+   0x04000003, 0x6000c000,
+   0x0813dc03, 0x1b000000,
+   0x0420446c,
+   0x040004bd,
+   0x04208003, 0x5800c000,
+   0x0430c103, 0x4800c000,
+   0x0ffc5dff,
+   0x90001dff,
+#endif
+
+// DIV S32: slow signed integer division
+//
+// INPUT:   $r0: dividend, $r1: divisor
+// OUTPUT:  $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p3
+// SIZE:    18 * 8 bytes
+//
+   0xfc05dc23, 0x188e0000,
+   0xfc17dc23, 0x18c40000,
+   0x03301e18,
+   0x07305e18,
+   0x0401dc03, 0x1b0e0000,
+   0x00008003, 0x78000000,
+   0x0400c003, 0x78000000,
+   0x0c20c103, 0x48000000,
+   0x0c108003, 0x60000000,
+   0x00005c28,
+   0x00001d18,
+   0x0031c023, 0x1b0ec000,
+   0xb000a1e7, 0x40000000,
+   0x04000003, 0x6000c000,
+   0x0813dc03, 0x1b000000,
+   0x0420446c,
+   0x040004bd,
+   0x04208003, 0x5800c000,
+   0x0430c103, 0x4800c000,
+   0x0ffc5dff,
+   0x01700e18,
+   0x05704a18,
+   0x90001dff,
+
+// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
+//
+// INPUT:   $r0d (x)
+// OUTPUT:  $r0d (rcp(x))
+// CLOBBER: $r2 - $r7
+// SIZE:    9 * 8 bytes
+//
+   0x9810dc08,
+   0x00009c28,
+   0x4001df18,
+   0x00019d18,
+   0x08011e01, 0x200c0000,
+   0x10209c01, 0x50000000,
+   0x08011e01, 0x200c0000,
+   0x10209c01, 0x50000000,
+   0x08011e01, 0x200c0000,
+   0x10201c01, 0x50000000,
+   0x00001de7, 0x90000000,
+
+// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
+//
+// INPUT:   $r0d (x)
+// OUTPUT:  $r0d (rsqrt(x))
+// CLOBBER: $r2 - $r7
+// SIZE:    14 * 8 bytes
+//
+   0x9c10dc08,
+   0x00009c28,
+   0x00019d18,
+   0x3fe1df18,
+   0x18001c01, 0x50000000,
+   0x0001dde2, 0x18ffe000,
+   0x08211c01, 0x50000000,
+   0x10011e01, 0x200c0000,
+   0x10209c01, 0x50000000,
+   0x08211c01, 0x50000000,
+   0x10011e01, 0x200c0000,
+   0x10209c01, 0x50000000,
+   0x08211c01, 0x50000000,
+   0x10011e01, 0x200c0000,
+   0x10201c01, 0x50000000,
+   0x00001de7, 0x90000000,
+};
+
+static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] =
+{
+   0,
+   8 * (22),
+   8 * (22 + 18),
+   8 * (22 + 18 + 9)
+};
+
+void
+TargetNVC0::getBuiltinCode(const uint32_t **code, uint32_t *size) const
+{
+   *code = &nvc0_builtin_code[0];
+   *size = sizeof(nvc0_builtin_code);
+}
+
+uint32_t
+TargetNVC0::getBuiltinOffset(int builtin) const
+{
+   assert(builtin < NVC0_BUILTIN_COUNT);
+   return nvc0_builtin_offsets[builtin];
+}
+
+struct opProperties
+{
+   operation op;
+   unsigned int mNeg   : 4;
+   unsigned int mAbs   : 4;
+   unsigned int mNot   : 4;
+   unsigned int mSat   : 4;
+   unsigned int fConst : 3;
+   unsigned int fImmd  : 4; // last bit indicates if full immediate is suppoted
+};
+
+static const struct opProperties _initProps[] =
+{
+   //           neg  abs  not  sat  c[]  imm
+   { OP_ADD,    0x3, 0x3, 0x0, 0x8, 0x2, 0x2 | 0x8 },
+   { OP_SUB,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 | 0x8 },
+   { OP_MUL,    0x3, 0x0, 0x0, 0x8, 0x2, 0x2 | 0x8 },
+   { OP_MAX,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_MIN,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_MAD,    0x7, 0x0, 0x0, 0x8, 0x6, 0x2 | 0x8 }, // special c[] constraint
+   { OP_ABS,    0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
+   { OP_NEG,    0x0, 0x1, 0x0, 0x0, 0x1, 0x0 },
+   { OP_CVT,    0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_AND,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
+   { OP_OR,     0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
+   { OP_XOR,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
+   { OP_SHL,    0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SHR,    0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SET,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SLCT,   0x4, 0x0, 0x0, 0x0, 0x6, 0x2 }, // special c[] constraint
+   { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
+   { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
+   { OP_COS,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_SIN,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_EX2,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_LG2,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_RCP,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_RSQ,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_DFDX,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_DFDY,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_CALL,   0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
+   { OP_INSBF,  0x0, 0x0, 0x0, 0x0, 0x0, 0x4 },
+   { OP_SET_AND, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SET_OR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SET_XOR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   // saturate only:
+   { OP_LINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
+   { OP_PINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
+};
+
+void TargetNVC0::initOpInfo()
+{
+   unsigned int i, j;
+
+   static const uint32_t commutative[(OP_LAST + 31) / 32] =
+   {
+      // ADD, MAD, MUL, AND, OR, XOR, MAX, MIN
+      0x0670ca00, 0x0000003f, 0x00000000
+   };
+
+   static const uint32_t shortForm[(OP_LAST + 31) / 32] =
+   {
+      // ADD, MAD, MUL, AND, OR, XOR, PRESIN, PREEX2, SFN, CVT, PINTERP, MOV
+      0x0670ca00, 0x00000000, 0x00000000
+   };
+
+   static const operation noDest[] =
+   {
+      OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT,
+      OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET,
+      OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART,
+      OP_QUADON, OP_QUADPOP
+   };
+
+   joinAnterior = false;
+
+   for (i = 0; i < DATA_FILE_COUNT; ++i)
+      nativeFileMap[i] = (DataFile)i;
+   nativeFileMap[FILE_ADDRESS] = FILE_GPR;
+
+   for (i = 0; i < OP_LAST; ++i) {
+      opInfo[i].variants = NULL;
+      opInfo[i].op = (operation)i;
+      opInfo[i].srcTypes = 1 << (int)TYPE_F32;
+      opInfo[i].dstTypes = 1 << (int)TYPE_F32;
+      opInfo[i].immdBits = 0;
+      opInfo[i].srcNr = operationSrcNr[i];
+
+      for (j = 0; j < opInfo[i].srcNr; ++j) {
+         opInfo[i].srcMods[j] = 0;
+         opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR;
+      }
+      opInfo[i].dstMods = 0;
+      opInfo[i].dstFiles = 1 << (int)FILE_GPR;
+
+      opInfo[i].hasDest = 1;
+      opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
+      opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
+      opInfo[i].pseudo = (i < OP_MOV);
+      opInfo[i].predicate = !opInfo[i].pseudo;
+      opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
+      opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
+   }
+   for (i = 0; i < sizeof(noDest) / sizeof(noDest[0]); ++i)
+      opInfo[noDest[i]].hasDest = 0;
+
+   for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
+      const struct opProperties *prop = &_initProps[i];
+
+      for (int s = 0; s < 3; ++s) {
+         if (prop->mNeg & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG;
+         if (prop->mAbs & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS;
+         if (prop->mNot & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT;
+         if (prop->fConst & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST;
+         if (prop->fImmd & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE;
+         if (prop->fImmd & 8)
+            opInfo[prop->op].immdBits = 0xffffffff;
+      }
+      if (prop->mSat & 8)
+         opInfo[prop->op].dstMods = NV50_IR_MOD_SAT;
+   }
+}
+
+unsigned int
+TargetNVC0::getFileSize(DataFile file) const
+{
+   switch (file) {
+   case FILE_NULL:          return 0;
+   case FILE_GPR:           return 63;
+   case FILE_PREDICATE:     return 7;
+   case FILE_FLAGS:         return 1;
+   case FILE_ADDRESS:       return 0;
+   case FILE_IMMEDIATE:     return 0;
+   case FILE_MEMORY_CONST:  return 65536;
+   case FILE_SHADER_INPUT:  return 0x400;
+   case FILE_SHADER_OUTPUT: return 0x400;
+   case FILE_MEMORY_GLOBAL: return 0xffffffff;
+   case FILE_MEMORY_SHARED: return 16 << 10;
+   case FILE_MEMORY_LOCAL:  return 48 << 10;
+   case FILE_SYSTEM_VALUE:  return 32;
+   default:
+      assert(!"invalid file");
+      return 0;
+   }
+}
+
+unsigned int
+TargetNVC0::getFileUnit(DataFile file) const
+{
+   if (file == FILE_GPR || file == FILE_ADDRESS || file == FILE_SYSTEM_VALUE)
+      return 2;
+   return 0;
+}
+
+uint32_t
+TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
+{
+   const int idx = sym->reg.data.sv.index;
+   const SVSemantic sv = sym->reg.data.sv.sv;
+
+   const bool isInput = shaderFile == FILE_SHADER_INPUT;
+
+   switch (sv) {
+   case SV_POSITION:       return 0x070 + idx * 4;
+   case SV_INSTANCE_ID:    return 0x2f8;
+   case SV_VERTEX_ID:      return 0x2fc;
+   case SV_PRIMITIVE_ID:   return isInput ? 0x060 : 0x040;
+   case SV_LAYER:          return 0x064;
+   case SV_VIEWPORT_INDEX: return 0x068;
+   case SV_POINT_SIZE:     return 0x06c;
+   case SV_CLIP_DISTANCE:  return 0x2c0 + idx * 4;
+   case SV_POINT_COORD:    return 0x2e0 + idx * 4;
+   case SV_FACE:           return 0x3fc;
+   case SV_TESS_FACTOR:    return 0x000 + idx * 4;
+   case SV_TESS_COORD:     return 0x2f0 + idx * 4;
+   default:
+      return 0xffffffff;
+   }
+}
+
+bool
+TargetNVC0::insnCanLoad(const Instruction *i, int s,
+                        const Instruction *ld) const
+{
+   DataFile sf = ld->src[0].getFile();
+
+   // immediate 0 can be represented by GPR $r63
+   if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0)
+      return (!i->asTex() && i->op != OP_EXPORT && i->op != OP_STORE);
+
+   if (s > opInfo[i->op].srcNr)
+      return false;
+   if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf)))
+      return false;
+
+   // indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0
+   if (ld->src[0].isIndirect(0))
+      return false;
+
+   for (int k = 0; i->srcExists(k); ++k) {
+      if (i->src[k].getFile() == FILE_IMMEDIATE) {
+         if (i->getSrc(k)->reg.data.u64 != 0)
+            return false;
+      } else
+      if (i->src[k].getFile() != FILE_GPR &&
+          i->src[k].getFile() != FILE_PREDICATE) {
+         return false;
+      }
+   }
+
+   // not all instructions support full 32 bit immediates
+   if (sf == FILE_IMMEDIATE) {
+      Storage &reg = ld->getSrc(0)->asImm()->reg;
+
+      if (opInfo[i->op].immdBits != 0xffffffff) {
+         if (i->sType == TYPE_F32) {
+            if (reg.data.u32 & 0xfff)
+               return false;
+         } else
+         if (i->sType == TYPE_S32 || i->sType == TYPE_U32) {
+            // with u32, 0xfffff counts as 0xffffffff as well
+            if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000)
+               return false;
+         }
+      } else
+      if (i->op == OP_MAD || i->op == OP_FMA) {
+         // requires src == dst, cannot decide before RA
+         // (except if we implement more constraints)
+         if (ld->getSrc(0)->asImm()->reg.data.u32 & 0xfff)
+            return false;
+      }
+   }
+
+   return true;
+}
+
+bool
+TargetNVC0::isOpSupported(operation op, DataType ty) const
+{
+   if ((op == OP_MAD || op == OP_FMA) && (ty != TYPE_F32))
+      return false;
+   if (op == OP_SAD && ty != TYPE_S32)
+      return false;
+   if (op == OP_POW || op == OP_SQRT || op == OP_DIV || op == OP_MOD)
+      return false;
+   return true;
+}
+
+bool
+TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const
+{
+   if (!isFloatType(insn->dType)) {
+      switch (insn->op) {
+      case OP_ABS:
+      case OP_NEG:
+      case OP_CVT:
+      case OP_CEIL:
+      case OP_FLOOR:
+      case OP_TRUNC:
+      case OP_AND:
+      case OP_OR:
+      case OP_XOR:
+         break;
+      case OP_ADD:
+         if (insn->src[s ? 0 : 1].mod.neg())
+            return false;
+         break;
+      case OP_SUB:
+         if (s == 0)
+            return insn->src[1].mod.neg() ? false : true;
+         break;
+      default:
+         return false;
+      }
+   }
+   if (s > 3)
+      return false;
+   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
+}
+
+bool
+TargetNVC0::mayPredicate(const Instruction *insn, const Value *pred) const
+{
+   if (insn->getPredicate())
+      return false;
+   return opInfo[insn->op].predicate;
+}
+
+bool
+TargetNVC0::isSatSupported(const Instruction *insn) const
+{
+   if (insn->op == OP_CVT)
+      return true;
+   if (!(opInfo[insn->op].dstMods & NV50_IR_MOD_SAT))
+      return false;
+
+   if (insn->dType == TYPE_U32)
+      return (insn->op == OP_ADD) || (insn->op == OP_MAD);
+
+   return insn->dType == TYPE_F32;
+}
+
+// TODO: better values
+int TargetNVC0::getLatency(const Instruction *i) const
+{
+   if (i->op == OP_LOAD) {
+      if (i->cache == CACHE_CV)
+         return 700;
+      return 48;
+   }
+   return 24;
+}
+
+// These are "inverse" throughput values, i.e. the number of cycles required
+// to issue a specific instruction for a full warp (32 threads).
+//
+// Assuming we have more than 1 warp in flight, a higher issue latency results
+// in a lower result latency since the MP will have spent more time with other
+// warps.
+// This also helps to determine the number of cycles between instructions in
+// a single warp.
+//
+int TargetNVC0::getThroughput(const Instruction *i) const
+{
+   // TODO: better values
+   if (i->dType == TYPE_F32) {
+      switch (i->op) {
+      case OP_ADD:
+      case OP_MUL:
+      case OP_MAD:
+      case OP_FMA:
+         return 1;
+      case OP_CVT:
+      case OP_CEIL:
+      case OP_FLOOR:
+      case OP_TRUNC:
+      case OP_SET:
+      case OP_SLCT:
+      case OP_MIN:
+      case OP_MAX:
+         return 2;
+      case OP_RCP:
+      case OP_RSQ:
+      case OP_LG2:
+      case OP_SIN:
+      case OP_COS:
+      case OP_PRESIN:
+      case OP_PREEX2:
+      default:
+         return 8;
+      }
+   } else
+   if (i->dType == TYPE_U32 || i->dType == TYPE_S32) {
+      switch (i->op) {
+      case OP_ADD:
+      case OP_AND:
+      case OP_OR:
+      case OP_XOR:
+      case OP_NOT:
+         return 1;
+      case OP_MUL:
+      case OP_MAD:
+      case OP_CVT:
+      case OP_SET:
+      case OP_SLCT:
+      case OP_SHL:
+      case OP_SHR:
+      case OP_NEG:
+      case OP_ABS:
+      case OP_MIN:
+      case OP_MAX:
+      default:
+         return 2;
+      }
+   } else
+   if (i->dType == TYPE_F64) {
+      return 2;
+   } else {
+      return 1;
+   }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h
new file mode 100644
index 00000000000..f96bfbeaa6a
--- /dev/null
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h
@@ -0,0 +1,46 @@
+
+#include "nv50/codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+#define NVC0_BUILTIN_DIV_U32 0
+#define NVC0_BUILTIN_DIV_S32 1
+#define NVC0_BUILTIN_RCP_F64 2
+#define NVC0_BUILTIN_RSQ_F64 3
+
+#define NVC0_BUILTIN_COUNT 4
+
+class TargetNVC0 : public Target
+{
+public:
+   TargetNVC0(unsigned int chipset);
+
+   virtual CodeEmitter *getCodeEmitter(Program::Type);
+
+   virtual bool runLegalizePass(Program *, CGStage stage) const;
+
+   virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const;
+
+   virtual bool insnCanLoad(const Instruction *insn, int s,
+                            const Instruction *ld) const;
+   virtual bool isOpSupported(operation, DataType) const;
+   virtual bool isModSupported(const Instruction *, int s, Modifier) const;
+   virtual bool isSatSupported(const Instruction *) const;
+   virtual bool mayPredicate(const Instruction *, const Value *) const;
+
+   virtual int getLatency(const Instruction *) const;
+   virtual int getThroughput(const Instruction *) const;
+
+   virtual unsigned int getFileSize(DataFile) const;
+   virtual unsigned int getFileUnit(DataFile) const;
+
+   virtual uint32_t getSVAddress(DataFile shaderFile, const Symbol *sv) const;
+
+   uint32_t getBuiltinOffset(int builtin) const;
+
+private:
+   void initOpInfo();
+
+};
+
+} // namespace nv50_ir
author	Christoph Bumiller <[email protected]>	2011-09-14 16:18:23 +0200
committer	Christoph Bumiller <[email protected]>	2011-09-14 16:19:52 +0200
commit	57594065c30feec9376be9b2132659f7d87362ee (patch)
tree	7e6808e0c5240b513851b7925c5be6678663b5e5 /src/gallium/drivers/nvc0
parent	a42eca84c56f6860e67c0c57f4765a5530cc5f81 (diff)