nv50/ir: import nv50 target

author: Christoph Bumiller <[email protected]> 2012-04-14 21:40:35 +0200
committer: Christoph Bumiller <[email protected]> 2012-04-14 21:54:04 +0200
commit: 322bc7ed68ed92233c97168c036d0aa50c11a20e (patch)
tree: 0ecfa903b0950c71455b6420c4f3550e4abb2007 /src/gallium/drivers/nv50/codegen
parent: 15ce0f76e2e014374a292550505f58da88333fb7 (diff)
11 files changed, 2472 insertions, 219 deletions
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir.cpp
index 19a90806c70..048759060ad 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir.cpp
@@ -559,8 +559,11 @@ void Instruction::init()
    subOp = 0;
 
    saturate = 0;
-   join = terminator = 0;
-   ftz = dnz = 0;
+   join = 0;
+   exit = 0;
+   terminator = 0;
+   ftz = 0;
+   dnz = 0;
    atomic = 0;
    perPatch = 0;
    fixed = 0;
@@ -982,6 +985,9 @@ Program::Program(Type type, Target *arch)
    calls.insert(&main->call);
 
    dbgFlags = 0;
+   optLevel = 0;
+
+   targetPriv = NULL;
 }
 
 Program::~Program()
@@ -1085,6 +1091,7 @@ nv50_ir_generate_code(struct nv50_ir_prog_info *info)
    if (!prog)
       return -1;
    prog->dbgFlags = info->dbgFlags;
+   prog->optLevel = info->optLevel;
 
    switch (info->bin.sourceRep) {
 #if 0
@@ -1105,6 +1112,7 @@ nv50_ir_generate_code(struct nv50_ir_prog_info *info)
    if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE)
       prog->print();
 
+   targ->parseDriverInfo(info);
    prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA);
 
    prog->convertToSSA();
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir.h b/src/gallium/drivers/nv50/codegen/nv50_ir.h
index c0a867d9552..6ec4fc95441 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir.h
@@ -140,6 +140,7 @@ enum operation
 #define NV50_IR_SUBOP_LDC_IS       2
 #define NV50_IR_SUBOP_LDC_ISL      3
 #define NV50_IR_SUBOP_SHIFT_WRAP   1
+#define NV50_IR_SUBOP_EMU_PRERET   1
 
 enum DataType
 {
@@ -1060,6 +1061,9 @@ public:
    MemoryPool mem_ImmediateValue;
 
    uint32_t dbgFlags;
+   uint8_t  optLevel;
+
+   void *targetPriv; // e.g. to carry information between passes
 
    void releaseInstruction(Instruction *);
    void releaseValue(Value *);
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h
index 022a27f1748..9ee04dbcd12 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h
@@ -46,7 +46,8 @@ public:
    inline void remove(Instruction *i) { assert(i->bb == bb); bb->remove(i); }
 
    inline LValue *getScratch(int size = 4, DataFile = FILE_GPR);
-   inline LValue *getSSA(int size = 4); // scratch value for a single assignment
+   // scratch value for a single assignment:
+   inline LValue *getSSA(int size = 4, DataFile = FILE_GPR);
 
    inline Instruction *mkOp(operation, DataType, Value *);
    Instruction *mkOp1(operation, DataType, Value *, Value *);
@@ -215,18 +216,16 @@ LValue *
 BuildUtil::getScratch(int size, DataFile f)
 {
    LValue *lval = new_LValue(func, f);
-   if (size != 4)
-      lval->reg.size = size;
+   lval->reg.size = size;
    return lval;
 }
 
 LValue *
-BuildUtil::getSSA(int size)
+BuildUtil::getSSA(int size, DataFile f)
 {
-   LValue *lval = new_LValue(func, FILE_GPR);
+   LValue *lval = new_LValue(func, f);
    lval->ssa = 1;
-   if (size != 4)
-      lval->reg.size = size;
+   lval->reg.size = size;
    return lval;
 }
 
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h b/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h
index ae733a1a924..dc42b8295e9 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h
@@ -175,6 +175,8 @@ struct nv50_ir_prog_info
 
    /* driver callback to assign input/output locations */
    int (*assignSlots)(struct nv50_ir_prog_info *);
+
+   void *driverPriv;
 };
 
 #ifdef __cplusplus
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_emit_nv50.cpp
index 1c09494f46d..c534d4a0c5e 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_emit_nv50.cpp
@@ -21,14 +21,19 @@
  */
 
 #include "nv50_ir.h"
-#include "nv50_ir_target.h"
+#include "nv50_ir_target_nv50.h"
 
 namespace nv50_ir {
 
+#define NV50_OP_ENC_LONG     0
+#define NV50_OP_ENC_SHORT    1
+#define NV50_OP_ENC_IMM      2
+#define NV50_OP_ENC_LONG_ALT 3
+
 class CodeEmitterNV50 : public CodeEmitter
 {
 public:
-   CodeEmitterNV50(const Target *);
+   CodeEmitterNV50(const TargetNV50 *);
 
    virtual bool emitInstruction(Instruction *);
 
@@ -36,23 +41,25 @@ public:
 
    inline void setProgramType(Program::Type pType) { progType = pType; }
 
-private:
-   const Target *targ;
+   virtual void prepareEmission(Function *);
 
+private:
    Program::Type progType;
 
+   const TargetNV50 *targ;
+
 private:
    inline void defId(const ValueDef&, const int pos);
    inline void srcId(const ValueRef&, const int pos);
    inline void srcId(const ValueRef *, const int pos);
 
-   inline void srcAddr16(const ValueRef&, const int pos);
+   inline void srcAddr16(const ValueRef&, bool adj, const int pos);
    inline void srcAddr8(const ValueRef&, const int pos);
 
    void emitFlagsRd(const Instruction *);
    void emitFlagsWr(const Instruction *);
 
-   void emitCondCode(CondCode cc, int pos);
+   void emitCondCode(CondCode cc, DataType ty, int pos);
 
    inline void setARegBits(unsigned int);
 
@@ -61,16 +68,16 @@ private:
 
    void setDst(const Value *);
    void setDst(const Instruction *, int d);
-   void emitSrc0(const ValueRef&);
-   void emitSrc1(const ValueRef&);
-   void emitSrc2(const ValueRef&);
+   void setSrcFileBits(const Instruction *, int enc);
+   void setSrc(const Instruction *, unsigned int s, int slot);
 
    void emitForm_MAD(const Instruction *);
    void emitForm_ADD(const Instruction *);
    void emitForm_MUL(const Instruction *);
    void emitForm_IMM(const Instruction *);
 
-   void emitLoadStoreSize(DataType ty, int pos);
+   void emitLoadStoreSizeLG(DataType ty, int pos);
+   void emitLoadStoreSizeCS(DataType ty);
 
    void roundMode_MAD(const Instruction *);
    void roundMode_CVT(RoundMode);
@@ -88,9 +95,10 @@ private:
    void emitUADD(const Instruction *);
    void emitAADD(const Instruction *);
    void emitFADD(const Instruction *);
-   void emitUMUL(const Instruction *);
+   void emitIMUL(const Instruction *);
    void emitFMUL(const Instruction *);
    void emitFMAD(const Instruction *);
+   void emitIMAD(const Instruction *);
 
    void emitMINMAX(const Instruction *);
 
@@ -98,17 +106,20 @@ private:
    void emitSFnOp(const Instruction *, uint8_t subOp);
 
    void emitShift(const Instruction *);
-   void emitARL(const Instruction *);
+   void emitARL(const Instruction *, unsigned int shl);
    void emitLogicOp(const Instruction *);
+   void emitNOT(const Instruction *);
 
    void emitCVT(const Instruction *);
    void emitSET(const Instruction *);
 
    void emitTEX(const TexInstruction *);
+   void emitTXQ(const TexInstruction *);
 
    void emitQUADOP(const Instruction *, uint8_t lane, uint8_t quOp);
 
    void emitFlow(const Instruction *, uint8_t flowOp);
+   void emitPRERETEmu(const FlowInstruction *);
 };
 
 #define SDATA(a) ((a).rep()->reg.data)
@@ -126,13 +137,20 @@ void CodeEmitterNV50::srcId(const ValueRef *src, const int pos)
    code[pos / 32] |= SDATA(*src).id << (pos % 32);
 }
 
-void CodeEmitterNV50::srcAddr16(const ValueRef& src, const int pos)
+void CodeEmitterNV50::srcAddr16(const ValueRef& src, bool adj, const int pos)
 {
    assert(src.get());
 
-   uint32_t offset = SDATA(src).offset;
+   int32_t offset = SDATA(src).offset;
+
+   assert(!adj || src.get()->reg.size <= 4);
+   if (adj)
+      offset /= src.get()->reg.size;
 
-   assert(offset <= 0xffff && (pos % 32) <= 16);
+   assert(offset <= 0x7fff && offset >= (int32_t)-0x8000 && (pos % 32) <= 16);
+
+   if (offset < 0)
+      offset &= adj ? (0xffff >> (src.get()->reg.size >> 1)) : 0xffff;
 
    code[pos / 32] |= offset << (pos % 32);
 }
@@ -143,14 +161,15 @@ void CodeEmitterNV50::srcAddr8(const ValueRef& src, const int pos)
 
    uint32_t offset = SDATA(src).offset;
 
-   assert(offset <= 0x1fc && !(offset & 0x3));
+   assert((offset <= 0x1fc || offset == 0x3fc) && !(offset & 0x3));
 
    code[pos / 32] |= (offset >> 2) << (pos % 32);
 }
 
 void CodeEmitterNV50::defId(const ValueDef& def, const int pos)
 {
-   assert(def.get());
+   assert(def.get() && def.getFile() != FILE_SHADER_OUTPUT);
+
    code[pos / 32] |= DDATA(def).id << (pos % 32);
 }
 
@@ -170,11 +189,11 @@ CodeEmitterNV50::roundMode_MAD(const Instruction *insn)
 void
 CodeEmitterNV50::emitMNeg12(const Instruction *i)
 {
-   code[1] |= i->src[0].mod.neg() << 26;
-   code[1] |= i->src[1].mod.neg() << 27;
+   code[1] |= i->src(0).mod.neg() << 26;
+   code[1] |= i->src(1).mod.neg() << 27;
 }
 
-void CodeEmitterNV50::emitCondCode(CondCode cc, int pos)
+void CodeEmitterNV50::emitCondCode(CondCode cc, DataType ty, int pos)
 {
    uint8_t enc;
 
@@ -210,6 +229,9 @@ void CodeEmitterNV50::emitCondCode(CondCode cc, int pos)
       assert(!"invalid condition code");
       break;
    }
+   if (ty != TYPE_NONE && !isFloatType(ty))
+      enc &= ~0x8; // unordered only exists for float types
+
    code[pos / 32] |= enc << (pos % 32);
 }
 
@@ -222,8 +244,8 @@ CodeEmitterNV50::emitFlagsRd(const Instruction *i)
 
    if (s >= 0) {
       assert(i->getSrc(s)->reg.file == FILE_FLAGS);
-      emitCondCode(i->cc, 32 + 7);
-      srcId(i->src[s], 32 + 12);
+      emitCondCode(i->cc, TYPE_NONE, 32 + 7);
+      srcId(i->src(s), 32 + 12);
    } else {
       code[1] |= 0x0780;
    }
@@ -234,8 +256,22 @@ CodeEmitterNV50::emitFlagsWr(const Instruction *i)
 {
    assert(!(code[1] & 0x70));
 
-   if (i->flagsDef >= 0)
-      code[1] |= (DDATA(i->def[i->flagsDef]).id << 4) | 0x40;
+   int flagsDef = i->flagsDef;
+
+   // find flags definition and check that it is the last def
+   if (flagsDef < 0) {
+      for (int d = 0; i->defExists(d); ++d)
+         if (i->def(d).getFile() == FILE_FLAGS)
+            flagsDef = d;
+      if (flagsDef >= 0 && 0) // TODO: enforce use of flagsDef at some point
+         WARN("Instruction::flagsDef was not set properly\n");
+   }
+   if (flagsDef == 0 && i->defExists(1))
+      WARN("flags def should not be the primary definition\n");
+
+   if (flagsDef >= 0)
+      code[1] |= (DDATA(i->def(flagsDef)).id << 4) | 0x40;
+
 }
 
 void
@@ -248,20 +284,27 @@ CodeEmitterNV50::setARegBits(unsigned int u)
 void
 CodeEmitterNV50::setAReg16(const Instruction *i, int s)
 {
-   s = i->src[s].indirect[0];
-   if (s >= 0)
-      setARegBits(SDATA(i->src[s]).id + 1);
+   if (i->srcExists(s)) {
+      s = i->src(s).indirect[0];
+      if (s >= 0)
+         setARegBits(SDATA(i->src(s)).id + 1);
+   }
 }
 
 void
 CodeEmitterNV50::setImmediate(const Instruction *i, int s)
 {
-   const ImmediateValue *imm = i->src[s].get()->asImm();
+   const ImmediateValue *imm = i->src(s).get()->asImm();
    assert(imm);
 
+   uint32_t u = imm->reg.data.u32;
+
+   if (i->src(s).mod & Modifier(NV50_IR_MOD_NOT))
+      u = ~u;
+
    code[1] |= 3;
-   code[0] |= (imm->reg.data.u32 & 0x3f) << 16;
-   code[1] |= (imm->reg.data.u32 >> 6) << 2;
+   code[0] |= (u & 0x3f) << 16;
+   code[1] |= (u >> 6) << 2;
 }
 
 void
@@ -271,13 +314,18 @@ CodeEmitterNV50::setDst(const Value *dst)
 
    assert(reg->file != FILE_ADDRESS);
 
-   if (reg->data.id < 0) {
+   if (reg->data.id < 0 || reg->file == FILE_FLAGS) {
       code[0] |= (127 << 2) | 1;
       code[1] |= 8;
    } else {
-      if (reg->file == FILE_SHADER_OUTPUT)
+      int id;
+      if (reg->file == FILE_SHADER_OUTPUT) {
          code[1] |= 8;
-      code[0] |= reg->data.id << 2;
+         id = reg->data.offset / 4;
+      } else {
+         id = reg->data.id;
+      }
+      code[0] |= id << 2;
    }
 }
 
@@ -293,60 +341,135 @@ CodeEmitterNV50::setDst(const Instruction *i, int d)
    }
 }
 
+// 3 * 2 bits:
+// 0: r
+// 1: a/s
+// 2: c
+// 3: i
 void
-CodeEmitterNV50::emitSrc0(const ValueRef& ref)
+CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
 {
-   const Storage *reg = &ref.rep()->reg;
-
-   if (reg->file == FILE_SHADER_INPUT)
-      code[1] |= 0x00200000;
-   else
-   if (reg->file != FILE_GPR)
-      ERROR("invalid src0 register file: %d\n", reg->file);
+   uint8_t mode = 0;
 
-   assert(reg->data.id < 128);
-   code[0] |= reg->data.id << 9;
-}
+   for (unsigned int s = 0; s < Target::operationSrcNr[i->op]; ++s) {
+      switch (i->src(s).getFile()) {
+      case FILE_GPR:
+         break;
+      case FILE_MEMORY_SHARED:
+      case FILE_SHADER_INPUT:
+         mode |= 1 << (s * 2);
+         break;
+      case FILE_MEMORY_CONST:
+         mode |= 2 << (s * 2);
+         break;
+      case FILE_IMMEDIATE:
+         mode |= 3 << (s * 2);
+         break;
+      default:
+	      ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile());
+         assert(0);
+         break;
+      }
+   }
+   switch (mode) {
+   case 0x00: // rrr
+      break;
+   case 0x01: // arr/grr
+      if (progType == Program::TYPE_GEOMETRY) {
+         code[0] |= 0x01800000;
+         if (enc == NV50_OP_ENC_LONG || enc == NV50_OP_ENC_LONG_ALT)
+            code[1] |= 0x00200000;
+      } else {
+         if (enc == NV50_OP_ENC_SHORT)
+            code[0] |= 0x01000000;
+         else
+            code[1] |= 0x00200000;
+      }
+      break;
+   case 0x03: // irr
+      assert(i->op == OP_MOV);
+      return;
+   case 0x0c: // rir
+      break;
+   case 0x0d: // gir
+      code[0] |= 0x01000000;
+      assert(progType == Program::TYPE_GEOMETRY ||
+             progType == Program::TYPE_COMPUTE);
+      break;
+   case 0x08: // rcr
+      code[0] |= (enc == NV50_OP_ENC_LONG_ALT) ? 0x01000000 : 0x00800000;
+      code[1] |= (i->getSrc(1)->reg.fileIndex << 22);
+      break;
+   case 0x09: // acr/gcr
+      if (progType == Program::TYPE_GEOMETRY) {
+         code[0] |= 0x01800000;
+      } else {
+         code[0] |= (enc == NV50_OP_ENC_LONG_ALT) ? 0x01000000 : 0x00800000;
+         code[1] |= 0x00200000;
+      }
+      code[1] |= (i->getSrc(1)->reg.fileIndex << 22);
+      break;
+   case 0x20: // rrc
+      code[0] |= 0x01000000;
+      code[1] |= (i->getSrc(2)->reg.fileIndex << 22);
+      break;
+   case 0x21: // arc
+      code[0] |= 0x01000000;
+      code[1] |= 0x00200000 | (i->getSrc(2)->reg.fileIndex << 22);
+      assert(progType != Program::TYPE_GEOMETRY);
+      break;
+   default:
+      ERROR("not encodable: %x\n", mode);
+      assert(0);
+      break;
+   }
+   if (progType != Program::TYPE_COMPUTE)
+      return;
 
-void
-CodeEmitterNV50::emitSrc1(const ValueRef& ref)
-{
-   const Storage *reg = &ref.rep()->reg;
+   if ((mode & 3) == 1) {
+      const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14;
 
-   if (reg->file == FILE_MEMORY_CONST) {
-      assert(!(code[1] & 0x01800000));
-      code[0] |= 1 << 23;
-      code[1] |= reg->fileIndex << 22;
-   } else
-   if (reg->file != FILE_GPR) {
-      ERROR("invalid src1 register file: %d\n", reg->file);
+      switch (i->getSrc(0)->reg.type) {
+      case TYPE_U8:
+         break;
+      case TYPE_U16:
+         code[0] |= 1 << pos;
+         break;
+      case TYPE_S16:
+         code[0] |= 2 << pos;
+         break;
+      default:
+         code[0] |= 3 << pos;
+         assert(i->getSrc(0)->reg.size == 4);
+         break;
+      }
    }
-
-   assert(reg->data.id < 128);
-   code[0] |= reg->data.id << 16;
 }
 
 void
-CodeEmitterNV50::emitSrc2(const ValueRef& ref)
+CodeEmitterNV50::setSrc(const Instruction *i, unsigned int s, int slot)
 {
-   const Storage *reg = &ref.rep()->reg;
-
-   if (reg->file == FILE_MEMORY_CONST) {
-      assert(!(code[1] & 0x01800000));
-      code[0] |= 1 << 24;
-      code[1] |= reg->fileIndex << 22;
-   } else
-   if (reg->file != FILE_GPR) {
-      ERROR("invalid src1 register file: %d\n", reg->file);
+   if (Target::operationSrcNr[i->op] <= s)
+      return;
+   const Storage *reg = &i->src(s).rep()->reg;
+
+   unsigned int id = (reg->file == FILE_GPR) ?
+      reg->data.id :
+      reg->data.offset >> (reg->size >> 1); // no > 4 byte sources here
+
+   switch (slot) {
+   case 0: code[0] |= id << 9; break;
+   case 1: code[0] |= id << 16; break;
+   case 2: code[1] |= id << 14; break;
+   default:
+      assert(0);
+      break;
    }
-
-   assert(reg->data.id < 128);
-   code[1] |= reg->data.id << 14;
 }
 
 // the default form:
 //  - long instruction
-//  - 1 to 3 sources in slots 0, 1, 2
+//  - 1 to 3 sources in slots 0, 1, 2 (rrr, arr, rcr, acr, rrc, arc, gcr, grr)
 //  - address & flags
 void
 CodeEmitterNV50::emitForm_MAD(const Instruction *i)
@@ -359,14 +482,10 @@ CodeEmitterNV50::emitForm_MAD(const Instruction *i)
 
    setDst(i, 0);
 
-   if (i->srcExists(0))
-      emitSrc0(i->src[0]);
-
-   if (i->srcExists(1))
-      emitSrc1(i->src[1]);
-
-   if (i->srcExists(2))
-      emitSrc2(i->src[2]);
+   setSrcFileBits(i, NV50_OP_ENC_LONG);
+   setSrc(i, 0, 0);
+   setSrc(i, 1, 1);
+   setSrc(i, 2, 2);
 
    setAReg16(i, 1);
 }
@@ -383,16 +502,14 @@ CodeEmitterNV50::emitForm_ADD(const Instruction *i)
 
    setDst(i, 0);
 
-   if (i->srcExists(0))
-      emitSrc0(i->src[0]);
-
-   if (i->srcExists(1))
-      emitSrc2(i->src[1]);
+   setSrcFileBits(i, NV50_OP_ENC_LONG_ALT);
+   setSrc(i, 0, 0);
+   setSrc(i, 1, 2);
 
    setAReg16(i, 1);
 }
 
-// default short form
+// default short form (rr, ar, rc, gr)
 void
 CodeEmitterNV50::emitForm_MUL(const Instruction *i)
 {
@@ -402,15 +519,13 @@ CodeEmitterNV50::emitForm_MUL(const Instruction *i)
 
    setDst(i, 0);
 
-   if (i->srcExists(0))
-      emitSrc0(i->src[0]);
-
-   if (i->srcExists(1))
-      emitSrc1(i->src[1]);
+   setSrcFileBits(i, NV50_OP_ENC_SHORT);
+   setSrc(i, 0, 0);
+   setSrc(i, 1, 1);
 }
 
 // usual immediate form
-// - 1 to 3 sources where last is immediate
+// - 1 to 3 sources where last is immediate (rir, gir)
 // - no address or predicate possible
 void
 CodeEmitterNV50::emitForm_IMM(const Instruction *i)
@@ -422,21 +537,18 @@ CodeEmitterNV50::emitForm_IMM(const Instruction *i)
 
    setDst(i, 0);
 
-   if (i->srcExists(2)) {
-      emitSrc0(i->src[0]);
-      emitSrc1(i->src[1]);
-      setImmediate(i, 2);
-   } else
-   if (i->srcExists(1)) {
-      emitSrc0(i->src[0]);
+   setSrcFileBits(i, NV50_OP_ENC_IMM);
+   if (Target::operationSrcNr[i->op] > 1) {
+      setSrc(i, 0, 0);
       setImmediate(i, 1);
+      setSrc(i, 2, 1);
    } else {
       setImmediate(i, 0);
    }
 }
 
 void
-CodeEmitterNV50::emitLoadStoreSize(DataType ty, int pos)
+CodeEmitterNV50::emitLoadStoreSizeLG(DataType ty, int pos)
 {
    uint8_t enc;
 
@@ -445,7 +557,9 @@ CodeEmitterNV50::emitLoadStoreSize(DataType ty, int pos)
    case TYPE_S32: // fall through
    case TYPE_U32:  enc = 0x6; break;
    case TYPE_B128: enc = 0x5; break;
-   case TYPE_F64:  enc = 0x4; break;
+   case TYPE_F64: // fall through
+   case TYPE_S64: // fall through
+   case TYPE_U64:  enc = 0x4; break;
    case TYPE_S16:  enc = 0x3; break;
    case TYPE_U16:  enc = 0x2; break;
    case TYPE_S8:   enc = 0x1; break;
@@ -459,18 +573,58 @@ CodeEmitterNV50::emitLoadStoreSize(DataType ty, int pos)
 }
 
 void
+CodeEmitterNV50::emitLoadStoreSizeCS(DataType ty)
+{
+   switch (ty) {
+   case TYPE_U8: break;
+   case TYPE_U16: code[1] |= 0x4000; break;
+   case TYPE_S16: code[1] |= 0x8000; break;
+   case TYPE_F32:
+   case TYPE_S32:
+   case TYPE_U32: code[1] |= 0xc000; break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+void
 CodeEmitterNV50::emitLOAD(const Instruction *i)
 {
-   DataFile sf = i->src[0].getFile();
+   DataFile sf = i->src(0).getFile();
+   int32_t offset = i->getSrc(0)->reg.data.offset;
 
    switch (sf) {
    case FILE_SHADER_INPUT:
-      code[0] = 0x10000001;
-      code[1] = 0x04200000 | (i->lanes << 14);
+      // use 'mov' where we can
+      code[0] = i->src(0).isIndirect(0) ? 0x00000001 : 0x10000001;
+      code[1] = 0x00200000 | (i->lanes << 14);
+      if (typeSizeof(i->dType) == 4)
+         code[1] |= 0x04000000;
+      break;
+   case FILE_MEMORY_SHARED:
+      if (targ->getChipset() >= 0x84) {
+         assert(offset <= (int32_t)(0x3fff * typeSizeof(i->sType)));
+         code[0] = 0x10000001;
+         code[1] = 0x40000000;
+
+         if (typeSizeof(i->dType) == 4)
+            code[1] |= 0x04000000;
+
+         emitLoadStoreSizeCS(i->sType);
+      } else {
+         assert(offset <= (int32_t)(0x1f * typeSizeof(i->sType)));
+         code[0] = 0x10000001;
+         code[1] = 0x00200000 | (i->lanes << 14);
+         emitLoadStoreSizeCS(i->sType);
+      }
       break;
    case FILE_MEMORY_CONST:
       code[0] = 0x10000001;
-      code[1] = 0x24000000 | (i->getSrc(0)->reg.fileIndex << 22);
+      code[1] = 0x20000000 | (i->getSrc(0)->reg.fileIndex << 22);
+      if (typeSizeof(i->dType) == 4)
+         code[1] |= 0x04000000;
+      emitLoadStoreSizeCS(i->sType);
       break;
    case FILE_MEMORY_LOCAL:
       code[0] = 0xd0000001;
@@ -486,18 +640,18 @@ CodeEmitterNV50::emitLOAD(const Instruction *i)
    }
    if (sf == FILE_MEMORY_LOCAL ||
        sf == FILE_MEMORY_GLOBAL)
-      emitLoadStoreSize(i->sType, 21 + 32);
+      emitLoadStoreSizeLG(i->sType, 21 + 32);
 
    setDst(i, 0);
 
    emitFlagsRd(i);
    emitFlagsWr(i);
 
-   if (i->src[0].getFile() == FILE_MEMORY_GLOBAL) {
-      srcId(*i->src[0].getIndirect(0), 9);
+   if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      srcId(*i->src(0).getIndirect(0), 9);
    } else {
       setAReg16(i, 0);
-      srcAddr16(i->src[0], 9);
+      srcAddr16(i->src(0), i->src(0).getFile() != FILE_MEMORY_LOCAL, 9);
    }
 }
 
@@ -509,19 +663,21 @@ CodeEmitterNV50::emitSTORE(const Instruction *i)
 
    switch (f) {
    case FILE_SHADER_OUTPUT:
-      code[0] = 0x00000001 | ((offset >> 2) << 2);
+      code[0] = 0x00000001 | ((offset >> 2) << 9);
       code[1] = 0x80c00000;
-      srcId(i->src[1], 32 + 15);
+      srcId(i->src(1), 32 + 14);
       break;
    case FILE_MEMORY_GLOBAL:
-      code[0] = 0xd0000000;
+      code[0] = 0xd0000001 | (i->getSrc(0)->reg.fileIndex << 16);
       code[1] = 0xa0000000;
-      emitLoadStoreSize(i->dType, 21 + 32);
+      emitLoadStoreSizeLG(i->dType, 21 + 32);
+      srcId(i->src(1), 2);
       break;
    case FILE_MEMORY_LOCAL:
       code[0] = 0xd0000001;
       code[1] = 0x60000000;
-      emitLoadStoreSize(i->dType, 21 + 32);
+      emitLoadStoreSizeLG(i->dType, 21 + 32);
+      srcId(i->src(1), 2);
       break;
    case FILE_MEMORY_SHARED:
       code[0] = 0x00000001;
@@ -536,28 +692,27 @@ CodeEmitterNV50::emitSTORE(const Instruction *i)
          break;
       case 4:
          code[0] |= (offset >> 2) << 9;
-         code[1] |= 0x04000000;
+         code[1] |= 0x04200000;
          break;
       default:
          assert(0);
          break;
       }
+      srcId(i->src(1), 32 + 14);
       break;
    default:
       assert(!"invalid store destination file");
       break;
    }
 
-   if (f != FILE_SHADER_OUTPUT) {
-      srcId(i->src[1], 2);
-      if (f == FILE_MEMORY_GLOBAL)
-         srcId(*i->src[0].getIndirect(0), 9);
-      if (f == FILE_MEMORY_LOCAL)
-         srcAddr16(i->src[0], 9);
-   }
-   if (f != FILE_MEMORY_GLOBAL)
+   if (f == FILE_MEMORY_GLOBAL)
+      srcId(*i->src(0).getIndirect(0), 9);
+   else
       setAReg16(i, 0);
 
+   if (f == FILE_MEMORY_LOCAL)
+      srcAddr16(i->src(0), false, 9);
+
    emitFlagsRd(i);
 }
 
@@ -572,21 +727,22 @@ CodeEmitterNV50::emitMOV(const Instruction *i)
    if (sf == FILE_FLAGS) {
       code[0] = 0x00000001;
       code[1] = 0x20000000;
-      defId(i->def[0], 2);
-      srcId(i->src[0], 12);
+      defId(i->def(0), 2);
+      srcId(i->src(0), 12);
       emitFlagsRd(i);
    } else
    if (sf == FILE_ADDRESS) {
       code[0] = 0x00000001;
       code[1] = 0x40000000;
-      defId(i->def[0], 2);
-      setARegBits(SDATA(i->src[0]).id + 1);
+      defId(i->def(0), 2);
+      setARegBits(SDATA(i->src(0)).id + 1);
+      emitFlagsRd(i);
    } else
    if (df == FILE_FLAGS) {
       code[0] = 0x00000001;
       code[1] = 0xa0000000;
-      defId(i->def[0], 4);
-      srcId(i->src[0], 9);
+      defId(i->def(0), 4);
+      srcId(i->src(0), 9);
       emitFlagsRd(i);
    } else
    if (sf == FILE_IMMEDIATE) {
@@ -598,10 +754,12 @@ CodeEmitterNV50::emitMOV(const Instruction *i)
          code[0] = 0x10008000;
       } else {
          code[0] = 0x10000001;
-         code[1] = 0x04000000 | (i->lanes << 14);
+         code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000;
+         code[1] |= (i->lanes << 14);
+         emitFlagsRd(i);
       }
-      defId(i->def[0], 2);
-      srcId(i->src[0], 9);
+      defId(i->def(0), 2);
+      srcId(i->src(0), 9);
    }
    if (df == FILE_SHADER_OUTPUT) {
       assert(i->encSize == 8);
@@ -628,7 +786,7 @@ CodeEmitterNV50::emitQUADOP(const Instruction *i, uint8_t lane, uint8_t quOp)
    emitForm_ADD(i);
 
    if (!i->srcExists(1))
-      srcId(i->src[0], 32 + 14);
+      srcId(i->src(0), 32 + 14);
 }
 
 void
@@ -637,8 +795,8 @@ CodeEmitterNV50::emitPFETCH(const Instruction *i)
    code[0] = 0x11800001;
    code[1] = 0x04200000 | (0xf << 14);
 
-   defId(i->def[0], 2);
-   srcAddr8(i->src[0], 9);
+   defId(i->def(0), 2);
+   srcAddr8(i->src(0), 9);
    setAReg16(i, 0);
 }
 
@@ -647,27 +805,27 @@ CodeEmitterNV50::emitINTERP(const Instruction *i)
 {
    code[0] = 0x80000000;
 
-   defId(i->def[0], 2);
-   srcAddr8(i->src[0], 16);
+   defId(i->def(0), 2);
+   srcAddr8(i->src(0), 16);
 
    if (i->getInterpMode() == NV50_IR_INTERP_FLAT) {
       code[0] |= 1 << 8;
    } else {
       if (i->op == OP_PINTERP) {
          code[0] |= 1 << 25;
-         srcId(i->src[1], 9);
+         srcId(i->src(1), 9);
       }
       if (i->getSampleMode() == NV50_IR_INTERP_CENTROID)
          code[0] |= 1 << 24;
    }
 
    if (i->encSize == 8) {
-      emitFlagsRd(i);
-      code[1] |=
+      code[1] =
          (code[0] & (3 << 24)) >> (24 - 16) |
-         (code[0] & (1 <<  8)) >> (18 -  8);
+         (code[0] & (1 <<  8)) << (18 -  8);
       code[0] &= ~0x03000100;
       code[0] |= 1;
+      emitFlagsRd(i);
    }
 }
 
@@ -693,8 +851,8 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i)
          assert(0);
          break;
       }
-      code[1] |= i->src[0].mod.abs() << 20;
-      code[1] |= i->src[1].mod.abs() << 19;
+      code[1] |= i->src(0).mod.abs() << 20;
+      code[1] |= i->src(1).mod.abs() << 19;
    }
    emitForm_MAD(i);
 }
@@ -702,8 +860,8 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i)
 void
 CodeEmitterNV50::emitFMAD(const Instruction *i)
 {
-   const int neg_mul = i->src[0].mod.neg() ^ i->src[1].mod.neg();
-   const int neg_add = i->src[2].mod.neg();
+   const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+   const int neg_add = i->src(2).mod.neg();
 
    code[0] = 0xe0000000;
 
@@ -711,30 +869,32 @@ CodeEmitterNV50::emitFMAD(const Instruction *i)
       emitForm_MUL(i);
       assert(!neg_mul && !neg_add);
    } else {
-      emitForm_MAD(i);
-      code[1] |= neg_mul << 26;
+      code[1]  = neg_mul << 26;
       code[1] |= neg_add << 27;
       if (i->saturate)
          code[1] |= 1 << 29;
+      emitForm_MAD(i);
    }
 }
 
 void
 CodeEmitterNV50::emitFADD(const Instruction *i)
 {
-   const int neg0 = i->src[0].mod.neg();
-   const int neg1 = i->src[1].mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+   const int neg0 = i->src(0).mod.neg();
+   const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
 
    code[0] = 0xb0000000;
 
-   assert(!(i->src[0].mod | i->src[1].mod).abs());
+   assert(!(i->src(0).mod | i->src(1).mod).abs());
 
-   if (i->src[1].getFile() == FILE_IMMEDIATE) {
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] = 0;
       emitForm_IMM(i);
       code[0] |= neg0 << 15;
       code[0] |= neg1 << 22;
    } else
    if (i->encSize == 8) {
+      code[1] = 0;
       emitForm_ADD(i);
       code[1] |= neg0 << 26;
       code[1] |= neg1 << 27;
@@ -744,27 +904,40 @@ CodeEmitterNV50::emitFADD(const Instruction *i)
       emitForm_MUL(i);
       code[0] |= neg0 << 15;
       code[0] |= neg1 << 22;
+      if (i->saturate)
+         code[0] |= 1 << 8;
    }
 }
 
 void
 CodeEmitterNV50::emitUADD(const Instruction *i)
 {
+   const int neg0 = i->src(0).mod.neg();
+   const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
    code[0] = 0x20008000;
 
-   if (i->src[0].getFile() == FILE_IMMEDIATE) {
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] = 0;
       emitForm_IMM(i);
    } else
    if (i->encSize == 8) {
       code[0] = 0x20000000;
-      code[1] = 0x04000000;
+      code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000;
       emitForm_ADD(i);
    } else {
       emitForm_MUL(i);
    }
-   assert(!(i->src[0].mod.neg() && i->src[1].mod.neg()));
-   code[0] |= i->src[0].mod.neg() << 28;
-   code[0] |= i->src[1].mod.neg() << 22;
+   assert(!(neg0 && neg1));
+   code[0] |= neg0 << 28;
+   code[0] |= neg1 << 22;
+
+   if (i->flagsSrc >= 0) {
+      // addc == sub | subr
+      assert(!(code[0] & 0x10400000) && !i->getPredicate());
+      code[0] |= 0x10400000;
+      srcId(i->src(i->flagsSrc), 32 + 12);
+   }
 }
 
 void
@@ -775,30 +948,47 @@ CodeEmitterNV50::emitAADD(const Instruction *i)
    code[0] = 0xd0000001 | (i->getSrc(s)->reg.data.u16 << 9);
    code[1] = 0x20000000;
 
-   code[0] |= (DDATA(i->def[0]).id + 1) << 2;
+   code[0] |= (DDATA(i->def(0)).id + 1) << 2;
 
    emitFlagsRd(i);
 
    if (s && i->srcExists(0))
-      setARegBits(SDATA(i->src[0]).id + 1);
+      setARegBits(SDATA(i->src(0)).id + 1);
+}
+
+void
+CodeEmitterNV50::emitIMUL(const Instruction *i)
+{
+   code[0] = 0x40000000;
+
+   if (i->encSize == 8) {
+      code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000;
+      emitForm_MAD(i);
+   } else {
+      if (i->sType == TYPE_S16)
+         code[0] |= 0x8100;
+      emitForm_MUL(i);
+   }
 }
 
 void
 CodeEmitterNV50::emitFMUL(const Instruction *i)
 {
-   const int neg = (i->src[0].mod ^ i->src[1].mod).neg();
+   const int neg = (i->src(0).mod ^ i->src(1).mod).neg();
 
    code[0] = 0xc0000000;
 
-   if (i->src[0].getFile() == FILE_IMMEDIATE) {
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] = 0;
       emitForm_IMM(i);
       if (neg)
          code[0] |= 0x8000;
    } else
    if (i->encSize == 8) {
-      emitForm_MAD(i);
+      code[1] = i->rnd == ROUND_Z ? 0x0000c000 : 0;
       if (neg)
          code[1] |= 0x08000000;
+      emitForm_MAD(i);
    } else {
       emitForm_MUL(i);
       if (neg)
@@ -807,12 +997,38 @@ CodeEmitterNV50::emitFMUL(const Instruction *i)
 }
 
 void
+CodeEmitterNV50::emitIMAD(const Instruction *i)
+{
+   code[0] = 0x60000000;
+   if (isSignedType(i->sType))
+      code[1] = i->saturate ? 0x40000000 : 0x20000000;
+   else
+      code[1] = 0x00000000;
+
+   int neg1 = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+   int neg2 = i->src(2).mod.neg();
+
+   assert(!(neg1 & neg2));
+   code[1] |= neg1 << 27;
+   code[1] |= neg2 << 26;
+
+   emitForm_MAD(i);
+
+   if (i->flagsSrc >= 0) {
+      // add with carry from $cX
+      assert(!(code[1] & 0x0c000000) && !i->getPredicate());
+      code[1] |= 0xc << 24;
+      srcId(i->src(i->flagsSrc), 32 + 12);
+   }
+}
+
+void
 CodeEmitterNV50::emitSET(const Instruction *i)
 {
    code[0] = 0x30000000;
    code[1] = 0x60000000;
 
-   emitCondCode(i->asCmp()->setCond, 32 + 14);
+   emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
 
    switch (i->sType) {
    case TYPE_F32: code[0] |= 0x80000000; break;
@@ -824,6 +1040,11 @@ CodeEmitterNV50::emitSET(const Instruction *i)
       assert(0);
       break;
    }
+   if (i->src(0).mod.neg()) code[1] |= 0x04000000;
+   if (i->src(1).mod.neg()) code[1] |= 0x08000000;
+   if (i->src(0).mod.abs()) code[1] |= 0x00100000;
+   if (i->src(1).mod.abs()) code[1] |= 0x00080000;
+
    emitForm_MAD(i);
 }
 
@@ -938,6 +1159,7 @@ CodeEmitterNV50::emitCVT(const Instruction *i)
          assert(0);
          break;
       }
+      break;
    case TYPE_S16:
    case TYPE_U16:
    case TYPE_S8:
@@ -958,12 +1180,12 @@ CodeEmitterNV50::emitCVT(const Instruction *i)
    default:
       break;
    }
-   code[1] ^= i->src[0].mod.neg() << 29;
-   code[1] |= i->src[0].mod.abs() << 20;
+   code[1] ^= i->src(0).mod.neg() << 29;
+   code[1] |= i->src(0).mod.abs() << 20;
    if (i->saturate)
       code[1] |= 1 << 19;
 
-   assert(i->op != OP_ABS || !i->src[0].mod.neg());
+   assert(i->op != OP_ABS || !i->src(0).mod.neg());
 
    emitForm_MAD(i);
 }
@@ -974,8 +1196,8 @@ CodeEmitterNV50::emitPreOp(const Instruction *i)
    code[0] = 0xb0000000;
    code[1] = (i->op == OP_PREEX2) ? 0xc0004000 : 0xc0000000;
 
-   code[1] |= i->src[0].mod.abs() << 20;
-   code[1] |= i->src[0].mod.neg() << 26;
+   code[1] |= i->src(0).mod.abs() << 20;
+   code[1] |= i->src(0).mod.neg() << 26;
 
    emitForm_MAD(i);
 }
@@ -990,18 +1212,37 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)
       emitForm_MUL(i);
    } else {
       code[1] = subOp << 29;
-      code[1] |= i->src[0].mod.abs() << 20;
-      code[1] |= i->src[0].mod.neg() << 26;
+      code[1] |= i->src(0).mod.abs() << 20;
+      code[1] |= i->src(0).mod.neg() << 26;
       emitForm_MAD(i);
    }
 }
 
 void
+CodeEmitterNV50::emitNOT(const Instruction *i)
+{
+   code[0] = 0xd0000000;
+   code[1] = 0x0002c000;
+
+   switch (i->sType) {
+   case TYPE_U32:
+   case TYPE_S32:
+      code[1] |= 0x04000000;
+      break;
+   default:
+      break;
+   }
+   emitForm_MAD(i);
+   setSrc(i, 0, 1);
+}
+
+void
 CodeEmitterNV50::emitLogicOp(const Instruction *i)
 {
    code[0] = 0xd0000000;
+   code[1] = 0;
 
-   if (i->src[1].getFile() == FILE_IMMEDIATE) {
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
       switch (i->op) {
       case OP_OR:  code[0] |= 0x0100; break;
       case OP_XOR: code[0] |= 0x8000; break;
@@ -1019,37 +1260,45 @@ CodeEmitterNV50::emitLogicOp(const Instruction *i)
          assert(0);
          break;
       }
+      if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT))
+         code[1] |= 1 << 16;
+      if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT))
+         code[1] |= 1 << 17;
+
       emitForm_MAD(i);
    }
 }
 
 void
-CodeEmitterNV50::emitARL(const Instruction *i)
+CodeEmitterNV50::emitARL(const Instruction *i, unsigned int shl)
 {
-   assert(i->src[1].getFile() == FILE_IMMEDIATE);
-
-   code[0] = 0x00000001 | (i->getSrc(1)->reg.data.u32 & 0x3f) << 16;
+   code[0] = 0x00000001 | (shl << 16);
    code[1] = 0xc0000000;
 
-   code[0] |= (DDATA(i->def[0]).id + 1) << 2;
-   emitSrc0(i->src[0]);
+   code[0] |= (DDATA(i->def(0)).id + 1) << 2;
+
+   setSrcFileBits(i, NV50_OP_ENC_IMM);
+   setSrc(i, 0, 0);
    emitFlagsRd(i);
 }
 
 void
 CodeEmitterNV50::emitShift(const Instruction *i)
 {
-   if (i->def[0].getFile() == FILE_ADDRESS) {
-      emitARL(i);
+   if (i->def(0).getFile() == FILE_ADDRESS) {
+      assert(i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE);
+      emitARL(i, i->getSrc(1)->reg.data.u32 & 0x3f);
    } else {
       code[0] = 0x30000001;
       code[1] = (i->op == OP_SHR) ? 0xe4000000 : 0xc4000000;
-      if (isSignedType(i->sType))
+      if (i->op == OP_SHR && isSignedType(i->sType))
           code[1] |= 1 << 27;
 
-      if (i->src[1].getFile() == FILE_IMMEDIATE) {
+      if (i->src(1).getFile() == FILE_IMMEDIATE) {
          code[1] |= 1 << 20;
          code[0] |= (i->getSrc(1)->reg.data.u32 & 0x7f) << 16;
+         defId(i->def(0), 2);
+         srcId(i->src(0), 9);
          emitFlagsRd(i);
       } else {
          emitForm_MAD(i);
@@ -1080,7 +1329,7 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i)
       code[1] = 0x40000000;
       break;
    case OP_TXF:
-      code[0] = 0x01000000;
+      code[0] |= 0x01000000;
       break;
    case OP_TXG:
       code[0] = 0x01000000;
@@ -1096,7 +1345,7 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i)
 
    int argc = i->tex.target.getArgCount();
 
-   if (i->op == OP_TXB || i->op == OP_TXL)
+   if (i->op == OP_TXB || i->op == OP_TXL || i->op == OP_TXF)
       argc += 1;
    if (i->tex.target.isShadow())
       argc += 1;
@@ -1108,9 +1357,9 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i)
       code[0] |= 0x08000000;
    } else
    if (i->tex.useOffsets) {
-      code[1] |= (i->tex.offset[0][0] & 0xf) << 16;
+      code[1] |= (i->tex.offset[0][0] & 0xf) << 24;
       code[1] |= (i->tex.offset[0][1] & 0xf) << 20;
-      code[1] |= (i->tex.offset[0][2] & 0xf) << 24;
+      code[1] |= (i->tex.offset[0][2] & 0xf) << 16;
    }
 
    code[0] |= (i->tex.mask & 0x3) << 25;
@@ -1119,27 +1368,100 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i)
    if (i->tex.liveOnly)
       code[1] |= 4;
 
-   defId(i->def[0], 2);
+   defId(i->def(0), 2);
+
+   emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitTXQ(const TexInstruction *i)
+{
+   assert(i->tex.query == TXQ_DIMS);
+
+   code[0] = 0xf0000001;
+   code[1] = 0x60000000;
+
+   code[0] |= i->tex.r << 9;
+   code[0] |= i->tex.s << 17;
+
+   code[0] |= (i->tex.mask & 0x3) << 25;
+   code[1] |= (i->tex.mask & 0xc) << 12;
+
+   defId(i->def(0), 2);
 
    emitFlagsRd(i);
 }
 
 void
+CodeEmitterNV50::emitPRERETEmu(const FlowInstruction *i)
+{
+   uint32_t pos = i->target.bb->binPos + 8; // +8 to skip an op */
+
+   code[0] = 0x10000003; // bra
+   code[1] = 0x00000780; // always
+
+   switch (i->subOp) {
+   case NV50_IR_SUBOP_EMU_PRERET + 0: // bra to the call
+      break;
+   case NV50_IR_SUBOP_EMU_PRERET + 1: // bra to skip the call
+      pos += 8;
+      break;
+   default:
+      assert(i->subOp == (NV50_IR_SUBOP_EMU_PRERET + 2));
+      code[0] = 0x20000003; // call
+      code[1] = 0x00000000; // no predicate
+      break;
+   }
+   addReloc(RelocEntry::TYPE_CODE, 0, pos, 0x07fff800, 9);
+   addReloc(RelocEntry::TYPE_CODE, 1, pos, 0x000fc000, -4);
+}
+
+void
 CodeEmitterNV50::emitFlow(const Instruction *i, uint8_t flowOp)
 {
    const FlowInstruction *f = i->asFlow();
+   bool hasPred = false;
+   bool hasTarg = false;
 
    code[0] = 0x00000003 | (flowOp << 28);
    code[1] = 0x00000000;
 
-   emitFlagsRd(i);
+   switch (i->op) {
+   case OP_BRA:
+      hasPred = true;
+      hasTarg = true;
+      break;
+   case OP_BREAK:
+   case OP_BRKPT:
+   case OP_DISCARD:
+   case OP_RET:
+      hasPred = true;
+      break;
+   case OP_CALL:
+   case OP_PREBREAK:
+   case OP_JOINAT:
+      hasTarg = true;
+      break;
+   case OP_PRERET:
+      hasTarg = true;
+      if (i->subOp >= NV50_IR_SUBOP_EMU_PRERET) {
+         emitPRERETEmu(f);
+         return;
+      }
+      break;
+   default:
+      break;
+   }
+
+   if (hasPred)
+      emitFlagsRd(i);
 
-   if (f && f->target.bb) {
+   if (hasTarg && f) {
       uint32_t pos;
 
       if (f->op == OP_CALL) {
          if (f->builtin) {
-            pos = 0; // XXX: TODO
+            pos = targ->getBuiltinOffset(f->target.builtin);
          } else {
             pos = f->target.fn->binPos;
          }
@@ -1149,6 +1471,13 @@ CodeEmitterNV50::emitFlow(const Instruction *i, uint8_t flowOp)
 
       code[0] |= ((pos >>  2) & 0xffff) << 11;
       code[1] |= ((pos >> 18) & 0x003f) << 14;
+
+      RelocEntry::Type relocTy;
+
+      relocTy = f->builtin ? RelocEntry::TYPE_BUILTIN : RelocEntry::TYPE_CODE;
+
+      addReloc(relocTy, 0, pos, 0x07fff800, 9);
+      addReloc(relocTy, 1, pos, 0x000fc000, -4);
    }
 }
 
@@ -1164,10 +1493,15 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
       return false;
    }
 
+   if (insn->bb->getProgram()->dbgFlags & NV50_IR_DEBUG_BASIC) {
+      INFO("EMIT: "); insn->print();
+   }
+
    switch (insn->op) {
    case OP_MOV:
       emitMOV(insn);
       break;
+   case OP_EXIT:
    case OP_NOP:
    case OP_JOIN:
       emitNOP();
@@ -1191,6 +1525,8 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
    case OP_SUB:
       if (isFloatType(insn->dType))
          emitFADD(insn);
+      else if (insn->getDef(0)->reg.file == FILE_ADDRESS)
+         emitAADD(insn);
       else
          emitUADD(insn);
       break;
@@ -1198,18 +1534,30 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
       if (isFloatType(insn->dType))
          emitFMUL(insn);
       else
-         emitUMUL(insn);
+         emitIMUL(insn);
       break;
    case OP_MAD:
    case OP_FMA:
-      emitFMAD(insn);
+      if (isFloatType(insn->dType))
+         emitFMAD(insn);
+      else
+         emitIMAD(insn);
       break;
+   case OP_NOT:
+      emitNOT(insn);
       break;
    case OP_AND:
    case OP_OR:
    case OP_XOR:
       emitLogicOp(insn);
       break;
+   case OP_SHL:
+   case OP_SHR:
+      emitShift(insn);
+      break;
+   case OP_SET:
+      emitSET(insn);
+      break;
    case OP_MIN:
    case OP_MAX:
       emitMINMAX(insn);
@@ -1217,9 +1565,22 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
    case OP_CEIL:
    case OP_FLOOR:
    case OP_TRUNC:
-   case OP_CVT:
+   case OP_ABS:
+   case OP_NEG:
+   case OP_SAT:
       emitCVT(insn);
       break;
+   case OP_CVT:
+      if (insn->def(0).getFile() == FILE_ADDRESS)
+         emitARL(insn, 0);
+      else
+      if (insn->def(0).getFile() == FILE_FLAGS ||
+          insn->src(0).getFile() == FILE_FLAGS ||
+          insn->src(0).getFile() == FILE_ADDRESS)
+         emitMOV(insn);
+      else
+         emitCVT(insn);
+      break;
    case OP_RCP:
       emitSFnOp(insn, 0);
       break;
@@ -1245,8 +1606,12 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
    case OP_TEX:
    case OP_TXB:
    case OP_TXL:
+   case OP_TXF:
       emitTEX(insn->asTex());
       break;
+   case OP_TXQ:
+      emitTXQ(insn->asTex());
+      break;
    case OP_EMIT:
    case OP_RESTART:
       emitOUT(insn);
@@ -1285,15 +1650,15 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
       emitQUADOP(insn, insn->lanes, insn->subOp);
       break;
    case OP_DFDX:
-      emitQUADOP(insn, 4, insn->src[0].mod.neg() ? 0x66 : 0x99);
+      emitQUADOP(insn, 4, insn->src(0).mod.neg() ? 0x66 : 0x99);
       break;
    case OP_DFDY:
-      emitQUADOP(insn, 5, insn->src[0].mod.neg() ? 0x5a : 0xa5);
+      emitQUADOP(insn, 5, insn->src(0).mod.neg() ? 0x5a : 0xa5);
       break;
    case OP_PHI:
    case OP_UNION:
    case OP_CONSTRAINT:
-      ERROR("operation should have been eliminated");
+      ERROR("operation should have been eliminated\n");
       return false;
    case OP_EXP:
    case OP_LOG:
@@ -1310,16 +1675,16 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
       ERROR("operation should have been lowered\n");
       return false;
    default:
-      ERROR("unknow op\n");
+      ERROR("unknown op: %u\n", insn->op);
       return false;
    }
-   if (insn->join)
+   if (insn->join || insn->op == OP_JOIN)
       code[1] |= 0x2;
    else
-   if (insn->exit)
+   if (insn->exit || insn->op == OP_EXIT)
       code[1] |= 0x1;
 
-   assert((insn->encSize == 8) == (code[1] & 1));
+   assert((insn->encSize == 8) == (code[0] & 1));
 
    code += insn->encSize / 4;
    codeSize += insn->encSize;
@@ -1331,20 +1696,147 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
 {
    const Target::OpInfo &info = targ->getOpInfo(i);
 
-   if (info.minEncSize == 8)
+   if (info.minEncSize > 4)
+      return 8;
+
+   // check constraints on dst and src operands
+   for (int d = 0; i->defExists(d); ++d) {
+      if (i->def(d).rep()->reg.data.id > 63 ||
+          i->def(d).rep()->reg.file != FILE_GPR)
+         return 8;
+   }
+
+   for (int s = 0; i->srcExists(s); ++s) {
+      DataFile sf = i->src(s).getFile();
+      if (sf != FILE_GPR)
+         if (sf != FILE_SHADER_INPUT || progType != Program::TYPE_FRAGMENT)
+            return 8;
+      if (i->src(s).rep()->reg.data.id > 63)
+         return 8;
+   }
+
+   // check modifiers & rounding
+   if (i->join || i->lanes != 0xf || i->exit)
+      return 8;
+   if (i->op == OP_MUL && i->rnd != ROUND_N)
       return 8;
 
-   return 4;
+   if (i->asTex())
+      return 8; // TODO: short tex encoding
+
+   // check constraints on short MAD
+   if (info.srcNr >= 2 && i->srcExists(2)) {
+      if (i->saturate || i->src(2).mod)
+         return 8;
+      if ((i->src(0).mod ^ i->src(1).mod) ||
+          (i->src(0).mod | i->src(1).mod).abs())
+         return 8;
+      if (!i->defExists(0) ||
+          i->def(0).rep()->reg.data.id != i->src(2).rep()->reg.data.id)
+         return 8;
+   }
+
+   return info.minEncSize;
+}
+
+// Change the encoding size of an instruction after BBs have been scheduled.
+static void
+makeInstructionLong(Instruction *insn)
+{
+   if (insn->encSize == 8)
+      return;
+   Function *fn = insn->bb->getFunction();
+   int n = 0;
+   int adj = 4;
+
+   for (Instruction *i = insn->next; i && i->encSize == 4; ++n, i = i->next);
+
+   if (n & 1) {
+      adj = 8;
+      insn->next->encSize = 8;
+   } else
+   if (insn->prev && insn->prev->encSize == 4) {
+      adj = 8;
+      insn->prev->encSize = 8;
+   }
+   insn->encSize = 8;
+
+   for (int i = fn->bbCount - 1; i >= 0 && fn->bbArray[i] != insn->bb; --i) {
+      fn->bbArray[i]->binPos += 4;
+   }
+   fn->binSize += adj;
+   insn->bb->binSize += adj;
+}
+
+static bool
+trySetExitModifier(Instruction *insn)
+{
+   if (insn->op == OP_DISCARD ||
+       insn->op == OP_QUADON ||
+       insn->op == OP_QUADPOP)
+      return false;
+   for (int s = 0; insn->srcExists(s); ++s)
+      if (insn->src(s).getFile() == FILE_IMMEDIATE)
+         return false;
+   if (insn->asFlow()) {
+      if (insn->op == OP_CALL) // side effects !
+         return false;
+      if (insn->getPredicate()) // cannot do conditional exit (or can we ?)
+         return false;
+      insn->op = OP_EXIT;
+   }
+   insn->exit = 1;
+   makeInstructionLong(insn);
+   return true;
+}
+
+static void
+replaceExitWithModifier(Function *func)
+{
+   BasicBlock *epilogue = BasicBlock::get(func->cfgExit);
+
+   if (!epilogue->getExit() ||
+       epilogue->getExit()->op != OP_EXIT) // only main will use OP_EXIT
+      return;
+
+   if (epilogue->getEntry()->op != OP_EXIT) {
+      Instruction *insn = epilogue->getExit()->prev;
+      if (!insn || !trySetExitModifier(insn))
+         return;
+      insn->exit = 1;
+   } else {
+      for (Graph::EdgeIterator ei = func->cfgExit->incident();
+           !ei.end(); ei.next()) {
+         BasicBlock *bb = BasicBlock::get(ei.getNode());
+         Instruction *i = bb->getExit();
+
+         if (!i || !trySetExitModifier(i))
+            return;
+      }
+   }
+   epilogue->binSize -= 8;
+   func->binSize -= 8;
+   delete_Instruction(func->getProgram(), epilogue->getExit());
+}
+
+void
+CodeEmitterNV50::prepareEmission(Function *func)
+{
+   CodeEmitter::prepareEmission(func);
+
+   replaceExitWithModifier(func);
 }
 
-CodeEmitterNV50::CodeEmitterNV50(const Target *target) : targ(target)
+CodeEmitterNV50::CodeEmitterNV50(const TargetNV50 *target) : CodeEmitter(target)
 {
+   targ = target; // specialized
    code = NULL;
    codeSize = codeSizeLimit = 0;
+   relocInfo = NULL;
 }
 
 CodeEmitter *
-Target::getCodeEmitter(Program::Type type)
+TargetNV50::getCodeEmitter(Program::Type type)
 {
    CodeEmitterNV50 *emit = new CodeEmitterNV50(this);
    emit->setProgramType(type);
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp
new file mode 100644
index 00000000000..30d8acee3bc
--- /dev/null
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp
@@ -0,0 +1,1118 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50/codegen/nv50_ir.h"
+#include "nv50/codegen/nv50_ir_build_util.h"
+
+#include "nv50_ir_target_nv50.h"
+
+namespace nv50_ir {
+
+// nv50 doesn't support 32 bit integer multiplication
+//
+//       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
+// -------------------
+//    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
+// ah*bh 00 00                 (           carry1) << 16 + ( carry2)
+//       al*bl
+//    ah*bl 00
+//
+// fffe0001 + fffe0001
+static bool
+expandIntegerMUL(BuildUtil *bld, Instruction *mul)
+{
+   const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
+
+   DataType fTy = mul->sType; // full type
+   DataType hTy;
+   switch (fTy) {
+   case TYPE_S32: hTy = TYPE_S16; break;
+   case TYPE_U32: hTy = TYPE_U16; break;
+   case TYPE_U64: hTy = TYPE_U32; break;
+   case TYPE_S64: hTy = TYPE_S32; break;
+   default:
+      return false;
+   }
+   unsigned int fullSize = typeSizeof(fTy);
+   unsigned int halfSize = typeSizeof(hTy);
+
+   Instruction *i[9];
+
+   Value *a[2] = { bld->getSSA(halfSize), bld->getSSA(halfSize) };
+   Value *b[2] = { bld->getSSA(halfSize), bld->getSSA(halfSize) };
+   Value *c[2];
+   Value *t[4];
+   for (int j = 0; j < 4; ++j)
+      t[j] = bld->getSSA(fullSize);
+
+   (i[0] = bld->mkOp1(OP_SPLIT, fTy, a[0], mul->getSrc(0)))->setDef(1, a[1]);
+   (i[1] = bld->mkOp1(OP_SPLIT, fTy, b[0], mul->getSrc(1)))->setDef(1, b[1]);
+
+   i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
+   i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+   i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
+   i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+
+   if (highResult) {
+      Value *r[3];
+      Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
+      c[0] = bld->getSSA(1, FILE_FLAGS);
+      c[1] = bld->getSSA(1, FILE_FLAGS);
+      for (int j = 0; j < 3; ++j)
+         r[j] = bld->getSSA(fullSize);
+
+      i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
+      i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
+      bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
+      i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
+
+      // set carry defs / sources
+      i[3]->setFlagsDef(1, c[0]);
+      i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
+      i[6]->setPredicate(CC_C, c[0]);
+      i[5]->setFlagsSrc(3, c[1]);
+   } else {
+      bld->mkMov(mul->getDef(0), t[3]);
+   }
+   delete_Instruction(bld->getProgram(), mul);
+
+   for (int j = 2; j <= (highResult ? 5 : 4); ++j)
+      i[j]->sType = hTy;
+
+   return true;
+}
+
+#define QOP_ADD  0
+#define QOP_SUBR 1
+#define QOP_SUB  2
+#define QOP_MOV2 3
+
+#define QUADOP(q, r, s, t)            \
+   ((QOP_##q << 0) | (QOP_##r << 2) | \
+    (QOP_##s << 4) | (QOP_##t << 6))
+
+class NV50LegalizePostRA : public Pass
+{
+private:
+   virtual bool visit(Function *);
+   virtual bool visit(BasicBlock *);
+
+   void handlePRERET(FlowInstruction *);
+   void replaceZero(Instruction *);
+   void split64BitOp(Instruction *);
+
+   LValue *r63;
+};
+
+bool
+NV50LegalizePostRA::visit(Function *fn)
+{
+   Program *prog = fn->getProgram();
+
+   r63 = new_LValue(fn, FILE_GPR);
+   r63->reg.data.id = 63;
+
+   // this is actually per-program, but we can do it all on visiting main()
+   std::list<Instruction *> *outWrites =
+      reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+
+   if (outWrites) {
+      for (std::list<Instruction *>::iterator it = outWrites->begin();
+           it != outWrites->end(); ++it)
+         (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
+      // instructions will be deleted on exit
+      outWrites->clear();
+   }
+
+   return true;
+}
+
+void
+NV50LegalizePostRA::replaceZero(Instruction *i)
+{
+   for (int s = 0; i->srcExists(s); ++s) {
+      ImmediateValue *imm = i->getSrc(s)->asImm();
+      if (imm && imm->reg.data.u64 == 0)
+         i->setSrc(s, r63);
+   }
+}
+
+void
+NV50LegalizePostRA::split64BitOp(Instruction *i)
+{
+   if (i->dType == TYPE_F64) {
+      if (i->op == OP_MAD)
+         i->op = OP_FMA;
+      if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA ||
+          i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX ||
+          i->op == OP_SET)
+         return;
+      i->dType = i->sType = TYPE_U32;
+
+      i->bb->insertAfter(i, cloneForward(func, i));
+   }
+}
+
+// Emulate PRERET: jump to the target and call to the origin from there
+//
+// WARNING: atm only works if BBs are affected by at most a single PRERET
+//
+// BB:0
+// preret BB:3
+// (...)
+// BB:3
+// (...)
+//             --->
+// BB:0
+// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
+// (...)
+// BB:3
+// bra BB:3 + n1 (skip the call)
+// call BB:0 + n2 (skip bra at beginning of BB:0)
+// (...)
+void
+NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
+{
+   BasicBlock *bbE = pre->bb;
+   BasicBlock *bbT = pre->target.bb;
+
+   pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
+   bbE->remove(pre);
+   bbE->insertHead(pre);
+
+   Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
+   Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
+
+   bbT->insertHead(call);
+   bbT->insertHead(skip);
+
+   // NOTE: maybe split blocks to prevent the instructions from moving ?
+
+   skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
+   call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
+}
+
+bool
+NV50LegalizePostRA::visit(BasicBlock *bb)
+{
+   Instruction *i, *next;
+
+   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
+   for (i = bb->getFirst(); i; i = next) {
+      next = i->next;
+      if (i->isNop()) {
+         bb->remove(i);
+      } else
+      if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
+         handlePRERET(i->asFlow());
+      } else {
+         if (i->op != OP_MOV && i->op != OP_PFETCH &&
+             (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
+            replaceZero(i);
+         if (typeSizeof(i->dType) == 8)
+            split64BitOp(i);
+      }
+   }
+   if (!bb->getEntry())
+      return true;
+
+   return true;
+}
+
+class NV50LegalizeSSA : public Pass
+{
+public:
+   NV50LegalizeSSA(Program *);
+
+   virtual bool visit(BasicBlock *bb);
+
+private:
+   void propagateWriteToOutput(Instruction *);
+   void handleDIV(Instruction *);
+   void handleMOD(Instruction *);
+   void handleMUL(Instruction *);
+   void handleAddrDef(Instruction *);
+
+   inline bool isARL(const Instruction *) const;
+
+   BuildUtil bld;
+
+   std::list<Instruction *> *outWrites;
+};
+
+NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
+{
+   bld.setProgram(prog);
+
+   if (prog->optLevel >= 2 &&
+       (prog->getType() == Program::TYPE_GEOMETRY ||
+        prog->getType() == Program::TYPE_VERTEX))
+      outWrites =
+         reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+   else
+      outWrites = NULL;
+}
+
+void
+NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
+{
+   if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
+      return;
+
+   // check def instruction can store
+   Instruction *di = st->getSrc(1)->defs.front()->getInsn();
+
+   // TODO: move exports (if beneficial) in common opt pass
+   if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
+      return;
+   for (int s = 0; di->srcExists(s); ++s)
+      if (di->src(s).getFile() == FILE_IMMEDIATE)
+         return;
+
+   // We cannot set defs to non-lvalues before register allocation, so
+   // save & remove (to save registers) the exports and replace later.
+   outWrites->push_back(st);
+   st->bb->remove(st);
+}
+
+bool
+NV50LegalizeSSA::isARL(const Instruction *i) const
+{
+   ImmediateValue imm;
+
+   if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
+      return false;
+   if (!i->src(1).getImmediate(imm))
+      return false;
+   return imm.isInteger(0);
+}
+
+void
+NV50LegalizeSSA::handleAddrDef(Instruction *i)
+{
+   Instruction *arl;
+
+   i->getDef(0)->reg.size = 2; // $aX are only 16 bit
+
+   // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
+   if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
+      if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
+         return;
+      if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
+         return;
+   }
+
+   // turn $a sources into $r sources (can't operate on $a)
+   for (int s = 0; i->srcExists(s); ++s) {
+      Value *a = i->getSrc(s);
+      Value *r;
+      if (a->reg.file == FILE_ADDRESS) {
+         if (a->getInsn() && isARL(a->getInsn())) {
+            i->setSrc(s, a->getInsn()->getSrc(0));
+         } else {
+            bld.setPosition(i, false);
+            r = bld.getSSA();
+            bld.mkMov(r, a);
+            i->setSrc(s, r);
+         }
+      }
+   }
+   if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
+      return;
+
+   // turn result back into $a
+   bld.setPosition(i, true);
+   arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
+   i->setDef(0, arl->getSrc(0));
+}
+
+void
+NV50LegalizeSSA::handleMUL(Instruction *mul)
+{
+   if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
+      return;
+   Value *def = mul->getDef(0);
+   Value *pred = mul->getPredicate();
+   CondCode cc = mul->cc;
+   if (pred)
+      mul->setPredicate(CC_ALWAYS, NULL);
+
+   if (mul->op == OP_MAD) {
+      Instruction *add = mul;
+      bld.setPosition(add, false);
+      Value *res = cloneShallow(func, mul->getDef(0));
+      mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
+      add->op = OP_ADD;
+      add->setSrc(0, mul->getDef(0));
+      add->setSrc(1, add->getSrc(2));
+      for (int s = 2; add->srcExists(s); ++s)
+         add->setSrc(s, NULL);
+      mul->subOp = add->subOp;
+      add->subOp = 0;
+   }
+   expandIntegerMUL(&bld, mul);
+   if (pred)
+      def->getInsn()->setPredicate(cc, pred);
+}
+
+// Use f32 division: first compute an approximate result, use it to reduce
+// the dividend, which should then be representable as f32, divide the reduced
+// dividend, and add the quotients.
+void
+NV50LegalizeSSA::handleDIV(Instruction *div)
+{
+   const DataType ty = div->sType;
+
+   if (ty != TYPE_U32 && ty != TYPE_S32)
+      return;
+
+   Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
+
+   bld.setPosition(div, false);
+
+   Value *a, *af = bld.getSSA();
+   Value *b, *bf = bld.getSSA();
+
+   bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
+   bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
+
+   if (isSignedType(ty)) {
+      af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
+      bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
+      a = bld.getSSA();
+      b = bld.getSSA();
+      bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
+      bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
+   } else {
+      a = div->getSrc(0);
+      b = div->getSrc(1);
+   }
+
+   bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
+   bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
+
+   bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
+   bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
+
+   // get error of 1st result
+   expandIntegerMUL(&bld,
+      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
+   bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
+
+   bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
+
+   bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
+   bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
+      ->rnd = ROUND_Z;
+   bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
+
+   // correction: if modulus >= divisor, add 1
+   expandIntegerMUL(&bld,
+      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
+   bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
+   bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
+   if (!isSignedType(ty)) {
+      div->op = OP_SUB;
+      div->setSrc(0, q);
+      div->setSrc(1, s);
+   } else {
+      t = q;
+      bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
+      s = bld.getSSA();
+      t = bld.getSSA();
+      // fix the sign
+      bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
+         ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
+      bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
+      bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
+
+      div->op = OP_UNION;
+      div->setSrc(0, s);
+      div->setSrc(1, t);
+   }
+}
+
+void
+NV50LegalizeSSA::handleMOD(Instruction *mod)
+{
+   if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
+      return;
+   bld.setPosition(mod, false);
+
+   Value *q = bld.getSSA();
+   Value *m = bld.getSSA();
+
+   bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
+   handleDIV(q->getInsn());
+
+   bld.setPosition(mod, false);
+   expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
+
+   mod->op = OP_SUB;
+   mod->setSrc(1, m);
+}
+
+bool
+NV50LegalizeSSA::visit(BasicBlock *bb)
+{
+   Instruction *insn, *next;
+   // skipping PHIs (don't pass them to handleAddrDef) !
+   for (insn = bb->getEntry(); insn; insn = next) {
+      next = insn->next;
+
+      switch (insn->op) {
+      case OP_EXPORT:
+         if (outWrites)
+            propagateWriteToOutput(insn);
+         break;
+      case OP_DIV:
+         handleDIV(insn);
+         break;
+      case OP_MOD:
+         handleMOD(insn);
+         break;
+      case OP_MAD:
+      case OP_MUL:
+         handleMUL(insn);
+         break;
+      default:
+         break;
+      }
+
+      if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
+         handleAddrDef(insn);
+   }
+   return true;
+}
+
+class NV50LoweringPreSSA : public Pass
+{
+public:
+   NV50LoweringPreSSA(Program *);
+
+private:
+   virtual bool visit(Instruction *);
+   virtual bool visit(Function *);
+
+   bool handleRDSV(Instruction *);
+   bool handleWRSV(Instruction *);
+
+   bool handleEXPORT(Instruction *);
+
+   bool handleMUL(Instruction *);
+   bool handleDIV(Instruction *);
+   bool handleSQRT(Instruction *);
+   bool handlePOW(Instruction *);
+
+   bool handleSET(Instruction *);
+   bool handleSLCT(CmpInstruction *);
+   bool handleSELP(Instruction *);
+
+   bool handleTEX(TexInstruction *);
+   bool handleTXB(TexInstruction *); // I really
+   bool handleTXL(TexInstruction *); // hate
+   bool handleTXD(TexInstruction *); // these 3
+
+   bool handleCALL(Instruction *);
+   bool handlePRECONT(Instruction *);
+   bool handleCONT(Instruction *);
+
+   void checkPredicate(Instruction *);
+
+private:
+   const Target *const targ;
+
+   BuildUtil bld;
+
+   Value *tid;
+};
+
+NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
+   targ(prog->getTarget()), tid(NULL)
+{
+   bld.setProgram(prog);
+}
+
+bool
+NV50LoweringPreSSA::visit(Function *f)
+{
+   BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
+
+   if (prog->getType() == Program::TYPE_COMPUTE) {
+      // Add implicit "thread id" argument in $r0 to the function
+      Value *arg = new_LValue(func, FILE_GPR);
+      arg->reg.data.id = 0;
+      f->ins.push_back(arg);
+
+      bld.setPosition(root, false);
+      tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
+   }
+
+   return true;
+}
+
+// move array source to first slot, convert to u16, add indirections
+bool
+NV50LoweringPreSSA::handleTEX(TexInstruction *i)
+{
+   const int arg = i->tex.target.getArgCount();
+   const int dref = arg;
+   const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
+
+   // dref comes before bias/lod
+   if (i->tex.target.isShadow())
+      if (i->op == OP_TXB || i->op == OP_TXL)
+         i->swapSources(dref, lod);
+
+   // array index must be converted to u32
+   if (i->tex.target.isArray()) {
+      Value *layer = i->getSrc(arg - 1);
+      LValue *src = new_LValue(func, FILE_GPR);
+      bld.mkCvt(OP_CVT, TYPE_U16, src, TYPE_F32, layer);
+      i->setSrc(arg - 1, src);
+
+      if (i->tex.target.isCube()) {
+         // Value *face = layer;
+         Value *x, *y;
+         x = new_LValue(func, FILE_GPR);
+         y = new_LValue(func, FILE_GPR);
+         layer = new_LValue(func, FILE_GPR);
+
+         i->tex.target = TEX_TARGET_2D_ARRAY;
+
+         // TODO: use TEXPREP to convert x,y,z,face -> x,y,layer
+         bld.mkMov(x, i->getSrc(0));
+         bld.mkMov(y, i->getSrc(1));
+         bld.mkMov(layer, i->getSrc(3));
+
+         i->setSrc(0, x);
+         i->setSrc(1, y);
+         i->setSrc(2, layer);
+         i->setSrc(3, i->getSrc(4));
+         i->setSrc(4, NULL);
+      }
+   }
+
+   // texel offsets are 3 immediate fields in the instruction,
+   // nv50 cannot do textureGatherOffsets
+   assert(i->tex.useOffsets <= 1);
+
+   return true;
+}
+
+// Bias must be equal for all threads of a quad or lod calculation will fail.
+//
+// The lanes of a quad are grouped by the bit in the condition register they
+// have set, which is selected by differing bias values.
+// Move the input values for TEX into a new register set for each group and
+// execute TEX only for a specific group.
+// We always need to use 4 new registers for the inputs/outputs because the
+// implicitly calculated derivatives must be correct.
+//
+// TODO: move to SSA phase so we can easily determine whether bias is constant
+bool
+NV50LoweringPreSSA::handleTXB(TexInstruction *i)
+{
+   const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
+   int l, d;
+
+   handleTEX(i);
+   Value *bias = i->getSrc(i->tex.target.getArgCount());
+   if (bias->isUniform())
+      return true;
+
+   Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
+                                 bld.loadImm(NULL, 1));
+   bld.setPosition(cond, false);
+
+   for (l = 1; l < 4; ++l) {
+      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
+      Value *bit = bld.getSSA();
+      Value *pred = bld.getScratch(1, FILE_FLAGS);
+      Value *imm = bld.loadImm(NULL, (1 << l));
+      bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
+      bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
+      cond->setSrc(l, bit);
+   }
+   Value *flags = bld.getScratch(1, FILE_FLAGS);
+   bld.setPosition(cond, true);
+   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
+
+   Instruction *tex[4];
+   for (l = 0; l < 4; ++l) {
+      (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
+      bld.insert(tex[l]);
+   }
+
+   Value *res[4][4];
+   for (d = 0; i->defExists(d); ++d)
+      res[0][d] = tex[0]->getDef(d);
+   for (l = 1; l < 4; ++l) {
+      for (d = 0; tex[l]->defExists(d); ++d) {
+         res[l][d] = cloneShallow(func, res[0][d]);
+         bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
+      }
+   }
+
+   for (d = 0; i->defExists(d); ++d) {
+      Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
+      for (l = 0; l < 4; ++l)
+         dst->setSrc(l, res[l][d]);
+   }
+   delete_Instruction(prog, i);
+   return true;
+}
+
+// LOD must be equal for all threads of a quad.
+// Unlike with TXB, here we can just diverge since there's no LOD calculation
+// that would require all 4 threads' sources to be set up properly.
+bool
+NV50LoweringPreSSA::handleTXL(TexInstruction *i)
+{
+   handleTEX(i);
+   Value *lod = i->getSrc(i->tex.target.getArgCount());
+   if (lod->isUniform())
+      return true;
+
+   BasicBlock *currBB = i->bb;
+   BasicBlock *texiBB = i->bb->splitBefore(i, false);
+   BasicBlock *joinBB = i->bb->splitAfter(i);
+
+   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
+
+   for (int l = 0; l <= 3; ++l) {
+      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
+      Value *pred = bld.getScratch(1, FILE_FLAGS);
+      bld.setPosition(currBB, true);
+      bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
+      bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
+      currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
+      if (l <= 2) {
+         BasicBlock *laneBB = new BasicBlock(func);
+         currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
+         currBB = laneBB;
+      }
+   }
+   bld.setPosition(joinBB, false);
+   bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleTXD(TexInstruction *i)
+{
+   static const uint8_t qOps[4][2] =
+   {
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
+   };
+   Value *def[4][4];
+   Value *crd[3];
+   Instruction *tex;
+   Value *zero = bld.loadImm(bld.getSSA(), 0);
+   int l, c;
+   const int dim = i->tex.target.getDim();
+
+   handleTEX(i);
+   i->op = OP_TEX; // no need to clone dPdx/dPdy later
+
+   for (c = 0; c < dim; ++c)
+      crd[c] = bld.getScratch();
+
+   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
+   for (l = 0; l < 4; ++l) {
+      // mov coordinates from lane l to all lanes
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
+      // add dPdx from lane l to lanes dx
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
+      // add dPdy from lane l to lanes dy
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // texture
+      bld.insert(tex = cloneForward(func, i));
+      for (c = 0; c < dim; ++c)
+         tex->setSrc(c, crd[c]);
+      // save results
+      for (c = 0; i->defExists(c); ++c) {
+         Instruction *mov;
+         def[c][l] = bld.getSSA();
+         mov = bld.mkMov(def[c][l], tex->getDef(c));
+         mov->fixed = 1;
+         mov->lanes = 1 << l;
+      }
+   }
+   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
+
+   for (c = 0; i->defExists(c); ++c) {
+      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
+      for (l = 0; l < 4; ++l)
+         u->setSrc(l, def[c][l]);
+   }
+
+   i->bb->remove(i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSET(Instruction *i)
+{
+   if (i->dType == TYPE_F32) {
+      bld.setPosition(i, true);
+      i->dType = TYPE_U32;
+      bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
+      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
+   }
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
+{
+   Value *src0 = bld.getSSA();
+   Value *src1 = bld.getSSA();
+   Value *pred = bld.getScratch(1, FILE_FLAGS);
+
+   Value *v0 = i->getSrc(0);
+   Value *v1 = i->getSrc(1);
+   // XXX: these probably shouldn't be immediates in the first place ...
+   if (v0->asImm())
+      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
+   if (v1->asImm())
+      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
+
+   bld.setPosition(i, true);
+   bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
+   bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
+   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
+
+   bld.setPosition(i, false);
+   i->op = OP_SET;
+   i->setFlagsDef(0, pred);
+   i->dType = TYPE_U8;
+   i->setSrc(0, i->getSrc(2));
+   i->setSrc(2, NULL);
+   i->setSrc(1, bld.loadImm(NULL, 0));
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSELP(Instruction *i)
+{
+   Value *src0 = bld.getSSA();
+   Value *src1 = bld.getSSA();
+
+   Value *v0 = i->getSrc(0);
+   Value *v1 = i->getSrc(1);
+   if (v0->asImm())
+      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
+   if (v1->asImm())
+      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
+
+   bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
+   bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
+   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
+   delete_Instruction(prog, i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleWRSV(Instruction *i)
+{
+   Symbol *sym = i->getSrc(0)->asSym();
+
+   // these are all shader outputs, $sreg are not writeable
+   uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
+   if (addr >= 0x400)
+      return false;
+   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
+
+   bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
+
+   bld.getBB()->remove(i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleCALL(Instruction *i)
+{
+   if (prog->getType() == Program::TYPE_COMPUTE) {
+      // Add implicit "thread id" argument in $r0 to the function
+      i->setSrc(i->srcCount(), tid);
+   }
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handlePRECONT(Instruction *i)
+{
+   delete_Instruction(prog, i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleCONT(Instruction *i)
+{
+   i->op = OP_BRA;
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleRDSV(Instruction *i)
+{
+   Symbol *sym = i->getSrc(0)->asSym();
+   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
+   Value *def = i->getDef(0);
+   SVSemantic sv = sym->reg.data.sv.sv;
+   int idx = sym->reg.data.sv.index;
+
+   if (addr >= 0x400) // mov $sreg
+      return true;
+
+   switch (sv) {
+   case SV_POSITION:
+      assert(prog->getType() == Program::TYPE_FRAGMENT);
+      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
+      break;
+   case SV_FACE:
+      bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
+      if (i->dType == TYPE_F32) {
+         bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
+         bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
+      }
+      break;
+   case SV_NCTAID:
+   case SV_CTAID:
+   case SV_NTID:
+      if ((sv == SV_NCTAID && idx >= 2) ||
+          (sv == SV_NTID && idx >= 3)) {
+         bld.mkMov(def, bld.mkImm(1));
+      } else if (sv == SV_CTAID && idx >= 2) {
+         bld.mkMov(def, bld.mkImm(0));
+      } else {
+         Value *x = bld.getSSA(2);
+         bld.mkOp1(OP_LOAD, TYPE_U16, x,
+                   bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
+         bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
+      }
+      break;
+   case SV_TID:
+      if (idx == 0) {
+         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
+      } else if (idx == 1) {
+         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
+         bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
+      } else if (idx == 2) {
+         bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
+      } else {
+         bld.mkMov(def, bld.mkImm(0));
+      }
+      break;
+   default:
+      bld.mkFetch(i->getDef(0), i->dType,
+                  FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
+      break;
+   }
+   bld.getBB()->remove(i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleMUL(Instruction *i)
+{
+   if (!isFloatType(i->dType) && typeSizeof(i->sType) > 2)
+      return expandIntegerMUL(&bld, i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleDIV(Instruction *i)
+{
+   if (!isFloatType(i->dType))
+      return true;
+   bld.setPosition(i, false);
+   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
+   i->op = OP_MUL;
+   i->setSrc(1, rcp->getDef(0));
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSQRT(Instruction *i)
+{
+   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+                                bld.getSSA(), i->getSrc(0));
+   i->op = OP_MUL;
+   i->setSrc(1, rsq->getDef(0));
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handlePOW(Instruction *i)
+{
+   LValue *val = bld.getScratch();
+
+   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
+   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
+   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
+
+   i->op = OP_EX2;
+   i->setSrc(0, val);
+   i->setSrc(1, NULL);
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleEXPORT(Instruction *i)
+{
+   if (prog->getType() == Program::TYPE_FRAGMENT) {
+      if (i->getIndirect(0, 0)) {
+         // TODO: redirect to l[] here, load to GPRs at exit
+         return false;
+      } else {
+         int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
+
+         i->op = OP_MOV;
+         i->src(0).set(i->src(1));
+         i->setSrc(1, NULL);
+         i->setDef(0, new_LValue(func, FILE_GPR));
+         i->getDef(0)->reg.data.id = id;
+
+         prog->maxGPR = MAX2(prog->maxGPR, id);
+      }
+   }
+   return true;
+}
+
+// Set flags according to predicate and make the instruction read $cX.
+void
+NV50LoweringPreSSA::checkPredicate(Instruction *insn)
+{
+   Value *pred = insn->getPredicate();
+   Value *cdst;
+
+   if (!pred || pred->reg.file == FILE_FLAGS)
+      return;
+   cdst = bld.getSSA(1, FILE_FLAGS);
+
+   bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, cdst, bld.loadImm(NULL, 0), pred);
+
+   insn->setPredicate(insn->cc, cdst);
+}
+
+//
+// - add quadop dance for texturing
+// - put FP outputs in GPRs
+// - convert instruction sequences
+//
+bool
+NV50LoweringPreSSA::visit(Instruction *i)
+{
+   if (i->prev)
+      bld.setPosition(i->prev, true);
+   else
+   if (i->next)
+      bld.setPosition(i->next, false);
+   else
+      bld.setPosition(i->bb, true);
+
+   if (i->cc != CC_ALWAYS)
+      checkPredicate(i);
+
+   switch (i->op) {
+   case OP_TEX:
+   case OP_TXF:
+   case OP_TXG:
+      return handleTEX(i->asTex());
+   case OP_TXB:
+      return handleTXB(i->asTex());
+   case OP_TXL:
+      return handleTXL(i->asTex());
+   case OP_TXD:
+      return handleTXD(i->asTex());
+   case OP_EX2:
+      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
+      i->setSrc(0, i->getDef(0));
+      break;
+   case OP_SET:
+      return handleSET(i);
+   case OP_SLCT:
+      return handleSLCT(i->asCmp());
+   case OP_SELP:
+      return handleSELP(i);
+   case OP_POW:
+      return handlePOW(i);
+   case OP_MUL:
+      return handleMUL(i);
+   case OP_DIV:
+      return handleDIV(i);
+   case OP_SQRT:
+      return handleSQRT(i);
+   case OP_EXPORT:
+      return handleEXPORT(i);
+   case OP_RDSV:
+      return handleRDSV(i);
+   case OP_WRSV:
+      return handleWRSV(i);
+   case OP_CALL:
+      return handleCALL(i);
+   case OP_PRECONT:
+      return handlePRECONT(i);
+   case OP_CONT:
+      return handleCONT(i);
+   default:
+      break;
+   }
+   return true;
+}
+
+bool
+TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
+{
+   bool ret = false;
+
+   if (stage == CG_STAGE_PRE_SSA) {
+      NV50LoweringPreSSA pass(prog);
+      ret = pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_SSA) {
+      if (!prog->targetPriv)
+         prog->targetPriv = new std::list<Instruction *>();
+      NV50LegalizeSSA pass(prog);
+      ret = pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_POST_RA) {
+      NV50LegalizePostRA pass;
+      ret = pass.run(prog, false, true);
+      if (prog->targetPriv)
+         delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+   }
+   return ret;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp
index 61382336bc4..c6134465996 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp
@@ -265,7 +265,7 @@ int Modifier::print(char *buf, size_t size) const
 
    return pos;
 }
-   
+
 int LValue::print(char *buf, size_t size, DataType ty) const
 {
    const char *postFix = "";
@@ -278,14 +278,23 @@ int LValue::print(char *buf, size_t size, DataType ty) const
    switch (reg.file) {
    case FILE_GPR:
       r = 'r'; col = TXT_GPR;
-      if (reg.size == 8)
+      if (reg.size == 2) {
+         if (p == '$') {
+            postFix = (idx & 1) ? "h" : "l";
+            idx /= 2;
+         } else {
+            postFix = "s";
+         }
+      } else
+      if (reg.size == 8) {
          postFix = "d";
-      else
-      if (reg.size == 16)
+      } else
+      if (reg.size == 16) {
          postFix = "q";
-      else
-      if (reg.size == 12)
+      } else
+      if (reg.size == 12) {
          postFix = "t";
+      }
       break;
    case FILE_PREDICATE:
       r = 'p'; col = TXT_REGISTER;
@@ -419,7 +428,7 @@ void Instruction::print() const
       } else {
          PRINT("%s", CondCodeStr[cc]);
       }
-      if (pos > pre + 1)
+      if (pos > pre)
          SPACE();
       pos += getSrc(predSrc)->print(&buf[pos], BUFSZ - pos);
       PRINT(" %s", colour[TXT_INSN]);
@@ -489,6 +498,8 @@ void Instruction::print() const
       else
          pos += getSrc(s)->print(&buf[pos], BUFSZ - pos, sType);
    }
+   if (exit)
+      PRINT("%s exit", colour[TXT_INSN]);
 
    PRINT("%s", colour[TXT_DEFAULT]);
 
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp
index 598e0d26384..27b9610ed52 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp
@@ -54,6 +54,7 @@ const uint8_t Target::operationSrcNr[OP_LAST + 1] =
 
 
 extern Target *getTargetNVC0(unsigned int chipset);
+extern Target *getTargetNV50(unsigned int chipset);
 
 Target *Target::create(unsigned int chipset)
 {
@@ -65,6 +66,7 @@ Target *Target::create(unsigned int chipset)
    case 0x80:
    case 0x90:
    case 0xa0:
+      return getTargetNV50(chipset);
    default:
       ERROR("unsupported target: NV%x\n", chipset);
       return 0;
@@ -76,6 +78,10 @@ void Target::destroy(Target *targ)
    delete targ;
 }
 
+CodeEmitter::CodeEmitter(const Target *target) : targ(target)
+{
+}
+
 void
 CodeEmitter::setCodeLocation(void *ptr, uint32_t size)
 {
@@ -261,6 +267,10 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
 
    emitSymbolTable(info);
 
+   // the nvc0 driver will print the binary iself together with the header
+   if ((dbgFlags & NV50_IR_DEBUG_BASIC) && getTarget()->getChipset() < 0xc0)
+      emit->printBinary();
+
    delete emit;
    return true;
 }
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target.h b/src/gallium/drivers/nv50/codegen/nv50_ir_target.h
index 6640198f090..88996ebbde3 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_target.h
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target.h
@@ -61,6 +61,8 @@ struct RelocInfo
 class CodeEmitter
 {
 public:
+   CodeEmitter(const Target *);
+
    // returns whether the instruction was encodable and written
    virtual bool emitInstruction(Instruction *) = 0;
 
@@ -76,12 +78,14 @@ public:
    inline void *getRelocInfo() const { return relocInfo; }
 
    void prepareEmission(Program *);
-   void prepareEmission(Function *);
+   virtual void prepareEmission(Function *);
    virtual void prepareEmission(BasicBlock *);
 
    void printBinary() const;
 
 protected:
+   const Target *targ;
+
    uint32_t *code;
    uint32_t codeSize;
    uint32_t codeSizeLimit;
@@ -105,6 +109,8 @@ public:
    // The address chosen is supplied to the relocation routine.
    virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const = 0;
 
+   virtual void parseDriverInfo(const struct nv50_ir_prog_info *info) { }
+
    virtual bool runLegalizePass(Program *, CGStage stage) const = 0;
 
 public:
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp
new file mode 100644
index 00000000000..a64f7f72255
--- /dev/null
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp
@@ -0,0 +1,531 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50_ir_target_nv50.h"
+
+namespace nv50_ir {
+
+Target *getTargetNV50(unsigned int chipset)
+{
+   return new TargetNV50(chipset);
+}
+
+TargetNV50::TargetNV50(unsigned int card)
+{
+   chipset = card;
+
+   wposMask = 0;
+   for (unsigned int i = 0; i <= SV_LAST; ++i)
+      sysvalLocation[i] = ~0;
+
+   initOpInfo();
+}
+
+#if 0
+// BULTINS / LIBRARY FUNCTIONS:
+
+// TODO
+static const uint32_t nvc0_builtin_code[] =
+{
+};
+
+static const uint16_t nvc0_builtin_offsets[NV50_BUILTIN_COUNT] =
+{
+};
+#endif
+
+void
+TargetNV50::getBuiltinCode(const uint32_t **code, uint32_t *size) const
+{
+   *code = NULL;
+   *size = 0;
+}
+
+uint32_t
+TargetNV50::getBuiltinOffset(int builtin) const
+{
+   return 0;
+}
+
+struct opProperties
+{
+   operation op;
+   unsigned int mNeg    : 4;
+   unsigned int mAbs    : 4;
+   unsigned int mNot    : 4;
+   unsigned int mSat    : 4;
+   unsigned int fConst  : 3;
+   unsigned int fShared : 3;
+   unsigned int fAttrib : 3;
+   unsigned int fImm    : 3;
+};
+
+static const struct opProperties _initProps[] =
+{
+   //           neg  abs  not  sat  c[]  s[], a[], imm
+   { OP_ADD,    0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 },
+   { OP_SUB,    0x3, 0x0, 0x0, 0x0, 0x2, 0x1, 0x1, 0x2 },
+   { OP_MUL,    0x3, 0x0, 0x0, 0x0, 0x2, 0x1, 0x1, 0x2 },
+   { OP_MAX,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
+   { OP_MIN,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
+   { OP_MAD,    0x7, 0x0, 0x0, 0x0, 0x6, 0x1, 0x1, 0x0 }, // special constraint
+   { OP_ABS,    0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x0 },
+   { OP_NEG,    0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x1, 0x0 },
+   { OP_CVT,    0x1, 0x1, 0x0, 0x8, 0x0, 0x1, 0x1, 0x0 },
+   { OP_AND,    0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_OR,     0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_XOR,    0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_SHL,    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_SHR,    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_SET,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
+   { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_LG2,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_RCP,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_RSQ,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_DFDX,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_DFDY,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+};
+
+void TargetNV50::initOpInfo()
+{
+   unsigned int i, j;
+
+   static const uint32_t commutative[(OP_LAST + 31) / 32] =
+   {
+      // ADD,MAD,MUL,AND,OR,XOR,MAX,MIN
+      0x0670ca00, 0x0000003f, 0x00000000
+   };
+   static const uint32_t shortForm[(OP_LAST + 31) / 32] =
+   {
+      // MOV,ADD,SUB,MUL,SAD,L/PINTERP,RCP,TEX,TXF
+      0x00010e40, 0x00000040, 0x00000498
+   };
+   static const operation noDestList[] =
+   {
+      OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT,
+      OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET,
+      OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART,
+      OP_QUADON, OP_QUADPOP
+   };
+   static const operation noPredList[] =
+   {
+      OP_CALL, OP_PREBREAK, OP_PRERET, OP_QUADON, OP_QUADPOP, OP_JOINAT
+   };
+
+   joinAnterior = true;
+
+   for (i = 0; i < DATA_FILE_COUNT; ++i)
+      nativeFileMap[i] = (DataFile)i;
+   nativeFileMap[FILE_PREDICATE] = FILE_FLAGS;
+
+   for (i = 0; i < OP_LAST; ++i) {
+      opInfo[i].variants = NULL;
+      opInfo[i].op = (operation)i;
+      opInfo[i].srcTypes = 1 << (int)TYPE_F32;
+      opInfo[i].dstTypes = 1 << (int)TYPE_F32;
+      opInfo[i].immdBits = 0xffffffff;
+      opInfo[i].srcNr = operationSrcNr[i];
+
+      for (j = 0; j < opInfo[i].srcNr; ++j) {
+         opInfo[i].srcMods[j] = 0;
+         opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR;
+      }
+      opInfo[i].dstMods = 0;
+      opInfo[i].dstFiles = 1 << (int)FILE_GPR;
+
+      opInfo[i].hasDest = 1;
+      opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
+      opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
+      opInfo[i].pseudo = (i < OP_MOV);
+      opInfo[i].predicate = !opInfo[i].pseudo;
+      opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
+      opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
+   }
+   for (i = 0; i < sizeof(noDestList) / sizeof(noDestList[0]); ++i)
+      opInfo[noDestList[i]].hasDest = 0;
+   for (i = 0; i < sizeof(noPredList) / sizeof(noPredList[0]); ++i)
+      opInfo[noPredList[i]].predicate = 0;
+
+   for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
+      const struct opProperties *prop = &_initProps[i];
+
+      for (int s = 0; s < 3; ++s) {
+         if (prop->mNeg & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG;
+         if (prop->mAbs & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS;
+         if (prop->mNot & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT;
+         if (prop->fConst & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST;
+         if (prop->fShared & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_SHARED;
+         if (prop->fAttrib & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_SHADER_INPUT;
+         if (prop->fImm & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE;
+      }
+      if (prop->mSat & 8)
+         opInfo[prop->op].dstMods = NV50_IR_MOD_SAT;
+   }
+}
+
+unsigned int
+TargetNV50::getFileSize(DataFile file) const
+{
+   switch (file) {
+   case FILE_NULL:          return 0;
+   case FILE_GPR:           return 256; // in 16-bit units **
+   case FILE_PREDICATE:     return 0;
+   case FILE_FLAGS:         return 4;
+   case FILE_ADDRESS:       return 4;
+   case FILE_IMMEDIATE:     return 0;
+   case FILE_MEMORY_CONST:  return 65536;
+   case FILE_SHADER_INPUT:  return 0x200;
+   case FILE_SHADER_OUTPUT: return 0x200;
+   case FILE_MEMORY_GLOBAL: return 0xffffffff;
+   case FILE_MEMORY_SHARED: return 16 << 10;
+   case FILE_MEMORY_LOCAL:  return 48 << 10;
+   case FILE_SYSTEM_VALUE:  return 16;
+   default:
+      assert(!"invalid file");
+      return 0;
+   }
+   // ** only first 128 units encodable for 16-bit regs
+}
+
+unsigned int
+TargetNV50::getFileUnit(DataFile file) const
+{
+   if (file == FILE_GPR || file == FILE_ADDRESS)
+      return 1;
+   if (file == FILE_SYSTEM_VALUE)
+      return 2;
+   return 0;
+}
+
+uint32_t
+TargetNV50::getSVAddress(DataFile shaderFile, const Symbol *sym) const
+{
+   switch (sym->reg.data.sv.sv) {
+   case SV_FACE:
+      return 0x3fc;
+   case SV_POSITION:
+   {
+      uint32_t addr = sysvalLocation[sym->reg.data.sv.sv];
+      for (int c = 0; c < sym->reg.data.sv.index; ++c)
+         if (wposMask & (1 << c))
+            addr += 4;
+      return addr;
+   }
+   case SV_NCTAID:
+      return 0x8 + 2 * sym->reg.data.sv.index;
+   case SV_CTAID:
+      return 0xc + 2 * sym->reg.data.sv.index;
+   case SV_NTID:
+      return 0x2 + 2 * sym->reg.data.sv.index;
+   case SV_TID:
+      return 0;
+   default:
+      return sysvalLocation[sym->reg.data.sv.sv];
+   }
+}
+
+// long:  rrr, arr, rcr, acr, rrc, arc, gcr, grr
+// short: rr, ar, rc, gr
+// immd:  ri, gi
+bool
+TargetNV50::insnCanLoad(const Instruction *i, int s,
+                        const Instruction *ld) const
+{
+   DataFile sf = ld->src(0).getFile();
+
+   if (sf == FILE_IMMEDIATE && (i->predSrc >= 0 || i->flagsDef >= 0))
+      return false;
+   if (s >= opInfo[i->op].srcNr)
+      return false;
+   if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf)))
+      return false;
+   if (s == 2 && i->src(1).getFile() != FILE_GPR)
+      return false;
+
+   // NOTE: don't rely on flagsDef
+   for (int d = 0; i->defExists(d); ++d)
+      if (i->def(d).getFile() == FILE_FLAGS)
+         return false;
+
+   unsigned mode = 0;
+
+   for (int z = 0; z < Target::operationSrcNr[i->op]; ++z) {
+      DataFile zf = (z == s) ? sf : i->src(z).getFile();
+      switch (zf) {
+      case FILE_GPR:
+         break;
+      case FILE_MEMORY_SHARED:
+      case FILE_SHADER_INPUT:
+         mode |= 1 << (z * 2);
+         break;
+      case FILE_MEMORY_CONST:
+         mode |= 2 << (z * 2);
+         break;
+      case FILE_IMMEDIATE:
+         mode |= 3 << (z * 2);
+      default:
+         break;
+      }
+   }
+
+   switch (mode) {
+   case 0x00:
+   case 0x01:
+   case 0x03:
+   case 0x08:
+   case 0x09:
+   case 0x0c:
+   case 0x20:
+   case 0x21:
+      break;
+   case 0x0d:
+      if (ld->bb->getProgram()->getType() != Program::TYPE_GEOMETRY)
+         return false;
+   default:
+      return false;
+   }
+
+   if (ld->getSrc(0)->reg.data.offset > (int32_t)(127 * typeSizeof(ld->dType)))
+      return false;
+
+   if (ld->src(0).isIndirect(0)) {
+      for (int z = 0; i->srcExists(z); ++z)
+         if (i->src(z).isIndirect(0))
+            return false;
+
+      // s[] access only possible in CP, $aX always applies
+      if (sf == FILE_MEMORY_SHARED)
+         return true;
+      if (!ld->bb) // can't check type ...
+         return false;
+      Program::Type pt = ld->bb->getProgram()->getType();
+
+      // $aX applies to c[] only in VP, FP, GP if p[] is not accessed
+      if (pt == Program::TYPE_COMPUTE)
+         return false;
+      if (pt == Program::TYPE_GEOMETRY) {
+         if (sf == FILE_MEMORY_CONST)
+            return i->src(s).getFile() != FILE_SHADER_INPUT;
+         return sf == FILE_SHADER_INPUT;
+      }
+      return sf == FILE_MEMORY_CONST;
+   }
+   return true;
+}
+
+bool
+TargetNV50::isAccessSupported(DataFile file, DataType ty) const
+{
+   if (ty == TYPE_B96 || ty == TYPE_NONE)
+      return false;
+   if (typeSizeof(ty) > 4)
+      return (file == FILE_MEMORY_LOCAL) || (file == FILE_MEMORY_GLOBAL);
+   return true;
+}
+
+bool
+TargetNV50::isOpSupported(operation op, DataType ty) const
+{
+   if (ty == TYPE_F64 && chipset < 0xa0)
+      return false;
+
+   switch (op) {
+   case OP_PRERET:
+      return chipset >= 0xa0;
+   case OP_TXG:
+      return chipset >= 0xa3;
+   case OP_POW:
+   case OP_SQRT:
+   case OP_DIV:
+   case OP_MOD:
+   case OP_SET_AND:
+   case OP_SET_OR:
+   case OP_SET_XOR:
+   case OP_SLCT:
+   case OP_SELP:
+   case OP_POPCNT:
+   case OP_INSBF:
+   case OP_EXTBF:
+   case OP_EXIT: // want exit modifier instead (on NOP if required)
+      return false;
+   case OP_SAD:
+      return ty == TYPE_S32;
+   default:
+      return true;
+   }
+}
+
+bool
+TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const
+{
+   if (!isFloatType(insn->dType)) {
+      switch (insn->op) {
+      case OP_ABS:
+      case OP_NEG:
+      case OP_CVT:
+      case OP_CEIL:
+      case OP_FLOOR:
+      case OP_TRUNC:
+      case OP_AND:
+      case OP_OR:
+      case OP_XOR:
+         break;
+      case OP_ADD:
+         if (insn->src(s ? 0 : 1).mod.neg())
+            return false;
+         break;
+      case OP_SUB:
+         if (s == 0)
+            return insn->src(1).mod.neg() ? false : true;
+         break;
+      case OP_SET:
+         if (insn->sType != TYPE_F32)
+            return false;
+         break;
+      default:
+         return false;
+      }
+   }
+   if (s > 3)
+      return false;
+   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
+}
+
+bool
+TargetNV50::mayPredicate(const Instruction *insn, const Value *pred) const
+{
+   if (insn->getPredicate() || insn->flagsSrc >= 0)
+      return false;
+   for (int s = 0; insn->srcExists(s); ++s)
+      if (insn->src(s).getFile() == FILE_IMMEDIATE)
+         return false;
+   return opInfo[insn->op].predicate;
+}
+
+bool
+TargetNV50::isSatSupported(const Instruction *insn) const
+{
+   if (insn->op == OP_CVT)
+      return true;
+   if (insn->dType != TYPE_F32)
+      return false;
+   return opInfo[insn->op].dstMods & NV50_IR_MOD_SAT;
+}
+
+int TargetNV50::getLatency(const Instruction *i) const
+{
+   // TODO: tune these values
+   if (i->op == OP_LOAD) {
+      switch (i->src(0).getFile()) {
+      case FILE_MEMORY_LOCAL:
+      case FILE_MEMORY_GLOBAL:
+         return 100; // really 400 to 800
+      default:
+         return 22;
+      }
+   }
+   return 22;
+}
+
+// These are "inverse" throughput values, i.e. the number of cycles required
+// to issue a specific instruction for a full warp (32 threads).
+//
+// Assuming we have more than 1 warp in flight, a higher issue latency results
+// in a lower result latency since the MP will have spent more time with other
+// warps.
+// This also helps to determine the number of cycles between instructions in
+// a single warp.
+//
+int TargetNV50::getThroughput(const Instruction *i) const
+{
+   // TODO: tune these values
+   if (i->dType == TYPE_F32) {
+      switch (i->op) {
+      case OP_RCP:
+      case OP_RSQ:
+      case OP_LG2:
+      case OP_SIN:
+      case OP_COS:
+      case OP_PRESIN:
+      case OP_PREEX2:
+         return 16;
+      default:
+         return 4;
+      }
+   } else
+   if (i->dType == TYPE_U32 || i->dType == TYPE_S32) {
+      return 4;
+   } else
+   if (i->dType == TYPE_F64) {
+      return 32;
+   } else {
+      return 1;
+   }
+}
+
+static void
+recordLocation(uint16_t *locs, uint8_t *masks,
+               const struct nv50_ir_varying *var)
+{
+   uint16_t addr = var->slot[0] * 4;
+
+   switch (var->sn) {
+   case TGSI_SEMANTIC_POSITION: locs[SV_POSITION] = addr; break;
+   case TGSI_SEMANTIC_INSTANCEID: locs[SV_INSTANCE_ID] = addr; break;
+   case TGSI_SEMANTIC_VERTEXID: locs[SV_VERTEX_ID] = addr; break;
+   case TGSI_SEMANTIC_PRIMID: locs[SV_PRIMITIVE_ID] = addr; break;
+   case NV50_SEMANTIC_LAYER: locs[SV_LAYER] = addr; break;
+   case NV50_SEMANTIC_VIEWPORTINDEX: locs[SV_VIEWPORT_INDEX] = addr; break;
+   default:
+      break;
+   }
+   if (var->sn == TGSI_SEMANTIC_POSITION && masks)
+      masks[0] = var->mask;
+}
+
+void
+TargetNV50::parseDriverInfo(const struct nv50_ir_prog_info *info)
+{
+   unsigned int i;
+   for (i = 0; i < info->numOutputs; ++i)
+      recordLocation(sysvalLocation, NULL, &info->out[i]);
+   for (i = 0; i < info->numInputs; ++i)
+      recordLocation(sysvalLocation, &wposMask, &info->in[i]);
+   for (i = 0; i < info->numSysVals; ++i)
+      recordLocation(sysvalLocation, NULL, &info->sv[i]);
+
+   if (sysvalLocation[SV_POSITION] >= 0x200) {
+      // not assigned by driver, but we need it internally
+      wposMask = 0x8;
+      sysvalLocation[SV_POSITION] = 0;
+   }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.h b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.h
new file mode 100644
index 00000000000..99e6f565612
--- /dev/null
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50/codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+#define NVC0_BUILTIN_DIV_U32 0
+#define NVC0_BUILTIN_DIV_S32 1
+#define NVC0_BUILTIN_RCP_F64 2
+#define NVC0_BUILTIN_RSQ_F64 3
+
+#define NVC0_BUILTIN_COUNT 4
+
+class TargetNV50 : public Target
+{
+public:
+   TargetNV50(unsigned int chipset);
+
+   virtual CodeEmitter *getCodeEmitter(Program::Type);
+
+   virtual bool runLegalizePass(Program *, CGStage stage) const;
+
+   virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const;
+
+   virtual void parseDriverInfo(const struct nv50_ir_prog_info *);
+
+   virtual bool insnCanLoad(const Instruction *insn, int s,
+                            const Instruction *ld) const;
+   virtual bool isOpSupported(operation, DataType) const;
+   virtual bool isAccessSupported(DataFile, DataType) const;
+   virtual bool isModSupported(const Instruction *, int s, Modifier) const;
+   virtual bool isSatSupported(const Instruction *) const;
+   virtual bool mayPredicate(const Instruction *, const Value *) const;
+
+   virtual int getLatency(const Instruction *) const;
+   virtual int getThroughput(const Instruction *) const;
+
+   virtual unsigned int getFileSize(DataFile) const;
+   virtual unsigned int getFileUnit(DataFile) const;
+
+   virtual uint32_t getSVAddress(DataFile shaderFile, const Symbol *sv) const;
+
+   uint32_t getBuiltinOffset(int builtin) const;
+
+private:
+   void initOpInfo();
+
+   uint16_t sysvalLocation[SV_LAST + 1];
+   uint8_t wposMask;
+};
+
+} // namespace nv50_ir
author	Christoph Bumiller <[email protected]>	2012-04-14 21:40:35 +0200
committer	Christoph Bumiller <[email protected]>	2012-04-14 21:54:04 +0200
commit	322bc7ed68ed92233c97168c036d0aa50c11a20e (patch)
tree	0ecfa903b0950c71455b6420c4f3550e4abb2007 /src/gallium/drivers/nv50/codegen
parent	15ce0f76e2e014374a292550505f58da88333fb7 (diff)