nvc0: add initial support for nve4+ (Kepler) chipsets

Most things that work on Fermi should work on Kepler too. There are a few performance optimizations left to do, like better placement of texture barriers and adding scheduling data to the shader instructions (without them, a thread group will be masked for 32 cycles after each single instruction issue).
author: Christoph Bumiller <[email protected]> 2012-04-14 23:56:56 +0200
committer: Christoph Bumiller <[email protected]> 2012-04-15 00:08:51 +0200
commit: e44089b2f79aa2dcaacf348911433d1e21235c0c (patch)
tree: 955d621392f0068ef8e3c98dc46195ff3916525e /src/gallium/drivers/nvc0/codegen
parent: 69a921892d2303f1400576aa73980c28880f8654 (diff)
3 files changed, 110 insertions, 25 deletions
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
index d4fd4da07e7..912540d0c40 100644
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
@@ -102,6 +102,7 @@ private:
    void emitSLCT(const CmpInstruction *);
    void emitSELP(const Instruction *);
 
+   void emitTEXBAR(const Instruction *);
    void emitTEX(const TexInstruction *);
    void emitTEXCSAA(const TexInstruction *);
    void emitTXQ(const TexInstruction *);
@@ -938,6 +939,14 @@ void CodeEmitterNVC0::emitSELP(const Instruction *i)
       code[1] |= 1 << 20;
 }
 
+void CodeEmitterNVC0::emitTEXBAR(const Instruction *i)
+{
+   code[0] = 0x00000006 | (i->subOp << 26);
+   code[1] = 0xf0000000;
+   emitPredicate(i);
+   emitCondCode(i->predSrc >= 0 ? i->cc : CC_ALWAYS, 5);
+}
+
 void CodeEmitterNVC0::emitTEXCSAA(const TexInstruction *i)
 {
    code[0] = 0x00000086;
@@ -1630,6 +1639,9 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn)
    case OP_TXQ:
       emitTXQ(insn->asTex());
       break;
+   case OP_TEXBAR:
+      emitTEXBAR(insn);
+      break;
    case OP_BRA:
    case OP_CALL:
    case OP_PRERET:
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
index bd33fbfac5c..318d345efdb 100644
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
@@ -117,6 +117,9 @@ NVC0LegalizeSSA::visit(BasicBlock *bb)
 
 class NVC0LegalizePostRA : public Pass
 {
+public:
+   NVC0LegalizePostRA(const Program *);
+
 private:
    virtual bool visit(Function *);
    virtual bool visit(BasicBlock *);
@@ -127,8 +130,15 @@ private:
    void propagateJoin(BasicBlock *);
 
    LValue *r63;
+
+   const bool needTexBar;
 };
 
+NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
+   : needTexBar(prog->getTarget()->getChipset() >= 0xe0)
+{
+}
+
 bool
 NVC0LegalizePostRA::visit(Function *fn)
 {
@@ -225,6 +235,12 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
       } else
       if (i->isNop()) {
          bb->remove(i);
+      } else
+      if (needTexBar && isTextureOp(i->op)) {
+         Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
+         bar->fixed = 1;
+         bar->subOp = 0;
+         bb->insertAfter(i, bar);
       } else {
          if (i->op != OP_MOV && i->op != OP_PFETCH)
             replaceZero(i);
@@ -310,7 +326,61 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
    const int arg = i->tex.target.getArgCount();
 
-   // generate and move the tsc/tic/array source to the front
+   if (prog->getTarget()->getChipset() >= 0xe0) {
+      if (i->tex.r == i->tex.s) {
+         i->tex.r += 8; // NOTE: offset should probably be a driver option
+         i->tex.s  = 0; // only a single cX[] value possible here
+      } else {
+         // TODO: extract handles and use register to select TIC/TSC entries
+      }
+      if (i->tex.target.isArray()) {
+         LValue *layer = new_LValue(func, FILE_GPR);
+         Value *src = i->getSrc(arg - 1);
+         const int sat = (i->op == OP_TXF) ? 1 : 0;
+         DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
+         bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
+         for (int s = dim; s >= 1; --s)
+            i->setSrc(s, i->getSrc(s - 1));
+         i->setSrc(0, layer);
+      }
+      if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
+         Value *tmp[2];
+         Symbol *bind;
+         Value *rRel = i->getIndirectR();
+         Value *sRel = i->getIndirectS();
+         Value *shCnt = bld.loadImm(NULL, 2);
+
+         if (rRel) {
+            tmp[0] = bld.getScratch();
+            bind = bld.mkSymbol(FILE_MEMORY_CONST, 15, TYPE_U32, i->tex.r * 4);
+            bld.mkOp2(OP_SHL, TYPE_U32, tmp[0], rRel, shCnt);
+            tmp[1] = bld.mkLoad(TYPE_U32, bind, tmp[0]);
+            bld.mkOp2(OP_AND, TYPE_U32, tmp[0], tmp[1],
+                      bld.loadImm(tmp[0], 0x00ffffffu));
+            rRel = tmp[0];
+            i->setSrc(i->tex.rIndirectSrc, NULL);
+         }
+         if (sRel) {
+            tmp[0] = bld.getScratch();
+            bind = bld.mkSymbol(FILE_MEMORY_CONST, 15, TYPE_U32, i->tex.s * 4);
+            bld.mkOp2(OP_SHL, TYPE_U32, tmp[0], sRel, shCnt);
+            tmp[1] = bld.mkLoad(TYPE_U32, bind, tmp[0]);
+            bld.mkOp2(OP_AND, TYPE_U32, tmp[0], tmp[1],
+                      bld.loadImm(tmp[0], 0xff000000u));
+            sRel = tmp[0];
+            i->setSrc(i->tex.sIndirectSrc, NULL);
+         }
+         bld.mkOp2(OP_OR, TYPE_U32, rRel, rRel, sRel);
+
+         int min = i->tex.rIndirectSrc;
+         if (min < 0 || min > i->tex.sIndirectSrc)
+            min = i->tex.sIndirectSrc;
+         for (int s = min; s >= 1; --s)
+            i->setSrc(s, i->getSrc(s - 1));
+         i->setSrc(0, rRel);
+      }
+   } else
+   // (nvc0) generate and move the tsc/tic/array source to the front
    if (dim != arg || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
 
@@ -717,7 +787,7 @@ TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
       return pass.run(prog, false, true);
    } else
    if (stage == CG_STAGE_POST_RA) {
-      NVC0LegalizePostRA pass;
+      NVC0LegalizePostRA pass(prog);
       return pass.run(prog, false, true);
    } else
    if (stage == CG_STAGE_SSA) {
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
index 04425623bdb..2aa20053c14 100644
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
@@ -42,6 +42,7 @@ TargetNVC0::TargetNVC0(unsigned int card)
 // Will probably make this nicer once we support subroutines properly,
 // i.e. when we have an input IR that provides function declarations.
 
+// TODO: separate version for nve4+ which doesn't like the 4-byte insn formats
 static const uint32_t nvc0_builtin_code[] =
 {
 // DIV U32: slow unsigned integer division
@@ -57,11 +58,11 @@ static const uint32_t nvc0_builtin_code[] =
 //
 #if 1
    0x04009c03, 0x78000000,
-   0x7c209cdd,
-   0x0010dd18,
+   0x7c209c82, 0x38000000, // 0x7c209cdd,
+   0x0400dde2, 0x18000000, // 0x0010dd18,
    0x08309c03, 0x60000000,
-   0x05605c18,
-   0x0810dc2a,
+   0x05205d04, 0x1c000000, // 0x05605c18,
+   0x0810dc03, 0x50000000, // 0x0810dc2a,
    0x0c209c43, 0x20040000,
    0x0810dc03, 0x50000000,
    0x0c209c43, 0x20040000,
@@ -73,15 +74,15 @@ static const uint32_t nvc0_builtin_code[] =
    0x0c209c43, 0x20040000,
    0x0000dde4, 0x28000000,
    0x08001c43, 0x50000000,
-   0x05609c18,
-   0x0010430d,
+   0x05209d04, 0x1c000000, // 0x05609c18,
+   0x00105c03, 0x20060000, // 0x0010430d,
    0x0811dc03, 0x1b0e0000,
    0x08104103, 0x48000000,
    0x04000002, 0x08000000,
    0x0811c003, 0x1b0e0000,
    0x08104103, 0x48000000,
-   0x040000ac,
-   0x90001dff,
+   0x04000002, 0x08000000, // 0x040000ac,
+   0x00001de7, 0x90000000, // 0x90001dff,
 #else
    0x0401dc03, 0x1b0e0000,
    0x00008003, 0x78000000,
@@ -111,27 +112,27 @@ static const uint32_t nvc0_builtin_code[] =
 //
    0xfc05dc23, 0x188e0000,
    0xfc17dc23, 0x18c40000,
-   0x03301e18,
-   0x07305e18,
+   0x01201ec4, 0x1c000000, // 0x03301e18,
+   0x05205ec4, 0x1c000000, // 0x07305e18,
    0x0401dc03, 0x1b0e0000,
    0x00008003, 0x78000000,
    0x0400c003, 0x78000000,
    0x0c20c103, 0x48000000,
    0x0c108003, 0x60000000,
-   0x00005c28,
-   0x00001d18,
+   0x00005de4, 0x28000000, // 0x00005c28,
+   0x00001de2, 0x18000000, // 0x00001d18,
    0x0031c023, 0x1b0ec000,
-   0xb000a1e7, 0x40000000,
+   0xe000a1e7, 0x40000000, // 0xb000a1e7, 0x40000000,
    0x04000003, 0x6000c000,
    0x0813dc03, 0x1b000000,
-   0x0420446c,
-   0x040004bd,
+   0x04204603, 0x48000000, // 0x0420446c,
+   0x04000442, 0x38000000, // 0x040004bd,
    0x04208003, 0x5800c000,
    0x0430c103, 0x4800c000,
-   0x0ffc5dff,
-   0x01700e18,
-   0x05704a18,
-   0x90001dff,
+   0xe0001de7, 0x4003fffe, // 0x0ffc5dff,
+   0x01200f84, 0x1c000000, // 0x01700e18,
+   0x05204b84, 0x1c000000, // 0x05704a18,
+   0x00001de7, 0x90000000, // 0x90001dff,
 
 // RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
 //
@@ -180,9 +181,9 @@ static const uint32_t nvc0_builtin_code[] =
 static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] =
 {
    0,
-   8 * (22),
-   8 * (22 + 18),
-   8 * (22 + 18 + 9)
+   8 * (26),
+   8 * (26 + 23),
+   8 * (26 + 23 + 9)
 };
 
 void
@@ -270,7 +271,7 @@ void TargetNVC0::initOpInfo()
       OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT,
       OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET,
       OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART,
-      OP_QUADON, OP_QUADPOP
+      OP_QUADON, OP_QUADPOP, OP_TEXBAR
    };
 
    joinAnterior = false;
@@ -445,6 +446,8 @@ TargetNVC0::isAccessSupported(DataFile file, DataType ty) const
 {
    if (ty == TYPE_NONE)
       return false;
+   if (file == FILE_MEMORY_CONST && getChipset() >= 0xe0) // wrong encoding ?
+      return typeSizeof(ty) <= 4;
    if (ty == TYPE_B96)
       return (file == FILE_SHADER_INPUT) || (file == FILE_SHADER_OUTPUT);
    return true;
author	Christoph Bumiller <[email protected]>	2012-04-14 23:56:56 +0200
committer	Christoph Bumiller <[email protected]>	2012-04-15 00:08:51 +0200
commit	e44089b2f79aa2dcaacf348911433d1e21235c0c (patch)
tree	955d621392f0068ef8e3c98dc46195ff3916525e /src/gallium/drivers/nvc0/codegen
parent	69a921892d2303f1400576aa73980c28880f8654 (diff)