summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/nvc0/codegen
diff options
context:
space:
mode:
authorChristoph Bumiller <[email protected]>2012-04-14 23:56:56 +0200
committerChristoph Bumiller <[email protected]>2012-04-15 00:08:51 +0200
commite44089b2f79aa2dcaacf348911433d1e21235c0c (patch)
tree955d621392f0068ef8e3c98dc46195ff3916525e /src/gallium/drivers/nvc0/codegen
parent69a921892d2303f1400576aa73980c28880f8654 (diff)
nvc0: add initial support for nve4+ (Kepler) chipsets
Most things that work on Fermi should work on Kepler too. There are a few performance optimizations left to do, like better placement of texture barriers and adding scheduling data to the shader instructions (without them, a thread group will be masked for 32 cycles after each single instruction issue).
Diffstat (limited to 'src/gallium/drivers/nvc0/codegen')
-rw-r--r--src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp12
-rw-r--r--src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp74
-rw-r--r--src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp49
3 files changed, 110 insertions, 25 deletions
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
index d4fd4da07e7..912540d0c40 100644
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
@@ -102,6 +102,7 @@ private:
void emitSLCT(const CmpInstruction *);
void emitSELP(const Instruction *);
+ void emitTEXBAR(const Instruction *);
void emitTEX(const TexInstruction *);
void emitTEXCSAA(const TexInstruction *);
void emitTXQ(const TexInstruction *);
@@ -938,6 +939,14 @@ void CodeEmitterNVC0::emitSELP(const Instruction *i)
code[1] |= 1 << 20;
}
+void CodeEmitterNVC0::emitTEXBAR(const Instruction *i)
+{
+ code[0] = 0x00000006 | (i->subOp << 26);
+ code[1] = 0xf0000000;
+ emitPredicate(i);
+ emitCondCode(i->predSrc >= 0 ? i->cc : CC_ALWAYS, 5);
+}
+
void CodeEmitterNVC0::emitTEXCSAA(const TexInstruction *i)
{
code[0] = 0x00000086;
@@ -1630,6 +1639,9 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn)
case OP_TXQ:
emitTXQ(insn->asTex());
break;
+ case OP_TEXBAR:
+ emitTEXBAR(insn);
+ break;
case OP_BRA:
case OP_CALL:
case OP_PRERET:
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
index bd33fbfac5c..318d345efdb 100644
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
@@ -117,6 +117,9 @@ NVC0LegalizeSSA::visit(BasicBlock *bb)
class NVC0LegalizePostRA : public Pass
{
+public:
+ NVC0LegalizePostRA(const Program *);
+
private:
virtual bool visit(Function *);
virtual bool visit(BasicBlock *);
@@ -127,8 +130,15 @@ private:
void propagateJoin(BasicBlock *);
LValue *r63;
+
+ const bool needTexBar;
};
+NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
+ : needTexBar(prog->getTarget()->getChipset() >= 0xe0)
+{
+}
+
bool
NVC0LegalizePostRA::visit(Function *fn)
{
@@ -225,6 +235,12 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
} else
if (i->isNop()) {
bb->remove(i);
+ } else
+ if (needTexBar && isTextureOp(i->op)) {
+ Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
+ bar->fixed = 1;
+ bar->subOp = 0;
+ bb->insertAfter(i, bar);
} else {
if (i->op != OP_MOV && i->op != OP_PFETCH)
replaceZero(i);
@@ -310,7 +326,61 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
const int dim = i->tex.target.getDim() + i->tex.target.isCube();
const int arg = i->tex.target.getArgCount();
- // generate and move the tsc/tic/array source to the front
+ if (prog->getTarget()->getChipset() >= 0xe0) {
+ if (i->tex.r == i->tex.s) {
+ i->tex.r += 8; // NOTE: offset should probably be a driver option
+ i->tex.s = 0; // only a single cX[] value possible here
+ } else {
+ // TODO: extract handles and use register to select TIC/TSC entries
+ }
+ if (i->tex.target.isArray()) {
+ LValue *layer = new_LValue(func, FILE_GPR);
+ Value *src = i->getSrc(arg - 1);
+ const int sat = (i->op == OP_TXF) ? 1 : 0;
+ DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
+ bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
+ for (int s = dim; s >= 1; --s)
+ i->setSrc(s, i->getSrc(s - 1));
+ i->setSrc(0, layer);
+ }
+ if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
+ Value *tmp[2];
+ Symbol *bind;
+ Value *rRel = i->getIndirectR();
+ Value *sRel = i->getIndirectS();
+ Value *shCnt = bld.loadImm(NULL, 2);
+
+ if (rRel) {
+ tmp[0] = bld.getScratch();
+ bind = bld.mkSymbol(FILE_MEMORY_CONST, 15, TYPE_U32, i->tex.r * 4);
+ bld.mkOp2(OP_SHL, TYPE_U32, tmp[0], rRel, shCnt);
+ tmp[1] = bld.mkLoad(TYPE_U32, bind, tmp[0]);
+ bld.mkOp2(OP_AND, TYPE_U32, tmp[0], tmp[1],
+ bld.loadImm(tmp[0], 0x00ffffffu));
+ rRel = tmp[0];
+ i->setSrc(i->tex.rIndirectSrc, NULL);
+ }
+ if (sRel) {
+ tmp[0] = bld.getScratch();
+ bind = bld.mkSymbol(FILE_MEMORY_CONST, 15, TYPE_U32, i->tex.s * 4);
+ bld.mkOp2(OP_SHL, TYPE_U32, tmp[0], sRel, shCnt);
+ tmp[1] = bld.mkLoad(TYPE_U32, bind, tmp[0]);
+ bld.mkOp2(OP_AND, TYPE_U32, tmp[0], tmp[1],
+ bld.loadImm(tmp[0], 0xff000000u));
+ sRel = tmp[0];
+ i->setSrc(i->tex.sIndirectSrc, NULL);
+ }
+ bld.mkOp2(OP_OR, TYPE_U32, rRel, rRel, sRel);
+
+ int min = i->tex.rIndirectSrc;
+ if (min < 0 || min > i->tex.sIndirectSrc)
+ min = i->tex.sIndirectSrc;
+ for (int s = min; s >= 1; --s)
+ i->setSrc(s, i->getSrc(s - 1));
+ i->setSrc(0, rRel);
+ }
+ } else
+ // (nvc0) generate and move the tsc/tic/array source to the front
if (dim != arg || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
@@ -717,7 +787,7 @@ TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
return pass.run(prog, false, true);
} else
if (stage == CG_STAGE_POST_RA) {
- NVC0LegalizePostRA pass;
+ NVC0LegalizePostRA pass(prog);
return pass.run(prog, false, true);
} else
if (stage == CG_STAGE_SSA) {
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
index 04425623bdb..2aa20053c14 100644
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
@@ -42,6 +42,7 @@ TargetNVC0::TargetNVC0(unsigned int card)
// Will probably make this nicer once we support subroutines properly,
// i.e. when we have an input IR that provides function declarations.
+// TODO: separate version for nve4+ which doesn't like the 4-byte insn formats
static const uint32_t nvc0_builtin_code[] =
{
// DIV U32: slow unsigned integer division
@@ -57,11 +58,11 @@ static const uint32_t nvc0_builtin_code[] =
//
#if 1
0x04009c03, 0x78000000,
- 0x7c209cdd,
- 0x0010dd18,
+ 0x7c209c82, 0x38000000, // 0x7c209cdd,
+ 0x0400dde2, 0x18000000, // 0x0010dd18,
0x08309c03, 0x60000000,
- 0x05605c18,
- 0x0810dc2a,
+ 0x05205d04, 0x1c000000, // 0x05605c18,
+ 0x0810dc03, 0x50000000, // 0x0810dc2a,
0x0c209c43, 0x20040000,
0x0810dc03, 0x50000000,
0x0c209c43, 0x20040000,
@@ -73,15 +74,15 @@ static const uint32_t nvc0_builtin_code[] =
0x0c209c43, 0x20040000,
0x0000dde4, 0x28000000,
0x08001c43, 0x50000000,
- 0x05609c18,
- 0x0010430d,
+ 0x05209d04, 0x1c000000, // 0x05609c18,
+ 0x00105c03, 0x20060000, // 0x0010430d,
0x0811dc03, 0x1b0e0000,
0x08104103, 0x48000000,
0x04000002, 0x08000000,
0x0811c003, 0x1b0e0000,
0x08104103, 0x48000000,
- 0x040000ac,
- 0x90001dff,
+ 0x04000002, 0x08000000, // 0x040000ac,
+ 0x00001de7, 0x90000000, // 0x90001dff,
#else
0x0401dc03, 0x1b0e0000,
0x00008003, 0x78000000,
@@ -111,27 +112,27 @@ static const uint32_t nvc0_builtin_code[] =
//
0xfc05dc23, 0x188e0000,
0xfc17dc23, 0x18c40000,
- 0x03301e18,
- 0x07305e18,
+ 0x01201ec4, 0x1c000000, // 0x03301e18,
+ 0x05205ec4, 0x1c000000, // 0x07305e18,
0x0401dc03, 0x1b0e0000,
0x00008003, 0x78000000,
0x0400c003, 0x78000000,
0x0c20c103, 0x48000000,
0x0c108003, 0x60000000,
- 0x00005c28,
- 0x00001d18,
+ 0x00005de4, 0x28000000, // 0x00005c28,
+ 0x00001de2, 0x18000000, // 0x00001d18,
0x0031c023, 0x1b0ec000,
- 0xb000a1e7, 0x40000000,
+ 0xe000a1e7, 0x40000000, // 0xb000a1e7, 0x40000000,
0x04000003, 0x6000c000,
0x0813dc03, 0x1b000000,
- 0x0420446c,
- 0x040004bd,
+ 0x04204603, 0x48000000, // 0x0420446c,
+ 0x04000442, 0x38000000, // 0x040004bd,
0x04208003, 0x5800c000,
0x0430c103, 0x4800c000,
- 0x0ffc5dff,
- 0x01700e18,
- 0x05704a18,
- 0x90001dff,
+ 0xe0001de7, 0x4003fffe, // 0x0ffc5dff,
+ 0x01200f84, 0x1c000000, // 0x01700e18,
+ 0x05204b84, 0x1c000000, // 0x05704a18,
+ 0x00001de7, 0x90000000, // 0x90001dff,
// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
//
@@ -180,9 +181,9 @@ static const uint32_t nvc0_builtin_code[] =
static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] =
{
0,
- 8 * (22),
- 8 * (22 + 18),
- 8 * (22 + 18 + 9)
+ 8 * (26),
+ 8 * (26 + 23),
+ 8 * (26 + 23 + 9)
};
void
@@ -270,7 +271,7 @@ void TargetNVC0::initOpInfo()
OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT,
OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET,
OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART,
- OP_QUADON, OP_QUADPOP
+ OP_QUADON, OP_QUADPOP, OP_TEXBAR
};
joinAnterior = false;
@@ -445,6 +446,8 @@ TargetNVC0::isAccessSupported(DataFile file, DataType ty) const
{
if (ty == TYPE_NONE)
return false;
+ if (file == FILE_MEMORY_CONST && getChipset() >= 0xe0) // wrong encoding ?
+ return typeSizeof(ty) <= 4;
if (ty == TYPE_B96)
return (file == FILE_SHADER_INPUT) || (file == FILE_SHADER_OUTPUT);
return true;