diff options
author | Christoph Bumiller <[email protected]> | 2012-04-14 23:56:56 +0200 |
---|---|---|
committer | Christoph Bumiller <[email protected]> | 2012-04-15 00:08:51 +0200 |
commit | e44089b2f79aa2dcaacf348911433d1e21235c0c (patch) | |
tree | 955d621392f0068ef8e3c98dc46195ff3916525e /src/gallium/drivers/nvc0/codegen | |
parent | 69a921892d2303f1400576aa73980c28880f8654 (diff) |
nvc0: add initial support for nve4+ (Kepler) chipsets
Most things that work on Fermi should work on Kepler too.
There are a few performance optimizations left to do, like better
placement of texture barriers and adding scheduling data to the
shader instructions (without them, a thread group will be masked
for 32 cycles after each single instruction issue).
Diffstat (limited to 'src/gallium/drivers/nvc0/codegen')
3 files changed, 110 insertions, 25 deletions
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp index d4fd4da07e7..912540d0c40 100644 --- a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp @@ -102,6 +102,7 @@ private: void emitSLCT(const CmpInstruction *); void emitSELP(const Instruction *); + void emitTEXBAR(const Instruction *); void emitTEX(const TexInstruction *); void emitTEXCSAA(const TexInstruction *); void emitTXQ(const TexInstruction *); @@ -938,6 +939,14 @@ void CodeEmitterNVC0::emitSELP(const Instruction *i) code[1] |= 1 << 20; } +void CodeEmitterNVC0::emitTEXBAR(const Instruction *i) +{ + code[0] = 0x00000006 | (i->subOp << 26); + code[1] = 0xf0000000; + emitPredicate(i); + emitCondCode(i->predSrc >= 0 ? i->cc : CC_ALWAYS, 5); +} + void CodeEmitterNVC0::emitTEXCSAA(const TexInstruction *i) { code[0] = 0x00000086; @@ -1630,6 +1639,9 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn) case OP_TXQ: emitTXQ(insn->asTex()); break; + case OP_TEXBAR: + emitTEXBAR(insn); + break; case OP_BRA: case OP_CALL: case OP_PRERET: diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp index bd33fbfac5c..318d345efdb 100644 --- a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp @@ -117,6 +117,9 @@ NVC0LegalizeSSA::visit(BasicBlock *bb) class NVC0LegalizePostRA : public Pass { +public: + NVC0LegalizePostRA(const Program *); + private: virtual bool visit(Function *); virtual bool visit(BasicBlock *); @@ -127,8 +130,15 @@ private: void propagateJoin(BasicBlock *); LValue *r63; + + const bool needTexBar; }; +NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog) + : needTexBar(prog->getTarget()->getChipset() >= 0xe0) +{ +} + bool NVC0LegalizePostRA::visit(Function *fn) { @@ -225,6 +235,12 @@ NVC0LegalizePostRA::visit(BasicBlock *bb) } else if (i->isNop()) { bb->remove(i); + } else + if (needTexBar && isTextureOp(i->op)) { + Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE); + bar->fixed = 1; + bar->subOp = 0; + bb->insertAfter(i, bar); } else { if (i->op != OP_MOV && i->op != OP_PFETCH) replaceZero(i); @@ -310,7 +326,61 @@ NVC0LoweringPass::handleTEX(TexInstruction *i) const int dim = i->tex.target.getDim() + i->tex.target.isCube(); const int arg = i->tex.target.getArgCount(); - // generate and move the tsc/tic/array source to the front + if (prog->getTarget()->getChipset() >= 0xe0) { + if (i->tex.r == i->tex.s) { + i->tex.r += 8; // NOTE: offset should probably be a driver option + i->tex.s = 0; // only a single cX[] value possible here + } else { + // TODO: extract handles and use register to select TIC/TSC entries + } + if (i->tex.target.isArray()) { + LValue *layer = new_LValue(func, FILE_GPR); + Value *src = i->getSrc(arg - 1); + const int sat = (i->op == OP_TXF) ? 1 : 0; + DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32; + bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat; + for (int s = dim; s >= 1; --s) + i->setSrc(s, i->getSrc(s - 1)); + i->setSrc(0, layer); + } + if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) { + Value *tmp[2]; + Symbol *bind; + Value *rRel = i->getIndirectR(); + Value *sRel = i->getIndirectS(); + Value *shCnt = bld.loadImm(NULL, 2); + + if (rRel) { + tmp[0] = bld.getScratch(); + bind = bld.mkSymbol(FILE_MEMORY_CONST, 15, TYPE_U32, i->tex.r * 4); + bld.mkOp2(OP_SHL, TYPE_U32, tmp[0], rRel, shCnt); + tmp[1] = bld.mkLoad(TYPE_U32, bind, tmp[0]); + bld.mkOp2(OP_AND, TYPE_U32, tmp[0], tmp[1], + bld.loadImm(tmp[0], 0x00ffffffu)); + rRel = tmp[0]; + i->setSrc(i->tex.rIndirectSrc, NULL); + } + if (sRel) { + tmp[0] = bld.getScratch(); + bind = bld.mkSymbol(FILE_MEMORY_CONST, 15, TYPE_U32, i->tex.s * 4); + bld.mkOp2(OP_SHL, TYPE_U32, tmp[0], sRel, shCnt); + tmp[1] = bld.mkLoad(TYPE_U32, bind, tmp[0]); + bld.mkOp2(OP_AND, TYPE_U32, tmp[0], tmp[1], + bld.loadImm(tmp[0], 0xff000000u)); + sRel = tmp[0]; + i->setSrc(i->tex.sIndirectSrc, NULL); + } + bld.mkOp2(OP_OR, TYPE_U32, rRel, rRel, sRel); + + int min = i->tex.rIndirectSrc; + if (min < 0 || min > i->tex.sIndirectSrc) + min = i->tex.sIndirectSrc; + for (int s = min; s >= 1; --s) + i->setSrc(s, i->getSrc(s - 1)); + i->setSrc(0, rRel); + } + } else + // (nvc0) generate and move the tsc/tic/array source to the front if (dim != arg || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) { LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa @@ -717,7 +787,7 @@ TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const return pass.run(prog, false, true); } else if (stage == CG_STAGE_POST_RA) { - NVC0LegalizePostRA pass; + NVC0LegalizePostRA pass(prog); return pass.run(prog, false, true); } else if (stage == CG_STAGE_SSA) { diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp index 04425623bdb..2aa20053c14 100644 --- a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp @@ -42,6 +42,7 @@ TargetNVC0::TargetNVC0(unsigned int card) // Will probably make this nicer once we support subroutines properly, // i.e. when we have an input IR that provides function declarations. +// TODO: separate version for nve4+ which doesn't like the 4-byte insn formats static const uint32_t nvc0_builtin_code[] = { // DIV U32: slow unsigned integer division @@ -57,11 +58,11 @@ static const uint32_t nvc0_builtin_code[] = // #if 1 0x04009c03, 0x78000000, - 0x7c209cdd, - 0x0010dd18, + 0x7c209c82, 0x38000000, // 0x7c209cdd, + 0x0400dde2, 0x18000000, // 0x0010dd18, 0x08309c03, 0x60000000, - 0x05605c18, - 0x0810dc2a, + 0x05205d04, 0x1c000000, // 0x05605c18, + 0x0810dc03, 0x50000000, // 0x0810dc2a, 0x0c209c43, 0x20040000, 0x0810dc03, 0x50000000, 0x0c209c43, 0x20040000, @@ -73,15 +74,15 @@ static const uint32_t nvc0_builtin_code[] = 0x0c209c43, 0x20040000, 0x0000dde4, 0x28000000, 0x08001c43, 0x50000000, - 0x05609c18, - 0x0010430d, + 0x05209d04, 0x1c000000, // 0x05609c18, + 0x00105c03, 0x20060000, // 0x0010430d, 0x0811dc03, 0x1b0e0000, 0x08104103, 0x48000000, 0x04000002, 0x08000000, 0x0811c003, 0x1b0e0000, 0x08104103, 0x48000000, - 0x040000ac, - 0x90001dff, + 0x04000002, 0x08000000, // 0x040000ac, + 0x00001de7, 0x90000000, // 0x90001dff, #else 0x0401dc03, 0x1b0e0000, 0x00008003, 0x78000000, @@ -111,27 +112,27 @@ static const uint32_t nvc0_builtin_code[] = // 0xfc05dc23, 0x188e0000, 0xfc17dc23, 0x18c40000, - 0x03301e18, - 0x07305e18, + 0x01201ec4, 0x1c000000, // 0x03301e18, + 0x05205ec4, 0x1c000000, // 0x07305e18, 0x0401dc03, 0x1b0e0000, 0x00008003, 0x78000000, 0x0400c003, 0x78000000, 0x0c20c103, 0x48000000, 0x0c108003, 0x60000000, - 0x00005c28, - 0x00001d18, + 0x00005de4, 0x28000000, // 0x00005c28, + 0x00001de2, 0x18000000, // 0x00001d18, 0x0031c023, 0x1b0ec000, - 0xb000a1e7, 0x40000000, + 0xe000a1e7, 0x40000000, // 0xb000a1e7, 0x40000000, 0x04000003, 0x6000c000, 0x0813dc03, 0x1b000000, - 0x0420446c, - 0x040004bd, + 0x04204603, 0x48000000, // 0x0420446c, + 0x04000442, 0x38000000, // 0x040004bd, 0x04208003, 0x5800c000, 0x0430c103, 0x4800c000, - 0x0ffc5dff, - 0x01700e18, - 0x05704a18, - 0x90001dff, + 0xe0001de7, 0x4003fffe, // 0x0ffc5dff, + 0x01200f84, 0x1c000000, // 0x01700e18, + 0x05204b84, 0x1c000000, // 0x05704a18, + 0x00001de7, 0x90000000, // 0x90001dff, // RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i) // @@ -180,9 +181,9 @@ static const uint32_t nvc0_builtin_code[] = static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] = { 0, - 8 * (22), - 8 * (22 + 18), - 8 * (22 + 18 + 9) + 8 * (26), + 8 * (26 + 23), + 8 * (26 + 23 + 9) }; void @@ -270,7 +271,7 @@ void TargetNVC0::initOpInfo() OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT, OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET, OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART, - OP_QUADON, OP_QUADPOP + OP_QUADON, OP_QUADPOP, OP_TEXBAR }; joinAnterior = false; @@ -445,6 +446,8 @@ TargetNVC0::isAccessSupported(DataFile file, DataType ty) const { if (ty == TYPE_NONE) return false; + if (file == FILE_MEMORY_CONST && getChipset() >= 0xe0) // wrong encoding ? + return typeSizeof(ty) <= 4; if (ty == TYPE_B96) return (file == FILE_SHADER_INPUT) || (file == FILE_SHADER_OUTPUT); return true; |