diff options
3 files changed, 1027 insertions, 4 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index 5bd0fa0a075..f3ebd2b0be8 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -24,6 +24,8 @@ #include "codegen/nv50_ir_target_gm107.h" +//#define GM107_DEBUG_SCHED_DATA + namespace nv50_ir { class CodeEmitterGM107 : public CodeEmitter @@ -3374,20 +3376,781 @@ CodeEmitterGM107::getMinEncodingSize(const Instruction *i) const class SchedDataCalculatorGM107 : public Pass { public: - SchedDataCalculatorGM107(const Target *targ) : targ(targ) {} + SchedDataCalculatorGM107(const TargetGM107 *targ) : targ(targ) {} + private: - const Target *targ; - bool visit(BasicBlock *bb); + struct RegScores + { + struct ScoreData { + int r[256]; + int p[8]; + int c; + } rd, wr; + int base; + + void rebase(const int base) + { + const int delta = this->base - base; + if (!delta) + return; + this->base = 0; + + for (int i = 0; i < 256; ++i) { + rd.r[i] += delta; + wr.r[i] += delta; + } + for (int i = 0; i < 8; ++i) { + rd.p[i] += delta; + wr.p[i] += delta; + } + rd.c += delta; + wr.c += delta; + } + void wipe() + { + memset(&rd, 0, sizeof(rd)); + memset(&wr, 0, sizeof(wr)); + } + int getLatest(const ScoreData& d) const + { + int max = 0; + for (int i = 0; i < 256; ++i) + if (d.r[i] > max) + max = d.r[i]; + for (int i = 0; i < 8; ++i) + if (d.p[i] > max) + max = d.p[i]; + if (d.c > max) + max = d.c; + return max; + } + inline int getLatestRd() const + { + return getLatest(rd); + } + inline int getLatestWr() const + { + return getLatest(wr); + } + inline int getLatest() const + { + return MAX2(getLatestRd(), getLatestWr()); + } + void setMax(const RegScores *that) + { + for (int i = 0; i < 256; ++i) { + rd.r[i] = MAX2(rd.r[i], that->rd.r[i]); + wr.r[i] = MAX2(wr.r[i], that->wr.r[i]); + } + for (int i = 0; i < 8; ++i) { + rd.p[i] = MAX2(rd.p[i], that->rd.p[i]); + wr.p[i] = MAX2(wr.p[i], that->wr.p[i]); + } + rd.c = MAX2(rd.c, that->rd.c); + wr.c = MAX2(wr.c, that->wr.c); + } + void print(int cycle) + { + for (int i = 0; i < 256; ++i) { + if (rd.r[i] > cycle) + INFO("rd $r%i @ %i\n", i, rd.r[i]); + if (wr.r[i] > cycle) + INFO("wr $r%i @ %i\n", i, wr.r[i]); + } + for (int i = 0; i < 8; ++i) { + if (rd.p[i] > cycle) + INFO("rd $p%i @ %i\n", i, rd.p[i]); + if (wr.p[i] > cycle) + INFO("wr $p%i @ %i\n", i, wr.p[i]); + } + if (rd.c > cycle) + INFO("rd $c @ %i\n", rd.c); + if (wr.c > cycle) + INFO("wr $c @ %i\n", wr.c); + } + }; + + RegScores *score; // for current BB + std::vector<RegScores> scoreBoards; + + const TargetGM107 *targ; + bool visit(Function *); + bool visit(BasicBlock *); + + void commitInsn(const Instruction *, int); + int calcDelay(const Instruction *, int) const; + void setDelay(Instruction *, int, const Instruction *); + void recordWr(const Value *, int, int); + void checkRd(const Value *, int, int&) const; + + inline void emitYield(Instruction *); + inline void emitStall(Instruction *, uint8_t); + inline void emitReuse(Instruction *, uint8_t); + inline void emitWrDepBar(Instruction *, uint8_t); + inline void emitRdDepBar(Instruction *, uint8_t); + inline void emitWtDepBar(Instruction *, uint8_t); + + inline int getStall(const Instruction *) const; + inline int getWrDepBar(const Instruction *) const; + inline int getRdDepBar(const Instruction *) const; + inline int getWtDepBar(const Instruction *) const; + + void setReuseFlag(Instruction *); + + inline void printSchedInfo(int, const Instruction *) const; + + struct LiveBarUse { + LiveBarUse(Instruction *insn, Instruction *usei) + : insn(insn), usei(usei) { } + Instruction *insn; + Instruction *usei; + }; + + struct LiveBarDef { + LiveBarDef(Instruction *insn, Instruction *defi) + : insn(insn), defi(defi) { } + Instruction *insn; + Instruction *defi; + }; + + bool insertBarriers(BasicBlock *); + + Instruction *findFirstUse(const Instruction *) const; + Instruction *findFirstDef(const Instruction *) const; + + bool needRdDepBar(const Instruction *) const; + bool needWrDepBar(const Instruction *) const; }; +inline void +SchedDataCalculatorGM107::emitStall(Instruction *insn, uint8_t cnt) +{ + assert(cnt < 16); + insn->sched |= cnt; +} + +inline void +SchedDataCalculatorGM107::emitYield(Instruction *insn) +{ + insn->sched |= 1 << 4; +} + +inline void +SchedDataCalculatorGM107::emitWrDepBar(Instruction *insn, uint8_t id) +{ + assert(id < 6); + if ((insn->sched & 0xe0) == 0xe0) + insn->sched ^= 0xe0; + insn->sched |= id << 5; +} + +inline void +SchedDataCalculatorGM107::emitRdDepBar(Instruction *insn, uint8_t id) +{ + assert(id < 6); + if ((insn->sched & 0x700) == 0x700) + insn->sched ^= 0x700; + insn->sched |= id << 8; +} + +inline void +SchedDataCalculatorGM107::emitWtDepBar(Instruction *insn, uint8_t id) +{ + assert(id < 6); + insn->sched |= 1 << (11 + id); +} + +inline void +SchedDataCalculatorGM107::emitReuse(Instruction *insn, uint8_t id) +{ + assert(id < 4); + insn->sched |= 1 << (17 + id); +} + +inline void +SchedDataCalculatorGM107::printSchedInfo(int cycle, + const Instruction *insn) const +{ + uint8_t st, yl, wr, rd, wt, ru; + + st = (insn->sched & 0x00000f) >> 0; + yl = (insn->sched & 0x000010) >> 4; + wr = (insn->sched & 0x0000e0) >> 5; + rd = (insn->sched & 0x000700) >> 8; + wt = (insn->sched & 0x01f800) >> 11; + ru = (insn->sched & 0x1e0000) >> 17; + + INFO("cycle %i, (st 0x%x, yl 0x%x, wr 0x%x, rd 0x%x, wt 0x%x, ru 0x%x)\n", + cycle, st, yl, wr, rd, wt, ru); +} + +inline int +SchedDataCalculatorGM107::getStall(const Instruction *insn) const +{ + return insn->sched & 0xf; +} + +inline int +SchedDataCalculatorGM107::getWrDepBar(const Instruction *insn) const +{ + return (insn->sched & 0x0000e0) >> 5; +} + +inline int +SchedDataCalculatorGM107::getRdDepBar(const Instruction *insn) const +{ + return (insn->sched & 0x000700) >> 8; +} + +inline int +SchedDataCalculatorGM107::getWtDepBar(const Instruction *insn) const +{ + return (insn->sched & 0x01f800) >> 11; +} + +// Emit the reuse flag which allows to make use of the new memory hierarchy +// introduced since Maxwell, the operand reuse cache. +// +// It allows to reduce bank conflicts by caching operands. Each time you issue +// an instruction, that flag can tell the hw which operands are going to be +// re-used by the next instruction. Note that the next instruction has to use +// the same GPR id in the same operand slot. +void +SchedDataCalculatorGM107::setReuseFlag(Instruction *insn) +{ + Instruction *next = insn->next; + BitSet defs(255, 1); + + if (!targ->isReuseSupported(insn)) + return; + + for (int d = 0; insn->defExists(d); ++d) { + const Value *def = insn->def(d).rep(); + if (insn->def(d).getFile() != FILE_GPR) + continue; + if (typeSizeof(insn->dType) != 4 || def->reg.data.id == 255) + continue; + defs.set(def->reg.data.id); + } + + for (int s = 0; insn->srcExists(s); s++) { + const Value *src = insn->src(s).rep(); + if (insn->src(s).getFile() != FILE_GPR) + continue; + if (typeSizeof(insn->sType) != 4 || src->reg.data.id == 255) + continue; + if (defs.test(src->reg.data.id)) + continue; + if (!next->srcExists(s) || next->src(s).getFile() != FILE_GPR) + continue; + if (src->reg.data.id != next->getSrc(s)->reg.data.id) + continue; + assert(s < 4); + emitReuse(insn, s); + } +} + +void +SchedDataCalculatorGM107::recordWr(const Value *v, int cycle, int ready) +{ + int a = v->reg.data.id, b; + + switch (v->reg.file) { + case FILE_GPR: + b = a + v->reg.size / 4; + for (int r = a; r < b; ++r) + score->rd.r[r] = ready; + break; + case FILE_PREDICATE: + // To immediately use a predicate set by any instructions, the minimum + // number of stall counts is 13. + score->rd.p[a] = cycle + 13; + break; + case FILE_FLAGS: + score->rd.c = ready; + break; + default: + break; + } +} + +void +SchedDataCalculatorGM107::checkRd(const Value *v, int cycle, int &delay) const +{ + int a = v->reg.data.id, b; + int ready = cycle; + + switch (v->reg.file) { + case FILE_GPR: + b = a + v->reg.size / 4; + for (int r = a; r < b; ++r) + ready = MAX2(ready, score->rd.r[r]); + break; + case FILE_PREDICATE: + ready = MAX2(ready, score->rd.p[a]); + break; + case FILE_FLAGS: + ready = MAX2(ready, score->rd.c); + break; + default: + break; + } + if (cycle < ready) + delay = MAX2(delay, ready - cycle); +} + +void +SchedDataCalculatorGM107::commitInsn(const Instruction *insn, int cycle) +{ + const int ready = cycle + targ->getLatency(insn); + + for (int d = 0; insn->defExists(d); ++d) + recordWr(insn->getDef(d), cycle, ready); + +#ifdef GM107_DEBUG_SCHED_DATA + score->print(cycle); +#endif +} + +#define GM107_MIN_ISSUE_DELAY 0x1 +#define GM107_MAX_ISSUE_DELAY 0xf + +int +SchedDataCalculatorGM107::calcDelay(const Instruction *insn, int cycle) const +{ + int delay = 0, ready = cycle; + + for (int s = 0; insn->srcExists(s); ++s) + checkRd(insn->getSrc(s), cycle, delay); + + // TODO: make use of getReadLatency()! + + return MAX2(delay, ready - cycle); +} + +void +SchedDataCalculatorGM107::setDelay(Instruction *insn, int delay, + const Instruction *next) +{ + const OpClass cl = targ->getOpClass(insn->op); + int wr, rd; + + if (insn->op == OP_EXIT || + insn->op == OP_BAR || + insn->op == OP_MEMBAR) { + delay = GM107_MAX_ISSUE_DELAY; + } else + if (insn->op == OP_QUADON || + insn->op == OP_QUADPOP) { + delay = 0xd; + } else + if (cl == OPCLASS_FLOW || insn->join) { + delay = 0xd; + } + + if (!next || !targ->canDualIssue(insn, next)) { + delay = CLAMP(delay, GM107_MIN_ISSUE_DELAY, GM107_MAX_ISSUE_DELAY); + } else { + delay = 0x0; // dual-issue + } + + wr = getWrDepBar(insn); + rd = getRdDepBar(insn); + + if (delay == GM107_MIN_ISSUE_DELAY && (wr & rd) != 7) { + // Barriers take one additional clock cycle to become active on top of + // the clock consumed by the instruction producing it. + if (!next || insn->bb != next->bb) { + delay = 0x2; + } else { + int wt = getWtDepBar(next); + if ((wt & (1 << wr)) | (wt & (1 << rd))) + delay = 0x2; + } + } + + emitStall(insn, delay); +} + + +// Return true when the given instruction needs to emit a read dependency +// barrier (for WaR hazards) because it doesn't operate at a fixed latency, and +// setting the maximum number of stall counts is not enough. +bool +SchedDataCalculatorGM107::needRdDepBar(const Instruction *insn) const +{ + BitSet srcs(255, 1), defs(255, 1); + int a, b; + + if (!targ->isBarrierRequired(insn)) + return false; + + // Do not emit a read dependency barrier when the instruction doesn't use + // any GPR (like st s[0x4] 0x0) as input because it's unnecessary. + for (int s = 0; insn->srcExists(s); ++s) { + const Value *src = insn->src(s).rep(); + if (insn->src(s).getFile() != FILE_GPR) + continue; + if (src->reg.data.id == 255) + continue; + + a = src->reg.data.id; + b = a + src->reg.size / 4; + for (int r = a; r < b; ++r) + srcs.set(r); + } + + if (!srcs.popCount()) + return false; + + // Do not emit a read dependency barrier when the output GPRs are equal to + // the input GPRs (like rcp $r0 $r0) because a write dependency barrier will + // be produced and WaR hazards are prevented. + for (int d = 0; insn->defExists(d); ++d) { + const Value *def = insn->def(d).rep(); + if (insn->def(d).getFile() != FILE_GPR) + continue; + if (def->reg.data.id == 255) + continue; + + a = def->reg.data.id; + b = a + def->reg.size / 4; + for (int r = a; r < b; ++r) + defs.set(r); + } + + srcs.andNot(defs); + if (!srcs.popCount()) + return false; + + return true; +} + +// Return true when the given instruction needs to emit a write dependency +// barrier (for RaW hazards) because it doesn't operate at a fixed latency, and +// setting the maximum number of stall counts is not enough. This is only legal +// if the instruction output something. +bool +SchedDataCalculatorGM107::needWrDepBar(const Instruction *insn) const +{ + if (!targ->isBarrierRequired(insn)) + return false; + + for (int d = 0; insn->defExists(d); ++d) { + if (insn->def(d).getFile() == FILE_GPR || + insn->def(d).getFile() == FILE_PREDICATE) + return true; + } + return false; +} + +// Find the next instruction inside the same basic block which uses the output +// of the given instruction in order to avoid RaW hazards. +Instruction * +SchedDataCalculatorGM107::findFirstUse(const Instruction *bari) const +{ + Instruction *insn, *next; + int minGPR, maxGPR; + + if (!bari->defExists(0)) + return NULL; + + minGPR = bari->def(0).rep()->reg.data.id; + maxGPR = minGPR + bari->def(0).rep()->reg.size / 4 - 1; + + for (insn = bari->next; insn != NULL; insn = next) { + next = insn->next; + + for (int s = 0; insn->srcExists(s); ++s) { + const Value *src = insn->src(s).rep(); + if (bari->def(0).getFile() == FILE_GPR) { + if (insn->src(s).getFile() != FILE_GPR || + src->reg.data.id + src->reg.size / 4 - 1 < minGPR || + src->reg.data.id > maxGPR) + continue; + return insn; + } else + if (bari->def(0).getFile() == FILE_PREDICATE) { + if (insn->src(s).getFile() != FILE_PREDICATE || + src->reg.data.id != minGPR) + continue; + return insn; + } + } + } + return NULL; +} + +// Find the next instruction inside the same basic block which overwrites, at +// least, one source of the given instruction in order to avoid WaR hazards. +Instruction * +SchedDataCalculatorGM107::findFirstDef(const Instruction *bari) const +{ + Instruction *insn, *next; + int minGPR, maxGPR; + + for (insn = bari->next; insn != NULL; insn = next) { + next = insn->next; + + for (int d = 0; insn->defExists(d); ++d) { + const Value *def = insn->def(d).rep(); + if (insn->def(d).getFile() != FILE_GPR) + continue; + + minGPR = def->reg.data.id; + maxGPR = minGPR + def->reg.size / 4 - 1; + + for (int s = 0; bari->srcExists(s); ++s) { + const Value *src = bari->src(s).rep(); + if (bari->src(s).getFile() != FILE_GPR || + src->reg.data.id + src->reg.size / 4 - 1 < minGPR || + src->reg.data.id > maxGPR) + continue; + return insn; + } + } + } + return NULL; +} + +// Dependency barriers: +// This pass is a bit ugly and could probably be improved by performing a +// better allocation. +// +// The main idea is to avoid WaR and RaW hazards by emitting read/write +// dependency barriers using the control codes. +bool +SchedDataCalculatorGM107::insertBarriers(BasicBlock *bb) +{ + std::list<LiveBarUse> live_uses; + std::list<LiveBarDef> live_defs; + Instruction *insn, *next; + BitSet bars(6, 1); + int bar_id; + + for (insn = bb->getEntry(); insn != NULL; insn = next) { + Instruction *usei = NULL, *defi = NULL; + bool need_wr_bar, need_rd_bar; + + next = insn->next; + + // Expire old barrier uses. + for (std::list<LiveBarUse>::iterator it = live_uses.begin(); + it != live_uses.end();) { + if (insn->serial >= it->usei->serial) { + int wr = getWrDepBar(it->insn); + emitWtDepBar(insn, wr); + bars.clr(wr); // free barrier + it = live_uses.erase(it); + continue; + } + ++it; + } + + // Expire old barrier defs. + for (std::list<LiveBarDef>::iterator it = live_defs.begin(); + it != live_defs.end();) { + if (insn->serial >= it->defi->serial) { + int rd = getRdDepBar(it->insn); + emitWtDepBar(insn, rd); + bars.clr(rd); // free barrier + it = live_defs.erase(it); + continue; + } + ++it; + } + + need_wr_bar = needWrDepBar(insn); + need_rd_bar = needRdDepBar(insn); + + if (need_wr_bar) { + // When the instruction requires to emit a write dependency barrier + // (all which write something at a variable latency), find the next + // instruction which reads the outputs. + usei = findFirstUse(insn); + + // Allocate and emit a new barrier. + bar_id = bars.findFreeRange(1); + if (bar_id == -1) + bar_id = 5; + bars.set(bar_id); + emitWrDepBar(insn, bar_id); + if (usei) + live_uses.push_back(LiveBarUse(insn, usei)); + } + + if (need_rd_bar) { + // When the instruction requires to emit a read dependency barrier + // (all which read something at a variable latency), find the next + // instruction which will write the inputs. + defi = findFirstDef(insn); + + if (usei && defi && usei->serial <= defi->serial) + continue; + + // Allocate and emit a new barrier. + bar_id = bars.findFreeRange(1); + if (bar_id == -1) + bar_id = 5; + bars.set(bar_id); + emitRdDepBar(insn, bar_id); + if (defi) + live_defs.push_back(LiveBarDef(insn, defi)); + } + } + + // Remove unnecessary barrier waits. + BitSet alive_bars(6, 1); + for (insn = bb->getEntry(); insn != NULL; insn = next) { + int wr, rd, wt; + + next = insn->next; + + wr = getWrDepBar(insn); + rd = getRdDepBar(insn); + wt = getWtDepBar(insn); + + for (int idx = 0; idx < 6; ++idx) { + if (!(wt & (1 << idx))) + continue; + if (!alive_bars.test(idx)) { + insn->sched &= ~(1 << (11 + idx)); + } else { + alive_bars.clr(idx); + } + } + + if (wr < 6) + alive_bars.set(wr); + if (rd < 6) + alive_bars.set(rd); + } + + return true; +} + +bool +SchedDataCalculatorGM107::visit(Function *func) +{ + ArrayList insns; + + func->orderInstructions(insns); + + scoreBoards.resize(func->cfg.getSize()); + for (size_t i = 0; i < scoreBoards.size(); ++i) + scoreBoards[i].wipe(); + return true; +} + bool SchedDataCalculatorGM107::visit(BasicBlock *bb) { + Instruction *insn, *next = NULL; + int cycle = 0; + for (Instruction *insn = bb->getEntry(); insn; insn = insn->next) { /*XXX*/ insn->sched = 0x7e0; } + if (!debug_get_bool_option("NV50_PROG_SCHED", true)) + return true; + + // Insert read/write dependency barriers for instructions which don't + // operate at a fixed latency. + insertBarriers(bb); + + score = &scoreBoards.at(bb->getId()); + + for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) { + // back branches will wait until all target dependencies are satisfied + if (ei.getType() == Graph::Edge::BACK) // sched would be uninitialized + continue; + BasicBlock *in = BasicBlock::get(ei.getNode()); + score->setMax(&scoreBoards.at(in->getId())); + } + +#ifdef GM107_DEBUG_SCHED_DATA + INFO("=== BB:%i initial scores\n", bb->getId()); + score->print(cycle); +#endif + + // Because barriers are allocated locally (intra-BB), we have to make sure + // that all produced barriers have been consumed before entering inside a + // new basic block. The best way is to do a global allocation pre RA but + // it's really more difficult, especially because of the phi nodes. Anyways, + // it seems like that waiting on a barrier which has already been consumed + // doesn't add any additional cost, it's just not elegant! + Instruction *start = bb->getEntry(); + if (start && bb->cfg.incidentCount() > 0) { + for (int b = 0; b < 6; b++) + emitWtDepBar(start, b); + } + + for (insn = bb->getEntry(); insn && insn->next; insn = insn->next) { + next = insn->next; + + commitInsn(insn, cycle); + int delay = calcDelay(next, cycle); + setDelay(insn, delay, next); + cycle += getStall(insn); + + setReuseFlag(insn); + + // XXX: The yield flag seems to destroy a bunch of things when it is + // set on every instruction, need investigation. + //emitYield(insn); + +#ifdef GM107_DEBUG_SCHED_DATA + printSchedInfo(cycle, insn); + insn->print(); + next->print(); +#endif + } + + if (!insn) + return true; + commitInsn(insn, cycle); + + int bbDelay = -1; + +#ifdef GM107_DEBUG_SCHED_DATA + fprintf(stderr, "last instruction is : "); + insn->print(); + fprintf(stderr, "cycle=%d\n", cycle); +#endif + + for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) { + BasicBlock *out = BasicBlock::get(ei.getNode()); + + if (ei.getType() != Graph::Edge::BACK) { + // Only test the first instruction of the outgoing block. + next = out->getEntry(); + if (next) { + bbDelay = MAX2(bbDelay, calcDelay(next, cycle)); + } else { + // When the outgoing BB is empty, make sure to set the number of + // stall counts needed by the instruction because we don't know the + // next instruction. + bbDelay = MAX2(bbDelay, targ->getLatency(insn)); + } + } else { + // Wait until all dependencies are satisfied. + const int regsFree = score->getLatest(); + next = out->getFirst(); + for (int c = cycle; next && c < regsFree; next = next->next) { + bbDelay = MAX2(bbDelay, calcDelay(next, c)); + c += getStall(next); + } + next = NULL; + } + } + if (bb->cfg.outgoingCount() != 1) + next = NULL; + setDelay(insn, bbDelay, next); + cycle += getStall(insn); + + score->rebase(cycle); // common base for initializing out blocks' scores return true; } @@ -3398,7 +4161,7 @@ SchedDataCalculatorGM107::visit(BasicBlock *bb) void CodeEmitterGM107::prepareEmission(Function *func) { - SchedDataCalculatorGM107 sched(targ); + SchedDataCalculatorGM107 sched(targGM107); CodeEmitter::prepareEmission(func); sched.run(func, true, true); } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp index 6b8f767a3c0..04cbd402a18 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp @@ -68,6 +68,259 @@ TargetGM107::isOpSupported(operation op, DataType ty) const return true; } +// Return true when an instruction supports the reuse flag. When supported, the +// hardware will use the operand reuse cache introduced since Maxwell, which +// should try to reduce bank conflicts by caching values for the subsequent +// instructions. Note that the next instructions have to use the same GPR id in +// the same operand slot. +bool +TargetGM107::isReuseSupported(const Instruction *insn) const +{ + const OpClass cl = getOpClass(insn->op); + + // TODO: double-check! + switch (cl) { + case OPCLASS_ARITH: + case OPCLASS_COMPARE: + case OPCLASS_LOGIC: + case OPCLASS_MOVE: + case OPCLASS_SHIFT: + return true; + case OPCLASS_BITFIELD: + if (insn->op == OP_INSBF || insn->op == OP_EXTBF) + return true; + break; + default: + break; + } + return false; +} + +// Return true when an instruction requires to set up a barrier because it +// doesn't operate at a fixed latency. Variable latency instructions are memory +// operations, double precision operations, special function unit operations +// and other low throughput instructions. +bool +TargetGM107::isBarrierRequired(const Instruction *insn) const +{ + const OpClass cl = getOpClass(insn->op); + + if (insn->dType == TYPE_F64 || insn->sType == TYPE_F64) + return true; + + switch (cl) { + case OPCLASS_ATOMIC: + case OPCLASS_LOAD: + case OPCLASS_STORE: + case OPCLASS_SURFACE: + case OPCLASS_TEXTURE: + return true; + case OPCLASS_SFU: + switch (insn->op) { + case OP_COS: + case OP_EX2: + case OP_LG2: + case OP_LINTERP: + case OP_PINTERP: + case OP_RCP: + case OP_RSQ: + case OP_SIN: + return true; + default: + break; + } + break; + case OPCLASS_BITFIELD: + switch (insn->op) { + case OP_BFIND: + case OP_POPCNT: + return true; + default: + break; + } + break; + case OPCLASS_CONTROL: + switch (insn->op) { + case OP_EMIT: + case OP_RESTART: + return true; + default: + break; + } + break; + case OPCLASS_OTHER: + switch (insn->op) { + case OP_AFETCH: + case OP_PFETCH: + case OP_PIXLD: + case OP_RDSV: + case OP_SHFL: + return true; + default: + break; + } + break; + case OPCLASS_ARITH: + // TODO: IMUL/IMAD require barriers too, use of XMAD instead! + if ((insn->op == OP_MUL || insn->op == OP_MAD) && + !isFloatType(insn->dType)) + return true; + break; + case OPCLASS_CONVERT: + if (insn->def(0).getFile() != FILE_PREDICATE && + insn->src(0).getFile() != FILE_PREDICATE) + return true; + break; + default: + break; + } + return false; +} + +bool +TargetGM107::canDualIssue(const Instruction *a, const Instruction *b) const +{ + // TODO + return false; +} + +// Return the number of stall counts needed to complete a single instruction. +// On Maxwell GPUs, the pipeline depth is 6, but some instructions require +// different number of stall counts like memory operations. +int +TargetGM107::getLatency(const Instruction *insn) const +{ + // TODO: better values! This should be good enough for now though. + switch (insn->op) { + case OP_EMIT: + case OP_EXPORT: + case OP_PIXLD: + case OP_RESTART: + case OP_STORE: + case OP_SUSTB: + case OP_SUSTP: + return 1; + case OP_SHFL: + return 2; + case OP_ADD: + case OP_AND: + case OP_EXTBF: + case OP_FMA: + case OP_INSBF: + case OP_MAD: + case OP_MAX: + case OP_MIN: + case OP_MOV: + case OP_MUL: + case OP_NOT: + case OP_OR: + case OP_PREEX2: + case OP_PRESIN: + case OP_QUADOP: + case OP_SELP: + case OP_SET: + case OP_SET_AND: + case OP_SET_OR: + case OP_SET_XOR: + case OP_SHL: + case OP_SHLADD: + case OP_SHR: + case OP_SLCT: + case OP_SUB: + case OP_VOTE: + case OP_XOR: + if (insn->dType != TYPE_F64) + return 6; + break; + case OP_ABS: + case OP_CEIL: + case OP_CVT: + case OP_FLOOR: + case OP_NEG: + case OP_SAT: + case OP_TRUNC: + if (insn->op == OP_CVT && (insn->def(0).getFile() == FILE_PREDICATE || + insn->src(0).getFile() == FILE_PREDICATE)) + return 6; + break; + case OP_BFIND: + case OP_COS: + case OP_EX2: + case OP_LG2: + case OP_POPCNT: + case OP_QUADON: + case OP_QUADPOP: + case OP_RCP: + case OP_RSQ: + case OP_SIN: + return 13; + default: + break; + } + // Use the maximum number of stall counts for other instructions. + return 15; +} + +// Return the operand read latency which is the number of stall counts before +// an instruction can read its sources. For memory operations like ATOM, LOAD +// and STORE, the memory access has to be indirect. +int +TargetGM107::getReadLatency(const Instruction *insn) const +{ + switch (insn->op) { + case OP_ABS: + case OP_BFIND: + case OP_CEIL: + case OP_COS: + case OP_EX2: + case OP_FLOOR: + case OP_LG2: + case OP_NEG: + case OP_POPCNT: + case OP_RCP: + case OP_RSQ: + case OP_SAT: + case OP_SIN: + case OP_SULDB: + case OP_SULDP: + case OP_SUREDB: + case OP_SUREDP: + case OP_SUSTB: + case OP_SUSTP: + case OP_TRUNC: + return 4; + case OP_CVT: + if (insn->def(0).getFile() != FILE_PREDICATE && + insn->src(0).getFile() != FILE_PREDICATE) + return 4; + break; + case OP_ATOM: + case OP_LOAD: + case OP_STORE: + if (insn->src(0).isIndirect(0)) { + switch (insn->src(0).getFile()) { + case FILE_MEMORY_SHARED: + case FILE_MEMORY_CONST: + return 2; + case FILE_MEMORY_GLOBAL: + case FILE_MEMORY_LOCAL: + return 4; + default: + break; + } + } + break; + case OP_EXPORT: + case OP_PFETCH: + case OP_SHFL: + case OP_VFETCH: + return 2; + default: + break; + } + return 0; +} + bool TargetGM107::runLegalizePass(Program *prog, CGStage stage) const { diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h index 5d606378953..dd4aa6a54d0 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h @@ -16,6 +16,13 @@ public: virtual uint32_t getBuiltinOffset(int) const; virtual bool isOpSupported(operation, DataType) const; + virtual bool isReuseSupported(const Instruction *) const; + + virtual bool isBarrierRequired(const Instruction *) const; + + virtual bool canDualIssue(const Instruction *, const Instruction *) const; + virtual int getLatency(const Instruction *) const; + virtual int getReadLatency(const Instruction *) const; }; } // namespace nv50_ir |