diff options
author | Christoph Bumiller <[email protected]> | 2012-04-09 20:58:39 +0200 |
---|---|---|
committer | Christoph Bumiller <[email protected]> | 2012-04-14 21:54:03 +0200 |
commit | e43a3a66a9d8a99021d76ff4d07dec7b8cfd62ca (patch) | |
tree | 7bfd51391b8a2222c57e821cbcbe052187352939 /src/gallium/drivers | |
parent | 99319328d4c249090d3e4a387f6bdc00f711b688 (diff) |
nv50/ir: rewrite the register allocator as GCRA, with spilling
This is more flexible than the linear scan, and we don't need the
separate allocation pass for constrained values anymore.
Diffstat (limited to 'src/gallium/drivers')
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir.cpp | 94 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir.h | 25 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp | 4 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp | 1 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h | 5 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp | 2 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp | 3 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp | 1588 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp | 118 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_util.h | 47 | ||||
-rw-r--r-- | src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp | 6 |
11 files changed, 1475 insertions, 418 deletions
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir.cpp index 314701bfb6a..cdf0a133e50 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir.cpp @@ -222,66 +222,17 @@ Value::Value() reg.size = 4; } -bool -Value::coalesce(Value *jval, bool force) -{ - Value *repr = this->join; // new representative - Value *jrep = jval->join; - - if (reg.file != jval->reg.file || reg.size != jval->reg.size) { - if (!force) - return false; - ERROR("forced coalescing of values of different sizes/files"); - } - - if (!force && (repr->reg.data.id != jrep->reg.data.id)) { - if (repr->reg.data.id >= 0 && - jrep->reg.data.id >= 0) - return false; - if (jrep->reg.data.id >= 0) { - repr = jval->join; - jrep = this->join; - jval = this; - } - - // need to check all fixed register values of the program for overlap - Function *func = defs.front()->getInsn()->bb->getFunction(); - - // TODO: put values in by register-id bins per function - ArrayList::Iterator iter = func->allLValues.iterator(); - for (; !iter.end(); iter.next()) { - Value *fixed = reinterpret_cast<Value *>(iter.get()); - assert(fixed); - if (fixed->reg.data.id == repr->reg.data.id) - if (fixed->livei.overlaps(jrep->livei)) - return false; - } - } - if (repr->livei.overlaps(jrep->livei)) { - if (!force) - return false; - // do we really want this ? if at all, only for constraint ops - INFO("NOTE: forced coalescing with live range overlap\n"); - } - - for (DefIterator it = jrep->defs.begin(); it != jrep->defs.end(); ++it) - (*it)->get()->join = repr; - - repr->defs.insert(repr->defs.end(), - jrep->defs.begin(), jrep->defs.end()); - repr->livei.unify(jrep->livei); - - assert(repr->join == repr && jval->join == repr); - return true; -} - LValue::LValue(Function *fn, DataFile file) { reg.file = file; reg.size = (file != FILE_PREDICATE) ? 4 : 1; reg.data.id = -1; - affinity = -1; + compMask = 0; + compound = 0; + ssa = 0; + fixedReg = 0; + noSpill = 0; fn->add(this, this->id); } @@ -294,7 +245,11 @@ LValue::LValue(Function *fn, LValue *lval) reg.size = lval->reg.size; reg.data.id = -1; - affinity = -1; + compMask = 0; + compound = 0; + ssa = 0; + fixedReg = 0; + noSpill = 0; fn->add(this, this->id); } @@ -523,8 +478,8 @@ Value::interfers(const Value *that) const idA = this->join->reg.data.offset; idB = that->join->reg.data.offset; } else { - idA = this->join->reg.data.id * this->reg.size; - idB = that->join->reg.data.id * that->reg.size; + idA = this->join->reg.data.id * MIN2(this->reg.size, 4); + idB = that->join->reg.data.id * MIN2(that->reg.size, 4); } if (idA < idB) @@ -539,8 +494,6 @@ Value::interfers(const Value *that) const bool Value::equals(const Value *that, bool strict) const { - that = that->join; - if (strict) return this == that; @@ -754,20 +707,38 @@ Instruction::clone(ClonePolicy<Function>& pol, Instruction *i) const } unsigned int -Instruction::defCount(unsigned int mask) const +Instruction::defCount(unsigned int mask, bool singleFile) const { unsigned int i, n; + if (singleFile) { + unsigned int d = ffs(mask); + if (!d) + return 0; + for (i = d--; defExists(i); ++i) + if (getDef(i)->reg.file != getDef(d)->reg.file) + mask &= ~(1 << i); + } + for (n = 0, i = 0; this->defExists(i); ++i, mask >>= 1) n += mask & 1; return n; } unsigned int -Instruction::srcCount(unsigned int mask) const +Instruction::srcCount(unsigned int mask, bool singleFile) const { unsigned int i, n; + if (singleFile) { + unsigned int s = ffs(mask); + if (!s) + return 0; + for (i = s--; srcExists(i); ++i) + if (getSrc(i)->reg.file != getSrc(s)->reg.file) + mask &= ~(1 << i); + } + for (n = 0, i = 0; this->srcExists(i); ++i, mask >>= 1) n += mask & 1; return n; @@ -1137,6 +1108,7 @@ out: info->bin.maxGPR = prog->maxGPR; info->bin.code = prog->code; info->bin.codeSize = prog->binSize; + info->bin.tlsSpace = prog->tlsSize; delete prog; nv50_ir::Target::destroy(targ); diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir.h b/src/gallium/drivers/nv50/codegen/nv50_ir.h index b06d0938b62..a52cc9a212f 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir.h @@ -219,6 +219,7 @@ enum DataFile FILE_PREDICATE, // boolean predicate FILE_FLAGS, // zero/sign/carry/overflow bits FILE_ADDRESS, + LAST_REGISTER_FILE = FILE_ADDRESS, FILE_IMMEDIATE, FILE_MEMORY_CONST, FILE_SHADER_INPUT, @@ -320,7 +321,7 @@ struct Storage float f32; double f64; int32_t offset; // offset from 0 (base of address space) - int32_t id; // register id (< 0 if virtual/unassigned) + int32_t id; // register id (< 0 if virtual/unassigned, in units <= 4) struct { SVSemantic sv; int index; @@ -473,8 +474,6 @@ public: inline const Symbol *asSym() const; inline const ImmediateValue *asImm() const; - bool coalesce(Value *, bool force = false); - inline bool inFile(DataFile f) { return reg.file == f; } static inline Value *get(Iterator&); @@ -506,9 +505,11 @@ public: virtual int print(char *, size_t, DataType ty = TYPE_NONE) const; public: - unsigned ssa : 1; - - int affinity; + unsigned compMask : 8; // compound/component mask + unsigned compound : 1; // used by RA, value involved in split/merge + unsigned ssa : 1; + unsigned fixedReg : 1; // set & used by RA, earlier just use (id < 0) + unsigned noSpill : 1; // do not spill (e.g. if spill temporary already) }; class Symbol : public Value @@ -611,7 +612,7 @@ public: return s < srcs.size() && srcs[s].exists(); } - inline bool constrainedDefs() const { return defExists(1); } + inline bool constrainedDefs() const; bool setPredicate(CondCode ccode, Value *); inline Value *getPredicate() const; @@ -622,9 +623,9 @@ public: inline void setFlagsDef(int d, Value *); unsigned int defCount() const { return defs.size(); }; - unsigned int defCount(unsigned int mask) const; + unsigned int defCount(unsigned int mask, bool singleFile = false) const; unsigned int srcCount() const { return srcs.size(); }; - unsigned int srcCount(unsigned int mask) const; + unsigned int srcCount(unsigned int mask, bool singleFile = false) const; // save & remove / set indirect[0,1] and predicate source void takeExtraSources(int s, Value *[3]); @@ -965,6 +966,11 @@ public: uint32_t binPos; uint32_t binSize; + Value *stackPtr; + + uint32_t tlsBase; // base address for l[] space (if no stack pointer is used) + uint32_t tlsSize; + ArrayList allBBlocks; ArrayList allInsns; ArrayList allLValues; @@ -1036,6 +1042,7 @@ public: uint32_t *code; uint32_t binSize; + uint32_t tlsSize; // size required for FILE_MEMORY_LOCAL int maxGPR; diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp index 7b211a2e0d1..ebfb07a2b3e 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp @@ -41,6 +41,10 @@ Function::Function(Program *p, const char *fnName, uint32_t label) binPos = 0; binSize = 0; + stackPtr = NULL; + tlsBase = 0; + tlsSize = 0; + prog->add(this, id); } diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp index c419a7dca6e..9fab1c314fc 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp @@ -2420,6 +2420,7 @@ Program::makeFromTGSI(struct nv50_ir_prog_info *info) tgsi::Source src(info); if (!src.scanSource()) return false; + tlsSize = info->bin.tlsSpace; Converter builder(this, &src); return builder.run(); diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h b/src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h index b06f217ba30..4ce9deb131f 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h @@ -192,6 +192,11 @@ Instruction *Value::getUniqueInsn() const return defs.front()->getInsn(); } +inline bool Instruction::constrainedDefs() const +{ + return defExists(1) || op == OP_UNION; +} + Value *Instruction::getIndirect(int s, int dim) const { return srcs[s].isIndirect(dim) ? getSrc(srcs[s].indirect[dim]) : NULL; diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp index 867bcb0408f..2cca44921a5 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp @@ -48,7 +48,7 @@ Instruction::isNop() const } if (op == OP_MOV || op == OP_UNION) { - if (!def(0).rep()->equals(getSrc(0))) + if (!getDef(0)->equals(getSrc(0))) return false; if (op == OP_UNION) if (!def(0).rep()->equals(getSrc(1))) diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp index a831145ef5d..61382336bc4 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp @@ -283,6 +283,9 @@ int LValue::print(char *buf, size_t size, DataType ty) const else if (reg.size == 16) postFix = "q"; + else + if (reg.size == 12) + postFix = "t"; break; case FILE_PREDICATE: r = 'p'; col = TXT_REGISTER; diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp index f13335f7d28..cd8f28955e4 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp @@ -23,7 +23,8 @@ #include "nv50_ir.h" #include "nv50_ir_target.h" -#include "nv50/nv50_debug.h" +#include <stack> +#include <limits> namespace nv50_ir { @@ -32,41 +33,74 @@ namespace nv50_ir { class RegisterSet { public: - RegisterSet(); RegisterSet(const Target *); void init(const Target *); - void reset(); // reset allocation status, but not max assigned regs + void reset(DataFile, bool resetMax = false); void periodicMask(DataFile f, uint32_t lock, uint32_t unlock); void intersect(DataFile f, const RegisterSet *); - bool assign(Value **, int nr); - void release(const Value *); + bool assign(int32_t& reg, DataFile f, unsigned int size); + void release(DataFile f, int32_t reg, unsigned int size); + bool occupy(DataFile f, int32_t reg, unsigned int size); bool occupy(const Value *); + void occupyMask(DataFile f, int32_t reg, uint8_t mask); - int getMaxAssigned(DataFile f) const { return fill[f]; } + inline int getMaxAssigned(DataFile f) const { return fill[f]; } + + inline unsigned int getFileSize(DataFile f, uint8_t regSize) const + { + if (restrictedGPR16Range && f == FILE_GPR && regSize == 2) + return (last[f] + 1) / 2; + return last[f] + 1; + } + + inline unsigned int units(DataFile f, unsigned int size) const + { + return size >> unit[f]; + } + // for regs of size >= 4, id is counted in 4-byte words (like nv50/c0 binary) + inline unsigned int idToBytes(Value *v) const + { + return v->reg.data.id * MIN2(v->reg.size, 4); + } + inline unsigned int idToUnits(Value *v) const + { + return units(v->reg.file, idToBytes(v)); + } + inline int bytesToId(Value *v, unsigned int bytes) const + { + if (v->reg.size < 4) + return units(v->reg.file, bytes); + return bytes / 4; + } + inline int unitsToId(DataFile f, int u, uint8_t size) const + { + if (u < 0) + return -1; + return (size < 4) ? u : ((u << unit[f]) / 4); + } void print() const; private: - uint32_t bits[FILE_ADDRESS + 1][(MAX_REGISTER_FILE_SIZE + 31) / 32]; + BitSet bits[LAST_REGISTER_FILE + 1]; + + int unit[LAST_REGISTER_FILE + 1]; // log2 of allocation granularity - int unit[FILE_ADDRESS + 1]; // log2 of allocation granularity + int last[LAST_REGISTER_FILE + 1]; + int fill[LAST_REGISTER_FILE + 1]; - int last[FILE_ADDRESS + 1]; - int fill[FILE_ADDRESS + 1]; + const bool restrictedGPR16Range; }; void -RegisterSet::reset() +RegisterSet::reset(DataFile f, bool resetMax) { - memset(bits, 0, sizeof(bits)); -} - -RegisterSet::RegisterSet() -{ - reset(); + bits[f].fill(0); + if (resetMax) + fill[f] = -1; } void @@ -78,118 +112,83 @@ RegisterSet::init(const Target *targ) unit[rf] = targ->getFileUnit(f); fill[rf] = -1; assert(last[rf] < MAX_REGISTER_FILE_SIZE); + bits[rf].allocate(last[rf] + 1, true); } } RegisterSet::RegisterSet(const Target *targ) + : restrictedGPR16Range(targ->getChipset() < 0xc0) { - reset(); init(targ); + for (unsigned int i = 0; i <= LAST_REGISTER_FILE; ++i) + reset(static_cast<DataFile>(i)); } void RegisterSet::periodicMask(DataFile f, uint32_t lock, uint32_t unlock) { - for (int i = 0; i < (last[f] + 31) / 32; ++i) - bits[f][i] = (bits[f][i] | lock) & ~unlock; + bits[f].periodicMask32(lock, unlock); } void RegisterSet::intersect(DataFile f, const RegisterSet *set) { - for (int i = 0; i < (last[f] + 31) / 32; ++i) - bits[f][i] |= set->bits[f][i]; + bits[f] |= set->bits[f]; } void RegisterSet::print() const { INFO("GPR:"); - for (int i = 0; i < (last[FILE_GPR] + 31) / 32; ++i) - INFO(" %08x", bits[FILE_GPR][i]); + bits[FILE_GPR].print(); INFO("\n"); } bool -RegisterSet::assign(Value **def, int nr) +RegisterSet::assign(int32_t& reg, DataFile f, unsigned int size) { - DataFile f = def[0]->reg.file; - int n = nr; - if (n == 3) - n = 4; - int s = (n * def[0]->reg.size) >> unit[f]; - uint32_t m = (1 << s) - 1; - - int id = last[f] + 1; - int i; - - for (i = 0; (i * 32) < last[f]; ++i) { - if (bits[f][i] == 0xffffffff) - continue; - - for (id = 0; id < 32; id += s) - if (!(bits[f][i] & (m << id))) - break; - if (id < 32) - break; - } - id += i * 32; - if (id > last[f]) + reg = bits[f].findFreeRange(size); + if (reg < 0) return false; - - bits[f][id / 32] |= m << (id % 32); - - if (id + (s - 1) > fill[f]) - fill[f] = id + (s - 1); - - for (i = 0; i < nr; ++i, ++id) - if (!def[i]->livei.isEmpty()) // XXX: really increased id if empty ? - def[i]->reg.data.id = id; + fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1)); return true; } bool -RegisterSet::occupy(const Value *val) +RegisterSet::occupy(const Value *v) { - int id = val->reg.data.id; - unsigned int f = val->reg.file; + return occupy(v->reg.file, v->reg.data.id, v->reg.size >> unit[v->reg.file]); +} - uint32_t m = (1 << (val->reg.size >> unit[f])) - 1; +void +RegisterSet::occupyMask(DataFile f, int32_t reg, uint8_t mask) +{ + bits[f].setMask(reg & ~31, static_cast<uint32_t>(mask) << (reg % 32)); +} - if (id < 0 || bits[f][id / 32] & m << (id % 32)) +bool +RegisterSet::occupy(DataFile f, int32_t reg, unsigned int size) +{ + if (bits[f].testRange(reg, size)) return false; - INFO_DBG(0, REG_ALLOC, "reg occupy: %u[%i] %x\n", f, id, m); + bits[f].setRange(reg, size); - bits[f][id / 32] |= m << (id % 32); + INFO_DBG(0, REG_ALLOC, "reg occupy: %u[%i] %u\n", f, reg, size); - if (fill[f] < id) - fill[f] = id; + fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1)); return true; } void -RegisterSet::release(const Value *val) +RegisterSet::release(DataFile f, int32_t reg, unsigned int size) { - int id = val->reg.data.id; - if (id < 0) - return; - unsigned int f = val->reg.file; - - uint32_t m = (1 << (val->reg.size >> unit[f])) - 1; + bits[f].clrRange(reg, size); - INFO_DBG(0, REG_ALLOC, "reg release: %u[%i] %x\n", f, id, m); - - bits[f][id / 32] &= ~(m << (id % 32)); + INFO_DBG(0, REG_ALLOC, "reg release: %u[%i] %u\n", f, reg, size); } -#define JOIN_MASK_PHI (1 << 0) -#define JOIN_MASK_UNION (1 << 1) -#define JOIN_MASK_MOV (1 << 2) -#define JOIN_MASK_TEX (1 << 3) -#define JOIN_MASK_CONSTRAINT (1 << 4) - class RegAlloc { public: @@ -199,11 +198,6 @@ public: bool execFunc(); private: - bool coalesceValues(unsigned int mask); - bool linearScan(); - bool allocateConstrainedValues(); - -private: class PhiMovesPass : public Pass { private: virtual bool visit(BasicBlock *); @@ -230,19 +224,25 @@ private: bool insertConstraintMoves(); + void condenseDefs(Instruction *); + void condenseSrcs(Instruction *, const int first, const int last); + void addHazard(Instruction *i, const ValueRef *src); void textureMask(TexInstruction *); void addConstraint(Instruction *, int s, int n); bool detectConflict(Instruction *, int s); - DLList constrList; + // target specific functions, TODO: put in subclass or Target + void texConstraintNV50(TexInstruction *); + void texConstraintNVC0(TexInstruction *); + void texConstraintNVE0(TexInstruction *); + + std::list<Instruction *> constrList; + + const Target *targ; }; bool buildLiveSets(BasicBlock *); - void collectLValues(DLList&, bool assignedOnly); - - void insertOrderedTail(DLList&, Value *); - inline Instruction *insnBySerial(int); private: Program *prog; @@ -254,11 +254,36 @@ private: int sequence; // for manual passes through CFG }; -Instruction * -RegAlloc::insnBySerial(int serial) +typedef std::pair<Value *, Value *> ValuePair; + +class SpillCodeInserter { - return reinterpret_cast<Instruction *>(insns.get(serial)); -} +public: + SpillCodeInserter(Function *fn) : func(fn), stackSize(0), stackBase(0) { } + + bool run(const std::list<ValuePair>&); + + Symbol *assignSlot(const Interval&, unsigned int size); + inline int32_t getStackSize() const { return stackSize; } + +private: + Function *func; + + struct SpillSlot + { + Interval occup; + std::list<Value *> residents; // needed to recalculate occup + Symbol *sym; + int32_t offset; + inline uint8_t size() const { return sym->reg.size; } + }; + std::list<SpillSlot> slots; + int32_t stackSize; + int32_t stackBase; + + LValue *unspill(Instruction *usei, LValue *, Value *slot); + void spill(Instruction *defi, Value *slot, LValue *); +}; void RegAlloc::BuildIntervalsPass::addLiveRange(Value *val, @@ -312,23 +337,26 @@ RegAlloc::PhiMovesPass::visit(BasicBlock *bb) Instruction *phi, *mov; BasicBlock *pb, *pn; + std::stack<BasicBlock *> stack; + for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) { - pb = pn = BasicBlock::get(ei.getNode()); + pb = BasicBlock::get(ei.getNode()); assert(pb); - - if (needNewElseBlock(bb, pb)) { - pn = new BasicBlock(func); - - // deletes an edge, iterator is invalid after this: - pb->cfg.detach(&bb->cfg); - pb->cfg.attach(&pn->cfg, Graph::Edge::TREE); - pn->cfg.attach(&bb->cfg, Graph::Edge::FORWARD); // XXX: check order ! - - assert(pb->getExit()->op != OP_CALL); - if (pb->getExit()->asFlow()->target.bb == bb) - pb->getExit()->asFlow()->target.bb = pn; - break; - } + if (needNewElseBlock(bb, pb)) + stack.push(pb); + } + while (!stack.empty()) { + pb = stack.top(); + pn = new BasicBlock(func); + stack.pop(); + + pb->cfg.detach(&bb->cfg); + pb->cfg.attach(&pn->cfg, Graph::Edge::TREE); + pn->cfg.attach(&bb->cfg, Graph::Edge::FORWARD); + + assert(pb->getExit()->op != OP_CALL); + if (pb->getExit()->asFlow()->target.bb == bb) + pb->getExit()->asFlow()->target.bb = pn; } // insert MOVs (phi->src(j) should stem from j-th in-BB) @@ -567,36 +595,350 @@ RegAlloc::BuildIntervalsPass::visit(BasicBlock *bb) return true; } + +#define JOIN_MASK_PHI (1 << 0) +#define JOIN_MASK_UNION (1 << 1) +#define JOIN_MASK_MOV (1 << 2) +#define JOIN_MASK_TEX (1 << 3) + +class GCRA +{ +public: + GCRA(Function *, SpillCodeInserter&); + ~GCRA(); + + bool allocateRegisters(ArrayList& insns); + + void printNodeInfo() const; + +private: + class RIG_Node : public Graph::Node + { + public: + RIG_Node(); + + void init(const RegisterSet&, LValue *); + + void addInterference(RIG_Node *); + void addRegPreference(RIG_Node *); + + inline LValue *getValue() const + { + return reinterpret_cast<LValue *>(data); + } + inline void setValue(LValue *lval) { data = lval; } + + inline uint8_t getCompMask() const + { + return ((1 << colors) - 1) << (reg & 7); + } + + static inline RIG_Node *get(const Graph::EdgeIterator& ei) + { + return static_cast<RIG_Node *>(ei.getNode()); + } + + public: + uint32_t degree; + uint16_t degreeLimit; // if deg < degLimit, node is trivially colourable + uint16_t colors; + + DataFile f; + int32_t reg; + + float weight; + + // list pointers for simplify() phase + RIG_Node *next; + RIG_Node *prev; + + // union of the live intervals of all coalesced values (we want to retain + // the separate intervals for testing interference of compound values) + Interval livei; + + std::list<RIG_Node *> prefRegs; + }; + +private: + inline RIG_Node *getNode(const LValue *v) const { return &nodes[v->id]; } + + void buildRIG(ArrayList&); + bool coalesce(ArrayList&); + bool doCoalesce(ArrayList&, unsigned int mask); + void calculateSpillWeights(); + void simplify(); + bool selectRegisters(); + void cleanup(const bool success); + + void simplifyEdge(RIG_Node *, RIG_Node *); + void simplifyNode(RIG_Node *); + + bool coalesceValues(Value *, Value *, bool force); + void resolveSplitsAndMerges(); + void makeCompound(Instruction *, bool isSplit); + + inline void checkInterference(const RIG_Node *, Graph::EdgeIterator&); + + inline void insertOrderedTail(std::list<RIG_Node *>&, RIG_Node *); + void checkList(std::list<RIG_Node *>&); + +private: + std::stack<uint32_t> stack; + + // list headers for simplify() phase + RIG_Node lo[2]; + RIG_Node hi; + + Graph RIG; + RIG_Node *nodes; + unsigned int nodeCount; + + Function *func; + Program *prog; + + static uint8_t relDegree[17][17]; + + RegisterSet regs; + + // need to fixup register id for participants of OP_MERGE/SPLIT + std::list<Instruction *> merges; + std::list<Instruction *> splits; + + SpillCodeInserter& spill; + std::list<ValuePair> mustSpill; +}; + +uint8_t GCRA::relDegree[17][17]; + +GCRA::RIG_Node::RIG_Node() : Node(NULL), next(this), prev(this) +{ + colors = 0; +} + +void +GCRA::printNodeInfo() const +{ + for (unsigned int i = 0; i < nodeCount; ++i) { + if (!nodes[i].colors) + continue; + INFO("RIG_Node[%%%i]($[%u]%i): %u colors, weight %f, deg %u/%u\n X", + i, + nodes[i].f,nodes[i].reg,nodes[i].colors, + nodes[i].weight, + nodes[i].degree, nodes[i].degreeLimit); + + for (Graph::EdgeIterator ei = nodes[i].outgoing(); !ei.end(); ei.next()) + INFO(" %%%i", RIG_Node::get(ei)->getValue()->id); + for (Graph::EdgeIterator ei = nodes[i].incident(); !ei.end(); ei.next()) + INFO(" %%%i", RIG_Node::get(ei)->getValue()->id); + INFO("\n"); + } +} + +void +GCRA::RIG_Node::init(const RegisterSet& regs, LValue *lval) +{ + setValue(lval); + if (lval->reg.data.id >= 0) + lval->noSpill = lval->fixedReg = 1; + + colors = regs.units(lval->reg.file, lval->reg.size); + f = lval->reg.file; + reg = -1; + if (lval->reg.data.id >= 0) + reg = regs.idToUnits(lval); + + weight = std::numeric_limits<float>::infinity(); + degree = 0; + degreeLimit = regs.getFileSize(f, lval->reg.size); + + livei.insert(lval->livei); +} + +bool +GCRA::coalesceValues(Value *dst, Value *src, bool force) +{ + LValue *rep = dst->join->asLValue(); + LValue *val = src->join->asLValue(); + + if (!force && val->reg.data.id >= 0) { + rep = src->join->asLValue(); + val = dst->join->asLValue(); + } + RIG_Node *nRep = &nodes[rep->id]; + RIG_Node *nVal = &nodes[val->id]; + + if (src->reg.file != dst->reg.file) { + if (!force) + return false; + WARN("forced coalescing of values in different files !\n"); + } + if (!force && dst->reg.size != src->reg.size) + return false; + + if ((rep->reg.data.id >= 0) && (rep->reg.data.id != val->reg.data.id)) { + if (force) { + if (val->reg.data.id >= 0) + WARN("forced coalescing of values in different fixed regs !\n"); + } else { + if (val->reg.data.id >= 0) + return false; + // make sure that there is no overlap with the fixed register of rep + for (ArrayList::Iterator it = func->allLValues.iterator(); + !it.end(); it.next()) { + Value *reg = reinterpret_cast<Value *>(it.get())->asLValue(); + assert(reg); + if (reg->interfers(rep) && reg->livei.overlaps(nVal->livei)) + return false; + } + } + } + + if (!force && nRep->livei.overlaps(nVal->livei)) + return false; + + INFO_DBG(prog->dbgFlags, REG_ALLOC, "joining %%%i($%i) <- %%%i\n", + rep->id, rep->reg.data.id, val->id); + + // set join pointer of all values joined with val + for (Value::DefIterator def = val->defs.begin(); def != val->defs.end(); + ++def) + (*def)->get()->join = rep; + assert(rep->join == rep && val->join == rep); + + // add val's definitions to rep and extend the live interval of its RIG node + rep->defs.insert(rep->defs.end(), val->defs.begin(), val->defs.end()); + nRep->livei.unify(nVal->livei); + return true; +} + +bool +GCRA::coalesce(ArrayList& insns) +{ + bool ret = doCoalesce(insns, JOIN_MASK_PHI); + if (!ret) + return false; + switch (func->getProgram()->getTarget()->getChipset() & ~0xf) { + case 0x50: + case 0x80: + case 0x90: + case 0xa0: + ret = doCoalesce(insns, JOIN_MASK_UNION | JOIN_MASK_TEX); + break; + case 0xc0: + case 0xd0: + case 0xe0: + ret = doCoalesce(insns, JOIN_MASK_UNION); + break; + default: + break; + } + if (!ret) + return false; + return doCoalesce(insns, JOIN_MASK_MOV); +} + +static inline uint8_t makeCompMask(int compSize, int base, int size) +{ + uint8_t m = ((1 << size) - 1) << base; + + switch (compSize) { + case 1: + return 0xff; + case 2: + m |= (m << 2); + return (m << 4) | m; + case 3: + case 4: + return (m << 4) | m; + default: + assert(compSize <= 8); + return m; + } +} + +static inline void copyCompound(Value *dst, Value *src) +{ + LValue *ldst = dst->asLValue(); + LValue *lsrc = src->asLValue(); + + ldst->compound = lsrc->compound; + ldst->compMask = lsrc->compMask; +} + +void +GCRA::makeCompound(Instruction *insn, bool split) +{ + LValue *rep = (split ? insn->getSrc(0) : insn->getDef(0))->asLValue(); + + if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) { + INFO("makeCompound(split = %i): ", split); + insn->print(); + } + + const unsigned int size = getNode(rep)->colors; + unsigned int base = 0; + + if (!rep->compound) + rep->compMask = 0xff; + rep->compound = 1; + + for (int c = 0; split ? insn->defExists(c) : insn->srcExists(c); ++c) { + LValue *val = (split ? insn->getDef(c) : insn->getSrc(c))->asLValue(); + + val->compound = 1; + if (!val->compMask) + val->compMask = 0xff; + val->compMask &= makeCompMask(size, base, getNode(val)->colors); + assert(val->compMask); + + INFO_DBG(prog->dbgFlags, REG_ALLOC, "compound: %%%i:%02x <- %%%i:%02x\n", + rep->id, rep->compMask, val->id, val->compMask); + + base += getNode(val)->colors; + } + assert(base == size); +} + bool -RegAlloc::coalesceValues(unsigned int mask) +GCRA::doCoalesce(ArrayList& insns, unsigned int mask) { int c, n; for (n = 0; n < insns.getSize(); ++n) { Instruction *i; - Instruction *insn = insnBySerial(n); + Instruction *insn = reinterpret_cast<Instruction *>(insns.get(n)); switch (insn->op) { case OP_PHI: if (!(mask & JOIN_MASK_PHI)) break; for (c = 0; insn->srcExists(c); ++c) - if (!insn->getDef(0)->coalesce(insn->getSrc(c), false)) { + if (!coalesceValues(insn->getDef(0), insn->getSrc(c), false)) { + // this is bad ERROR("failed to coalesce phi operands\n"); return false; } break; case OP_UNION: + case OP_MERGE: if (!(mask & JOIN_MASK_UNION)) break; for (c = 0; insn->srcExists(c); ++c) - insn->getDef(0)->coalesce(insn->getSrc(c), true); + coalesceValues(insn->getDef(0), insn->getSrc(c), true); + if (insn->op == OP_MERGE) { + merges.push_back(insn); + if (insn->srcExists(1)) + makeCompound(insn, false); + } break; - case OP_CONSTRAINT: - if (!(mask & JOIN_MASK_CONSTRAINT)) + case OP_SPLIT: + if (!(mask & JOIN_MASK_UNION)) break; - for (c = 0; c < 4 && insn->srcExists(c); ++c) - insn->getDef(c)->coalesce(insn->getSrc(c), true); + splits.push_back(insn); + for (c = 0; insn->defExists(c); ++c) + coalesceValues(insn->getSrc(0), insn->getDef(c), true); + makeCompound(insn, true); break; case OP_MOV: if (!(mask & JOIN_MASK_MOV)) @@ -605,11 +947,13 @@ RegAlloc::coalesceValues(unsigned int mask) if (!insn->getDef(0)->uses.empty()) i = insn->getDef(0)->uses.front()->getInsn(); // if this is a contraint-move there will only be a single use - if (i && i->op == OP_CONSTRAINT) + if (i && i->op == OP_MERGE) // do we really still need this ? break; i = insn->getSrc(0)->getUniqueInsn(); - if (i && !i->constrainedDefs()) - insn->getDef(0)->coalesce(insn->getSrc(0), false); + if (i && !i->constrainedDefs()) { + if (coalesceValues(insn->getDef(0), insn->getSrc(0), false)) + copyCompound(insn->getSrc(0), insn->getDef(0)); + } break; case OP_TEX: case OP_TXB: @@ -621,8 +965,8 @@ RegAlloc::coalesceValues(unsigned int mask) case OP_TEXCSAA: if (!(mask & JOIN_MASK_TEX)) break; - for (c = 0; c < 4 && insn->srcExists(c); ++c) - insn->getDef(c)->coalesce(insn->getSrc(c), true); + for (c = 0; insn->srcExists(c) && c != insn->predSrc; ++c) + coalesceValues(insn->getDef(c), insn->getSrc(c), true); break; default: break; @@ -632,186 +976,555 @@ RegAlloc::coalesceValues(unsigned int mask) } void -RegAlloc::insertOrderedTail(DLList &list, Value *val) +GCRA::RIG_Node::addInterference(RIG_Node *node) +{ + this->degree += relDegree[node->colors][colors]; + node->degree += relDegree[colors][node->colors]; + + this->attach(node, Graph::Edge::CROSS); +} + +void +GCRA::RIG_Node::addRegPreference(RIG_Node *node) +{ + prefRegs.push_back(node); +} + +GCRA::GCRA(Function *fn, SpillCodeInserter& spill) : + func(fn), + regs(fn->getProgram()->getTarget()), + spill(spill) { - // we insert the live intervals in order, so this should be short - DLList::Iterator iter = list.revIterator(); - const int begin = val->livei.begin(); - for (; !iter.end(); iter.next()) { - if (reinterpret_cast<Value *>(iter.get())->livei.begin() <= begin) + prog = func->getProgram(); + + // initialize relative degrees array - i takes away from j + for (int i = 1; i <= 16; ++i) + for (int j = 1; j <= 16; ++j) + relDegree[i][j] = j * ((i + j - 1) / j); +} + +GCRA::~GCRA() +{ + if (nodes) + delete[] nodes; +} + +void +GCRA::checkList(std::list<RIG_Node *>& lst) +{ + GCRA::RIG_Node *prev = NULL; + + for (std::list<RIG_Node *>::iterator it = lst.begin(); + it != lst.end(); + ++it) { + assert((*it)->getValue()->join == (*it)->getValue()); + if (prev) + assert(prev->livei.begin() <= (*it)->livei.begin()); + prev = *it; + } +} + +void +GCRA::insertOrderedTail(std::list<RIG_Node *>& list, RIG_Node *node) +{ + if (node->livei.isEmpty()) + return; + // only the intervals of joined values don't necessarily arrive in order + std::list<RIG_Node *>::iterator prev, it; + for (it = list.end(); it != list.begin(); it = prev) { + prev = it; + --prev; + if ((*prev)->livei.begin() <= node->livei.begin()) break; } - iter.insert(val); + list.insert(it, node); } -static void -checkList(DLList &list) +void +GCRA::buildRIG(ArrayList& insns) { - Value *prev = NULL; - Value *next = NULL; + std::list<RIG_Node *> values, active; + + for (std::deque<ValueDef>::iterator it = func->ins.begin(); + it != func->ins.end(); ++it) + insertOrderedTail(values, getNode(it->get()->asLValue())); + + for (int i = 0; i < insns.getSize(); ++i) { + Instruction *insn = reinterpret_cast<Instruction *>(insns.get(i)); + for (int d = 0; insn->defExists(d); ++d) + if (insn->getDef(d)->rep() == insn->getDef(d)) + insertOrderedTail(values, getNode(insn->getDef(d)->asLValue())); + } + checkList(values); - for (DLList::Iterator iter = list.iterator(); !iter.end(); iter.next()) { - next = Value::get(iter); - assert(next); - if (prev) { - assert(prev->livei.begin() <= next->livei.begin()); + while (!values.empty()) { + RIG_Node *cur = values.front(); + + for (std::list<RIG_Node *>::iterator it = active.begin(); + it != active.end(); + ++it) { + RIG_Node *node = *it; + + if (node->livei.end() <= cur->livei.begin()) { + it = active.erase(it); + --it; + } else + if (node->f == cur->f && node->livei.overlaps(cur->livei)) { + cur->addInterference(node); + } } - assert(next->join == next); - prev = next; + values.pop_front(); + active.push_back(cur); } } void -RegAlloc::collectLValues(DLList &list, bool assignedOnly) +GCRA::calculateSpillWeights() { - for (std::deque<ValueDef>::iterator it = func->ins.begin(); - it != func->ins.end(); ++it) { - if (!assignedOnly || it->get()->reg.data.id >= 0) - insertOrderedTail(list, it->get()); - } + for (unsigned int i = 0; i < nodeCount; ++i) { + RIG_Node *const n = &nodes[i]; + if (!nodes[i].colors || nodes[i].livei.isEmpty()) + continue; + if (nodes[i].reg >= 0) { + // update max reg + regs.occupy(n->f, n->reg, n->colors); + continue; + } + LValue *val = nodes[i].getValue(); - for (int n = 0; n < insns.getSize(); ++n) { - Instruction *i = insnBySerial(n); + if (!val->noSpill) { + int rc = 0; + for (Value::DefIterator it = val->defs.begin(); + it != val->defs.end(); + ++it) + rc += (*it)->get()->refCount(); - for (int d = 0; i->defExists(d); ++d) - if (!i->getDef(d)->livei.isEmpty()) - if (!assignedOnly || i->getDef(d)->reg.data.id >= 0) - insertOrderedTail(list, i->getDef(d)); + nodes[i].weight = + (float)rc * (float)rc / (float)nodes[i].livei.extent(); + } + + if (nodes[i].degree < nodes[i].degreeLimit) { + int l = 0; + if (val->reg.size > 4) + l = 1; + DLLIST_ADDHEAD(&lo[l], &nodes[i]); + } else { + DLLIST_ADDHEAD(&hi, &nodes[i]); + } } - checkList(list); + if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) + printNodeInfo(); } -bool -RegAlloc::allocateConstrainedValues() +void +GCRA::simplifyEdge(RIG_Node *a, RIG_Node *b) { - Value *defs[4]; - RegisterSet regSet[4]; - DLList regVals; + bool move = b->degree >= b->degreeLimit; - INFO_DBG(prog->dbgFlags, REG_ALLOC, "RA: allocating constrained values\n"); + INFO_DBG(prog->dbgFlags, REG_ALLOC, + "edge: (%%%i, deg %u/%u) >-< (%%%i, deg %u/%u)\n", + a->getValue()->id, a->degree, a->degreeLimit, + b->getValue()->id, b->degree, b->degreeLimit); - collectLValues(regVals, true); + b->degree -= relDegree[a->colors][b->colors]; - for (int c = 0; c < 4; ++c) - regSet[c].init(prog->getTarget()); + move = move && b->degree < b->degreeLimit; + if (move && !DLLIST_EMPTY(b)) { + int l = (b->getValue()->reg.size > 4) ? 1 : 0; + DLLIST_DEL(b); + DLLIST_ADDTAIL(&lo[l], b); + } +} - for (int n = 0; n < insns.getSize(); ++n) { - Instruction *i = insnBySerial(n); +void +GCRA::simplifyNode(RIG_Node *node) +{ + for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next()) + simplifyEdge(node, RIG_Node::get(ei)); - const int vecSize = i->defCount(0xf); - if (vecSize < 2) - continue; - assert(vecSize <= 4); + for (Graph::EdgeIterator ei = node->incident(); !ei.end(); ei.next()) + simplifyEdge(node, RIG_Node::get(ei)); - for (int c = 0; c < vecSize; ++c) - defs[c] = i->def(c).rep(); + DLLIST_DEL(node); + stack.push(node->getValue()->id); - if (defs[0]->reg.data.id >= 0) { - for (int c = 1; c < vecSize; ++c) { - assert(defs[c]->reg.data.id >= 0); + INFO_DBG(prog->dbgFlags, REG_ALLOC, "SIMPLIFY: pushed %%%i%s\n", + node->getValue()->id, + (node->degree < node->degreeLimit) ? "" : "(spill)"); +} + +void +GCRA::simplify() +{ + for (;;) { + if (!DLLIST_EMPTY(&lo[0])) { + do { + simplifyNode(lo[0].next); + } while (!DLLIST_EMPTY(&lo[0])); + } else + if (!DLLIST_EMPTY(&lo[1])) { + simplifyNode(lo[1].next); + } else + if (!DLLIST_EMPTY(&hi)) { + RIG_Node *best = hi.next; + float bestScore = best->weight / (float)best->degree; + // spill candidate + for (RIG_Node *it = best->next; it != &hi; it = it->next) { + float score = it->weight / (float)it->degree; + if (score < bestScore) { + best = it; + bestScore = score; + } } - continue; + if (isinf(bestScore)) { + ERROR("no viable spill candidates left\n"); + break; + } + simplifyNode(best); + } else { + break; } + } +} + +void +GCRA::checkInterference(const RIG_Node *node, Graph::EdgeIterator& ei) +{ + const RIG_Node *intf = RIG_Node::get(ei); + + if (intf->reg < 0) + return; + const LValue *vA = node->getValue(); + const LValue *vB = intf->getValue(); - for (int c = 0; c < vecSize; ++c) { - uint32_t mask; - regSet[c].reset(); + const uint8_t intfMask = ((1 << intf->colors) - 1) << (intf->reg & 7); - for (DLList::Iterator it = regVals.iterator(); !it.end(); it.next()) { - Value *rVal = Value::get(it); - if (rVal->reg.data.id >= 0 && rVal->livei.overlaps(defs[c]->livei)) - regSet[c].occupy(rVal); + if (vA->compound | vB->compound) { + // NOTE: this only works for >aligned< register tuples ! + for (Value::DefCIterator D = vA->defs.begin(); D != vA->defs.end(); ++D) { + for (Value::DefCIterator d = vB->defs.begin(); d != vB->defs.end(); ++d) { + const LValue *vD = (*D)->get()->asLValue(); + const LValue *vd = (*d)->get()->asLValue(); + + if (!vD->livei.overlaps(vd->livei)) { + INFO_DBG(prog->dbgFlags, REG_ALLOC, "(%%%i) X (%%%i): no overlap\n", + vD->id, vd->id); + continue; } - mask = 0x11111111; - if (vecSize == 2) // granularity is 2 instead of 4 - mask |= 0x11111111 << 2; - regSet[c].periodicMask(defs[0]->reg.file, 0, ~(mask << c)); - if (!defs[c]->livei.isEmpty()) - insertOrderedTail(regVals, defs[c]); + uint8_t mask = vD->compound ? vD->compMask : ~0; + if (vd->compound) { + assert(vB->compound); + mask &= vd->compMask & vB->compMask; + } else { + mask &= intfMask; + } + + INFO_DBG(prog->dbgFlags, REG_ALLOC, + "(%%%i)%02x X (%%%i)%02x & %02x: $r%i.%02x\n", + vD->id, + vD->compound ? vD->compMask : 0xff, + vd->id, + vd->compound ? vd->compMask : intfMask, + vB->compMask, intf->reg & ~7, mask); + if (mask) + regs.occupyMask(node->f, intf->reg & ~7, mask); } - for (int c = 1; c < vecSize; ++c) - regSet[0].intersect(defs[0]->reg.file, ®Set[c]); + } + } else { + INFO_DBG(prog->dbgFlags, REG_ALLOC, + "(%%%i) X (%%%i): $r%i + %u\n", + vA->id, vB->id, intf->reg, intf->colors); + regs.occupy(node->f, intf->reg, intf->colors); + } +} - if (!regSet[0].assign(&defs[0], vecSize)) // TODO: spilling - return false; +bool +GCRA::selectRegisters() +{ + INFO_DBG(prog->dbgFlags, REG_ALLOC, "\nSELECT phase\n"); + + while (!stack.empty()) { + RIG_Node *node = &nodes[stack.top()]; + stack.pop(); + + regs.reset(node->f); + + INFO_DBG(prog->dbgFlags, REG_ALLOC, "\nNODE[%%%i, %u colors]\n", + node->getValue()->id, node->colors); + + for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next()) + checkInterference(node, ei); + for (Graph::EdgeIterator ei = node->incident(); !ei.end(); ei.next()) + checkInterference(node, ei); + + if (!node->prefRegs.empty()) { + for (std::list<RIG_Node *>::const_iterator it = node->prefRegs.begin(); + it != node->prefRegs.end(); + ++it) { + if ((*it)->reg >= 0 && + regs.occupy(node->f, (*it)->reg, node->colors)) { + node->reg = (*it)->reg; + break; + } + } + } + if (node->reg >= 0) + continue; + LValue *lval = node->getValue(); + if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) + regs.print(); + bool ret = regs.assign(node->reg, node->f, node->colors); + if (ret) { + INFO_DBG(prog->dbgFlags, REG_ALLOC, "assigned reg %i\n", node->reg); + lval->compMask = node->getCompMask(); + } else { + INFO_DBG(prog->dbgFlags, REG_ALLOC, "must spill: %%%i (size %u)\n", + lval->id, lval->reg.size); + Symbol *slot = NULL; + if (lval->reg.file == FILE_GPR) + slot = spill.assignSlot(node->livei, lval->reg.size); + mustSpill.push_back(ValuePair(lval, slot)); + } + } + if (!mustSpill.empty()) + return false; + for (unsigned int i = 0; i < nodeCount; ++i) { + LValue *lval = nodes[i].getValue(); + if (nodes[i].reg >= 0 && nodes[i].colors > 0) + lval->reg.data.id = + regs.unitsToId(nodes[i].f, nodes[i].reg, lval->reg.size); } - for (int c = 0; c < 4; c += 2) - if (regSet[c].getMaxAssigned(FILE_GPR) > prog->maxGPR) - prog->maxGPR = regSet[c].getMaxAssigned(FILE_GPR); return true; } bool -RegAlloc::linearScan() +GCRA::allocateRegisters(ArrayList& insns) { - Value *cur, *val; - DLList unhandled, active, inactive; - RegisterSet f(prog->getTarget()), free(prog->getTarget()); + bool ret; + + INFO_DBG(prog->dbgFlags, REG_ALLOC, + "allocateRegisters to %u instructions\n", insns.getSize()); - INFO_DBG(prog->dbgFlags, REG_ALLOC, "RA: linear scan\n"); + nodeCount = func->allLValues.getSize(); + nodes = new RIG_Node[nodeCount]; + if (!nodes) + return false; + for (unsigned int i = 0; i < nodeCount; ++i) { + LValue *lval = reinterpret_cast<LValue *>(func->allLValues.get(i)); + if (lval) { + nodes[i].init(regs, lval); + RIG.insert(&nodes[i]); + } + } - collectLValues(unhandled, false); + // coalesce first, we use only 1 RIG node for a group of joined values + ret = coalesce(insns); + if (!ret) + goto out; - for (DLList::Iterator cI = unhandled.iterator(); !cI.end();) { - cur = Value::get(cI); - cI.erase(); + if (func->getProgram()->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) + func->printLiveIntervals(); - for (DLList::Iterator aI = active.iterator(); !aI.end();) { - val = Value::get(aI); - if (val->livei.end() <= cur->livei.begin()) { - free.release(val); - aI.erase(); - } else - if (!val->livei.contains(cur->livei.begin())) { - free.release(val); - aI.moveToList(inactive); - } else { - aI.next(); - } + buildRIG(insns); + calculateSpillWeights(); + simplify(); + + ret = selectRegisters(); + if (!ret) { + INFO_DBG(prog->dbgFlags, REG_ALLOC, + "selectRegisters failed, inserting spill code ...\n"); + regs.reset(FILE_GPR, true); + spill.run(mustSpill); + if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) + func->print(); + } else { + prog->maxGPR = regs.getMaxAssigned(FILE_GPR); + } + +out: + cleanup(ret); + return ret; +} + +void +GCRA::cleanup(const bool success) +{ + mustSpill.clear(); + + for (ArrayList::Iterator it = func->allLValues.iterator(); + !it.end(); it.next()) { + LValue *lval = reinterpret_cast<LValue *>(it.get()); + + lval->livei.clear(); + + lval->compound = 0; + lval->compMask = 0; + + if (lval->join == lval) + continue; + + if (success) { + lval->reg.data.id = lval->join->reg.data.id; + } else { + for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end(); + ++d) + lval->join->defs.remove(*d); + lval->join = lval; } + } - for (DLList::Iterator iI = inactive.iterator(); !iI.end();) { - val = Value::get(iI); - if (val->livei.end() <= cur->livei.begin()) { - iI.erase(); - } else - if (val->livei.contains(cur->livei.begin())) { - free.occupy(val); - iI.moveToList(active); - } else { - iI.next(); + if (success) + resolveSplitsAndMerges(); + splits.clear(); // avoid duplicate entries on next coalesce pass + merges.clear(); + + delete[] nodes; + nodes = NULL; +} + +Symbol * +SpillCodeInserter::assignSlot(const Interval &livei, unsigned int size) +{ + SpillSlot slot; + int32_t offsetBase = stackSize; + int32_t offset; + std::list<SpillSlot>::iterator pos = slots.end(), it = slots.begin(); + + if (offsetBase % size) + offsetBase += size - (offsetBase % size); + + slot.sym = NULL; + + for (offset = offsetBase; offset < stackSize; offset += size) { + while (it != slots.end() && it->offset < offset) + ++it; + if (it == slots.end()) // no slots left + break; + std::list<SpillSlot>::iterator bgn = it; + + while (it != slots.end() && it->offset < (offset + size)) { + it->occup.print(); + if (it->occup.overlaps(livei)) + break; + ++it; + } + if (it == slots.end() || it->offset >= (offset + size)) { + // fits + for (; bgn != slots.end() && bgn->offset < (offset + size); ++bgn) { + bgn->occup.insert(livei); + if (bgn->size() == size) + slot.sym = bgn->sym; } + break; } - f = free; + } + if (!slot.sym) { + stackSize = offset + size; + slot.offset = offset; + slot.sym = new_Symbol(func->getProgram(), FILE_MEMORY_LOCAL); + if (!func->stackPtr) + offset += func->tlsBase; + slot.sym->setAddress(NULL, offset); + slot.sym->reg.size = size; + slots.insert(pos, slot)->occup.insert(livei); + } + return slot.sym; +} - for (DLList::Iterator iI = inactive.iterator(); !iI.end(); iI.next()) { - val = Value::get(iI); - if (val->livei.overlaps(cur->livei)) - f.occupy(val); - } +void +SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval) +{ + const DataType ty = typeOfSize(slot->reg.size); + + Instruction *st; + if (slot->reg.file == FILE_MEMORY_LOCAL) { + st = new_Instruction(func, OP_STORE, ty); + st->setSrc(0, slot); + st->setSrc(1, lval); + lval->noSpill = 1; + } else { + st = new_Instruction(func, OP_CVT, ty); + st->setDef(0, slot); + st->setSrc(0, lval); + } + defi->bb->insertAfter(defi, st); +} - for (DLList::Iterator uI = unhandled.iterator(); !uI.end(); uI.next()) { - val = Value::get(uI); - if (val->reg.data.id >= 0 && val->livei.overlaps(cur->livei)) - f.occupy(val); - } +LValue * +SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot) +{ + const DataType ty = typeOfSize(slot->reg.size); + + lval = cloneShallow(func, lval); - if (cur->reg.data.id < 0) { - bool spill = !f.assign(&cur, 1); - if (spill) { - ERROR("out of registers of file %u\n", cur->reg.file); - abort(); + Instruction *ld; + if (slot->reg.file == FILE_MEMORY_LOCAL) { + lval->noSpill = 1; + ld = new_Instruction(func, OP_LOAD, ty); + } else { + ld = new_Instruction(func, OP_CVT, ty); + } + ld->setDef(0, lval); + ld->setSrc(0, slot); + + usei->bb->insertBefore(usei, ld); + return lval; +} + +bool +SpillCodeInserter::run(const std::list<ValuePair>& lst) +{ + for (std::list<ValuePair>::const_iterator it = lst.begin(); it != lst.end(); + ++it) { + LValue *lval = it->first->asLValue(); + Symbol *mem = it->second ? it->second->asSym() : NULL; + + for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end(); + ++d) { + Value *slot = mem ? + static_cast<Value *>(mem) : new_LValue(func, FILE_GPR); + Value *tmp = NULL; + Instruction *last = NULL; + + LValue *dval = (*d)->get()->asLValue(); + Instruction *defi = (*d)->getInsn(); + + // handle uses first or they'll contain the spill stores + while (!dval->uses.empty()) { + ValueRef *u = dval->uses.front(); + Instruction *usei = u->getInsn(); + assert(usei); + if (usei->op == OP_PHI) { + tmp = (slot->reg.file == FILE_MEMORY_LOCAL) ? NULL : slot; + last = NULL; + } else + if (!last || usei != last->next) { // TODO: sort uses + tmp = unspill(usei, dval, slot); + last = usei; + } + u->set(tmp); + } + + assert(defi); + if (defi->op == OP_PHI) { + d = lval->defs.erase(d); + --d; + if (slot->reg.file == FILE_MEMORY_LOCAL) + delete_Instruction(func->getProgram(), defi); + else + defi->setDef(0, slot); + } else { + spill(defi, slot, dval); } } - free.occupy(cur); - active.insert(cur); + } - if (f.getMaxAssigned(FILE_GPR) > prog->maxGPR) - prog->maxGPR = f.getMaxAssigned(FILE_GPR); - if (free.getMaxAssigned(FILE_GPR) > prog->maxGPR) - prog->maxGPR = free.getMaxAssigned(FILE_GPR); + // TODO: We're not trying to reuse old slots in a potential next iteration. + // We have to update the slots' livei intervals to be able to do that. + stackBase = stackSize; + slots.clear(); return true; } @@ -821,8 +1534,11 @@ RegAlloc::exec() for (IteratorRef it = prog->calls.iteratorDFS(false); !it->end(); it->next()) { func = Function::get(reinterpret_cast<Graph::Node *>(it->get())); + + func->tlsBase = prog->tlsSize; if (!execFunc()) return false; + prog->tlsSize += func->tlsSize; } return true; } @@ -834,8 +1550,11 @@ RegAlloc::execFunc() PhiMovesPass insertPhiMoves; ArgumentMovesPass insertArgMoves; BuildIntervalsPass buildIntervals; + SpillCodeInserter insertSpills(func); + + GCRA gcra(func, insertSpills); - unsigned int i; + unsigned int i, retries; bool ret; ret = insertConstr.exec(func); @@ -846,60 +1565,76 @@ RegAlloc::execFunc() if (!ret) goto out; - ret = insertArgMoves.run(func, true); + ret = insertArgMoves.run(func); if (!ret) goto out; - for (sequence = func->cfg.nextSequence(), i = 0; - ret && i <= func->loopNestingBound; - sequence = func->cfg.nextSequence(), ++i) - ret = buildLiveSets(BasicBlock::get(func->cfg.getRoot())); - if (!ret) - goto out; - - func->orderInstructions(this->insns); - - ret = buildIntervals.run(func); - if (!ret) - goto out; - - ret = coalesceValues(JOIN_MASK_PHI); - if (!ret) - goto out; - switch (prog->getTarget()->getChipset() & 0xf0) { - case 0x50: - ret = coalesceValues(JOIN_MASK_UNION | JOIN_MASK_TEX); - break; - case 0xc0: - ret = coalesceValues(JOIN_MASK_UNION | JOIN_MASK_CONSTRAINT); - break; - default: - break; - } - if (!ret) - goto out; - ret = coalesceValues(JOIN_MASK_MOV); - if (!ret) - goto out; + // TODO: need to fix up spill slot usage ranges to support > 1 retry + for (retries = 0; retries < 3; ++retries) { + if (retries && (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)) + INFO("Retry: %i\n", retries); + if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) + func->print(); + + // spilling to registers may add live ranges, need to rebuild everything + ret = true; + for (sequence = func->cfg.nextSequence(), i = 0; + ret && i <= func->loopNestingBound; + sequence = func->cfg.nextSequence(), ++i) + ret = buildLiveSets(BasicBlock::get(func->cfg.getRoot())); + if (!ret) + break; + func->orderInstructions(this->insns); - if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) { - func->print(); - func->printLiveIntervals(); + ret = buildIntervals.run(func); + if (!ret) + break; + ret = gcra.allocateRegisters(insns); + if (ret) + break; // success } + INFO_DBG(prog->dbgFlags, REG_ALLOC, "RegAlloc done: %i\n", ret); - ret = allocateConstrainedValues() && linearScan(); - if (!ret) - goto out; - + func->tlsSize = insertSpills.getStackSize(); out: - // TODO: should probably call destructor on LValues later instead - for (ArrayList::Iterator it = func->allLValues.iterator(); - !it.end(); it.next()) - reinterpret_cast<LValue *>(it.get())->livei.clear(); - return ret; } +// TODO: check if modifying Instruction::join here breaks anything +void +GCRA::resolveSplitsAndMerges() +{ + for (std::list<Instruction *>::iterator it = splits.begin(); + it != splits.end(); + ++it) { + Instruction *split = *it; + unsigned int reg = regs.idToBytes(split->getSrc(0)); + for (int d = 0; split->defExists(d); ++d) { + Value *v = split->getDef(d); + v->reg.data.id = regs.bytesToId(v, reg); + v->join = v; + reg += v->reg.size; + } + delete_Instruction(prog, split); + } + splits.clear(); + + for (std::list<Instruction *>::iterator it = merges.begin(); + it != merges.end(); + ++it) { + Instruction *merge = *it; + unsigned int reg = regs.idToBytes(merge->getDef(0)); + for (int s = 0; merge->srcExists(s); ++s) { + Value *v = merge->getSrc(s); + v->reg.data.id = regs.bytesToId(v, reg); + v->join = v; + reg += v->reg.size; + } + delete_Instruction(prog, merge); + } + merges.clear(); +} + bool Program::registerAllocation() { RegAlloc ra(this); @@ -936,17 +1671,10 @@ RegAlloc::InsertConstraintsPass::textureMask(TexInstruction *tex) } tex->tex.mask = mask; -#if 0 // reorder or set the unused ones NULL ? - for (c = 0; c < 4; ++c) - if (!(tex->tex.mask & (1 << c))) - def[d++] = tex->getDef(c); -#endif for (c = 0; c < d; ++c) tex->setDef(c, def[c]); -#if 1 for (; c < 4; ++c) tex->setDef(c, NULL); -#endif } bool @@ -978,8 +1706,10 @@ RegAlloc::InsertConstraintsPass::addConstraint(Instruction *i, int s, int n) int d; // first, look for an existing identical constraint op - for (DLList::Iterator it = constrList.iterator(); !it.end(); it.next()) { - cst = reinterpret_cast<Instruction *>(it.get()); + for (std::list<Instruction *>::iterator it = constrList.begin(); + it != constrList.end(); + ++it) { + cst = (*it); if (!i->bb->dominatedBy(cst->bb)) break; for (d = 0; d < n; ++d) @@ -1000,7 +1730,7 @@ RegAlloc::InsertConstraintsPass::addConstraint(Instruction *i, int s, int n) } i->bb->insertBefore(i, cst); - constrList.insert(cst); + constrList.push_back(cst); } // Add a dummy use of the pointer source of >= 8 byte loads after the load @@ -1015,6 +1745,143 @@ RegAlloc::InsertConstraintsPass::addHazard(Instruction *i, const ValueRef *src) } +// b32 { %r0 %r1 %r2 %r3 } -> b128 %r0q +void +RegAlloc::InsertConstraintsPass::condenseDefs(Instruction *insn) +{ + uint8_t size = 0; + int n; + for (n = 0; insn->defExists(n) && insn->def(n).getFile() == FILE_GPR; ++n) + size += insn->getDef(n)->reg.size; + if (n < 2) + return; + LValue *lval = new_LValue(func, FILE_GPR); + lval->reg.size = size; + + Instruction *split = new_Instruction(func, OP_SPLIT, typeOfSize(size)); + split->setSrc(0, lval); + for (int d = 0; d < n; ++d) { + split->setDef(d, insn->getDef(d)); + insn->setDef(d, NULL); + } + insn->setDef(0, lval); + + for (int k = 1, d = n; insn->defExists(d); ++d, ++k) { + insn->setDef(k, insn->getDef(d)); + insn->setDef(d, NULL); + } + // carry over predicate if any (mainly for OP_UNION uses) + split->setPredicate(insn->cc, insn->getPredicate()); + + insn->bb->insertAfter(insn, split); + constrList.push_back(split); +} +void +RegAlloc::InsertConstraintsPass::condenseSrcs(Instruction *insn, + const int a, const int b) +{ + uint8_t size = 0; + if (a >= b) + return; + for (int s = a; s <= b; ++s) + size += insn->getSrc(s)->reg.size; + if (!size) + return; + LValue *lval = new_LValue(func, FILE_GPR); + lval->reg.size = size; + + Value *save[3]; + insn->takeExtraSources(0, save); + + Instruction *merge = new_Instruction(func, OP_MERGE, typeOfSize(size)); + merge->setDef(0, lval); + for (int s = a, i = 0; s <= b; ++s, ++i) { + merge->setSrc(i, insn->getSrc(s)); + insn->setSrc(s, NULL); + } + insn->setSrc(a, lval); + + for (int k = a + 1, s = b + 1; insn->srcExists(s); ++s, ++k) { + insn->setSrc(k, insn->getSrc(s)); + insn->setSrc(s, NULL); + } + insn->bb->insertBefore(insn, merge); + + insn->putExtraSources(0, save); + + constrList.push_back(merge); +} + +void +RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex) +{ + textureMask(tex); + condenseDefs(tex); + + int n = tex->srcCount(0xff, true); + if (n > 4) { + condenseSrcs(tex, 0, 3); + if (n > 5) + condenseSrcs(tex, 4, n - 1); + } else + if (n > 1) { + condenseSrcs(tex, 0, n - 1); + } +} + +void +RegAlloc::InsertConstraintsPass::texConstraintNVC0(TexInstruction *tex) +{ + int n, s; + + textureMask(tex); + + if (tex->op == OP_TXQ) { + s = tex->srcCount(0xff); + n = 0; + } else { + s = tex->tex.target.getArgCount(); + if (!tex->tex.target.isArray() && + (tex->tex.rIndirectSrc >= 0 || tex->tex.sIndirectSrc >= 0)) + ++s; + if (tex->op == OP_TXD && tex->tex.useOffsets) + ++s; + n = tex->srcCount(0xff) - s; + assert(n <= 4); + } + + if (s > 1) + condenseSrcs(tex, 0, s - 1); + if (n > 1) + condenseSrcs(tex, s, s + (n - 1)); + + condenseDefs(tex); +} + +void +RegAlloc::InsertConstraintsPass::texConstraintNV50(TexInstruction *tex) +{ + Value *pred = tex->getPredicate(); + if (pred) + tex->setPredicate(tex->cc, NULL); + + textureMask(tex); + + assert(tex->defExists(0) && tex->srcExists(0)); + // make src and def count match + int c; + for (c = 0; tex->srcExists(c) || tex->defExists(c); ++c) { + if (!tex->srcExists(c)) + tex->setSrc(c, new_LValue(func, tex->getSrc(0)->asLValue())); + if (!tex->defExists(c)) + tex->setDef(c, new_LValue(func, tex->getDef(0)->asLValue())); + } + if (pred) + tex->setPredicate(tex->cc, pred); + condenseDefs(tex); + condenseSrcs(tex, 0, c - 1); +} + // Insert constraint markers for instructions whose multiple sources must be // located in consecutive registers. bool @@ -1022,45 +1889,46 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb) { TexInstruction *tex; Instruction *next; - int s, n, size; + int s, size; + + targ = bb->getProgram()->getTarget(); for (Instruction *i = bb->getEntry(); i; i = next) { next = i->next; if ((tex = i->asTex())) { - textureMask(tex); - - // FIXME: this is target specific - if (tex->op == OP_TXQ) { - s = tex->srcCount(0xff); - n = 0; - } else { - s = tex->tex.target.getArgCount(); - if (!tex->tex.target.isArray() && - (tex->tex.rIndirectSrc >= 0 || tex->tex.sIndirectSrc >= 0)) - ++s; - if (tex->op == OP_TXD && tex->tex.useOffsets) - ++s; - n = tex->srcCount(0xff) - s; - assert(n <= 4); + switch (targ->getChipset() & ~0xf) { + case 0x50: + case 0x80: + case 0x90: + case 0xa0: + texConstraintNV50(tex); + break; + case 0xc0: + case 0xd0: + texConstraintNVC0(tex); + break; + case 0xe0: + texConstraintNVE0(tex); + break; + default: + break; } - - if (s > 1) - addConstraint(i, 0, s); - if (n > 1) - addConstraint(i, s, n); } else if (i->op == OP_EXPORT || i->op == OP_STORE) { for (size = typeSizeof(i->dType), s = 1; size > 0; ++s) { assert(i->srcExists(s)); size -= i->getSrc(s)->reg.size; } - if ((s - 1) > 1) - addConstraint(i, 1, s - 1); + condenseSrcs(i, 1, s - 1); } else - if (i->op == OP_LOAD) { + if (i->op == OP_LOAD || i->op == OP_VFETCH) { + condenseDefs(i); if (i->src(0).isIndirect(0) && typeSizeof(i->dType) >= 8) addHazard(i, i->src(0).getIndirect(0)); + } else + if (i->op == OP_UNION) { + constrList.push_back(i); } } return true; @@ -1071,21 +1939,65 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb) bool RegAlloc::InsertConstraintsPass::insertConstraintMoves() { - for (DLList::Iterator it = constrList.iterator(); !it.end(); it.next()) { - Instruction *cst = reinterpret_cast<Instruction *>(it.get()); + for (std::list<Instruction *>::iterator it = constrList.begin(); + it != constrList.end(); + ++it) { + Instruction *cst = *it; + Instruction *mov; + + if (cst->op == OP_SPLIT && 0) { + // spilling splits is annoying, just make sure they're separate + for (int d = 0; cst->defExists(d); ++d) { + if (!cst->getDef(d)->refCount()) + continue; + LValue *lval = new_LValue(func, cst->def(d).getFile()); + const uint8_t size = cst->def(d).getSize(); + lval->reg.size = size; + + mov = new_Instruction(func, OP_MOV, typeOfSize(size)); + mov->setSrc(0, lval); + mov->setDef(0, cst->getDef(d)); + cst->setDef(d, mov->getSrc(0)); + cst->bb->insertAfter(cst, mov); + + cst->getSrc(0)->asLValue()->noSpill = 1; + mov->getSrc(0)->asLValue()->noSpill = 1; + } + } else + if (cst->op == OP_MERGE || cst->op == OP_UNION) { + for (int s = 0; cst->srcExists(s); ++s) { + const uint8_t size = cst->src(s).getSize(); + + if (!cst->getSrc(s)->defs.size()) { + mov = new_Instruction(func, OP_NOP, typeOfSize(size)); + mov->setDef(0, cst->getSrc(s)); + cst->bb->insertBefore(cst, mov); + continue; + } + assert(cst->getSrc(s)->defs.size() == 1); // still SSA + + Instruction *defi = cst->getSrc(s)->defs.front()->getInsn(); + // catch some cases where don't really need MOVs + if (cst->getSrc(s)->refCount() == 1 && !defi->constrainedDefs()) + continue; + + LValue *lval = new_LValue(func, cst->src(s).getFile()); + lval->reg.size = size; - for (int s = 0; cst->srcExists(s); ++s) { - if (!detectConflict(cst, s)) - continue; - Instruction *mov = new_Instruction(func, OP_MOV, - typeOfSize(cst->src(s).getSize())); - mov->setSrc(0, cst->getSrc(s)); - mov->setDef(0, new_LValue(func, FILE_GPR)); - cst->setSrc(s, mov->getDef(0)); + mov = new_Instruction(func, OP_MOV, typeOfSize(size)); + mov->setDef(0, lval); + mov->setSrc(0, cst->getSrc(s)); + cst->setSrc(s, mov->getDef(0)); + cst->bb->insertBefore(cst, mov); - cst->bb->insertBefore(cst, mov); + cst->getDef(0)->asLValue()->noSpill = 1; // doesn't help + + if (cst->op == OP_UNION) + mov->setPredicate(defi->cc, defi->getPredicate()); + } } } + return true; } diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp index 4eaaa9602c1..f7812a1d706 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp @@ -88,6 +88,11 @@ Stack::moveTo(Stack& that) this->size = 0; } +Interval::Interval(const Interval& that) : head(NULL), tail(NULL) +{ + this->insert(that); +} + Interval::~Interval() { clear(); @@ -148,7 +153,7 @@ Interval::extend(int a, int b) return true; } -bool Interval::contains(int pos) +bool Interval::contains(int pos) const { for (Range *r = head; r && r->bgn <= pos; r = r->next) if (r->end > pos) @@ -156,16 +161,37 @@ bool Interval::contains(int pos) return false; } -bool Interval::overlaps(const Interval &iv) const +bool Interval::overlaps(const Interval &that) const { +#if 1 + Range *a = this->head; + Range *b = that.head; + + while (a && b) { + if (b->bgn < a->end && + b->end > a->bgn) + return true; + if (a->end <= b->bgn) + a = a->next; + else + b = b->next; + } +#else for (Range *rA = this->head; rA; rA = rA->next) for (Range *rB = iv.head; rB; rB = rB->next) if (rB->bgn < rA->end && rB->end > rA->bgn) return true; +#endif return false; } +void Interval::insert(const Interval &that) +{ + for (Range *r = that.head; r; r = r->next) + this->extend(r->bgn, r->end); +} + void Interval::unify(Interval &that) { assert(this != &that); @@ -177,6 +203,14 @@ void Interval::unify(Interval &that) that.head = NULL; } +int Interval::length() const +{ + int len = 0; + for (Range *r = head; r; r = r->next) + len += r->bgn - r->end; + return len; +} + void Interval::print() const { if (!head) @@ -205,6 +239,27 @@ BitSet& BitSet::operator|=(const BitSet &set) return *this; } +bool BitSet::resize(unsigned int nBits) +{ + if (!data || !nBits) + return allocate(nBits, true); + const unsigned int p = (size + 31) / 32; + const unsigned int n = (nBits + 31) / 32; + if (n == p) + return true; + + data = (uint32_t *)REALLOC(data, 4 * p, 4 * n); + if (!data) { + size = 0; + return false; + } + if (n > p) + memset(&data[4 * p + 4], 0, (n - p) * 4); + + size = nBits; + return true; +} + bool BitSet::allocate(unsigned int nBits, bool zero) { if (data && size < nBits) { @@ -254,6 +309,65 @@ void BitSet::setOr(BitSet *pA, BitSet *pB) } } +int BitSet::findFreeRange(unsigned int count) const +{ + const uint32_t m = (1 << count) - 1; + int pos = size; + unsigned int i; + const unsigned int end = (size + 31) / 32; + + if (count == 1) { + for (i = 0; i < end; ++i) { + pos = ffs(~data[i]) - 1; + if (pos >= 0) + break; + } + } else + if (count == 2) { + for (i = 0; i < end; ++i) { + if (data[i] != 0xffffffff) { + uint32_t b = data[i] | (data[i] >> 1) | 0xaaaaaaaa; + pos = ffs(~b) - 1; + if (pos >= 0) + break; + } + } + } else + if (count == 4 || count == 3) { + for (i = 0; i < end; ++i) { + if (data[i] != 0xffffffff) { + uint32_t b = + (data[i] >> 0) | (data[i] >> 1) | + (data[i] >> 2) | (data[i] >> 3) | 0xeeeeeeee; + pos = ffs(~b) - 1; + if (pos >= 0) + break; + } + } + } else { + if (count <= 8) + count = 8; + else + if (count <= 16) + count = 16; + else + count = 32; + + for (i = 0; i < end; ++i) { + if (data[i] != 0xffffffff) { + for (pos = 0; pos < 32; pos += count) + if (!(data[i] & (m << pos))) + break; + if (pos < 32) + break; + } + } + } + pos += i * 32; + + return ((pos + count) <= size) ? pos : -1; +} + void BitSet::print() const { unsigned int n = 0; diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_util.h b/src/gallium/drivers/nv50/codegen/nv50_ir_util.h index 830320721d1..f348f1811b8 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_util.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_util.h @@ -137,6 +137,8 @@ public: (__listA)->prev = prevB; \ } while(0) +#define DLLIST_EMPTY(__list) ((__list)->next == (__list)) + #define DLLIST_FOR_EACH(list, it) \ for (DLList::Iterator (it) = (list)->iterator(); !(it).end(); (it).next()) @@ -416,17 +418,22 @@ class Interval { public: Interval() : head(0), tail(0) { } + Interval(const Interval&); ~Interval(); bool extend(int, int); + void insert(const Interval&); void unify(Interval&); // clears source interval void clear(); - inline int begin() { return head ? head->bgn : -1; } - inline int end() { checkTail(); return tail ? tail->end : -1; } + inline int begin() const { return head ? head->bgn : -1; } + inline int end() const { checkTail(); return tail ? tail->end : -1; } inline bool isEmpty() const { return !head; } bool overlaps(const Interval&) const; - bool contains(int pos); + bool contains(int pos) const; + + inline int extent() const { return end() - begin(); } + int length() const; void print() const; @@ -477,6 +484,7 @@ public: } bool allocate(unsigned int nBits, bool zero); + bool resize(unsigned int nBits); // keep old data, zero additional bits inline unsigned int getSize() const { return size; } @@ -489,18 +497,44 @@ public: assert(i < size); data[i / 32] |= 1 << (i % 32); } + // NOTE: range may not cross 32 bit boundary (implies n <= 32) + inline void setRange(unsigned int i, unsigned int n) + { + assert((i + n) <= size && (((i % 32) + n) <= 32)); + data[i / 32] |= ((1 << n) - 1) << (i % 32); + } + inline void setMask(unsigned int i, uint32_t m) + { + assert(i < size); + data[i / 32] |= m; + } inline void clr(unsigned int i) { assert(i < size); data[i / 32] &= ~(1 << (i % 32)); } + // NOTE: range may not cross 32 bit boundary (implies n <= 32) + inline void clrRange(unsigned int i, unsigned int n) + { + assert((i + n) <= size && (((i % 32) + n) <= 32)); + data[i / 32] &= ~(((1 << n) - 1) << (i % 32)); + } inline bool test(unsigned int i) const { assert(i < size); return data[i / 32] & (1 << (i % 32)); } + // NOTE: range may not cross 32 bit boundary (implies n <= 32) + inline bool testRange(unsigned int i, unsigned int n) + { + assert((i + n) <= size && (((i % 32) + n) <= 32)); + return data[i / 32] & (((1 << n) - 1) << (i % 32)); + } + + // Find a range of size (<= 32) clear bits aligned to roundup_pow2(size). + int findFreeRange(unsigned int size) const; BitSet& operator|=(const BitSet&); @@ -514,6 +548,13 @@ public: void andNot(const BitSet&); + // bits = (bits | setMask) & ~clrMask + inline void periodicMask32(uint32_t setMask, uint32_t clrMask) + { + for (unsigned int i = 0; i < (size + 31) / 32; ++i) + data[i] = (data[i] | setMask) & ~clrMask; + } + unsigned int popCount() const; void print() const; diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp index 9c4108c6666..31769b2e4e2 100644 --- a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp @@ -1009,9 +1009,7 @@ CodeEmitterNVC0::emitTEX(const TexInstruction *i) if (i->tex.target.isShadow()) code[1] |= 1 << 24; - int src1 = i->tex.target.getArgCount(); - if (i->op == OP_TXD && i->tex.useOffsets) - ++src1; + const int src1 = MAX2(i->predSrc + 1, 1); // if predSrc == 1, no 2nd src if (i->srcExists(src1) && i->src(src1).getFile() == FILE_IMMEDIATE) { // lzero @@ -1184,7 +1182,7 @@ CodeEmitterNVC0::emitVFETCH(const Instruction *i) emitPredicate(i); - code[0] |= (i->defCount(0xf) - 1) << 5; + code[0] |= ((i->getDef(0)->reg.size / 4) - 1) << 5; defId(i->def(0), 14); srcId(i->src(0).getIndirect(0), 20); |