nvc0/ir: implement better placement of texture barriers

Put them before first uses instead of right after the texturing instruction and cull unnecessary barriers.
author: Christoph Bumiller <[email protected]> 2012-04-29 17:56:57 +0200
committer: Christoph Bumiller <[email protected]> 2012-04-29 17:56:57 +0200
commit: 00fe442253744c4c4e7e68da44d6983da053968b (patch)
tree: a2220fceb8ffa22edd97b4d725b4e3ee231a19d0 /src/gallium
parent: 163b290f886c69a233c71799613eb74fb2668085 (diff)
8 files changed, 327 insertions, 13 deletions
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir.h b/src/gallium/drivers/nv50/codegen/nv50_ir.h
index c299cab3f52..da9042066ad 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir.h
@@ -135,6 +135,8 @@ enum operation
    OP_LAST
 };
 
+// various instruction-specific modifier definitions Instruction::subOp
+// MOV_FINAL marks a MOV originating from an EXPORT (used for placing TEXBARs)
 #define NV50_IR_SUBOP_MUL_HIGH     1
 #define NV50_IR_SUBOP_EMIT_RESTART 1
 #define NV50_IR_SUBOP_LDC_IL       1
@@ -143,6 +145,7 @@ enum operation
 #define NV50_IR_SUBOP_SHIFT_WRAP   1
 #define NV50_IR_SUBOP_EMU_PRERET   1
 #define NV50_IR_SUBOP_TEXBAR(n)    n
+#define NV50_IR_SUBOP_MOV_FINAL    1
 
 enum DataType
 {
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_graph.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_graph.cpp
index 90147b6bfb8..f1bff973636 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_graph.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_graph.cpp
@@ -21,6 +21,9 @@
  */
 
 #include "nv50_ir_graph.h"
+#include <limits>
+#include <list>
+#include "nv50_ir.h"
 
 namespace nv50_ir {
 
@@ -228,8 +231,7 @@ public:
    virtual bool end() const { return pos >= count; }
    virtual void next() { if (pos < count) ++pos; }
    virtual void *get() const { return nodes[pos]; }
-
-   void reset() { pos = 0; }
+   virtual void reset() { pos = 0; }
 
 protected:
    Graph::Node **nodes;
@@ -274,6 +276,7 @@ public:
    virtual void *get() const { return nodes[pos]; }
    virtual bool end() const { return pos >= count; }
    virtual void next() { if (pos < count) ++pos; }
+   virtual void reset() { pos = 0; }
 
 private:
    void search(Graph::Node *node, const int sequence)
@@ -389,4 +392,43 @@ void Graph::classifyDFS(Node *curr, int& seq)
    curr->tag = 0;
 }
 
+// @dist is indexed by Node::tag, returns -1 if no path found
+int
+Graph::findLightestPathWeight(Node *a, Node *b, const std::vector<int> &weight)
+{
+   std::vector<int> path(weight.size(), std::numeric_limits<int>::max());
+   std::list<Node *> nodeList;
+   const int seq = nextSequence();
+
+   path[a->tag] = 0;
+   for (Node *c = a; c && c != b;) {
+      const int p = path[c->tag] + weight[c->tag];
+      for (EdgeIterator ei = c->outgoing(); !ei.end(); ei.next()) {
+         Node *t = ei.getNode();
+         if (t->getSequence() < seq) {
+            if (path[t->tag] == std::numeric_limits<int>::max())
+               nodeList.push_front(t);
+            if (p < path[t->tag])
+               path[t->tag] = p;
+         }
+      }
+      c->visit(seq);
+      Node *next = NULL;
+      for (std::list<Node *>::iterator n = nodeList.begin();
+           n != nodeList.end(); ++n) {
+         if (!next || path[(*n)->tag] < path[next->tag])
+            next = *n;
+         if ((*n) == c) {
+            // erase visited
+            n = nodeList.erase(n);
+            --n;
+         }
+      }
+      c = next;
+   }
+   if (path[b->tag] == std::numeric_limits<int>::max())
+      return -1;
+   return path[b->tag];
+}
+
 } // namespace nv50_ir
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_graph.h b/src/gallium/drivers/nv50/codegen/nv50_ir_graph.h
index cfa73c3dd1a..9ef317f943c 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_graph.h
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_graph.h
@@ -24,6 +24,7 @@
 #define __NV50_IR_GRAPH_H__
 
 #include "nv50_ir_util.h"
+#include <vector>
 
 namespace nv50_ir {
 
@@ -165,6 +166,9 @@ public:
 
    void classifyEdges();
 
+   // @weights: indexed by Node::tag
+   int findLightestPathWeight(Node *, Node *, const std::vector<int>& weights);
+
 private:
    void classifyDFS(Node *, int&);
 
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp
index 30d8acee3bc..27373b4cc47 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp
@@ -998,6 +998,7 @@ NV50LoweringPreSSA::handleEXPORT(Instruction *i)
          int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
 
          i->op = OP_MOV;
+         i->subOp = NV50_IR_SUBOP_MOV_FINAL;
          i->src(0).set(i->src(1));
          i->setSrc(1, NULL);
          i->setDef(0, new_LValue(func, FILE_GPR));
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp
index 7d0ebc69710..5bc3a450779 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp
@@ -33,7 +33,7 @@ namespace nv50_ir {
 bool
 Instruction::isNop() const
 {
-   if (op == OP_CONSTRAINT || op == OP_PHI)
+   if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
       return true;
    if (terminator || join) // XXX: should terminator imply flow ?
       return false;
@@ -1855,7 +1855,10 @@ FlatteningPass::visit(BasicBlock *bb)
    Instruction *insn = bb->getExit();
    if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
       insn = insn->prev;
-      if (insn && !insn->getPredicate() && !insn->asFlow() && !insn->isNop()) {
+      if (insn && !insn->getPredicate() &&
+          !insn->asFlow() &&
+          insn->op != OP_TEXBAR &&
+          !insn->isNop()) {
          insn->join = 1;
          bb->remove(bb->getExit());
          return true;
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp
index 5d2aa587a31..77edaa6067a 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp
@@ -1615,7 +1615,6 @@ GCRA::resolveSplitsAndMerges()
          v->join = v;
          reg += v->reg.size;
       }
-      delete_Instruction(prog, split);
    }
    splits.clear();
 
@@ -1630,7 +1629,6 @@ GCRA::resolveSplitsAndMerges()
          v->join = v;
          reg += v->reg.size;
       }
-      delete_Instruction(prog, merge);
    }
    merges.clear();
 }
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_util.h b/src/gallium/drivers/nv50/codegen/nv50_ir_util.h
index f348f1811b8..a0020789c11 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_util.h
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_util.h
@@ -91,6 +91,7 @@ public:
    virtual void next() = 0;
    virtual void *get() const = 0;
    virtual bool end() const = 0; // if true, get will return 0
+   virtual void reset() { assert(0); } // only for graph iterators
 };
 
 typedef std::auto_ptr<Iterator> IteratorRef;
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
index 318d345efdb..02ae9fd5d0e 100644
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
@@ -25,6 +25,8 @@
 
 #include "nv50_ir_target_nvc0.h"
 
+#include <limits>
+
 namespace nv50_ir {
 
 #define QOP_ADD  0
@@ -129,8 +131,26 @@ private:
    bool tryReplaceContWithBra(BasicBlock *);
    void propagateJoin(BasicBlock *);
 
-   LValue *r63;
+   struct TexUse
+   {
+      TexUse(Instruction *use, const Instruction *tex)
+         : insn(use), tex(tex), level(-1) { }
+      Instruction *insn;
+      const Instruction *tex; // or split / mov
+      int level;
+   };
+   struct Limits
+   {
+      Limits() { }
+      Limits(int min, int max) : min(min), max(max) { }
+      int min, max;
+   };
+   bool insertTextureBarriers(Function *);
+   inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
+   void findFirstUses(const Instruction *, std::list<TexUse>&);
 
+private:
+   LValue *r63;
    const bool needTexBar;
 };
 
@@ -140,8 +160,255 @@ NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
 }
 
 bool
+NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
+                                    const Instruction *early) const
+{
+   if (early->bb == later->bb)
+      return early->serial < later->serial;
+   return later->bb->dominatedBy(early->bb);
+}
+
+void
+NVC0LegalizePostRA::findFirstUses(const Instruction *insn,
+                                  std::list<TexUse> &uses)
+{
+   for (int d = 0; insn->defExists(d); ++d) {
+      Value *v = insn->getDef(d);
+      for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
+         Instruction *usei = (*u)->getInsn();
+         if (usei->op == OP_SPLIT ||
+             usei->op == OP_PHI ||
+             usei->op == OP_UNION) {
+            // these uses don't manifest in the machine code
+            findFirstUses(usei, uses);
+         } else
+         if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
+             usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
+            findFirstUses(usei, uses);
+         } else {
+            bool add = true;
+            for (std::list<TexUse>::iterator it = uses.begin();
+                 it != uses.end();) {
+               if (insnDominatedBy(usei, it->insn)) {
+                  add = false;
+                  break;
+               }
+               if (insnDominatedBy(it->insn, usei))
+                  it = uses.erase(it);
+               else
+                  ++it;
+            }
+            if (add)
+               uses.push_back(TexUse(usei, insn));
+         }
+      }
+   }
+}
+
+// Texture barriers:
+// This pass is a bit long and ugly and can probably be optimized.
+//
+// 1. obtain a list of TEXes and their outputs' first use(s)
+// 2. calculate the barrier level of each first use (minimal number of TEXes,
+//    over all paths, between the TEX and the use in question)
+// 3. for each barrier, if all paths from the source TEX to that barrier
+//    contain a barrier of lesser level, it can be culled
+bool
+NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
+{
+   std::list<TexUse> *uses;
+   std::vector<Instruction *> texes;
+   std::vector<int> bbFirstTex;
+   std::vector<int> bbFirstUse;
+   std::vector<int> texCounts;
+   std::vector<TexUse> useVec;
+   ArrayList insns;
+
+   fn->orderInstructions(insns);
+
+   texCounts.resize(fn->allBBlocks.getSize(), 0);
+   bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
+   bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
+
+   // tag BB CFG nodes by their id for later
+   for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
+      BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
+      if (bb)
+         bb->cfg.tag = bb->getId();
+   }
+
+   // gather the first uses for each TEX
+   for (int i = 0; i < insns.getSize(); ++i) {
+      Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
+      if (isTextureOp(tex->op)) {
+         texes.push_back(tex);
+         if (!texCounts.at(tex->bb->getId()))
+            bbFirstTex[tex->bb->getId()] = texes.size() - 1;
+         texCounts[tex->bb->getId()]++;
+      }
+   }
+   insns.clear();
+   if (texes.empty())
+      return false;
+   uses = new std::list<TexUse>[texes.size()];
+   if (!uses)
+      return false;
+   for (size_t i = 0; i < texes.size(); ++i)
+      findFirstUses(texes[i], uses[i]);
+
+   // determine the barrier level at each use
+   for (size_t i = 0; i < texes.size(); ++i) {
+      for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
+           ++u) {
+         BasicBlock *tb = texes[i]->bb;
+         BasicBlock *ub = u->insn->bb;
+         if (tb == ub) {
+            u->level = 0;
+            for (size_t j = i + 1; j < texes.size() &&
+                    texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
+                 ++j)
+               u->level++;
+         } else {
+            u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
+                                                      &ub->cfg, texCounts);
+            if (u->level < 0) {
+               WARN("Failed to find path TEX -> TEXBAR\n");
+               u->level = 0;
+               continue;
+            }
+            // this counted all TEXes in the origin block, correct that
+            u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
+            // and did not count the TEXes in the destination block, add those
+            for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
+                    texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
+                 ++j)
+               u->level++;
+         }
+         assert(u->level >= 0);
+         useVec.push_back(*u);
+      }
+   }
+   delete[] uses;
+   uses = NULL;
+
+   // insert the barriers
+   for (size_t i = 0; i < useVec.size(); ++i) {
+      Instruction *prev = useVec[i].insn->prev;
+      if (useVec[i].level < 0)
+         continue;
+      if (prev && prev->op == OP_TEXBAR) {
+         if (prev->subOp > useVec[i].level)
+            prev->subOp = useVec[i].level;
+         prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
+      } else {
+         Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
+         bar->fixed = 1;
+         bar->subOp = useVec[i].level;
+         // make use explicit to ease latency calculation
+         bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
+         useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
+      }
+   }
+
+   if (fn->getProgram()->optLevel < 3) {
+      if (uses)
+         delete[] uses;
+      return true;
+   }
+
+   std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
+
+   limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
+   limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
+   limitS.resize(fn->allBBlocks.getSize());
+
+   // cull unneeded barriers (should do that earlier, but for simplicity)
+   IteratorRef bi = fn->cfg.iteratorDFS(true);
+   // first calculate min/max outstanding TEXes for each BB
+   for (bi->reset(); !bi->end(); bi->next()) {
+      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
+      BasicBlock *bb = BasicBlock::get(n);
+      int min = 0;
+      int max = std::numeric_limits<int>::max();
+      for (Instruction *i = bb->getFirst(); i; i = i->next) {
+         if (isTextureOp(i->op)) {
+            min++;
+            if (max < std::numeric_limits<int>::max())
+               max++;
+         } else
+         if (i->op == OP_TEXBAR) {
+            min = MIN2(min, i->subOp);
+            max = MIN2(max, i->subOp);
+         }
+      }
+      // limits when looking at an isolated block
+      limitS[bb->getId()].min = min;
+      limitS[bb->getId()].max = max;
+   }
+   // propagate the min/max values
+   for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
+      for (bi->reset(); !bi->end(); bi->next()) {
+         Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
+         BasicBlock *bb = BasicBlock::get(n);
+         const int bbId = bb->getId();
+         for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
+            BasicBlock *in = BasicBlock::get(ei.getNode());
+            const int inId = in->getId();
+            limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
+            limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
+         }
+         // I just hope this is correct ...
+         if (limitS[bbId].max == std::numeric_limits<int>::max()) {
+            // no barrier
+            limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
+            limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
+         } else {
+            // block contained a barrier
+            limitB[bbId].min = MIN2(limitS[bbId].max,
+                                    limitT[bbId].min + limitS[bbId].min);
+            limitB[bbId].max = MIN2(limitS[bbId].max,
+                                    limitT[bbId].max + limitS[bbId].min);
+         }
+      }
+   }
+   // finally delete unnecessary barriers
+   for (bi->reset(); !bi->end(); bi->next()) {
+      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
+      BasicBlock *bb = BasicBlock::get(n);
+      Instruction *prev = NULL;
+      Instruction *next;
+      int max = limitT[bb->getId()].max;
+      for (Instruction *i = bb->getFirst(); i; i = next) {
+         next = i->next;
+         if (i->op == OP_TEXBAR) {
+            if (i->subOp >= max) {
+               delete_Instruction(prog, i);
+            } else {
+               max = i->subOp;
+               if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
+                  delete_Instruction(prog, prev);
+                  prev = NULL;
+               }
+            }
+         } else
+         if (isTextureOp(i->op)) {
+            max++;
+         }
+         if (!i->isNop())
+            prev = i;
+      }
+   }
+   if (uses)
+      delete[] uses;
+   return true;
+}
+
+bool
 NVC0LegalizePostRA::visit(Function *fn)
 {
+   if (needTexBar)
+      insertTextureBarriers(fn);
+
    r63 = new_LValue(fn, FILE_GPR);
    r63->reg.data.id = 63;
    return true;
@@ -235,12 +502,6 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
       } else
       if (i->isNop()) {
          bb->remove(i);
-      } else
-      if (needTexBar && isTextureOp(i->op)) {
-         Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
-         bar->fixed = 1;
-         bar->subOp = 0;
-         bb->insertAfter(i, bar);
       } else {
          if (i->op != OP_MOV && i->op != OP_PFETCH)
             replaceZero(i);
@@ -673,6 +934,7 @@ NVC0LoweringPass::handleEXPORT(Instruction *i)
       if (i->src(0).isIndirect(0)) // TODO, ugly
          return false;
       i->op = OP_MOV;
+      i->subOp = NV50_IR_SUBOP_MOV_FINAL;
       i->src(0).set(i->src(1));
       i->setSrc(1, NULL);
       i->setDef(0, new_LValue(func, FILE_GPR));
author	Christoph Bumiller <[email protected]>	2012-04-29 17:56:57 +0200
committer	Christoph Bumiller <[email protected]>	2012-04-29 17:56:57 +0200
commit	00fe442253744c4c4e7e68da44d6983da053968b (patch)
tree	a2220fceb8ffa22edd97b4d725b4e3ee231a19d0 /src/gallium
parent	163b290f886c69a233c71799613eb74fb2668085 (diff)