nv50/ir: fix DCE to not generate 96-bit loads

A situation where there's a 128-bit load where the last component gets DCE'd causes a 96-bit load to be generated, which no GPU can actually emit. Avoid generating such instructions by scaling back to 64-bit on the first load when splitting. Signed-off-by: Ilia Mirkin <[email protected]> Cc: "11.0 11.1" <[email protected]>
author: Ilia Mirkin <[email protected]> 2015-12-03 14:04:06 -0500
committer: Ilia Mirkin <[email protected]> 2015-12-03 23:02:57 -0500
commit: 49692f86a1b77fac4634d2a3f0502ec7451c3435 (patch)
tree: 1c8cae041cf81d90be35347778520a400d730e20
parent: 51140f452a8623c9b912126b027f0f1819e72531 (diff)
1 files changed, 31 insertions, 1 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 0f3caa8f07e..bb7f4911c21 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -2962,6 +2962,16 @@ DeadCodeElim::visit(BasicBlock *bb)
    return true;
 }
 
+// Each load can go into up to 4 destinations, any of which might potentially
+// be dead (i.e. a hole). These can always be split into 2 loads, independent
+// of where the holes are. We find the first contiguous region, put it into
+// the first load, and then put the second contiguous region into the second
+// load. There can be at most 2 contiguous regions.
+//
+// Note that there are some restrictions, for example it's not possible to do
+// a 64-bit load that's not 64-bit aligned, so such a load has to be split
+// up. Also hardware doesn't support 96-bit loads, so those also have to be
+// split into a 64-bit and 32-bit load.
 void
 DeadCodeElim::checkSplitLoad(Instruction *ld1)
 {
@@ -2982,6 +2992,8 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
    addr1 = ld1->getSrc(0)->reg.data.offset;
    n1 = n2 = 0;
    size1 = size2 = 0;
+
+   // Compute address/width for first load
    for (d = 0; ld1->defExists(d); ++d) {
       if (mask & (1 << d)) {
          if (size1 && (addr1 & 0x7))
@@ -2995,16 +3007,34 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
          break;
       }
    }
+
+   // Scale back the size of the first load until it can be loaded. This
+   // typically happens for TYPE_B96 loads.
+   while (n1 &&
+          !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
+                                                typeOfSize(size1))) {
+      size1 -= def1[--n1]->reg.size;
+      d--;
+   }
+
+   // Compute address/width for second load
    for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
       if (mask & (1 << d)) {
+         assert(!size2 || !(addr2 & 0x7));
          def2[n2] = ld1->getDef(d);
          size2 += def2[n2++]->reg.size;
-      } else {
+      } else if (!n2) {
          assert(!n2);
          addr2 += ld1->getDef(d)->reg.size;
+      } else {
+         break;
       }
    }
 
+   // Make sure that we've processed all the values
+   for (; ld1->defExists(d); ++d)
+      assert(!(mask & (1 << d)));
+
    updateLdStOffset(ld1, addr1, func);
    ld1->setType(typeOfSize(size1));
    for (d = 0; d < 4; ++d)
author	Ilia Mirkin <[email protected]>	2015-12-03 14:04:06 -0500
committer	Ilia Mirkin <[email protected]>	2015-12-03 23:02:57 -0500
commit	49692f86a1b77fac4634d2a3f0502ec7451c3435 (patch)
tree	1c8cae041cf81d90be35347778520a400d730e20
parent	51140f452a8623c9b912126b027f0f1819e72531 (diff)