From 7aa76d66a1f5edad9e8c1d54aafdce99ffa6c345 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Fri, 27 May 2016 00:45:04 -0700
Subject: i965/fs: Skip SIMD lowering destination zipping if possible.

Skipping the temporary allocation and copy instructions is easy (just
return dst), but the conditions used to find out whether the copy can
be optimized out safely without breaking the program are rather
complex: The destination must be exactly one component of at most the
execution width of the lowered instruction, and all source regions of
the instruction must be either fully disjoint from the destination or
be aligned with it group by group.

v2: Don't handle partial source-destination overlap for simplicity
    (Jason).  No instruction count regressions with respect to v1 in
    either shader-db or the few FP64 shader_runner test-cases with
    partial overlap I've checked manually.

Cc: mesa-stable@lists.freedesktop.org
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 55 ++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'src/mesa/drivers')

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 9abe73acef2..f8b0059abe9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -5109,6 +5109,52 @@ emit_unzip(const fs_builder &lbld, bblock_t *block, fs_inst *inst,
    }
 }
 
+/**
+ * Return true if splitting out the group of channels of instruction \p inst
+ * given by lbld.group() requires allocating a temporary for the destination
+ * of the lowered instruction and copying the data back to the original
+ * destination region.
+ */
+static inline bool
+needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
+{
+   /* If the instruction writes more than one component we'll have to shuffle
+    * the results of multiple lowered instructions in order to make sure that
+    * they end up arranged correctly in the original destination region.
+    */
+   if (inst->regs_written * REG_SIZE >
+       inst->dst.component_size(inst->exec_size))
+      return true;
+
+   /* If the lowered execution size is larger than the original the result of
+    * the instruction won't fit in the original destination, so we'll have to
+    * allocate a temporary in any case.
+    */
+   if (lbld.dispatch_width() > inst->exec_size)
+      return true;
+
+   for (unsigned i = 0; i < inst->sources; i++) {
+      /* If we already made a copy of the source for other reasons there won't
+       * be any overlap with the destination.
+       */
+      if (needs_src_copy(lbld, inst, i))
+         continue;
+
+      /* In order to keep the logic simple we emit a copy whenever the
+       * destination region doesn't exactly match an overlapping source, which
+       * may point at the source and destination not being aligned group by
+       * group which could cause one of the lowered instructions to overwrite
+       * the data read from the same source by other lowered instructions.
+       */
+      if (regions_overlap(inst->dst, inst->regs_written * REG_SIZE,
+                          inst->src[i], inst->regs_read(i) * REG_SIZE) &&
+          !inst->dst.equals(inst->src[i]))
+        return true;
+   }
+
+   return false;
+}
+
 /**
  * Insert data from a packed temporary into the channel group given by
  * lbld.group() of the destination region of instruction \p inst and return
@@ -5129,6 +5175,8 @@ emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)
    const fs_reg dst = horiz_offset(inst->dst, lbld.group());
    const unsigned dst_size = inst->regs_written * REG_SIZE /
             inst->dst.component_size(inst->exec_size);
+
+   if (needs_dst_copy(lbld, inst)) {
    const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size);
 
    if (inst->predicate) {
@@ -5146,6 +5194,13 @@ emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)
           .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld, k));
 
    return tmp;
+
+   } else {
+      /* No need to allocate a temporary for the lowered instruction, just
+       * take the right group of channels from the original region.
+       */
+      return dst;
+   }
 }
 
 bool
-- 
cgit v1.2.3