5 files changed, 898 insertions, 0 deletions
diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
index cdb10ece35e..1c62bad816e 100644
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -48,6 +48,7 @@ COMPILER_FILES = \
 	compiler/brw_eu_util.c \
 	compiler/brw_eu_validate.c \
 	compiler/brw_fs_builder.h \
+	compiler/brw_fs_bank_conflicts.cpp \
 	compiler/brw_fs_cmod_propagation.cpp \
 	compiler/brw_fs_combine_constants.cpp \
 	compiler/brw_fs_copy_propagation.cpp \
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 93bb6b46732..c5d4f5634df 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6065,6 +6065,8 @@ fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling)
    if (failed)
       return;
 
+   opt_bank_conflicts();
+
    schedule_instructions(SCHEDULE_POST);
 
    if (last_scratch > 0) {
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 30557324d5a..0cec6fdcbad 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -145,6 +145,7 @@ public:
                                    exec_list *acp);
    bool opt_drop_redundant_mov_to_flags();
    bool opt_register_renaming();
+   bool opt_bank_conflicts();
    bool register_coalesce();
    bool compute_to_mrf();
    bool eliminate_find_live_channel();
diff --git a/src/intel/compiler/brw_fs_bank_conflicts.cpp b/src/intel/compiler/brw_fs_bank_conflicts.cpp
new file mode 100644
index 00000000000..b64a3d4a8a8
--- /dev/null
+++ b/src/intel/compiler/brw_fs_bank_conflicts.cpp
@@ -0,0 +1,893 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_bank_conflicts.cpp
+ *
+ * This file contains a GRF bank conflict mitigation pass.  The pass is
+ * intended to be run after register allocation and works by rearranging the
+ * layout of the GRF space (without altering the semantics of the program) in
+ * a way that minimizes the number of GRF bank conflicts incurred by ternary
+ * instructions.
+ *
+ * Unfortunately there is close to no information about bank conflicts in the
+ * hardware spec, but experimentally on Gen7-Gen9 ternary instructions seem to
+ * incur an average bank conflict penalty of one cycle per SIMD8 op whenever
+ * the second and third source are stored in the same GRF bank (\sa bank_of()
+ * for the exact bank layout) which cannot be fetched during the same cycle by
+ * the EU, unless the EU logic manages to optimize out the read cycle of a
+ * duplicate source register (\sa is_conflict_optimized_out()).
+ *
+ * The asymptotic run-time of the algorithm is dominated by the
+ * shader_conflict_weight_matrix() computation below, which is O(n) on the
+ * number of instructions in the program, however for small and medium-sized
+ * programs the run-time is likely to be dominated by
+ * optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of
+ * the program (\sa partitioning), which is bounded (since the program uses a
+ * bounded number of registers post-regalloc) and of the order of 100.  For
+ * that reason optimize_reg_permutation() is vectorized in order to keep the
+ * cubic term within reasonable bounds for m close to its theoretical maximum.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+#ifdef __SSE2__
+
+#include <emmintrin.h>
+
+/**
+ * Thin layer around vector intrinsics so they can be easily replaced with
+ * e.g. the fall-back scalar path, an implementation with different vector
+ * width or using different SIMD architectures (AVX-512?!).
+ *
+ * This implementation operates on pairs of independent SSE2 integer vectors à
+ * la SIMD16 for somewhat improved throughput.  SSE2 is supported by virtually
+ * all platforms that care about bank conflicts, so this path should almost
+ * always be available in practice.
+ */
+namespace {
+   /**
+    * SIMD integer vector data type.
+    */
+   struct vector_type {
+      __m128i v[2];
+   };
+
+   /**
+    * Scalar data type matching the representation of a single component of \p
+    * vector_type.
+    */
+   typedef int16_t scalar_type;
+
+   /**
+    * Maximum integer value representable as a \p scalar_type.
+    */
+   const scalar_type max_scalar = INT16_MAX;
+
+   /**
+    * Number of components of a \p vector_type.
+    */
+   const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type);
+
+   /**
+    * Set the i-th component of vector \p v to \p x.
+    */
+   void
+   set(vector_type &v, unsigned i, scalar_type x)
+   {
+      assert(i < vector_width);
+      memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x));
+   }
+
+   /**
+    * Get the i-th component of vector \p v.
+    */
+   scalar_type
+   get(const vector_type &v, unsigned i)
+   {
+      assert(i < vector_width);
+      scalar_type x;
+      memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x));
+      return x;
+   }
+
+   /**
+    * Add two vectors with saturation.
+    */
+   vector_type
+   adds(const vector_type &v, const vector_type &w)
+   {
+      const vector_type u = {{
+            _mm_adds_epi16(v.v[0], w.v[0]),
+            _mm_adds_epi16(v.v[1], w.v[1])
+         }};
+      return u;
+   }
+
+   /**
+    * Subtract two vectors with saturation.
+    */
+   vector_type
+   subs(const vector_type &v, const vector_type &w)
+   {
+      const vector_type u = {{
+            _mm_subs_epi16(v.v[0], w.v[0]),
+            _mm_subs_epi16(v.v[1], w.v[1])
+         }};
+      return u;
+   }
+
+   /**
+    * Compute the bitwise conjunction of two vectors.
+    */
+   vector_type
+   mask(const vector_type &v, const vector_type &w)
+   {
+      const vector_type u = {{
+            _mm_and_si128(v.v[0], w.v[0]),
+            _mm_and_si128(v.v[1], w.v[1])
+         }};
+      return u;
+   }
+
+   /**
+    * Reduce the components of a vector using saturating addition.
+    */
+   scalar_type
+   sums(const vector_type &v)
+   {
+      const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]);
+      const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e));
+      const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1));
+      const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1));
+      return _mm_extract_epi16(v1, 0);
+   }
+}
+
+#else
+
+/**
+ * Thin layer around vector intrinsics so they can be easily replaced with
+ * e.g. the fall-back scalar path, an implementation with different vector
+ * width or using different SIMD architectures (AVX-512?!).
+ *
+ * This implementation operates on scalar values and doesn't rely on
+ * any vector extensions.  This is mainly intended for debugging and
+ * to keep this file building on exotic platforms.
+ */
+namespace {
+   /**
+    * SIMD integer vector data type.
+    */
+   typedef int16_t vector_type;
+
+   /**
+    * Scalar data type matching the representation of a single component of \p
+    * vector_type.
+    */
+   typedef int16_t scalar_type;
+
+   /**
+    * Maximum integer value representable as a \p scalar_type.
+    */
+   const scalar_type max_scalar = INT16_MAX;
+
+   /**
+    * Number of components of a \p vector_type.
+    */
+   const unsigned vector_width = 1;
+
+   /**
+    * Set the i-th component of vector \p v to \p x.
+    */
+   void
+   set(vector_type &v, unsigned i, scalar_type x)
+   {
+      assert(i < vector_width);
+      v = x;
+   }
+
+   /**
+    * Get the i-th component of vector \p v.
+    */
+   scalar_type
+   get(const vector_type &v, unsigned i)
+   {
+      assert(i < vector_width);
+      return v;
+   }
+
+   /**
+    * Add two vectors with saturation.
+    */
+   vector_type
+   adds(vector_type v, vector_type w)
+   {
+      return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w));
+   }
+
+   /**
+    * Substract two vectors with saturation.
+    */
+   vector_type
+   subs(vector_type v, vector_type w)
+   {
+      return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w));
+   }
+
+   /**
+    * Compute the bitwise conjunction of two vectors.
+    */
+   vector_type
+   mask(vector_type v, vector_type w)
+   {
+      return v & w;
+   }
+
+   /**
+    * Reduce the components of a vector using saturating addition.
+    */
+   scalar_type
+   sums(vector_type v)
+   {
+      return v;
+   }
+}
+
+#endif
+
+/**
+ * Swap \p x and \p y.
+ */
+#define SWAP(x, y) do {                          \
+      __typeof(y) _swap_tmp = y;                 \
+      y = x;                                     \
+      x = _swap_tmp;                             \
+   } while (0)
+
+namespace {
+   /**
+    * Variable-length vector type intended to represent cycle-count costs for
+    * arbitrary atom-to-bank assignments.  It's indexed by a pair of integers
+    * (i, p), where i is an atom index and p in {0, 1} indicates the parity of
+    * the conflict (respectively, whether the cost is incurred whenever the
+    * atoms are assigned the same bank b or opposite-parity banks b and b^1).
+    * \sa shader_conflict_weight_matrix()
+    */
+   struct weight_vector_type {
+      weight_vector_type() : v(NULL), size(0) {}
+
+      weight_vector_type(unsigned n) :
+         v(new vector_type[DIV_ROUND_UP(n, vector_width)]()),
+         size(n) {}
+
+      weight_vector_type(const weight_vector_type &u) :
+         v(new vector_type[DIV_ROUND_UP(u.size, vector_width)]()),
+         size(u.size)
+      {
+         memcpy(v, u.v,
+                DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type));
+      }
+
+      ~weight_vector_type()
+      {
+         delete[] v;
+      }
+
+      weight_vector_type &
+      operator=(weight_vector_type u)
+      {
+         SWAP(v, u.v);
+         SWAP(size, u.size);
+         return *this;
+      }
+
+      vector_type *v;
+      unsigned size;
+   };
+
+   /**
+    * Set the (i, p)-th component of weight vector \p v to \p x.
+    */
+   void
+   set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x)
+   {
+      set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);
+   }
+
+   /**
+    * Get the (i, p)-th component of weight vector \p v.
+    */
+   scalar_type
+   get(const weight_vector_type &v, unsigned i, unsigned p)
+   {
+      return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);
+   }
+
+   /**
+    * Swap the (i, p)-th and (j, q)-th components of weight vector \p v.
+    */
+   void
+   swap(weight_vector_type &v,
+        unsigned i, unsigned p,
+        unsigned j, unsigned q)
+   {
+      const scalar_type tmp = get(v, i, p);
+      set(v, i, p, get(v, j, q));
+      set(v, j, q, tmp);
+   }
+}
+
+namespace {
+   /**
+    * Object that represents the partitioning of an arbitrary register space
+    * into indivisible units (referred to as atoms below) that can potentially
+    * be rearranged independently from other registers.  The partitioning is
+    * inferred from a number of contiguity requirements specified using
+    * require_contiguous().  This allows efficient look-up of the atom index a
+    * given register address belongs to, or conversely the range of register
+    * addresses that belong to a given atom.
+    */
+   struct partitioning {
+      /**
+       * Create a (for the moment unrestricted) partitioning of a register
+       * file of size \p n.  The units are arbitrary.
+       */
+      partitioning(unsigned n) :
+         max_reg(n),
+         offsets(new unsigned[n + num_terminator_atoms]),
+         atoms(new unsigned[n + num_terminator_atoms])
+      {
+         for (unsigned i = 0; i < n + num_terminator_atoms; i++) {
+            offsets[i] = i;
+            atoms[i] = i;
+         }
+      }
+
+      partitioning(const partitioning &p) :
+         max_reg(p.max_reg),
+         offsets(new unsigned[p.num_atoms() + num_terminator_atoms]),
+         atoms(new unsigned[p.max_reg + num_terminator_atoms])
+      {
+         memcpy(offsets, p.offsets,
+                sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms));
+         memcpy(atoms, p.atoms,
+                sizeof(unsigned) * (p.max_reg + num_terminator_atoms));
+      }
+
+      ~partitioning()
+      {
+         delete[] offsets;
+         delete[] atoms;
+      }
+
+      partitioning &
+      operator=(partitioning p)
+      {
+         SWAP(max_reg, p.max_reg);
+         SWAP(offsets, p.offsets);
+         SWAP(atoms, p.atoms);
+         return *this;
+      }
+
+      /**
+       * Require register range [reg, reg + n[ to be considered part of the
+       * same atom.
+       */
+      void
+      require_contiguous(unsigned reg, unsigned n)
+      {
+         unsigned r = atoms[reg];
+
+         /* Renumber atoms[reg...] = { r... } and their offsets[r...] for the
+          * case that the specified contiguity requirement leads to the fusion
+          * (yay) of one or more existing atoms.
+          */
+         for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) {
+            if (offsets[atoms[reg1]] < reg + n) {
+               atoms[reg1] = r;
+            } else {
+               if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]])
+                  r++;
+
+               offsets[r] = offsets[atoms[reg1]];
+               atoms[reg1] = r;
+            }
+         }
+      }
+
+      /**
+       * Get the atom index register address \p reg belongs to.
+       */
+      unsigned
+      atom_of_reg(unsigned reg) const
+      {
+         return atoms[reg];
+      }
+
+      /**
+       * Get the base register address that belongs to atom \p r.
+       */
+      unsigned
+      reg_of_atom(unsigned r) const
+      {
+         return offsets[r];
+      }
+
+      /**
+       * Get the size of atom \p r in register address units.
+       */
+      unsigned
+      size_of_atom(unsigned r) const
+      {
+         assert(r < num_atoms());
+         return reg_of_atom(r + 1) - reg_of_atom(r);
+      }
+
+      /**
+       * Get the number of atoms the whole register space is partitioned into.
+       */
+      unsigned
+      num_atoms() const
+      {
+         return atoms[max_reg];
+      }
+
+   private:
+      /**
+       * Number of trailing atoms inserted for convenience so among other
+       * things we don't need to special-case the last element in
+       * size_of_atom().
+       */
+      static const unsigned num_terminator_atoms = 1;
+      unsigned max_reg;
+      unsigned *offsets;
+      unsigned *atoms;
+   };
+
+   /**
+    * Only GRF sources (whether they have been register-allocated or not) can
+    * possibly incur bank conflicts.
+    */
+   bool
+   is_grf(const fs_reg &r)
+   {
+      return r.file == VGRF || r.file == FIXED_GRF;
+   }
+
+   /**
+    * Register offset of \p r in GRF units.  Useful because the representation
+    * of GRFs post-register allocation is somewhat inconsistent and depends on
+    * whether the register already had a fixed GRF offset prior to register
+    * allocation or whether it was part of a VGRF allocation.
+    */
+   unsigned
+   reg_of(const fs_reg &r)
+   {
+      assert(is_grf(r));
+      if (r.file == VGRF)
+         return r.nr + r.offset / REG_SIZE;
+      else
+         return reg_offset(r) / REG_SIZE;
+   }
+
+   /**
+    * Calculate the finest partitioning of the GRF space compatible with the
+    * register contiguity requirements derived from all instructions part of
+    * the program.
+    */
+   partitioning
+   shader_reg_partitioning(const fs_visitor *v)
+   {
+      partitioning p(BRW_MAX_GRF);
+
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         if (is_grf(inst->dst))
+            p.require_contiguous(reg_of(inst->dst), regs_written(inst));
+
+         for (int i = 0; i < inst->sources; i++) {
+            if (is_grf(inst->src[i]))
+               p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i));
+         }
+      }
+
+      return p;
+   }
+
+   /**
+    * Return the set of GRF atoms that should be left untouched at their
+    * original location to avoid violating hardware or software assumptions.
+    */
+   bool *
+   shader_reg_constraints(const fs_visitor *v, const partitioning &p)
+   {
+      bool *constrained = new bool[p.num_atoms()]();
+
+      /* These are read implicitly by some send-message instructions without
+       * any indication at the IR level.  Assume they are unsafe to move
+       * around.
+       */
+      for (unsigned reg = 0; reg < 2; reg++)
+         constrained[p.atom_of_reg(reg)] = true;
+
+      /* Assume that anything referenced via fixed GRFs is baked into the
+       * hardware's fixed-function logic and may be unsafe to move around.
+       * Also take into account the source GRF restrictions of EOT
+       * send-message instructions.
+       */
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         if (inst->dst.file == FIXED_GRF)
+            constrained[p.atom_of_reg(reg_of(inst->dst))] = true;
+
+         for (int i = 0; i < inst->sources; i++) {
+            if (inst->src[i].file == FIXED_GRF ||
+                (is_grf(inst->src[i]) && inst->eot))
+               constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;
+         }
+      }
+
+      return constrained;
+   }
+
+   /**
+    * Return whether the hardware will be able to prevent a bank conflict by
+    * optimizing out the read cycle of a source register.  The formula was
+    * found experimentally.
+    */
+   bool
+   is_conflict_optimized_out(const gen_device_info *devinfo, const fs_inst *inst)
+   {
+      return devinfo->gen >= 9 &&
+         ((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
+                                    reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
+          reg_of(inst->src[1]) == reg_of(inst->src[2]));
+   }
+
+   /**
+    * Return a matrix that allows reasonably efficient computation of the
+    * cycle-count cost of bank conflicts incurred throughout the whole program
+    * for any given atom-to-bank assignment.
+    *
+    * More precisely, if C_r_s_p is the result of this function, the total
+    * cost of all bank conflicts involving any given atom r can be readily
+    * recovered as follows:
+    *
+    *  S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p)
+    *
+    * where d_i_j is the Kronecker delta, and B_r indicates the bank
+    * assignment of r.  \sa delta_conflicts() for a vectorized implementation
+    * of the expression above.
+    *
+    * FINISHME: Teach this about the Gen10+ bank conflict rules, which are
+    *           somewhat more relaxed than on previous generations.  In the
+    *           meantime optimizing based on Gen9 weights is likely to be more
+    *           helpful than not optimizing at all.
+    */
+   weight_vector_type *
+   shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p)
+   {
+      weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()];
+      for (unsigned r = 0; r < p.num_atoms(); r++)
+         conflicts[r] = weight_vector_type(2 * p.num_atoms());
+
+      /* Crude approximation of the number of times the current basic block
+       * will be executed at run-time.
+       */
+      unsigned block_scale = 1;
+
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         if (inst->opcode == BRW_OPCODE_DO) {
+            block_scale *= 10;
+
+         } else if (inst->opcode == BRW_OPCODE_WHILE) {
+            block_scale /= 10;
+
+         } else if (inst->is_3src(v->devinfo) &&
+                    is_grf(inst->src[1]) && is_grf(inst->src[2])) {
+            const unsigned r = p.atom_of_reg(reg_of(inst->src[1]));
+            const unsigned s = p.atom_of_reg(reg_of(inst->src[2]));
+
+            /* Estimate of the cycle-count cost of incurring a bank conflict
+             * for this instruction.  This is only true on the average, for a
+             * sequence of back-to-back ternary instructions, since the EU
+             * front-end only seems to be able to issue a new instruction at
+             * an even cycle.  The cost of a bank conflict incurred by an
+             * isolated ternary instruction may be higher.
+             */
+            const unsigned exec_size = inst->dst.component_size(inst->exec_size);
+            const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size,
+                                                                    REG_SIZE);
+
+            /* Neglect same-atom conflicts (since they're either trivial or
+             * impossible to avoid without splitting the atom), and conflicts
+             * known to be optimized out by the hardware.
+             */
+            if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
+               /* Calculate the parity of the sources relative to the start of
+                * their respective atoms.  If their parity is the same (and
+                * none of the atoms straddle the 2KB mark), the instruction
+                * will incur a conflict iff both atoms are assigned the same
+                * bank b.  If their parity is opposite, the instruction will
+                * incur a conflict iff they are assigned opposite banks (b and
+                * b^1).
+                */
+               const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r));
+               const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s));
+               const unsigned p = p_r ^ p_s;
+
+               /* Calculate the updated cost of a hypothetical conflict
+                * between atoms r and s.  Note that the weight matrix is
+                * symmetric with respect to indices r and s by construction.
+                */
+               const scalar_type w = MIN2(unsigned(max_scalar),
+                                          get(conflicts[r], s, p) + cycle_scale);
+               set(conflicts[r], s, p, w);
+               set(conflicts[s], r, p, w);
+            }
+         }
+      }
+
+      return conflicts;
+   }
+
+   /**
+    * Return the set of GRF atoms that could potentially lead to bank
+    * conflicts if laid out unfavorably in the GRF space according to
+    * the specified \p conflicts matrix (\sa
+    * shader_conflict_weight_matrix()).
+    */
+   bool *
+   have_any_conflicts(const partitioning &p,
+                      const weight_vector_type *conflicts)
+   {
+      bool *any_conflicts = new bool[p.num_atoms()]();
+
+      for (unsigned r = 0; r < p.num_atoms(); r++) {
+         const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width);
+         for (unsigned s = 0; s < m; s++)
+            any_conflicts[r] |= sums(conflicts[r].v[s]);
+      }
+
+      return any_conflicts;
+   }
+
+   /**
+    * Calculate the difference between two S(B) cost estimates as defined
+    * above (\sa shader_conflict_weight_matrix()).  This represents the
+    * (partial) cycle-count benefit from moving an atom r from bank p to n.
+    * The respective bank assignments Bp and Bn are encoded as the \p
+    * bank_mask_p and \p bank_mask_n bitmasks for efficient computation,
+    * according to the formula:
+    *
+    *  bank_mask(B)_s_p = -d_(p^B_r)_(B_s)
+    *
+    * Notice the similarity with the delta function in the S(B) expression
+    * above, and how bank_mask(B) can be precomputed for every possible
+    * selection of r since bank_mask(B) only depends on it via B_r that may
+    * only assume one of four different values, so the caller can keep every
+    * possible bank_mask(B) vector in memory without much hassle (\sa
+    * bank_characteristics()).
+    */
+   int
+   delta_conflicts(const weight_vector_type &bank_mask_p,
+                   const weight_vector_type &bank_mask_n,
+                   const weight_vector_type &conflicts)
+   {
+      const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width);
+      vector_type s_p = {}, s_n = {};
+
+      for (unsigned r = 0; r < m; r++) {
+         s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r]));
+         s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r]));
+      }
+
+      return sums(subs(s_p, s_n));
+   }
+
+   /**
+    * Register atom permutation, represented as the start GRF offset each atom
+    * is mapped into.
+    */
+   struct permutation {
+      permutation() : v(NULL), size(0) {}
+
+      permutation(unsigned n) :
+         v(new unsigned[n]()), size(n) {}
+
+      permutation(const permutation &p) :
+         v(new unsigned[p.size]), size(p.size)
+      {
+         memcpy(v, p.v, p.size * sizeof(unsigned));
+      }
+
+      ~permutation()
+      {
+         delete[] v;
+      }
+
+      permutation &
+      operator=(permutation p)
+      {
+         SWAP(v, p.v);
+         SWAP(size, p.size);
+         return *this;
+      }
+
+      unsigned *v;
+      unsigned size;
+   };
+
+   /**
+    * Return an identity permutation of GRF atoms.
+    */
+   permutation
+   identity_reg_permutation(const partitioning &p)
+   {
+      permutation map(p.num_atoms());
+
+      for (unsigned r = 0; r < map.size; r++)
+         map.v[r] = p.reg_of_atom(r);
+
+      return map;
+   }
+
+   /**
+    * Return the bank index of GRF address \p reg, numbered according to the
+    * table:
+    *        Even Odd
+    *    Lo    0   1
+    *    Hi    2   3
+    */
+   unsigned
+   bank_of(unsigned reg)
+   {
+      return (reg & 0x40) >> 5 | (reg & 1);
+   }
+
+   /**
+    * Return bitmasks suitable for use as bank mask arguments for the
+    * delta_conflicts() computation.  Note that this is just the (negative)
+    * characteristic function of each bank, if you regard it as a set
+    * containing all atoms assigned to it according to the \p map array.
+    */
+   weight_vector_type *
+   bank_characteristics(const permutation &map)
+   {
+      weight_vector_type *banks = new weight_vector_type[4];
+
+      for (unsigned b = 0; b < 4; b++) {
+         banks[b] = weight_vector_type(2 * map.size);
+
+         for (unsigned j = 0; j < map.size; j++) {
+            for (unsigned p = 0; p < 2; p++)
+               set(banks[b], j, p,
+                   (b ^ p) == bank_of(map.v[j]) ? -1 : 0);
+         }
+      }
+
+      return banks;
+   }
+
+   /**
+    * Return an improved permutation of GRF atoms based on \p map attempting
+    * to reduce the total cycle-count cost of bank conflicts greedily.
+    *
+    * Note that this doesn't attempt to merge multiple atoms into one, which
+    * may allow it to do a better job in some cases -- It simply reorders
+    * existing atoms in the GRF space without affecting their identity.
+    */
+   permutation
+   optimize_reg_permutation(const partitioning &p,
+                            const bool *constrained,
+                            const weight_vector_type *conflicts,
+                            permutation map)
+   {
+      const bool *any_conflicts = have_any_conflicts(p, conflicts);
+      weight_vector_type *banks = bank_characteristics(map);
+
+      for (unsigned r = 0; r < map.size; r++) {
+         const unsigned bank_r = bank_of(map.v[r]);
+
+         if (!constrained[r]) {
+            unsigned best_s = r;
+            int best_benefit = 0;
+
+            for (unsigned s = 0; s < map.size; s++) {
+               const unsigned bank_s = bank_of(map.v[s]);
+
+               if (bank_r != bank_s && !constrained[s] &&
+                   p.size_of_atom(r) == p.size_of_atom(s) &&
+                   (any_conflicts[r] || any_conflicts[s])) {
+                  const int benefit =
+                     delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) +
+                     delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]);
+
+                  if (benefit > best_benefit) {
+                     best_s = s;
+                     best_benefit = benefit;
+                  }
+               }
+            }
+
+            if (best_s != r) {
+               for (unsigned b = 0; b < 4; b++) {
+                  for (unsigned p = 0; p < 2; p++)
+                     swap(banks[b], r, p, best_s, p);
+               }
+
+               SWAP(map.v[r], map.v[best_s]);
+            }
+         }
+      }
+
+      delete[] banks;
+      delete[] any_conflicts;
+      return map;
+   }
+
+   /**
+    * Apply the GRF atom permutation given by \p map to register \p r and
+    * return the result.
+    */
+   fs_reg
+   transform(const partitioning &p, const permutation &map, fs_reg r)
+   {
+      if (r.file == VGRF) {
+         const unsigned reg = reg_of(r);
+         const unsigned s = p.atom_of_reg(reg);
+         r.nr = map.v[s] + reg - p.reg_of_atom(s);
+         r.offset = r.offset % REG_SIZE;
+      }
+
+      return r;
+   }
+}
+
+bool
+fs_visitor::opt_bank_conflicts()
+{
+   assert(grf_used || !"Must be called after register allocation");
+
+   /* No ternary instructions -- No bank conflicts. */
+   if (devinfo->gen < 6)
+      return false;
+
+   const partitioning p = shader_reg_partitioning(this);
+   const bool *constrained = shader_reg_constraints(this, p);
+   const weight_vector_type *conflicts =
+      shader_conflict_weight_matrix(this, p);
+   const permutation map =
+      optimize_reg_permutation(p, constrained, conflicts,
+                               identity_reg_permutation(p));
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      inst->dst = transform(p, map, inst->dst);
+
+      for (int i = 0; i < inst->sources; i++)
+         inst->src[i] = transform(p, map, inst->src[i]);
+   }
+
+   delete[] conflicts;
+   delete[] constrained;
+   return true;
+}
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build
index fe0a1f6e8ac..91127bf35a9 100644
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -43,6 +43,7 @@ libintel_compiler_files = files(
   'brw_eu.h',
   'brw_eu_util.c',
   'brw_eu_validate.c',
+  'brw_fs_bank_conflicts.cpp',
   'brw_fs_builder.h',
   'brw_fs_cmod_propagation.cpp',
   'brw_fs_combine_constants.cpp',