diff options
-rw-r--r-- | src/intel/Makefile.sources | 1 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs.cpp | 2 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs.h | 1 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs_bank_conflicts.cpp | 893 | ||||
-rw-r--r-- | src/intel/compiler/meson.build | 1 |
5 files changed, 898 insertions, 0 deletions
diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index cdb10ece35e..1c62bad816e 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -48,6 +48,7 @@ COMPILER_FILES = \ compiler/brw_eu_util.c \ compiler/brw_eu_validate.c \ compiler/brw_fs_builder.h \ + compiler/brw_fs_bank_conflicts.cpp \ compiler/brw_fs_cmod_propagation.cpp \ compiler/brw_fs_combine_constants.cpp \ compiler/brw_fs_copy_propagation.cpp \ diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 93bb6b46732..c5d4f5634df 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -6065,6 +6065,8 @@ fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling) if (failed) return; + opt_bank_conflicts(); + schedule_instructions(SCHEDULE_POST); if (last_scratch > 0) { diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 30557324d5a..0cec6fdcbad 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -145,6 +145,7 @@ public: exec_list *acp); bool opt_drop_redundant_mov_to_flags(); bool opt_register_renaming(); + bool opt_bank_conflicts(); bool register_coalesce(); bool compute_to_mrf(); bool eliminate_find_live_channel(); diff --git a/src/intel/compiler/brw_fs_bank_conflicts.cpp b/src/intel/compiler/brw_fs_bank_conflicts.cpp new file mode 100644 index 00000000000..b64a3d4a8a8 --- /dev/null +++ b/src/intel/compiler/brw_fs_bank_conflicts.cpp @@ -0,0 +1,893 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_fs_bank_conflicts.cpp + * + * This file contains a GRF bank conflict mitigation pass. The pass is + * intended to be run after register allocation and works by rearranging the + * layout of the GRF space (without altering the semantics of the program) in + * a way that minimizes the number of GRF bank conflicts incurred by ternary + * instructions. + * + * Unfortunately there is close to no information about bank conflicts in the + * hardware spec, but experimentally on Gen7-Gen9 ternary instructions seem to + * incur an average bank conflict penalty of one cycle per SIMD8 op whenever + * the second and third source are stored in the same GRF bank (\sa bank_of() + * for the exact bank layout) which cannot be fetched during the same cycle by + * the EU, unless the EU logic manages to optimize out the read cycle of a + * duplicate source register (\sa is_conflict_optimized_out()). + * + * The asymptotic run-time of the algorithm is dominated by the + * shader_conflict_weight_matrix() computation below, which is O(n) on the + * number of instructions in the program, however for small and medium-sized + * programs the run-time is likely to be dominated by + * optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of + * the program (\sa partitioning), which is bounded (since the program uses a + * bounded number of registers post-regalloc) and of the order of 100. For + * that reason optimize_reg_permutation() is vectorized in order to keep the + * cubic term within reasonable bounds for m close to its theoretical maximum. + */ + +#include "brw_fs.h" +#include "brw_cfg.h" + +#ifdef __SSE2__ + +#include <emmintrin.h> + +/** + * Thin layer around vector intrinsics so they can be easily replaced with + * e.g. the fall-back scalar path, an implementation with different vector + * width or using different SIMD architectures (AVX-512?!). + * + * This implementation operates on pairs of independent SSE2 integer vectors à + * la SIMD16 for somewhat improved throughput. SSE2 is supported by virtually + * all platforms that care about bank conflicts, so this path should almost + * always be available in practice. + */ +namespace { + /** + * SIMD integer vector data type. + */ + struct vector_type { + __m128i v[2]; + }; + + /** + * Scalar data type matching the representation of a single component of \p + * vector_type. + */ + typedef int16_t scalar_type; + + /** + * Maximum integer value representable as a \p scalar_type. + */ + const scalar_type max_scalar = INT16_MAX; + + /** + * Number of components of a \p vector_type. + */ + const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type); + + /** + * Set the i-th component of vector \p v to \p x. + */ + void + set(vector_type &v, unsigned i, scalar_type x) + { + assert(i < vector_width); + memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x)); + } + + /** + * Get the i-th component of vector \p v. + */ + scalar_type + get(const vector_type &v, unsigned i) + { + assert(i < vector_width); + scalar_type x; + memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x)); + return x; + } + + /** + * Add two vectors with saturation. + */ + vector_type + adds(const vector_type &v, const vector_type &w) + { + const vector_type u = {{ + _mm_adds_epi16(v.v[0], w.v[0]), + _mm_adds_epi16(v.v[1], w.v[1]) + }}; + return u; + } + + /** + * Subtract two vectors with saturation. + */ + vector_type + subs(const vector_type &v, const vector_type &w) + { + const vector_type u = {{ + _mm_subs_epi16(v.v[0], w.v[0]), + _mm_subs_epi16(v.v[1], w.v[1]) + }}; + return u; + } + + /** + * Compute the bitwise conjunction of two vectors. + */ + vector_type + mask(const vector_type &v, const vector_type &w) + { + const vector_type u = {{ + _mm_and_si128(v.v[0], w.v[0]), + _mm_and_si128(v.v[1], w.v[1]) + }}; + return u; + } + + /** + * Reduce the components of a vector using saturating addition. + */ + scalar_type + sums(const vector_type &v) + { + const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]); + const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e)); + const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1)); + const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1)); + return _mm_extract_epi16(v1, 0); + } +} + +#else + +/** + * Thin layer around vector intrinsics so they can be easily replaced with + * e.g. the fall-back scalar path, an implementation with different vector + * width or using different SIMD architectures (AVX-512?!). + * + * This implementation operates on scalar values and doesn't rely on + * any vector extensions. This is mainly intended for debugging and + * to keep this file building on exotic platforms. + */ +namespace { + /** + * SIMD integer vector data type. + */ + typedef int16_t vector_type; + + /** + * Scalar data type matching the representation of a single component of \p + * vector_type. + */ + typedef int16_t scalar_type; + + /** + * Maximum integer value representable as a \p scalar_type. + */ + const scalar_type max_scalar = INT16_MAX; + + /** + * Number of components of a \p vector_type. + */ + const unsigned vector_width = 1; + + /** + * Set the i-th component of vector \p v to \p x. + */ + void + set(vector_type &v, unsigned i, scalar_type x) + { + assert(i < vector_width); + v = x; + } + + /** + * Get the i-th component of vector \p v. + */ + scalar_type + get(const vector_type &v, unsigned i) + { + assert(i < vector_width); + return v; + } + + /** + * Add two vectors with saturation. + */ + vector_type + adds(vector_type v, vector_type w) + { + return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w)); + } + + /** + * Substract two vectors with saturation. + */ + vector_type + subs(vector_type v, vector_type w) + { + return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w)); + } + + /** + * Compute the bitwise conjunction of two vectors. + */ + vector_type + mask(vector_type v, vector_type w) + { + return v & w; + } + + /** + * Reduce the components of a vector using saturating addition. + */ + scalar_type + sums(vector_type v) + { + return v; + } +} + +#endif + +/** + * Swap \p x and \p y. + */ +#define SWAP(x, y) do { \ + __typeof(y) _swap_tmp = y; \ + y = x; \ + x = _swap_tmp; \ + } while (0) + +namespace { + /** + * Variable-length vector type intended to represent cycle-count costs for + * arbitrary atom-to-bank assignments. It's indexed by a pair of integers + * (i, p), where i is an atom index and p in {0, 1} indicates the parity of + * the conflict (respectively, whether the cost is incurred whenever the + * atoms are assigned the same bank b or opposite-parity banks b and b^1). + * \sa shader_conflict_weight_matrix() + */ + struct weight_vector_type { + weight_vector_type() : v(NULL), size(0) {} + + weight_vector_type(unsigned n) : + v(new vector_type[DIV_ROUND_UP(n, vector_width)]()), + size(n) {} + + weight_vector_type(const weight_vector_type &u) : + v(new vector_type[DIV_ROUND_UP(u.size, vector_width)]()), + size(u.size) + { + memcpy(v, u.v, + DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type)); + } + + ~weight_vector_type() + { + delete[] v; + } + + weight_vector_type & + operator=(weight_vector_type u) + { + SWAP(v, u.v); + SWAP(size, u.size); + return *this; + } + + vector_type *v; + unsigned size; + }; + + /** + * Set the (i, p)-th component of weight vector \p v to \p x. + */ + void + set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x) + { + set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x); + } + + /** + * Get the (i, p)-th component of weight vector \p v. + */ + scalar_type + get(const weight_vector_type &v, unsigned i, unsigned p) + { + return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width); + } + + /** + * Swap the (i, p)-th and (j, q)-th components of weight vector \p v. + */ + void + swap(weight_vector_type &v, + unsigned i, unsigned p, + unsigned j, unsigned q) + { + const scalar_type tmp = get(v, i, p); + set(v, i, p, get(v, j, q)); + set(v, j, q, tmp); + } +} + +namespace { + /** + * Object that represents the partitioning of an arbitrary register space + * into indivisible units (referred to as atoms below) that can potentially + * be rearranged independently from other registers. The partitioning is + * inferred from a number of contiguity requirements specified using + * require_contiguous(). This allows efficient look-up of the atom index a + * given register address belongs to, or conversely the range of register + * addresses that belong to a given atom. + */ + struct partitioning { + /** + * Create a (for the moment unrestricted) partitioning of a register + * file of size \p n. The units are arbitrary. + */ + partitioning(unsigned n) : + max_reg(n), + offsets(new unsigned[n + num_terminator_atoms]), + atoms(new unsigned[n + num_terminator_atoms]) + { + for (unsigned i = 0; i < n + num_terminator_atoms; i++) { + offsets[i] = i; + atoms[i] = i; + } + } + + partitioning(const partitioning &p) : + max_reg(p.max_reg), + offsets(new unsigned[p.num_atoms() + num_terminator_atoms]), + atoms(new unsigned[p.max_reg + num_terminator_atoms]) + { + memcpy(offsets, p.offsets, + sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms)); + memcpy(atoms, p.atoms, + sizeof(unsigned) * (p.max_reg + num_terminator_atoms)); + } + + ~partitioning() + { + delete[] offsets; + delete[] atoms; + } + + partitioning & + operator=(partitioning p) + { + SWAP(max_reg, p.max_reg); + SWAP(offsets, p.offsets); + SWAP(atoms, p.atoms); + return *this; + } + + /** + * Require register range [reg, reg + n[ to be considered part of the + * same atom. + */ + void + require_contiguous(unsigned reg, unsigned n) + { + unsigned r = atoms[reg]; + + /* Renumber atoms[reg...] = { r... } and their offsets[r...] for the + * case that the specified contiguity requirement leads to the fusion + * (yay) of one or more existing atoms. + */ + for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) { + if (offsets[atoms[reg1]] < reg + n) { + atoms[reg1] = r; + } else { + if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]]) + r++; + + offsets[r] = offsets[atoms[reg1]]; + atoms[reg1] = r; + } + } + } + + /** + * Get the atom index register address \p reg belongs to. + */ + unsigned + atom_of_reg(unsigned reg) const + { + return atoms[reg]; + } + + /** + * Get the base register address that belongs to atom \p r. + */ + unsigned + reg_of_atom(unsigned r) const + { + return offsets[r]; + } + + /** + * Get the size of atom \p r in register address units. + */ + unsigned + size_of_atom(unsigned r) const + { + assert(r < num_atoms()); + return reg_of_atom(r + 1) - reg_of_atom(r); + } + + /** + * Get the number of atoms the whole register space is partitioned into. + */ + unsigned + num_atoms() const + { + return atoms[max_reg]; + } + + private: + /** + * Number of trailing atoms inserted for convenience so among other + * things we don't need to special-case the last element in + * size_of_atom(). + */ + static const unsigned num_terminator_atoms = 1; + unsigned max_reg; + unsigned *offsets; + unsigned *atoms; + }; + + /** + * Only GRF sources (whether they have been register-allocated or not) can + * possibly incur bank conflicts. + */ + bool + is_grf(const fs_reg &r) + { + return r.file == VGRF || r.file == FIXED_GRF; + } + + /** + * Register offset of \p r in GRF units. Useful because the representation + * of GRFs post-register allocation is somewhat inconsistent and depends on + * whether the register already had a fixed GRF offset prior to register + * allocation or whether it was part of a VGRF allocation. + */ + unsigned + reg_of(const fs_reg &r) + { + assert(is_grf(r)); + if (r.file == VGRF) + return r.nr + r.offset / REG_SIZE; + else + return reg_offset(r) / REG_SIZE; + } + + /** + * Calculate the finest partitioning of the GRF space compatible with the + * register contiguity requirements derived from all instructions part of + * the program. + */ + partitioning + shader_reg_partitioning(const fs_visitor *v) + { + partitioning p(BRW_MAX_GRF); + + foreach_block_and_inst(block, fs_inst, inst, v->cfg) { + if (is_grf(inst->dst)) + p.require_contiguous(reg_of(inst->dst), regs_written(inst)); + + for (int i = 0; i < inst->sources; i++) { + if (is_grf(inst->src[i])) + p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i)); + } + } + + return p; + } + + /** + * Return the set of GRF atoms that should be left untouched at their + * original location to avoid violating hardware or software assumptions. + */ + bool * + shader_reg_constraints(const fs_visitor *v, const partitioning &p) + { + bool *constrained = new bool[p.num_atoms()](); + + /* These are read implicitly by some send-message instructions without + * any indication at the IR level. Assume they are unsafe to move + * around. + */ + for (unsigned reg = 0; reg < 2; reg++) + constrained[p.atom_of_reg(reg)] = true; + + /* Assume that anything referenced via fixed GRFs is baked into the + * hardware's fixed-function logic and may be unsafe to move around. + * Also take into account the source GRF restrictions of EOT + * send-message instructions. + */ + foreach_block_and_inst(block, fs_inst, inst, v->cfg) { + if (inst->dst.file == FIXED_GRF) + constrained[p.atom_of_reg(reg_of(inst->dst))] = true; + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == FIXED_GRF || + (is_grf(inst->src[i]) && inst->eot)) + constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true; + } + } + + return constrained; + } + + /** + * Return whether the hardware will be able to prevent a bank conflict by + * optimizing out the read cycle of a source register. The formula was + * found experimentally. + */ + bool + is_conflict_optimized_out(const gen_device_info *devinfo, const fs_inst *inst) + { + return devinfo->gen >= 9 && + ((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) || + reg_of(inst->src[0]) == reg_of(inst->src[2]))) || + reg_of(inst->src[1]) == reg_of(inst->src[2])); + } + + /** + * Return a matrix that allows reasonably efficient computation of the + * cycle-count cost of bank conflicts incurred throughout the whole program + * for any given atom-to-bank assignment. + * + * More precisely, if C_r_s_p is the result of this function, the total + * cost of all bank conflicts involving any given atom r can be readily + * recovered as follows: + * + * S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p) + * + * where d_i_j is the Kronecker delta, and B_r indicates the bank + * assignment of r. \sa delta_conflicts() for a vectorized implementation + * of the expression above. + * + * FINISHME: Teach this about the Gen10+ bank conflict rules, which are + * somewhat more relaxed than on previous generations. In the + * meantime optimizing based on Gen9 weights is likely to be more + * helpful than not optimizing at all. + */ + weight_vector_type * + shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p) + { + weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()]; + for (unsigned r = 0; r < p.num_atoms(); r++) + conflicts[r] = weight_vector_type(2 * p.num_atoms()); + + /* Crude approximation of the number of times the current basic block + * will be executed at run-time. + */ + unsigned block_scale = 1; + + foreach_block_and_inst(block, fs_inst, inst, v->cfg) { + if (inst->opcode == BRW_OPCODE_DO) { + block_scale *= 10; + + } else if (inst->opcode == BRW_OPCODE_WHILE) { + block_scale /= 10; + + } else if (inst->is_3src(v->devinfo) && + is_grf(inst->src[1]) && is_grf(inst->src[2])) { + const unsigned r = p.atom_of_reg(reg_of(inst->src[1])); + const unsigned s = p.atom_of_reg(reg_of(inst->src[2])); + + /* Estimate of the cycle-count cost of incurring a bank conflict + * for this instruction. This is only true on the average, for a + * sequence of back-to-back ternary instructions, since the EU + * front-end only seems to be able to issue a new instruction at + * an even cycle. The cost of a bank conflict incurred by an + * isolated ternary instruction may be higher. + */ + const unsigned exec_size = inst->dst.component_size(inst->exec_size); + const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size, + REG_SIZE); + + /* Neglect same-atom conflicts (since they're either trivial or + * impossible to avoid without splitting the atom), and conflicts + * known to be optimized out by the hardware. + */ + if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) { + /* Calculate the parity of the sources relative to the start of + * their respective atoms. If their parity is the same (and + * none of the atoms straddle the 2KB mark), the instruction + * will incur a conflict iff both atoms are assigned the same + * bank b. If their parity is opposite, the instruction will + * incur a conflict iff they are assigned opposite banks (b and + * b^1). + */ + const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r)); + const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s)); + const unsigned p = p_r ^ p_s; + + /* Calculate the updated cost of a hypothetical conflict + * between atoms r and s. Note that the weight matrix is + * symmetric with respect to indices r and s by construction. + */ + const scalar_type w = MIN2(unsigned(max_scalar), + get(conflicts[r], s, p) + cycle_scale); + set(conflicts[r], s, p, w); + set(conflicts[s], r, p, w); + } + } + } + + return conflicts; + } + + /** + * Return the set of GRF atoms that could potentially lead to bank + * conflicts if laid out unfavorably in the GRF space according to + * the specified \p conflicts matrix (\sa + * shader_conflict_weight_matrix()). + */ + bool * + have_any_conflicts(const partitioning &p, + const weight_vector_type *conflicts) + { + bool *any_conflicts = new bool[p.num_atoms()](); + + for (unsigned r = 0; r < p.num_atoms(); r++) { + const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width); + for (unsigned s = 0; s < m; s++) + any_conflicts[r] |= sums(conflicts[r].v[s]); + } + + return any_conflicts; + } + + /** + * Calculate the difference between two S(B) cost estimates as defined + * above (\sa shader_conflict_weight_matrix()). This represents the + * (partial) cycle-count benefit from moving an atom r from bank p to n. + * The respective bank assignments Bp and Bn are encoded as the \p + * bank_mask_p and \p bank_mask_n bitmasks for efficient computation, + * according to the formula: + * + * bank_mask(B)_s_p = -d_(p^B_r)_(B_s) + * + * Notice the similarity with the delta function in the S(B) expression + * above, and how bank_mask(B) can be precomputed for every possible + * selection of r since bank_mask(B) only depends on it via B_r that may + * only assume one of four different values, so the caller can keep every + * possible bank_mask(B) vector in memory without much hassle (\sa + * bank_characteristics()). + */ + int + delta_conflicts(const weight_vector_type &bank_mask_p, + const weight_vector_type &bank_mask_n, + const weight_vector_type &conflicts) + { + const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width); + vector_type s_p = {}, s_n = {}; + + for (unsigned r = 0; r < m; r++) { + s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r])); + s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r])); + } + + return sums(subs(s_p, s_n)); + } + + /** + * Register atom permutation, represented as the start GRF offset each atom + * is mapped into. + */ + struct permutation { + permutation() : v(NULL), size(0) {} + + permutation(unsigned n) : + v(new unsigned[n]()), size(n) {} + + permutation(const permutation &p) : + v(new unsigned[p.size]), size(p.size) + { + memcpy(v, p.v, p.size * sizeof(unsigned)); + } + + ~permutation() + { + delete[] v; + } + + permutation & + operator=(permutation p) + { + SWAP(v, p.v); + SWAP(size, p.size); + return *this; + } + + unsigned *v; + unsigned size; + }; + + /** + * Return an identity permutation of GRF atoms. + */ + permutation + identity_reg_permutation(const partitioning &p) + { + permutation map(p.num_atoms()); + + for (unsigned r = 0; r < map.size; r++) + map.v[r] = p.reg_of_atom(r); + + return map; + } + + /** + * Return the bank index of GRF address \p reg, numbered according to the + * table: + * Even Odd + * Lo 0 1 + * Hi 2 3 + */ + unsigned + bank_of(unsigned reg) + { + return (reg & 0x40) >> 5 | (reg & 1); + } + + /** + * Return bitmasks suitable for use as bank mask arguments for the + * delta_conflicts() computation. Note that this is just the (negative) + * characteristic function of each bank, if you regard it as a set + * containing all atoms assigned to it according to the \p map array. + */ + weight_vector_type * + bank_characteristics(const permutation &map) + { + weight_vector_type *banks = new weight_vector_type[4]; + + for (unsigned b = 0; b < 4; b++) { + banks[b] = weight_vector_type(2 * map.size); + + for (unsigned j = 0; j < map.size; j++) { + for (unsigned p = 0; p < 2; p++) + set(banks[b], j, p, + (b ^ p) == bank_of(map.v[j]) ? -1 : 0); + } + } + + return banks; + } + + /** + * Return an improved permutation of GRF atoms based on \p map attempting + * to reduce the total cycle-count cost of bank conflicts greedily. + * + * Note that this doesn't attempt to merge multiple atoms into one, which + * may allow it to do a better job in some cases -- It simply reorders + * existing atoms in the GRF space without affecting their identity. + */ + permutation + optimize_reg_permutation(const partitioning &p, + const bool *constrained, + const weight_vector_type *conflicts, + permutation map) + { + const bool *any_conflicts = have_any_conflicts(p, conflicts); + weight_vector_type *banks = bank_characteristics(map); + + for (unsigned r = 0; r < map.size; r++) { + const unsigned bank_r = bank_of(map.v[r]); + + if (!constrained[r]) { + unsigned best_s = r; + int best_benefit = 0; + + for (unsigned s = 0; s < map.size; s++) { + const unsigned bank_s = bank_of(map.v[s]); + + if (bank_r != bank_s && !constrained[s] && + p.size_of_atom(r) == p.size_of_atom(s) && + (any_conflicts[r] || any_conflicts[s])) { + const int benefit = + delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) + + delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]); + + if (benefit > best_benefit) { + best_s = s; + best_benefit = benefit; + } + } + } + + if (best_s != r) { + for (unsigned b = 0; b < 4; b++) { + for (unsigned p = 0; p < 2; p++) + swap(banks[b], r, p, best_s, p); + } + + SWAP(map.v[r], map.v[best_s]); + } + } + } + + delete[] banks; + delete[] any_conflicts; + return map; + } + + /** + * Apply the GRF atom permutation given by \p map to register \p r and + * return the result. + */ + fs_reg + transform(const partitioning &p, const permutation &map, fs_reg r) + { + if (r.file == VGRF) { + const unsigned reg = reg_of(r); + const unsigned s = p.atom_of_reg(reg); + r.nr = map.v[s] + reg - p.reg_of_atom(s); + r.offset = r.offset % REG_SIZE; + } + + return r; + } +} + +bool +fs_visitor::opt_bank_conflicts() +{ + assert(grf_used || !"Must be called after register allocation"); + + /* No ternary instructions -- No bank conflicts. */ + if (devinfo->gen < 6) + return false; + + const partitioning p = shader_reg_partitioning(this); + const bool *constrained = shader_reg_constraints(this, p); + const weight_vector_type *conflicts = + shader_conflict_weight_matrix(this, p); + const permutation map = + optimize_reg_permutation(p, constrained, conflicts, + identity_reg_permutation(p)); + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + inst->dst = transform(p, map, inst->dst); + + for (int i = 0; i < inst->sources; i++) + inst->src[i] = transform(p, map, inst->src[i]); + } + + delete[] conflicts; + delete[] constrained; + return true; +} diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index fe0a1f6e8ac..91127bf35a9 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -43,6 +43,7 @@ libintel_compiler_files = files( 'brw_eu.h', 'brw_eu_util.c', 'brw_eu_validate.c', + 'brw_fs_bank_conflicts.cpp', 'brw_fs_builder.h', 'brw_fs_cmod_propagation.cpp', 'brw_fs_combine_constants.cpp', |