diff options
Diffstat (limited to 'src/intel/compiler')
-rw-r--r-- | src/intel/compiler/brw_fs.cpp | 5 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs.h | 21 | ||||
-rw-r--r-- | src/intel/compiler/brw_fs_lower_regioning.cpp | 399 | ||||
-rw-r--r-- | src/intel/compiler/brw_ir_fs.h | 10 | ||||
-rw-r--r-- | src/intel/compiler/meson.build | 1 |
5 files changed, 417 insertions, 19 deletions
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 8981a94b882..edcb0ce79f4 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -6478,7 +6478,10 @@ fs_visitor::optimize() OPT(dead_code_eliminate); } - if (OPT(lower_conversions)) { + progress = false; + OPT(lower_conversions); + OPT(lower_regioning); + if (progress) { OPT(opt_copy_propagation); OPT(dead_code_eliminate); OPT(lower_simd_width); diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index dc36ecc21ac..36825754931 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -164,6 +164,7 @@ public: void lower_uniform_pull_constant_loads(); bool lower_load_payload(); bool lower_pack(); + bool lower_regioning(); bool lower_conversions(); bool lower_logical_sends(); bool lower_integer_multiplication(); @@ -536,24 +537,8 @@ namespace brw { } } - /** - * Remove any modifiers from the \p i-th source region of the instruction, - * including negate, abs and any implicit type conversion to the execution - * type. Instead any source modifiers will be implemented as a separate - * MOV instruction prior to the original instruction. - */ - inline bool - lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i) - { - assert(inst->components_read(i) == 1); - const fs_builder ibld(v, block, inst); - const fs_reg tmp = ibld.vgrf(get_exec_type(inst)); - - ibld.MOV(tmp, inst->src[i]); - inst->src[i] = tmp; - - return true; - } + bool + lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i); } void shuffle_from_32bit_read(const brw::fs_builder &bld, diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp new file mode 100644 index 00000000000..cc4163b4c2c --- /dev/null +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp @@ -0,0 +1,399 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_fs.h" +#include "brw_cfg.h" +#include "brw_fs_builder.h" + +using namespace brw; + +namespace { + /* From the SKL PRM Vol 2a, "Move": + * + * "A mov with the same source and destination type, no source modifier, + * and no saturation is a raw move. A packed byte destination region (B + * or UB type with HorzStride == 1 and ExecSize > 1) can only be written + * using raw move." + */ + bool + is_byte_raw_mov(const fs_inst *inst) + { + return type_sz(inst->dst.type) == 1 && + inst->opcode == BRW_OPCODE_MOV && + inst->src[0].type == inst->dst.type && + !inst->saturate && + !inst->src[0].negate && + !inst->src[0].abs; + } + + /* + * Return an acceptable byte stride for the destination of an instruction + * that requires it to have some particular alignment. + */ + unsigned + required_dst_byte_stride(const fs_inst *inst) + { + if (type_sz(inst->dst.type) < get_exec_type_size(inst) && + !is_byte_raw_mov(inst)) { + return get_exec_type_size(inst); + } else { + unsigned stride = inst->dst.stride * type_sz(inst->dst.type); + + for (unsigned i = 0; i < inst->sources; i++) { + if (!is_uniform(inst->src[i])) + stride = MAX2(stride, inst->src[i].stride * + type_sz(inst->src[i].type)); + } + + return stride; + } + } + + /* + * Return an acceptable byte sub-register offset for the destination of an + * instruction that requires it to be aligned to the sub-register offset of + * the sources. + */ + unsigned + required_dst_byte_offset(const fs_inst *inst) + { + for (unsigned i = 0; i < inst->sources; i++) { + if (!is_uniform(inst->src[i])) + if (reg_offset(inst->src[i]) % REG_SIZE != + reg_offset(inst->dst) % REG_SIZE) + return 0; + } + + return reg_offset(inst->dst) % REG_SIZE; + } + + /* + * Return whether the instruction has an unsupported channel bit layout + * specified for the i-th source region. + */ + bool + has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst, + unsigned i) + { + if (is_unordered(inst)) { + return false; + } else { + const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type); + const unsigned src_byte_stride = inst->src[i].stride * + type_sz(inst->src[i].type); + const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE; + const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE; + + return has_dst_aligned_region_restriction(devinfo, inst) && + !is_uniform(inst->src[i]) && + (src_byte_stride != dst_byte_stride || + src_byte_offset != dst_byte_offset); + } + } + + /* + * Return whether the instruction has an unsupported channel bit layout + * specified for the destination region. + */ + bool + has_invalid_dst_region(const gen_device_info *devinfo, + const fs_inst *inst) + { + if (is_unordered(inst)) { + return false; + } else { + const brw_reg_type exec_type = get_exec_type(inst); + const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE; + const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type); + const bool is_narrowing_conversion = !is_byte_raw_mov(inst) && + type_sz(inst->dst.type) < type_sz(exec_type); + + return (has_dst_aligned_region_restriction(devinfo, inst) && + (required_dst_byte_stride(inst) != dst_byte_stride || + required_dst_byte_offset(inst) != dst_byte_offset)) || + (is_narrowing_conversion && + required_dst_byte_stride(inst) != dst_byte_stride); + } + } + + /* + * Return whether the instruction has unsupported source modifiers + * specified for the i-th source region. + */ + bool + has_invalid_src_modifiers(const gen_device_info *devinfo, const fs_inst *inst, + unsigned i) + { + return !inst->can_do_source_mods(devinfo) && + (inst->src[i].negate || inst->src[i].abs); + } + + /* + * Return whether the instruction has an unsupported type conversion + * specified for the destination. + */ + bool + has_invalid_conversion(const gen_device_info *devinfo, const fs_inst *inst) + { + switch (inst->opcode) { + case BRW_OPCODE_MOV: + return false; + case BRW_OPCODE_SEL: + return inst->dst.type != get_exec_type(inst); + case SHADER_OPCODE_BROADCAST: + case SHADER_OPCODE_MOV_INDIRECT: + /* The source and destination types of these may be hard-coded to + * integer at codegen time due to hardware limitations of 64-bit + * types. + */ + return ((devinfo->gen == 7 && !devinfo->is_haswell) || + devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) && + type_sz(inst->src[0].type) > 4 && + inst->dst.type != inst->src[0].type; + default: + /* FIXME: We assume the opcodes don't explicitly mentioned before + * just work fine with arbitrary conversions. + */ + return false; + } + } + + /** + * Return whether the instruction has non-standard semantics for the + * conditional mod which don't cause the flag register to be updated with + * the comparison result. + */ + bool + has_inconsistent_cmod(const fs_inst *inst) + { + return inst->opcode == BRW_OPCODE_SEL || + inst->opcode == BRW_OPCODE_CSEL || + inst->opcode == BRW_OPCODE_IF || + inst->opcode == BRW_OPCODE_WHILE; + } + + bool + lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst); +} + +namespace brw { + /** + * Remove any modifiers from the \p i-th source region of the instruction, + * including negate, abs and any implicit type conversion to the execution + * type. Instead any source modifiers will be implemented as a separate + * MOV instruction prior to the original instruction. + */ + bool + lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i) + { + assert(inst->components_read(i) == 1); + const fs_builder ibld(v, block, inst); + const fs_reg tmp = ibld.vgrf(get_exec_type(inst)); + + lower_instruction(v, block, ibld.MOV(tmp, inst->src[i])); + inst->src[i] = tmp; + + return true; + } +} + +namespace { + /** + * Remove any modifiers from the destination region of the instruction, + * including saturate, conditional mod and any implicit type conversion + * from the execution type. Instead any destination modifiers will be + * implemented as a separate MOV instruction after the original + * instruction. + */ + bool + lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst) + { + const fs_builder ibld(v, block, inst); + const brw_reg_type type = get_exec_type(inst); + /* Not strictly necessary, but if possible use a temporary with the same + * channel alignment as the current destination in order to avoid + * violating the restrictions enforced later on by lower_src_region() + * and lower_dst_region(), which would introduce additional copy + * instructions into the program unnecessarily. + */ + const unsigned stride = + type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 : + type_sz(inst->dst.type) * inst->dst.stride / type_sz(type); + const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride), stride); + + /* Emit a MOV taking care of all the destination modifiers. */ + fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp); + mov->saturate = inst->saturate; + if (!has_inconsistent_cmod(inst)) + mov->conditional_mod = inst->conditional_mod; + if (inst->opcode != BRW_OPCODE_SEL) { + mov->predicate = inst->predicate; + mov->predicate_inverse = inst->predicate_inverse; + } + mov->flag_subreg = inst->flag_subreg; + lower_instruction(v, block, mov); + + /* Point the original instruction at the temporary, and clean up any + * destination modifiers. + */ + assert(inst->size_written == inst->dst.component_size(inst->exec_size)); + inst->dst = tmp; + inst->size_written = inst->dst.component_size(inst->exec_size); + inst->saturate = false; + if (!has_inconsistent_cmod(inst)) + inst->conditional_mod = BRW_CONDITIONAL_NONE; + + assert(!inst->flags_written() || !mov->predicate); + return true; + } + + /** + * Remove any non-trivial shuffling of data from the \p i-th source region + * of the instruction. Instead implement the region as a series of integer + * copies into a temporary with the same channel layout as the destination. + */ + bool + lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i) + { + assert(inst->components_read(i) == 1); + const fs_builder ibld(v, block, inst); + const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride / + type_sz(inst->src[i].type); + assert(stride > 0); + const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type, stride), + stride); + + /* Emit a series of 32-bit integer copies with any source modifiers + * cleaned up (because their semantics are dependent on the type). + */ + const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4), + false); + const unsigned n = type_sz(tmp.type) / type_sz(raw_type); + fs_reg raw_src = inst->src[i]; + raw_src.negate = false; + raw_src.abs = false; + + for (unsigned j = 0; j < n; j++) + ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j)); + + /* Point the original instruction at the temporary, making sure to keep + * any source modifiers in the instruction. + */ + fs_reg lower_src = tmp; + lower_src.negate = inst->src[i].negate; + lower_src.abs = inst->src[i].abs; + inst->src[i] = lower_src; + + return true; + } + + /** + * Remove any non-trivial shuffling of data from the destination region of + * the instruction. Instead implement the region as a series of integer + * copies from a temporary with a channel layout compatible with the + * sources. + */ + bool + lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst) + { + const fs_builder ibld(v, block, inst); + const unsigned stride = required_dst_byte_stride(inst) / + type_sz(inst->dst.type); + assert(stride > 0); + const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type, stride), + stride); + + /* Emit a series of 32-bit integer copies from the temporary into the + * original destination. + */ + const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4), + false); + const unsigned n = type_sz(tmp.type) / type_sz(raw_type); + + if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) { + /* Note that in general we cannot simply predicate the copies on the + * same flag register as the original instruction, since it may have + * been overwritten by the instruction itself. Instead initialize + * the temporary with the previous contents of the destination + * register. + */ + for (unsigned j = 0; j < n; j++) + ibld.MOV(subscript(tmp, raw_type, j), + subscript(inst->dst, raw_type, j)); + } + + for (unsigned j = 0; j < n; j++) + ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j), + subscript(tmp, raw_type, j)); + + /* Point the original instruction at the temporary, making sure to keep + * any destination modifiers in the instruction. + */ + assert(inst->size_written == inst->dst.component_size(inst->exec_size)); + inst->dst = tmp; + inst->size_written = inst->dst.component_size(inst->exec_size); + + return true; + } + + /** + * Legalize the source and destination regioning controls of the specified + * instruction. + */ + bool + lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst) + { + const gen_device_info *devinfo = v->devinfo; + bool progress = false; + + if (has_invalid_conversion(devinfo, inst)) + progress |= lower_dst_modifiers(v, block, inst); + + if (has_invalid_dst_region(devinfo, inst)) + progress |= lower_dst_region(v, block, inst); + + for (unsigned i = 0; i < inst->sources; i++) { + if (has_invalid_src_modifiers(devinfo, inst, i)) + progress |= lower_src_modifiers(v, block, inst, i); + + if (has_invalid_src_region(devinfo, inst, i)) + progress |= lower_src_region(v, block, inst, i); + } + + return progress; + } +} + +bool +fs_visitor::lower_regioning() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) + progress |= lower_instruction(this, block, inst); + + if (progress) + invalidate_live_intervals(); + + return progress; +} diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h index 5bb92e4cc86..3c23fb375e4 100644 --- a/src/intel/compiler/brw_ir_fs.h +++ b/src/intel/compiler/brw_ir_fs.h @@ -487,6 +487,16 @@ get_exec_type_size(const fs_inst *inst) } /** + * Return whether the instruction isn't an ALU instruction and cannot be + * assumed to complete in-order. + */ +static inline bool +is_unordered(const fs_inst *inst) +{ + return inst->mlen || inst->is_send_from_grf() || inst->is_math(); +} + +/** * Return whether the following regioning restriction applies to the specified * instruction. From the Cherryview PRM Vol 7. "Register Region * Restrictions": diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index 69ce2eab4cf..4af134b418e 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -57,6 +57,7 @@ libintel_compiler_files = files( 'brw_fs_live_variables.h', 'brw_fs_lower_conversions.cpp', 'brw_fs_lower_pack.cpp', + 'brw_fs_lower_regioning.cpp', 'brw_fs_nir.cpp', 'brw_fs_reg_allocate.cpp', 'brw_fs_register_coalesce.cpp', |