diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/intel/compiler/brw_fs_reg_allocate.cpp | 422 |
1 files changed, 205 insertions, 217 deletions
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp index 66a92095ab8..7173772331f 100644 --- a/src/intel/compiler/brw_fs_reg_allocate.cpp +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -399,8 +399,19 @@ public: fs(fs), devinfo(fs->devinfo), compiler(fs->compiler), g(NULL) { mem_ctx = ralloc_context(NULL); + + /* Most of this allocation was written for a reg_width of 1 + * (dispatch_width == 8). In extending to SIMD16, the code was + * left in place and it was converted to have the hardware + * registers it's allocating be contiguous physical pairs of regs + * for reg_width == 2. + */ int reg_width = fs->dispatch_width / 8; rsi = _mesa_logbase2(reg_width); + payload_node_count = ALIGN(fs->first_non_payload_grf, reg_width); + + /* Get payload IP information */ + payload_last_use_ip = ralloc_array(mem_ctx, int, payload_node_count); } ~fs_reg_alloc() @@ -411,10 +422,10 @@ public: bool assign_regs(bool allow_spilling, bool spill_all); private: - void setup_payload_interference(int payload_node_count, - int first_payload_node); - void setup_mrf_hack_interference(int first_mrf_node, - int *first_used_mrf); + void setup_live_interference(unsigned node, + int node_start_ip, int node_end_ip); + void setup_inst_interference(fs_inst *inst); + void build_interference_graph(bool allow_spilling); int choose_spill_reg(); @@ -429,70 +440,15 @@ private: int rsi; ra_graph *g; -}; - - -/** - * Sets up interference between thread payload registers and the virtual GRFs - * to be allocated for program temporaries. - * - * We want to be able to reallocate the payload for our virtual GRFs, notably - * because the setup coefficients for a full set of 16 FS inputs takes up 8 of - * our 128 registers. - * - * The layout of the payload registers is: - * - * 0..payload.num_regs-1: fixed function setup (including bary coordinates). - * payload.num_regs..payload.num_regs+curb_read_lengh-1: uniform data - * payload.num_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients. - * - * And we have payload_node_count nodes covering these registers in order - * (note that in SIMD16, a node is two registers). - */ -void -fs_reg_alloc::setup_payload_interference(int payload_node_count, - int first_payload_node) -{ - int payload_last_use_ip[payload_node_count]; - fs->calculate_payload_ranges(payload_node_count, payload_last_use_ip); - for (int i = 0; i < payload_node_count; i++) { - if (payload_last_use_ip[i] == -1) - continue; + int payload_node_count; + int *payload_last_use_ip; - /* Mark the payload node as interfering with any virtual grf that is - * live between the start of the program and our last use of the payload - * node. - */ - for (unsigned j = 0; j < fs->alloc.count; j++) { - /* Note that we use a <= comparison, unlike virtual_grf_interferes(), - * in order to not have to worry about the uniform issue described in - * calculate_live_intervals(). - */ - if (fs->virtual_grf_start[j] <= payload_last_use_ip[i]) { - ra_add_node_interference(g, first_payload_node + i, j); - } - } - } - - for (int i = 0; i < payload_node_count; i++) { - /* Mark each payload node as being allocated to its physical register. - * - * The alternative would be to have per-physical-register classes, which - * would just be silly. - */ - if (devinfo->gen <= 5 && fs->dispatch_width >= 16) { - /* We have to divide by 2 here because we only have even numbered - * registers. Some of the payload registers will be odd, but - * that's ok because their physical register numbers have already - * been assigned. The only thing this is used for is interference. - */ - ra_set_node_reg(g, first_payload_node + i, i / 2); - } else { - ra_set_node_reg(g, first_payload_node + i, i); - } - } -} + int node_count; + int first_payload_node; + int first_mrf_hack_node; + int grf127_send_hack_node; +}; /** * Sets the mrf_used array to indicate which MRFs are used by the shader IR @@ -571,135 +527,54 @@ namespace { } } -/** - * Sets interference between virtual GRFs and usage of the high GRFs for SEND - * messages (treated as MRFs in code generation). - */ -void -fs_reg_alloc::setup_mrf_hack_interference(int first_mrf_node, - int *first_used_mrf) -{ - *first_used_mrf = spill_base_mrf(fs); - for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->gen); i++) { - /* Mark each MRF reg node as being allocated to its physical register. - * - * The alternative would be to have per-physical-register classes, which - * would just be silly. - */ - ra_set_node_reg(g, first_mrf_node + i, GEN7_MRF_HACK_START + i); - - for (unsigned j = 0; j < fs->alloc.count; j++) - ra_add_node_interference(g, first_mrf_node + i, j); - } -} - void -fs_reg_alloc::build_interference_graph(bool allow_spilling) +fs_reg_alloc::setup_live_interference(unsigned node, + int node_start_ip, int node_end_ip) { - const gen_device_info *devinfo = fs->devinfo; - const brw_compiler *compiler = fs->compiler; - - /* Most of this allocation was written for a reg_width of 1 - * (dispatch_width == 8). In extending to SIMD16, the code was - * left in place and it was converted to have the hardware - * registers it's allocating be contiguous physical pairs of regs - * for reg_width == 2. + /* Mark any virtual grf that is live between the start of the program and + * the last use of a payload node interfering with that payload node. */ - int reg_width = fs->dispatch_width / 8; - int payload_node_count = ALIGN(fs->first_non_payload_grf, reg_width); - - fs->calculate_live_intervals(); - - int node_count = fs->alloc.count; - int first_payload_node = node_count; - node_count += payload_node_count; - int first_mrf_hack_node = node_count; - if (devinfo->gen >= 7) - node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START; - int grf127_send_hack_node = node_count; - if (devinfo->gen >= 8) - node_count ++; - - assert(g == NULL); - g = ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count); - ralloc_steal(mem_ctx, g); - - for (unsigned i = 0; i < fs->alloc.count; i++) { - unsigned size = fs->alloc.sizes[i]; - int c; - - assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) && - "Register allocation relies on split_virtual_grfs()"); - c = compiler->fs_reg_sets[rsi].classes[size - 1]; + for (int i = 0; i < payload_node_count; i++) { + if (payload_last_use_ip[i] == -1) + continue; - /* Special case: on pre-GEN6 hardware that supports PLN, the - * second operand of a PLN instruction needs to be an - * even-numbered register, so we have a special register class - * wm_aligned_pairs_class to handle this case. pre-GEN6 always - * uses fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] as the - * second operand of a PLN instruction (since it doesn't support - * any other interpolation modes). So all we need to do is find - * that register and set it to the appropriate class. + /* Note that we use a <= comparison, unlike virtual_grf_interferes(), + * in order to not have to worry about the uniform issue described in + * calculate_live_intervals(). */ - if (compiler->fs_reg_sets[rsi].aligned_pairs_class >= 0 && - fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].file == VGRF && - fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].nr == i) { - c = compiler->fs_reg_sets[rsi].aligned_pairs_class; - } + if (node_start_ip <= payload_last_use_ip[i]) + ra_add_node_interference(g, node, first_payload_node + i); + } - ra_set_node_class(g, i, c); + /* If we have the MRF hack enabled, mark this node as interfering with all + * MRF registers. + */ + if (first_mrf_hack_node >= 0) { + for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->gen); i++) + ra_add_node_interference(g, node, first_mrf_hack_node + i); + } - for (unsigned j = 0; j < i; j++) { - if (fs->virtual_grf_interferes(i, j)) { - ra_add_node_interference(g, i, j); - } - } + /* Add interference with every vgrf whose live range intersects this + * node's. We only need to look at nodes below this one as the reflexivity + * of interference will take care of the rest. + */ + for (unsigned i = 0; i < node; i++) { + if (!(node_end_ip <= fs->virtual_grf_start[i] || + fs->virtual_grf_end[i] <= node_start_ip)) + ra_add_node_interference(g, node, i); } +} +void +fs_reg_alloc::setup_inst_interference(fs_inst *inst) +{ /* Certain instructions can't safely use the same register for their * sources and destination. Add interference. */ - foreach_block_and_inst(block, fs_inst, inst, fs->cfg) { - if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) { - for (unsigned i = 0; i < inst->sources; i++) { - if (inst->src[i].file == VGRF) { - ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr); - } - } - } - } - - setup_payload_interference(payload_node_count, first_payload_node); - if (devinfo->gen >= 7) { - int first_used_mrf = BRW_MAX_MRF(devinfo->gen); - if (allow_spilling) - setup_mrf_hack_interference(first_mrf_hack_node, &first_used_mrf); - - foreach_block_and_inst(block, fs_inst, inst, fs->cfg) { - /* When we do send-from-GRF for FB writes, we need to ensure that - * the last write instruction sends from a high register. This is - * because the vertex fetcher wants to start filling the low - * payload registers while the pixel data port is still working on - * writing out the memory. If we don't do this, we get rendering - * artifacts. - * - * We could just do "something high". Instead, we just pick the - * highest register that works. - */ - if (inst->eot) { - const int vgrf = inst->opcode == SHADER_OPCODE_SEND ? - inst->src[2].nr : inst->src[0].nr; - int size = fs->alloc.sizes[vgrf]; - int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1; - - /* If something happened to spill, we want to push the EOT send - * register early enough in the register file that we don't - * conflict with any used MRF hack registers. - */ - reg -= BRW_MAX_MRF(devinfo->gen) - first_used_mrf; - - ra_set_node_reg(g, vgrf, reg); - break; + if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) { + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file == VGRF) { + ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr); } } } @@ -715,10 +590,7 @@ fs_reg_alloc::build_interference_graph(bool allow_spilling) * about this level of granularity, we simply make the source and * destination interfere. */ - foreach_block_and_inst(block, fs_inst, inst, fs->cfg) { - if (inst->exec_size < 16 || inst->dst.file != VGRF) - continue; - + if (inst->exec_size >= 16 && inst->dst.file == VGRF) { for (int i = 0; i < inst->sources; ++i) { if (inst->src[i].file == VGRF) { ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr); @@ -726,7 +598,7 @@ fs_reg_alloc::build_interference_graph(bool allow_spilling) } } - if (devinfo->gen >= 8) { + if (grf127_send_hack_node >= 0) { /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference", * subsection "EUISA Instructions", Send Message (page 990): * @@ -740,28 +612,20 @@ fs_reg_alloc::build_interference_graph(bool allow_spilling) * We don't apply it to SIMD16 instructions because previous code avoids * any register overlap between sources and destination. */ - ra_set_node_reg(g, grf127_send_hack_node, 127); - foreach_block_and_inst(block, fs_inst, inst, fs->cfg) { - if (inst->exec_size < 16 && inst->is_send_from_grf() && - inst->dst.file == VGRF) - ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node); - } - - if (fs->spilled_any_registers) { - foreach_block_and_inst(block, fs_inst, inst, fs->cfg) { - /* Spilling instruction are genereated as SEND messages from MRF - * but as Gen7+ supports sending from GRF the driver will maps - * assingn these MRF registers to a GRF. Implementations reuses - * the dest of the send message as source. So as we will have an - * overlap for sure, we create an interference between destination - * and grf127. - */ - if ((inst->opcode == SHADER_OPCODE_GEN7_SCRATCH_READ || - inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ) && - inst->dst.file == VGRF) - ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node); - } - } + if (inst->exec_size < 16 && inst->is_send_from_grf() && + inst->dst.file == VGRF) + ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node); + + /* Spilling instruction are genereated as SEND messages from MRF but as + * Gen7+ supports sending from GRF the driver will maps assingn these + * MRF registers to a GRF. Implementations reuses the dest of the send + * message as source. So as we will have an overlap for sure, we create + * an interference between destination and grf127. + */ + if ((inst->opcode == SHADER_OPCODE_GEN7_SCRATCH_READ || + inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ) && + inst->dst.file == VGRF) + ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node); } /* From the Skylake PRM Vol. 2a docs for sends: @@ -776,15 +640,139 @@ fs_reg_alloc::build_interference_graph(bool allow_spilling) * interference here. */ if (devinfo->gen >= 9) { - foreach_block_and_inst(block, fs_inst, inst, fs->cfg) { - if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 && - inst->src[2].file == VGRF && - inst->src[3].file == VGRF && - inst->src[2].nr != inst->src[3].nr) - ra_add_node_interference(g, inst->src[2].nr, - inst->src[3].nr); + if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 && + inst->src[2].file == VGRF && inst->src[3].file == VGRF && + inst->src[2].nr != inst->src[3].nr) + ra_add_node_interference(g, inst->src[2].nr, + inst->src[3].nr); + } + + /* When we do send-from-GRF for FB writes, we need to ensure that the last + * write instruction sends from a high register. This is because the + * vertex fetcher wants to start filling the low payload registers while + * the pixel data port is still working on writing out the memory. If we + * don't do this, we get rendering artifacts. + * + * We could just do "something high". Instead, we just pick the highest + * register that works. + */ + if (inst->eot) { + const int vgrf = inst->opcode == SHADER_OPCODE_SEND ? + inst->src[2].nr : inst->src[0].nr; + int size = fs->alloc.sizes[vgrf]; + int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1; + + /* If something happened to spill, we want to push the EOT send + * register early enough in the register file that we don't + * conflict with any used MRF hack registers. + */ + if (first_mrf_hack_node >= 0) + reg -= BRW_MAX_MRF(devinfo->gen) - spill_base_mrf(fs); + + ra_set_node_reg(g, vgrf, reg); + } +} + +void +fs_reg_alloc::build_interference_graph(bool allow_spilling) +{ + const gen_device_info *devinfo = fs->devinfo; + const brw_compiler *compiler = fs->compiler; + + /* Compute the RA node layout */ + node_count = fs->alloc.count; + first_payload_node = node_count; + node_count += payload_node_count; + if (devinfo->gen >= 7 && allow_spilling) { + first_mrf_hack_node = node_count; + node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START; + } else { + first_mrf_hack_node = -1; + } + if (devinfo->gen >= 8) { + grf127_send_hack_node = node_count; + node_count ++; + } else { + grf127_send_hack_node = -1; + } + + fs->calculate_live_intervals(); + fs->calculate_payload_ranges(payload_node_count, + payload_last_use_ip); + + assert(g == NULL); + g = ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count); + ralloc_steal(mem_ctx, g); + + /* Set up the payload nodes */ + for (int i = 0; i < payload_node_count; i++) { + /* Mark each payload node as being allocated to its physical register. + * + * The alternative would be to have per-physical-register classes, which + * would just be silly. + */ + if (devinfo->gen <= 5 && fs->dispatch_width >= 16) { + /* We have to divide by 2 here because we only have even numbered + * registers. Some of the payload registers will be odd, but + * that's ok because their physical register numbers have already + * been assigned. The only thing this is used for is interference. + */ + ra_set_node_reg(g, first_payload_node + i, i / 2); + } else { + ra_set_node_reg(g, first_payload_node + i, i); + } + } + + if (first_mrf_hack_node >= 0) { + /* Mark each MRF reg node as being allocated to its physical + * register. + * + * The alternative would be to have per-physical-register classes, + * which would just be silly. + */ + for (int i = 0; i < BRW_MAX_MRF(devinfo->gen); i++) { + ra_set_node_reg(g, first_mrf_hack_node + i, + GEN7_MRF_HACK_START + i); + } + } + + if (grf127_send_hack_node >= 0) + ra_set_node_reg(g, grf127_send_hack_node, 127); + + for (unsigned i = 0; i < fs->alloc.count; i++) { + unsigned size = fs->alloc.sizes[i]; + int c; + + assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) && + "Register allocation relies on split_virtual_grfs()"); + c = compiler->fs_reg_sets[rsi].classes[size - 1]; + + /* Special case: on pre-GEN6 hardware that supports PLN, the + * second operand of a PLN instruction needs to be an + * even-numbered register, so we have a special register class + * wm_aligned_pairs_class to handle this case. pre-GEN6 always + * uses fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] as the + * second operand of a PLN instruction (since it doesn't support + * any other interpolation modes). So all we need to do is find + * that register and set it to the appropriate class. + */ + if (compiler->fs_reg_sets[rsi].aligned_pairs_class >= 0 && + fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].file == VGRF && + fs->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].nr == i) { + c = compiler->fs_reg_sets[rsi].aligned_pairs_class; } + + ra_set_node_class(g, i, c); + + /* Add interference based on the live range of the register */ + setup_live_interference(i, fs->virtual_grf_start[i], + fs->virtual_grf_end[i]); } + + /* Add interference based on the instructions in which a register is used. + */ + foreach_block_and_inst(block, fs_inst, inst, fs->cfg) + setup_inst_interference(inst); } static void |