/* * Copyright (C) 2019 Google, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * Authors: * Rob Clark */ #include "ir3.h" /* * Helpers to figure out the necessary delay slots between instructions. Used * both in scheduling pass(es) and the final pass to insert any required nop's * so that the shader program is valid. * * Note that this needs to work both pre and post RA, so we can't assume ssa * src iterators work. */ /* generally don't count false dependencies, since this can just be * something like a barrier, or SSBO store. The exception is array * dependencies if the assigner is an array write and the consumer * reads the same array. */ static bool ignore_dep(struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned n) { if (!__is_false_dep(consumer, n)) return false; if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) { struct ir3_register *dst = assigner->regs[0]; debug_assert(dst->flags & IR3_REG_ARRAY); foreach_src (src, consumer) { if ((src->flags & IR3_REG_ARRAY) && (dst->array.id == src->array.id)) { return false; } } } return true; } /* calculate required # of delay slots between the instruction that * assigns a value and the one that consumes */ int ir3_delayslots(struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned n, bool soft) { if (ignore_dep(assigner, consumer, n)) return 0; /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch * handled with sync bits */ if (is_meta(assigner) || is_meta(consumer)) return 0; if (writes_addr0(assigner) || writes_addr1(assigner)) return 6; /* On a6xx, it takes the number of delay slots to get a SFU result * back (ie. using nop's instead of (ss) is: * * 8 - single warp * 9 - two warps * 10 - four warps * * and so on. Not quite sure where it tapers out (ie. how many * warps share an SFU unit). But 10 seems like a reasonable # * to choose: */ if (soft && is_sfu(assigner)) return 10; /* handled via sync flags: */ if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner)) return 0; /* assigner must be alu: */ if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) || is_mem(consumer)) { return 6; } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 3)) { /* special case, 3rd src to cat3 not required on first cycle */ return 1; } else { return 3; } } static bool count_instruction(struct ir3_instruction *n) { /* NOTE: don't count branch/jump since we don't know yet if they will * be eliminated later in resolve_jumps().. really should do that * earlier so we don't have this constraint. */ return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B)); } /** * @block: the block to search in, starting from end; in first pass, * this will be the block the instruction would be inserted into * (but has not yet, ie. it only contains already scheduled * instructions). For intra-block scheduling (second pass), this * would be one of the predecessor blocks. * @instr: the instruction to search for * @maxd: max distance, bail after searching this # of instruction * slots, since it means the instruction we are looking for is * far enough away * @pred: if true, recursively search into predecessor blocks to * find the worst case (shortest) distance (only possible after * individual blocks are all scheduled) */ static unsigned distance(struct ir3_block *block, struct ir3_instruction *instr, unsigned maxd, bool pred) { unsigned d = 0; /* Note that this relies on incrementally building up the block's * instruction list.. but this is how scheduling and nopsched * work. */ foreach_instr_rev (n, &block->instr_list) { if ((n == instr) || (d >= maxd)) return MIN2(maxd, d + n->nop); if (count_instruction(n)) d = MIN2(maxd, d + 1 + n->repeat + n->nop); } /* if coming from a predecessor block, assume it is assigned far * enough away.. we'll fix up later. */ if (!pred) return maxd; if (pred && (block->data != block)) { /* Search into predecessor blocks, finding the one with the * shortest distance, since that will be the worst case */ unsigned min = maxd - d; /* (ab)use block->data to prevent recursion: */ block->data = block; set_foreach (block->predecessors, entry) { struct ir3_block *pred = (struct ir3_block *)entry->key; unsigned n; n = distance(pred, instr, min, pred); min = MIN2(min, n); } block->data = NULL; d += min; } return d; } /* calculate delay for specified src: */ static unsigned delay_calc_srcn(struct ir3_block *block, struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned srcn, bool soft, bool pred) { unsigned delay = 0; if (is_meta(assigner)) { foreach_src_n (src, n, assigner) { unsigned d; if (!src->instr) continue; d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred); /* A (rptN) instruction executes in consecutive cycles so * it's outputs are written in successive cycles. And * likewise for it's (r)'d (incremented) inputs, they are * read on successive cycles. * * So we need to adjust the delay for (rptN)'s assigners * and consumers accordingly. * * Note that the dst of a (rptN) instruction is implicitly * (r) (the assigner case), although that is not the case * for src registers. There is exactly one case, bary.f, * which has a vecN (collect) src that is not (r)'d. */ if ((assigner->opc == OPC_META_SPLIT) && src->instr->repeat) { /* (rptN) assigner case: */ d -= MIN2(d, src->instr->repeat - assigner->split.off); } else if ((assigner->opc == OPC_META_COLLECT) && consumer->repeat && (consumer->regs[srcn]->flags & IR3_REG_R)) { d -= MIN2(d, n); } delay = MAX2(delay, d); } } else { delay = ir3_delayslots(assigner, consumer, srcn, soft); delay -= distance(block, assigner, delay, pred); } return delay; } static struct ir3_instruction * find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd) { unsigned d = 0; /* Note that this relies on incrementally building up the block's * instruction list.. but this is how scheduling and nopsched * work. */ foreach_instr_rev (n, &block->instr_list) { if (d >= maxd) return NULL; if (count_instruction(n)) d++; if (dest_regs(n) == 0) continue; /* note that a dest reg will never be an immediate */ if (n->regs[0]->array.id == array_id) return n; } return NULL; } /* like list_length() but only counts instructions which count in the * delay determination: */ static unsigned count_block_delay(struct ir3_block *block) { unsigned delay = 0; foreach_instr (n, &block->instr_list) { if (!count_instruction(n)) continue; delay++; } return delay; } static unsigned delay_calc_array(struct ir3_block *block, unsigned array_id, struct ir3_instruction *consumer, unsigned srcn, bool soft, bool pred, unsigned maxd) { struct ir3_instruction *assigner; assigner = find_array_write(block, array_id, maxd); if (assigner) return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred); if (!pred) return 0; unsigned len = count_block_delay(block); if (maxd <= len) return 0; maxd -= len; if (block->data == block) { /* we have a loop, return worst case: */ return maxd; } /* If we need to search into predecessors, find the one with the * max delay.. the resulting delay is that minus the number of * counted instructions in this block: */ unsigned max = 0; /* (ab)use block->data to prevent recursion: */ block->data = block; set_foreach (block->predecessors, entry) { struct ir3_block *pred = (struct ir3_block *)entry->key; unsigned delay = delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd); max = MAX2(max, delay); } block->data = NULL; if (max < len) return 0; return max - len; } /** * Calculate delay for instruction (maximum of delay for all srcs): * * @soft: If true, add additional delay for situations where they * would not be strictly required because a sync flag would be * used (but scheduler would prefer to schedule some other * instructions first to avoid stalling on sync flag) * @pred: If true, recurse into predecessor blocks */ unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr, bool soft, bool pred) { unsigned delay = 0; foreach_src_n (src, i, instr) { unsigned d = 0; if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) { d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6); } else if (src->instr) { d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred); } delay = MAX2(delay, d); } if (instr->address) { unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred); delay = MAX2(delay, d); } return delay; } /** * Remove nop instructions. The scheduler can insert placeholder nop's * so that ir3_delay_calc() can account for nop's that won't be needed * due to nop's triggered by a previous instruction. However, before * legalize, we want to remove these. The legalize pass can insert * some nop's if needed to hold (for example) sync flags. This final * remaining nops are inserted by legalize after this. */ void ir3_remove_nops(struct ir3 *ir) { foreach_block (block, &ir->block_list) { foreach_instr_safe (instr, &block->instr_list) { if (instr->opc == OPC_NOP) { list_del(&instr->node); } } } }