summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/freedreno/ir3
diff options
context:
space:
mode:
authorRob Clark <[email protected]>2018-02-05 08:45:29 -0500
committerRob Clark <[email protected]>2018-02-10 14:54:58 -0500
commitc57ed8e01cb40ef9a422346c3d304c1a3cc1f418 (patch)
tree7cc5f80c53b0e7a3b27a343804b2d5669890dbab /src/gallium/drivers/freedreno/ir3
parent2a2099a875568f2d979f0030aa6291ed88366046 (diff)
freedreno/ir3: intra-block scheduling
Because of loops, we can't schedule all of a block's predecessors first. Instead just assume that the result consumed in a block was written far enough away in all paths into a block. And do an intra-block scheduling pass to figure out if there are any cases where we need to insert extra nop's. This works out better than always assuming the worst case (ie. that a value live into a block was written in the last instruction in the predecessor block). Signed-off-by: Rob Clark <[email protected]>
Diffstat (limited to 'src/gallium/drivers/freedreno/ir3')
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_sched.c126
1 files changed, 104 insertions, 22 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 72bd0b2ce07..9269f471c56 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -136,29 +136,73 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
return d;
}
+/**
+ * @block: the block to search in, starting from end; in first pass,
+ * this will be the block the instruction would be inserted into
+ * (but has not yet, ie. it only contains already scheduled
+ * instructions). For intra-block scheduling (second pass), this
+ * would be one of the predecessor blocks.
+ * @instr: the instruction to search for
+ * @maxd: max distance, bail after searching this # of instruction
+ * slots, since it means the instruction we are looking for is
+ * far enough away
+ * @pred: if true, recursively search into predecessor blocks to
+ * find the worst case (shortest) distance (only possible after
+ * individual blocks are all scheduled
+ */
static unsigned
-distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
- unsigned maxd)
+distance(struct ir3_block *block, struct ir3_instruction *instr,
+ unsigned maxd, bool pred)
{
- struct list_head *instr_list = &ctx->block->instr_list;
unsigned d = 0;
- list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
+ list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
if ((n == instr) || (d >= maxd))
- break;
- if (is_alu(n) || is_flow(n))
+ return d;
+ /* NOTE: don't count branch/jump since we don't know yet if they will
+ * be eliminated later in resolve_jumps().. really should do that
+ * earlier so we don't have this constraint.
+ */
+ if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
d++;
}
+ /* if coming from a predecessor block, assume it is assigned far
+ * enough away.. we'll fix up later.
+ */
+ if (!pred)
+ return maxd;
+
+ if (pred && (block->data != block)) {
+ /* Search into predecessor blocks, finding the one with the
+ * shortest distance, since that will be the worst case
+ */
+ unsigned min = maxd - d;
+
+ /* (ab)use block->data to prevent recursion: */
+ block->data = block;
+
+ for (unsigned i = 0; i < block->predecessors_count; i++) {
+ unsigned n;
+
+ n = distance(block->predecessors[i], instr, min, pred);
+
+ min = MIN2(min, n);
+ }
+
+ block->data = NULL;
+ d += min;
+ }
+
return d;
}
/* calculate delay for specified src: */
static unsigned
-delay_calc_srcn(struct ir3_sched_ctx *ctx,
+delay_calc_srcn(struct ir3_block *block,
struct ir3_instruction *assigner,
struct ir3_instruction *consumer,
- unsigned srcn, bool soft)
+ unsigned srcn, bool soft, bool pred)
{
unsigned delay = 0;
@@ -166,9 +210,7 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
struct ir3_instruction *src;
foreach_ssa_src(src, assigner) {
unsigned d;
- if (src->block != assigner->block)
- break;
- d = delay_calc_srcn(ctx, src, consumer, srcn, soft);
+ d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
delay = MAX2(delay, d);
}
} else {
@@ -181,7 +223,7 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
} else {
delay = ir3_delayslots(assigner, consumer, srcn);
}
- delay -= distance(ctx, assigner, delay);
+ delay -= distance(block, assigner, delay, pred);
}
return delay;
@@ -189,19 +231,15 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
/* calculate delay for instruction (maximum of delay for all srcs): */
static unsigned
-delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr, bool soft)
+delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+ bool soft, bool pred)
{
unsigned delay = 0;
struct ir3_instruction *src;
foreach_ssa_src_n(src, i, instr) {
unsigned d;
- /* for array writes, no need to delay on previous write: */
- if (i == 0)
- continue;
- if (src->block != instr->block)
- continue;
- d = delay_calc_srcn(ctx, src, instr, i, soft);
+ d = delay_calc_srcn(block, src, instr, i, soft, pred);
delay = MAX2(delay, d);
}
@@ -396,7 +434,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
if (!candidate)
continue;
- delay = delay_calc(ctx, candidate, soft);
+ delay = delay_calc(ctx->block, candidate, soft, false);
if (delay < min_delay) {
best_instr = candidate;
min_delay = delay;
@@ -537,7 +575,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
instr = find_eligible_instr(ctx, &notes, false);
if (instr) {
- unsigned delay = delay_calc(ctx, instr, false);
+ unsigned delay = delay_calc(ctx->block, instr, false, false);
/* and if we run out of instructions that can be scheduled,
* then it is time for nop's:
@@ -594,7 +632,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
debug_assert(ctx->pred);
debug_assert(block->condition);
- delay -= distance(ctx, ctx->pred, delay);
+ delay -= distance(ctx->block, ctx->pred, delay, false);
while (delay > 0) {
ir3_NOP(block);
@@ -633,14 +671,58 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
*/
}
+/* After scheduling individual blocks, we still could have cases where
+ * one (or more) paths into a block, a value produced by a previous
+ * has too few delay slots to be legal. We can't deal with this in the
+ * first pass, because loops (ie. we can't ensure all predecessor blocks
+ * are already scheduled in the first pass). All we can really do at
+ * this point is stuff in extra nop's until things are legal.
+ */
+static void
+sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+ unsigned n = 0;
+
+ ctx->block = block;
+
+ list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+ unsigned delay = 0;
+
+ for (unsigned i = 0; i < block->predecessors_count; i++) {
+ unsigned d = delay_calc(block->predecessors[i], instr, false, true);
+ delay = MAX2(d, delay);
+ }
+
+ while (delay > n) {
+ struct ir3_instruction *nop = ir3_NOP(block);
+
+ /* move to before instr: */
+ list_delinit(&nop->node);
+ list_addtail(&nop->node, &instr->node);
+
+ n++;
+ }
+
+ /* we can bail once we hit worst case delay: */
+ if (++n > 6)
+ break;
+ }
+}
+
int ir3_sched(struct ir3 *ir)
{
struct ir3_sched_ctx ctx = {0};
ir3_clear_mark(ir);
+
list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
sched_block(&ctx, block);
}
+
+ list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ sched_intra_block(&ctx, block);
+ }
+
if (ctx.error)
return -1;
return 0;