diff options
-rw-r--r-- | src/freedreno/ir3/ir3.h | 5 | ||||
-rw-r--r-- | src/freedreno/ir3/ir3_sched.c | 123 |
2 files changed, 115 insertions, 13 deletions
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index f3c25ea2792..ccd102b8e44 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -292,6 +292,11 @@ struct ir3_instruction { }; /* used for per-pass extra instruction data. + * + * TODO we should remove the per-pass data like this and 'use_count' + * and do something similar to what RA does w/ ir3_ra_instr_data.. + * ie. use the ir3_count_instructions pass, and then use instr->ip + * to index into a table of pass-private data. */ void *data; diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c index 16199ca3fb9..1b07bf8c1dd 100644 --- a/src/freedreno/ir3/ir3_sched.c +++ b/src/freedreno/ir3/ir3_sched.c @@ -216,7 +216,7 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs) return NULL; for (; i < nsrcs; i++) - if (srcs[i] && (srcs[i]->sun > d->sun)) + if (srcs[i] && (srcs[i]->depth > d->depth)) d = srcs[id = i]; srcs[id] = NULL; @@ -500,13 +500,63 @@ find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, return NULL; } +/* find net change to live values if instruction were scheduled: */ +static int +live_effect(struct ir3_instruction *instr) +{ + struct ir3_instruction *src; + int new_live = dest_regs(instr); + int old_live = 0; + + foreach_ssa_src_n(src, n, instr) { + if (__is_false_dep(instr, n)) + continue; + + if (instr->block != src->block) + continue; + + /* for fanout/split, just pass things along to the real src: */ + if (src->opc == OPC_META_FO) + src = ssa(src->regs[1]); + + /* for fanin/collect, if this is the last use of *each* src, + * then it will decrease the live values, since RA treats + * them as a whole: + */ + if (src->opc == OPC_META_FI) { + struct ir3_instruction *src2; + bool last_use = true; + + foreach_ssa_src(src2, src) { + if (src2->use_count > 1) { + last_use = false; + break; + } + } + + if (last_use) + old_live += dest_regs(src); + + } else { + debug_assert(src->use_count > 0); + + if (src->use_count == 1) { + old_live += dest_regs(src); + } + } + } + + return new_live - old_live; +} + /* find instruction to schedule: */ static struct ir3_instruction * find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, bool soft) { struct ir3_instruction *best_instr = NULL; - unsigned min_delay = ~0; + int best_rank = INT_MAX; /* lower is better */ + unsigned deepest = 0; /* TODO we'd really rather use the list/array of block outputs. But we * don't have such a thing. Recursing *every* instruction in the list @@ -516,23 +566,70 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, */ list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) { struct ir3_instruction *candidate; - unsigned delay; candidate = find_instr_recursive(ctx, notes, instr); if (!candidate) continue; - if (ctx->live_values > 16*4) { - /* under register pressure, only care about reducing live values: */ - if (!best_instr || (candidate->sun > best_instr->sun)) - best_instr = candidate; - } else { - delay = delay_calc(ctx->block, candidate, soft, false); - if ((delay < min_delay) || - ((delay <= (min_delay + 2)) && (candidate->sun > best_instr->sun))) { - best_instr = candidate; - min_delay = delay; + deepest = MAX2(deepest, candidate->depth); + } + + /* traverse the list a second time.. but since we cache the result of + * find_instr_recursive() it isn't as bad as it looks. + */ + list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) { + struct ir3_instruction *candidate; + + candidate = find_instr_recursive(ctx, notes, instr); + if (!candidate) + continue; + + /* determine net change to # of live values: */ + int le = live_effect(candidate); + + /* if there is a net increase in # of live values, then apply some + * threshold to avoid instructions getting scheduled *too* early + * and increasing register pressure. + */ + if (le >= 1) { + unsigned threshold; + + if (ctx->live_values > 4*4) { + threshold = 4; + } else { + threshold = 6; } + + /* Filter out any "shallow" instructions which would otherwise + * tend to get scheduled too early to fill delay slots even + * when they are not needed for a while. There will probably + * be later delay slots that they could just as easily fill. + * + * A classic case where this comes up is frag shaders that + * write a constant value (like 1.0f) to one of the channels + * of the output color(s). Since the mov from immed has no + * dependencies, it would otherwise get scheduled early to + * fill delay slots, occupying a register until the end of + * the program. + */ + if ((deepest - candidate->depth) > threshold) + continue; + } + + int rank = delay_calc(ctx->block, candidate, soft, false); + + /* if too many live values, prioritize instructions that reduce the + * number of live values: + */ + if (ctx->live_values > 16*4) { + rank = le; + } else if (ctx->live_values > 4*4) { + rank += le; + } + + if (rank < best_rank) { + best_instr = candidate; + best_rank = rank; } } |