summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRob Clark <[email protected]>2015-04-30 13:57:15 -0400
committerRob Clark <[email protected]>2015-06-21 07:53:17 -0400
commit7273cb4e933f8be65fc73b9d8c69c76d1078cb14 (patch)
treec37dc582a6892349a72543da34df2c47fa7a73cb
parentadf1659ff5f07d907eca552be3b566e408c8601e (diff)
freedreno/ir3/sched: convert to priority queue
Use a more standard priority-queue based scheduling algo. It is simpler and will make things easier once we have multiple basic blocks and flow control. Signed-off-by: Rob Clark <[email protected]>
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3.c1
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3.h3
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c1
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_sched.c466
4 files changed, 242 insertions, 229 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index 84564a9eef7..aea1b967b07 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -82,6 +82,7 @@ void ir3_destroy(struct ir3 *shader)
free(chunk);
}
free(shader->indirects);
+ free(shader->predicates);
free(shader->baryfs);
free(shader);
}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index edb5b49e23c..030a74fe21a 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -346,6 +346,9 @@ struct ir3 {
*/
unsigned indirects_count, indirects_sz;
struct ir3_instruction **indirects;
+ /* and same for instructions that consume predicate register: */
+ unsigned predicates_count, predicates_sz;
+ struct ir3_instruction **predicates;
struct ir3_block *block;
unsigned heap_idx;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 8d382e5cf3e..caea34c7fd4 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1250,6 +1250,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
cond->regs[0]->num = regid(REG_P0, 0);
kill = ir3_KILL(b, cond, 0);
+ array_insert(ctx->ir->predicates, kill);
ctx->kill[ctx->kill_count++] = kill;
ctx->so->has_kill = true;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index fc41f93b884..1d166d879df 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -31,23 +31,14 @@
#include "ir3.h"
-enum {
- SCHEDULED = -1,
- DELAYED = -2,
-};
-
/*
* Instruction Scheduling:
*
- * Using the depth sorted list from depth pass, attempt to recursively
- * schedule deepest unscheduled path. The first instruction that cannot
- * be scheduled, returns the required delay slots it needs, at which
- * point we return back up to the top and attempt to schedule by next
- * highest depth. After a sufficient number of instructions have been
- * scheduled, return back to beginning of list and start again. If you
- * reach the end of depth sorted list without being able to insert any
- * instruction, insert nop's. Repeat until no more unscheduled
- * instructions.
+ * A priority-queue based scheduling algo. Add eligible instructions,
+ * ie. ones with all their dependencies scheduled, to the priority
+ * (depth) sorted queue (list). Pop highest priority instruction off
+ * the queue and schedule it, add newly eligible instructions to the
+ * priority queue, rinse, repeat.
*
* There are a few special cases that need to be handled, since sched
* is currently independent of register allocation. Usages of address
@@ -60,67 +51,29 @@ enum {
*/
struct ir3_sched_ctx {
- struct ir3_instruction *scheduled; /* last scheduled instr */
+ struct ir3_block *block; /* the current block */
+ struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
struct ir3_instruction *addr; /* current a0.x user, if any */
struct ir3_instruction *pred; /* current p0.x user, if any */
- unsigned cnt;
bool error;
};
-static struct ir3_instruction *
-deepest(struct ir3_instruction **srcs, unsigned nsrcs)
-{
- struct ir3_instruction *d = NULL;
- unsigned i = 0, id = 0;
-
- while ((i < nsrcs) && !(d = srcs[id = i]))
- i++;
-
- if (!d)
- return NULL;
-
- for (; i < nsrcs; i++)
- if (srcs[i] && (srcs[i]->depth > d->depth))
- d = srcs[id = i];
-
- srcs[id] = NULL;
-
- return d;
-}
-
-static unsigned
-distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
- unsigned maxd)
-{
- struct list_head *instr_list = &instr->block->instr_list;
- unsigned d = 0;
-
- list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
- if ((n == instr) || (d >= maxd))
- break;
- if (is_alu(n) || is_flow(n))
- d++;
- }
-
- return d;
-}
-
static bool is_sfu_or_mem(struct ir3_instruction *instr)
{
return is_sfu(instr) || is_mem(instr);
}
-static void schedule(struct ir3_sched_ctx *ctx,
- struct ir3_instruction *instr, bool remove)
+static void
+schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
{
- struct ir3_block *block = instr->block;
+ debug_assert(ctx->block == instr->block);
/* maybe there is a better way to handle this than just stuffing
* a nop.. ideally we'd know about this constraint in the
* scheduling and depth calculation..
*/
if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
- ir3_NOP(block);
+ ir3_NOP(ctx->block);
/* remove from depth list:
*/
@@ -140,16 +93,28 @@ static void schedule(struct ir3_sched_ctx *ctx,
list_addtail(&instr->node, &instr->block->instr_list);
ctx->scheduled = instr;
-
- ctx->cnt++;
}
-/*
- * Delay-slot calculation. Follows fanin/fanout.
- */
+static unsigned
+distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
+ unsigned maxd)
+{
+ struct list_head *instr_list = &ctx->block->instr_list;
+ unsigned d = 0;
+
+ list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
+ if ((n == instr) || (d >= maxd))
+ break;
+ if (is_alu(n) || is_flow(n))
+ d++;
+ }
+
+ return d;
+}
/* calculate delay for specified src: */
-static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
+static unsigned
+delay_calc_srcn(struct ir3_sched_ctx *ctx,
struct ir3_instruction *assigner,
struct ir3_instruction *consumer, unsigned srcn)
{
@@ -158,7 +123,10 @@ static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
if (is_meta(assigner)) {
struct ir3_instruction *src;
foreach_ssa_src(src, assigner) {
- unsigned d = delay_calc_srcn(ctx, src, consumer, srcn);
+ unsigned d;
+ if (src->block != assigner->block)
+ break;
+ d = delay_calc_srcn(ctx, src, consumer, srcn);
delay = MAX2(delay, d);
}
} else {
@@ -170,48 +138,77 @@ static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
}
/* calculate delay for instruction (maximum of delay for all srcs): */
-static unsigned delay_calc(struct ir3_sched_ctx *ctx,
- struct ir3_instruction *instr)
+static unsigned
+delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
{
unsigned delay = 0;
struct ir3_instruction *src;
foreach_ssa_src_n(src, i, instr) {
- unsigned d = delay_calc_srcn(ctx, src, instr, i);
+ unsigned d;
+ if (src->block != instr->block)
+ continue;
+ d = delay_calc_srcn(ctx, src, instr, i);
delay = MAX2(delay, d);
}
return delay;
}
-/* A negative return value signals that an instruction has been newly
- * SCHEDULED (or DELAYED due to address or predicate register already
- * in use), return back up to the top of the stack (to block_sched())
+struct ir3_sched_notes {
+ /* there is at least one kill which could be scheduled, except
+ * for unscheduled bary.f's:
+ */
+ bool blocked_kill;
+ /* there is at least one instruction that could be scheduled,
+ * except for conflicting address/predicate register usage:
+ */
+ bool addr_conflict, pred_conflict;
+};
+
+static bool is_scheduled(struct ir3_instruction *instr)
+{
+ return !!(instr->flags & IR3_INSTR_MARK);
+}
+
+static bool
+check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+ struct ir3_instruction *instr)
+{
+ /* if this is a write to address/predicate register, and that
+ * register is currently in use, we need to defer until it is
+ * free:
+ */
+ if (writes_addr(instr) && ctx->addr) {
+ assert(ctx->addr != instr);
+ notes->addr_conflict = true;
+ return true;
+ }
+
+ if (writes_pred(instr) && ctx->pred) {
+ assert(ctx->pred != instr);
+ notes->pred_conflict = true;
+ return true;
+ }
+
+ return false;
+}
+
+/* is this instruction ready to be scheduled? Return negative for not
+ * ready (updating notes if needed), or >= 0 to indicate number of
+ * delay slots needed.
*/
-static int trysched(struct ir3_sched_ctx *ctx,
+static int
+instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
struct ir3_instruction *instr)
{
- struct ir3_instruction *srcs[64];
struct ir3_instruction *src;
- unsigned delay, nsrcs = 0;
-
- /* if already scheduled: */
- if (instr->flags & IR3_INSTR_MARK)
- return 0;
+ unsigned delay = 0;
- /* figure out our src's, copy 'em out into an array for sorting: */
foreach_ssa_src(src, instr) {
- debug_assert(nsrcs < ARRAY_SIZE(srcs));
- srcs[nsrcs++] = src;
- }
-
- /* for each src register in sorted order:
- */
- delay = 0;
- while ((src = deepest(srcs, nsrcs))) {
- delay = trysched(ctx, src);
- if (delay)
- return delay;
+ /* if dependency not scheduled, we aren't ready yet: */
+ if (!is_scheduled(src))
+ return -1;
}
/* all our dependents are scheduled, figure out if
@@ -236,183 +233,194 @@ static int trysched(struct ir3_sched_ctx *ctx,
*/
if (is_kill(instr)) {
struct ir3 *ir = instr->block->shader;
- unsigned i;
- for (i = 0; i < ir->baryfs_count; i++) {
+ for (unsigned i = 0; i < ir->baryfs_count; i++) {
struct ir3_instruction *baryf = ir->baryfs[i];
if (baryf->depth == DEPTH_UNUSED)
continue;
- delay = trysched(ctx, baryf);
- if (delay)
- return delay;
+ if (!is_scheduled(baryf)) {
+ notes->blocked_kill = true;
+ return -1;
+ }
}
}
- /* if this is a write to address/predicate register, and that
- * register is currently in use, we need to defer until it is
- * free:
- */
- if (writes_addr(instr) && ctx->addr) {
- assert(ctx->addr != instr);
- return DELAYED;
- }
- if (writes_pred(instr) && ctx->pred) {
- assert(ctx->pred != instr);
- return DELAYED;
- }
+ if (check_conflict(ctx, notes, instr))
+ return -1;
- schedule(ctx, instr, true);
- return SCHEDULED;
+ return 0;
}
-static bool uses_current_addr(struct ir3_sched_ctx *ctx,
- struct ir3_instruction *instr)
+/* move eligible instructions to the priority list: */
+static unsigned
+add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+ struct list_head *prio_queue, struct list_head *unscheduled_list)
{
- return instr->address && (ctx->addr == instr->address);
-}
+ unsigned min_delay = ~0;
+
+ list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) {
+ int e = instr_eligibility(ctx, notes, instr);
+ if (e < 0)
+ continue;
+ min_delay = MIN2(min_delay, e);
+ if (e == 0) {
+ /* remove from unscheduled list and into priority queue: */
+ list_delinit(&instr->node);
+ ir3_insert_by_depth(instr, prio_queue);
+ }
+ }
-static bool uses_current_pred(struct ir3_sched_ctx *ctx,
- struct ir3_instruction *instr)
-{
- struct ir3_instruction *src;
- foreach_ssa_src(src, instr)
- if (ctx->pred == src)
- return true;
- return false;
+ return min_delay;
}
-/* when we encounter an instruction that writes to the address register
- * when it is in use, we delay that instruction and try to schedule all
- * other instructions using the current address register:
+/* "spill" the address register by remapping any unscheduled
+ * instructions which depend on the current address register
+ * to a clone of the instruction which wrote the address reg.
*/
-static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
- struct list_head *unscheduled_list)
+static void
+split_addr(struct ir3_sched_ctx *ctx)
{
- bool addr_in_use = false;
- bool pred_in_use = false;
- bool all_delayed = true;
- unsigned cnt = ~0, attempted = 0;
-
- list_for_each_entry_safe(struct ir3_instruction, instr, unscheduled_list, node) {
- bool addr = uses_current_addr(ctx, instr);
- bool pred = uses_current_pred(ctx, instr);
-
- if (addr || pred) {
- int ret = trysched(ctx, instr);
-
- if (ret != DELAYED)
- all_delayed = false;
-
- if (ret == SCHEDULED)
- cnt = 0;
- else if (ret > 0)
- cnt = MIN2(cnt, ret);
- if (addr)
- addr_in_use = true;
- if (pred)
- pred_in_use = true;
-
- attempted++;
+ struct ir3 *ir = ctx->addr->block->shader;
+ struct ir3_instruction *new_addr = NULL;
+ unsigned i;
+
+ debug_assert(ctx->addr);
+
+ for (i = 0; i < ir->indirects_count; i++) {
+ struct ir3_instruction *indirect = ir->indirects[i];
+
+ /* skip instructions already scheduled: */
+ if (indirect->flags & IR3_INSTR_MARK)
+ continue;
+
+ /* remap remaining instructions using current addr
+ * to new addr:
+ */
+ if (indirect->address == ctx->addr) {
+ if (!new_addr) {
+ new_addr = ir3_instr_clone(ctx->addr);
+ /* original addr is scheduled, but new one isn't: */
+ new_addr->flags &= ~IR3_INSTR_MARK;
+ }
+ indirect->address = new_addr;
}
}
- if (!addr_in_use)
- ctx->addr = NULL;
+ /* all remaining indirects remapped to new addr: */
+ ctx->addr = NULL;
+}
- if (!pred_in_use)
- ctx->pred = NULL;
+/* "spill" the predicate register by remapping any unscheduled
+ * instructions which depend on the current predicate register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static void
+split_pred(struct ir3_sched_ctx *ctx)
+{
+ struct ir3 *ir = ctx->pred->block->shader;
+ struct ir3_instruction *new_pred = NULL;
+ unsigned i;
- /* detect if we've gotten ourselves into an impossible situation
- * and bail if needed
- */
- if (all_delayed && (attempted > 0)) {
- if (pred_in_use) {
- /* TODO we probably need to keep a list of instructions
- * that reference predicate, similar to indirects
- */
- ctx->error = true;
- return DELAYED;
- }
- if (addr_in_use) {
- struct ir3 *ir = ctx->addr->block->shader;
- struct ir3_instruction *new_addr =
- ir3_instr_clone(ctx->addr);
- unsigned i;
-
- /* original addr is scheduled, but new one isn't: */
- new_addr->flags &= ~IR3_INSTR_MARK;
-
- for (i = 0; i < ir->indirects_count; i++) {
- struct ir3_instruction *indirect = ir->indirects[i];
-
- /* skip instructions already scheduled: */
- if (indirect->flags & IR3_INSTR_MARK)
- continue;
-
- /* remap remaining instructions using current addr
- * to new addr:
- */
- if (indirect->address == ctx->addr)
- indirect->address = new_addr;
- }
+ debug_assert(ctx->pred);
- /* all remaining indirects remapped to new addr: */
- ctx->addr = NULL;
+ for (i = 0; i < ir->predicates_count; i++) {
+ struct ir3_instruction *predicated = ir->predicates[i];
- /* not really, but this will trigger us to go back to
- * main trysched() loop now that we've resolved the
- * conflict by duplicating the instr that writes to
- * the address register.
- */
- return SCHEDULED;
+ /* skip instructions already scheduled: */
+ if (predicated->flags & IR3_INSTR_MARK)
+ continue;
+
+ /* remap remaining instructions using current pred
+ * to new pred:
+ *
+ * TODO is there ever a case when pred isn't first
+ * (and only) src?
+ */
+ if (ssa(predicated->regs[1]) == ctx->pred) {
+ if (!new_pred) {
+ new_pred = ir3_instr_clone(ctx->pred);
+ /* original pred is scheduled, but new one isn't: */
+ new_pred->flags &= ~IR3_INSTR_MARK;
+ }
+ predicated->regs[1]->instr = new_pred;
}
}
- return cnt;
+ /* all remaining predicated remapped to new pred: */
+ ctx->pred = NULL;
}
-static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+static void
+sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
{
- struct list_head unscheduled_list;
+ struct list_head unscheduled_list, prio_queue;
+ ctx->block = block;
+
+ /* move all instructions to the unscheduled list, and
+ * empty the block's instruction list (to which we will
+ * be inserting.
+ */
list_replace(&block->instr_list, &unscheduled_list);
list_inithead(&block->instr_list);
+ list_inithead(&prio_queue);
- /* schedule all the shader input's (meta-instr) first so that
- * the RA step sees that the input registers contain a value
- * from the start of the shader:
+ /* first a pre-pass to schedule all meta:input/phi instructions
+ * (which need to appear first so that RA knows the register is
+ * occupied:
*/
- if (!block->parent) {
- unsigned i;
- for (i = 0; i < block->ninputs; i++) {
- struct ir3_instruction *in = block->inputs[i];
- if (in)
- schedule(ctx, in, true);
- }
- }
-
list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
- int cnt = trysched(ctx, instr);
+ if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
+ (instr->opc == OPC_META_PHI)))
+ schedule(ctx, instr);
+ }
- if (cnt == DELAYED)
- cnt = block_sched_undelayed(ctx, &unscheduled_list);
+ while (!(list_empty(&unscheduled_list) &&
+ list_empty(&prio_queue))) {
+ struct ir3_sched_notes notes = {0};
+ unsigned delay;
- /* -1 is signal to return up stack, but to us means same as 0: */
- cnt = MAX2(0, cnt);
- cnt += ctx->cnt;
+ delay = add_eligible_instrs(ctx, &notes, &prio_queue, &unscheduled_list);
- /* if deepest remaining instruction cannot be scheduled, try
- * the increasingly more shallow instructions until needed
- * number of delay slots is filled:
- */
- list_for_each_entry_safe (struct ir3_instruction, instr, &instr->node, node)
- trysched(ctx, instr);
+ if (!list_empty(&prio_queue)) {
+ struct ir3_instruction *instr = list_last_entry(&prio_queue,
+ struct ir3_instruction, node);
+ /* ugg, this is a bit ugly, but between the time when
+ * the instruction became eligible and now, a new
+ * conflict may have arose..
+ */
+ if (check_conflict(ctx, &notes, instr)) {
+ list_del(&instr->node);
+ list_addtail(&instr->node, &unscheduled_list);
+ continue;
+ }
- /* and if we run out of instructions that can be scheduled,
- * then it is time for nop's:
- */
- while (cnt > ctx->cnt)
- schedule(ctx, ir3_NOP(block), false);
+ schedule(ctx, instr);
+ } else if (delay == ~0) {
+ /* nothing available to schedule.. if we are blocked on
+ * address/predicate register conflict, then break the
+ * deadlock by cloning the instruction that wrote that
+ * reg:
+ */
+ if (notes.addr_conflict) {
+ split_addr(ctx);
+ } else if (notes.pred_conflict) {
+ split_pred(ctx);
+ } else {
+ debug_assert(0);
+ ctx->error = true;
+ return;
+ }
+ } else {
+ /* and if we run out of instructions that can be scheduled,
+ * then it is time for nop's:
+ */
+ debug_assert(delay <= 6);
+ while (delay > 0) {
+ ir3_NOP(block);
+ delay--;
+ }
+ }
}
}
@@ -420,7 +428,7 @@ int ir3_block_sched(struct ir3_block *block)
{
struct ir3_sched_ctx ctx = {0};
ir3_clear_mark(block->shader);
- block_sched(&ctx, block);
+ sched_block(&ctx, block);
if (ctx.error)
return -1;
return 0;