2 files changed, 204 insertions, 127 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 89b93105cbc..62d14a0ae37 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -255,6 +255,10 @@ struct ir3_instruction {
 		};
 	};
 
+	/* used for per-pass extra instruction data.
+	 */
+	void *data;
+
 	/* Used during CP and RA stages.  For fanin and shader inputs/
 	 * outputs where we need a sequence of consecutive registers,
 	 * keep track of each src instructions left (ie 'n-1') and right
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 08f5cac0cf4..6aaa16edbfe 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -34,11 +34,12 @@
 /*
  * Instruction Scheduling:
  *
- * A priority-queue based scheduling algo.  Add eligible instructions,
- * ie. ones with all their dependencies scheduled, to the priority
- * (depth) sorted queue (list).  Pop highest priority instruction off
- * the queue and schedule it, add newly eligible instructions to the
- * priority queue, rinse, repeat.
+ * A recursive depth based scheduling algo.  Recursively find an eligible
+ * instruction to schedule from the deepest instruction (recursing through
+ * it's unscheduled src instructions).  Normally this would result in a
+ * lot of re-traversal of the same instructions, so we cache results in
+ * instr->data (and clear cached results that would be no longer valid
+ * after scheduling an instruction).
  *
  * There are a few special cases that need to be handled, since sched
  * is currently independent of register allocation.  Usages of address
@@ -52,6 +53,7 @@
 
 struct ir3_sched_ctx {
 	struct ir3_block *block;           /* the current block */
+	struct list_head depth_list;       /* depth sorted unscheduled instrs */
 	struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
 	struct ir3_instruction *addr;      /* current a0.x user, if any */
 	struct ir3_instruction *pred;      /* current p0.x user, if any */
@@ -63,6 +65,17 @@ static bool is_sfu_or_mem(struct ir3_instruction *instr)
 	return is_sfu(instr) || is_mem(instr);
 }
 
+#define NULL_INSTR ((void *)~0)
+
+static void
+clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+	list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
+		if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
+			instr2->data = NULL;
+	}
+}
+
 static void
 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
@@ -93,6 +106,34 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 
 	list_addtail(&instr->node, &instr->block->instr_list);
 	ctx->scheduled = instr;
+
+	if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
+		clear_cache(ctx, NULL);
+	} else {
+		/* invalidate only the necessary entries.. */
+		clear_cache(ctx, instr);
+	}
+}
+
+static struct ir3_instruction *
+deepest(struct ir3_instruction **srcs, unsigned nsrcs)
+{
+	struct ir3_instruction *d = NULL;
+	unsigned i = 0, id = 0;
+
+	while ((i < nsrcs) && !(d = srcs[id = i]))
+		i++;
+
+	if (!d)
+		return NULL;
+
+	for (; i < nsrcs; i++)
+		if (srcs[i] && (srcs[i]->depth > d->depth))
+			d = srcs[id = i];
+
+	srcs[id] = NULL;
+
+	return d;
 }
 
 static unsigned
@@ -171,10 +212,51 @@ static bool is_scheduled(struct ir3_instruction *instr)
 	return !!(instr->flags & IR3_INSTR_MARK);
 }
 
+/* could an instruction be scheduled if specified ssa src was scheduled? */
+static bool
+could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+{
+	struct ir3_instruction *other_src;
+	foreach_ssa_src(other_src, instr) {
+		/* if dependency not scheduled, we aren't ready yet: */
+		if ((src != other_src) && !is_scheduled(other_src)) {
+			return false;
+		}
+	}
+	return true;
+}
+
+/* Check if instruction is ok to schedule.  Make sure it is not blocked
+ * by use of addr/predicate register, etc.
+ */
 static bool
-check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		struct ir3_instruction *instr)
 {
+	/* For instructions that write address register we need to
+	 * make sure there is at least one instruction that uses the
+	 * addr value which is otherwise ready.
+	 *
+	 * TODO if any instructions use pred register and have other
+	 * src args, we would need to do the same for writes_pred()..
+	 */
+	if (writes_addr(instr)) {
+		struct ir3 *ir = instr->block->shader;
+		bool ready = false;
+		for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
+			struct ir3_instruction *indirect = ir->indirects[i];
+			if (!indirect)
+				continue;
+			if (indirect->address != instr)
+				continue;
+			ready = could_sched(indirect, instr);
+		}
+
+		/* nothing could be scheduled, so keep looking: */
+		if (!ready)
+			return false;
+	}
+
 	/* if this is a write to address/predicate register, and that
 	 * register is currently in use, we need to defer until it is
 	 * free:
@@ -182,52 +264,15 @@ check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	if (writes_addr(instr) && ctx->addr) {
 		debug_assert(ctx->addr != instr);
 		notes->addr_conflict = true;
-		return true;
+		return false;
 	}
 
 	if (writes_pred(instr) && ctx->pred) {
 		debug_assert(ctx->pred != instr);
 		notes->pred_conflict = true;
-		return true;
+		return false;
 	}
 
-	return false;
-}
-
-/* is this instruction ready to be scheduled?  Return negative for not
- * ready (updating notes if needed), or >= 0 to indicate number of
- * delay slots needed.
- */
-static int
-instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		struct ir3_instruction *instr)
-{
-	struct ir3_instruction *src;
-	unsigned delay = 0;
-
-	/* Phi instructions can have a dependency on something not
-	 * scheduled yet (for ex, loops).  But OTOH we don't really
-	 * care.  By definition phi's should appear at the top of
-	 * the block, and it's sources should be values from the
-	 * previously executing block, so they are always ready to
-	 * be scheduled:
-	 */
-	if (is_meta(instr) && (instr->opc == OPC_META_PHI))
-		return 0;
-
-	foreach_ssa_src(src, instr) {
-		/* if dependency not scheduled, we aren't ready yet: */
-		if (!is_scheduled(src))
-			return -1;
-	}
-
-	/* all our dependents are scheduled, figure out if
-	 * we have enough delay slots to schedule ourself:
-	 */
-	delay = delay_calc(ctx, instr);
-	if (delay)
-		return delay;
-
 	/* if the instruction is a kill, we need to ensure *every*
 	 * bary.f is scheduled.  The hw seems unhappy if the thread
 	 * gets killed before the end-input (ei) flag is hit.
@@ -250,76 +295,105 @@ instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 				continue;
 			if (!is_scheduled(baryf)) {
 				notes->blocked_kill = true;
-				return -1;
+				return false;
 			}
 		}
 	}
 
-	if (check_conflict(ctx, notes, instr))
-		return -1;
-
-	return 0;
+	return true;
 }
 
-/* could an instruction be scheduled if specified ssa src was scheduled? */
-static bool
-could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+/* Find the best instruction to schedule from specified instruction or
+ * recursively it's ssa sources.
+ */
+static struct ir3_instruction *
+find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct ir3_instruction *instr)
 {
-	struct ir3_instruction *other_src;
-	foreach_ssa_src(other_src, instr) {
-		/* if dependency not scheduled, we aren't ready yet: */
-		if ((src != other_src) && !is_scheduled(other_src)) {
-			return false;
+	struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
+	struct ir3_instruction *src;
+	unsigned nsrcs = 0;
+
+	if (is_scheduled(instr))
+		return NULL;
+
+	/* use instr->data to cache the results of recursing up the
+	 * instr src's.  Otherwise the recursive algo can scale quite
+	 * badly w/ shader size.  But this takes some care to clear
+	 * the cache appropriately when instructions are scheduled.
+	 */
+	if (instr->data) {
+		if (instr->data == NULL_INSTR)
+			return NULL;
+		return instr->data;
+	}
+
+	/* find unscheduled srcs: */
+	foreach_ssa_src(src, instr) {
+		if (!is_scheduled(src)) {
+			debug_assert(nsrcs < ARRAY_SIZE(srcs));
+			srcs[nsrcs++] = src;
 		}
 	}
-	return true;
+
+	/* if all our src's are already scheduled: */
+	if (nsrcs == 0) {
+		if (check_instr(ctx, notes, instr)) {
+			instr->data = instr;
+			return instr;
+		}
+		return NULL;
+	}
+
+	while ((src = deepest(srcs, nsrcs))) {
+		struct ir3_instruction *candidate;
+
+		candidate = find_instr_recursive(ctx, notes, src);
+		if (!candidate)
+			continue;
+
+		if (check_instr(ctx, notes, candidate)) {
+			instr->data = candidate;
+			return candidate;
+		}
+	}
+
+	instr->data = NULL_INSTR;
+	return NULL;
 }
 
-/* move eligible instructions to the priority list: */
-static unsigned
-add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		struct list_head *prio_queue, struct list_head *unscheduled_list)
+/* find instruction to schedule: */
+static struct ir3_instruction *
+find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
 {
+	struct ir3_instruction *best_instr = NULL;
 	unsigned min_delay = ~0;
 
-	list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) {
-		int e = instr_eligibility(ctx, notes, instr);
-		if (e < 0)
-			continue;
+	/* TODO we'd really rather use the list/array of block outputs.  But we
+	 * don't have such a thing.  Recursing *every* instruction in the list
+	 * will result in a lot of repeated traversal, since instructions will
+	 * get traversed both when they appear as ssa src to a later instruction
+	 * as well as where they appear in the depth_list.
+	 */
+	list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+		struct ir3_instruction *candidate;
+		unsigned delay;
 
-		/* For instructions that write address register we need to
-		 * make sure there is at least one instruction that uses the
-		 * addr value which is otherwise ready.
-		 *
-		 * TODO if any instructions use pred register and have other
-		 * src args, we would need to do the same for writes_pred()..
-		 */
-		if (unlikely(writes_addr(instr))) {
-			struct ir3 *ir = instr->block->shader;
-			bool ready = false;
-			for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
-				struct ir3_instruction *indirect = ir->indirects[i];
-				if (!indirect)
-					continue;
-				if (indirect->address != instr)
-					continue;
-				ready = could_sched(indirect, instr);
-			}
+		candidate = find_instr_recursive(ctx, notes, instr);
+		if (!candidate)
+			continue;
 
-			/* nothing could be scheduled, so keep looking: */
-			if (!ready)
-				continue;
+		delay = delay_calc(ctx, candidate);
+		if (delay < min_delay) {
+			best_instr = candidate;
+			min_delay = delay;
 		}
 
-		min_delay = MIN2(min_delay, e);
-		if (e == 0) {
-			/* remove from unscheduled list and into priority queue: */
-			list_delinit(&instr->node);
-			ir3_insert_by_depth(instr, prio_queue);
-		}
+		if (min_delay == 0)
+			break;
 	}
 
-	return min_delay;
+	return best_instr;
 }
 
 /* "spill" the address register by remapping any unscheduled
@@ -413,50 +487,55 @@ split_pred(struct ir3_sched_ctx *ctx)
 static void
 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 {
-	struct list_head unscheduled_list, prio_queue;
+	struct list_head unscheduled_list;
 
 	ctx->block = block;
 
+	/* addr/pred writes are per-block: */
+	ctx->addr = NULL;
+	ctx->pred = NULL;
+
 	/* move all instructions to the unscheduled list, and
 	 * empty the block's instruction list (to which we will
-	 * be inserting.
+	 * be inserting).
 	 */
 	list_replace(&block->instr_list, &unscheduled_list);
 	list_inithead(&block->instr_list);
-	list_inithead(&prio_queue);
+	list_inithead(&ctx->depth_list);
 
 	/* first a pre-pass to schedule all meta:input/phi instructions
 	 * (which need to appear first so that RA knows the register is
-	 * occupied:
+	 * occupied), and move remaining to depth sorted list:
 	 */
 	list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
 		if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
-				(instr->opc == OPC_META_PHI)))
+				(instr->opc == OPC_META_PHI))) {
 			schedule(ctx, instr);
+		} else {
+			ir3_insert_by_depth(instr, &ctx->depth_list);
+		}
 	}
 
-	while (!(list_empty(&unscheduled_list) &&
-			list_empty(&prio_queue))) {
+	while (!list_empty(&ctx->depth_list)) {
 		struct ir3_sched_notes notes = {0};
-		unsigned delay;
+		struct ir3_instruction *instr;
+
+		instr = find_eligible_instr(ctx, &notes);
 
-		delay = add_eligible_instrs(ctx, &notes, &prio_queue, &unscheduled_list);
+		if (instr) {
+			unsigned delay = delay_calc(ctx, instr);
 
-		if (!list_empty(&prio_queue)) {
-			struct ir3_instruction *instr = list_last_entry(&prio_queue,
-					struct ir3_instruction, node);
-			/* ugg, this is a bit ugly, but between the time when
-			 * the instruction became eligible and now, a new
-			 * conflict may have arose..
+			/* and if we run out of instructions that can be scheduled,
+			 * then it is time for nop's:
 			 */
-			if (check_conflict(ctx, &notes, instr)) {
-				list_del(&instr->node);
-				list_addtail(&instr->node, &unscheduled_list);
-				continue;
+			debug_assert(delay <= 6);
+			while (delay > 0) {
+				ir3_NOP(block);
+				delay--;
 			}
 
 			schedule(ctx, instr);
-		} else if (delay == ~0) {
+		} else {
 			struct ir3_instruction *new_instr = NULL;
 
 			/* nothing available to schedule.. if we are blocked on
@@ -475,23 +554,17 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 			}
 
 			if (new_instr) {
-				list_del(&new_instr->node);
-				list_addtail(&new_instr->node, &unscheduled_list);
+				/* clearing current addr/pred can change what is
+				 * available to schedule, so clear cache..
+				 */
+				clear_cache(ctx, NULL);
+
+				ir3_insert_by_depth(new_instr, &ctx->depth_list);
 				/* the original instr that wrote addr/pred may have
 				 * originated from a different block:
 				 */
 				new_instr->block = block;
 			}
-
-		} else {
-			/* and if we run out of instructions that can be scheduled,
-			 * then it is time for nop's:
-			 */
-			debug_assert(delay <= 6);
-			while (delay > 0) {
-				ir3_NOP(block);
-				delay--;
-			}
 		}
 	}