2 files changed, 115 insertions, 13 deletions
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index f3c25ea2792..ccd102b8e44 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -292,6 +292,11 @@ struct ir3_instruction {
 	};
 
 	/* used for per-pass extra instruction data.
+	 *
+	 * TODO we should remove the per-pass data like this and 'use_count'
+	 * and do something similar to what RA does w/ ir3_ra_instr_data..
+	 * ie. use the ir3_count_instructions pass, and then use instr->ip
+	 * to index into a table of pass-private data.
 	 */
 	void *data;
 
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index 16199ca3fb9..1b07bf8c1dd 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -216,7 +216,7 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
 		return NULL;
 
 	for (; i < nsrcs; i++)
-		if (srcs[i] && (srcs[i]->sun > d->sun))
+		if (srcs[i] && (srcs[i]->depth > d->depth))
 			d = srcs[id = i];
 
 	srcs[id] = NULL;
@@ -500,13 +500,63 @@ find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	return NULL;
 }
 
+/* find net change to live values if instruction were scheduled: */
+static int
+live_effect(struct ir3_instruction *instr)
+{
+	struct ir3_instruction *src;
+	int new_live = dest_regs(instr);
+	int old_live = 0;
+
+	foreach_ssa_src_n(src, n, instr) {
+		if (__is_false_dep(instr, n))
+			continue;
+
+		if (instr->block != src->block)
+			continue;
+
+		/* for fanout/split, just pass things along to the real src: */
+		if (src->opc == OPC_META_FO)
+			src = ssa(src->regs[1]);
+
+		/* for fanin/collect, if this is the last use of *each* src,
+		 * then it will decrease the live values, since RA treats
+		 * them as a whole:
+		 */
+		if (src->opc == OPC_META_FI) {
+			struct ir3_instruction *src2;
+			bool last_use = true;
+
+			foreach_ssa_src(src2, src) {
+				if (src2->use_count > 1) {
+					last_use = false;
+					break;
+				}
+			}
+
+			if (last_use)
+				old_live += dest_regs(src);
+
+		} else {
+			debug_assert(src->use_count > 0);
+
+			if (src->use_count == 1) {
+				old_live += dest_regs(src);
+			}
+		}
+	}
+
+	return new_live - old_live;
+}
+
 /* find instruction to schedule: */
 static struct ir3_instruction *
 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		bool soft)
 {
 	struct ir3_instruction *best_instr = NULL;
-	unsigned min_delay = ~0;
+	int best_rank = INT_MAX;      /* lower is better */
+	unsigned deepest = 0;
 
 	/* TODO we'd really rather use the list/array of block outputs.  But we
 	 * don't have such a thing.  Recursing *every* instruction in the list
@@ -516,23 +566,70 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	 */
 	list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
 		struct ir3_instruction *candidate;
-		unsigned delay;
 
 		candidate = find_instr_recursive(ctx, notes, instr);
 		if (!candidate)
 			continue;
 
-		if (ctx->live_values > 16*4) {
-			/* under register pressure, only care about reducing live values: */
-			if (!best_instr || (candidate->sun > best_instr->sun))
-				best_instr = candidate;
-		} else {
-			delay = delay_calc(ctx->block, candidate, soft, false);
-			if ((delay < min_delay) ||
-					((delay <= (min_delay + 2)) && (candidate->sun > best_instr->sun))) {
-				best_instr = candidate;
-				min_delay = delay;
+		deepest = MAX2(deepest, candidate->depth);
+	}
+
+	/* traverse the list a second time.. but since we cache the result of
+	 * find_instr_recursive() it isn't as bad as it looks.
+	 */
+	list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+		struct ir3_instruction *candidate;
+
+		candidate = find_instr_recursive(ctx, notes, instr);
+		if (!candidate)
+			continue;
+
+		/* determine net change to # of live values: */
+		int le = live_effect(candidate);
+
+		/* if there is a net increase in # of live values, then apply some
+		 * threshold to avoid instructions getting scheduled *too* early
+		 * and increasing register pressure.
+		 */
+		if (le >= 1) {
+			unsigned threshold;
+
+			if (ctx->live_values > 4*4) {
+				threshold = 4;
+			} else {
+				threshold = 6;
 			}
+
+			/* Filter out any "shallow" instructions which would otherwise
+			 * tend to get scheduled too early to fill delay slots even
+			 * when they are not needed for a while.  There will probably
+			 * be later delay slots that they could just as easily fill.
+			 *
+			 * A classic case where this comes up is frag shaders that
+			 * write a constant value (like 1.0f) to one of the channels
+			 * of the output color(s).  Since the mov from immed has no
+			 * dependencies, it would otherwise get scheduled early to
+			 * fill delay slots, occupying a register until the end of
+			 * the program.
+			 */
+			if ((deepest - candidate->depth) > threshold)
+				continue;
+		}
+
+		int rank = delay_calc(ctx->block, candidate, soft, false);
+
+		/* if too many live values, prioritize instructions that reduce the
+		 * number of live values:
+		 */
+		if (ctx->live_values > 16*4) {
+			rank = le;
+		} else if (ctx->live_values > 4*4) {
+			rank += le;
+		}
+
+		if (rank < best_rank) {
+			best_instr = candidate;
+			best_rank = rank;
 		}
 	}