freedreno/ir3: simplify RA

Group inputs/outputs, in addition to fanin/fanout, as they must also exist in sequential scalar registers. This lets us simplify RA by working in terms of neighbor groups. NOTE: has the slight problem that it can't optimize out mov's for things like: MOV OUT[n], IN[m] To avoid this, instead of trying to figure out what mov's we can eliminate, we first remove all mov's prior to grouping, and then re-insert mov's as needed while grouping inputs/outputs/fanins. Eventually we'd prefer the frontend to not insert extra mov's in the first place (so we don't have to bother removing them). This is the plan for an eventual NIR based frontend, so separate out the instr grouping (which will still be needed for NIR frontend) from the mov elimination (which won't). Signed-off-by: Rob Clark <[email protected]>
author: Rob Clark <[email protected]> 2014-10-25 15:11:59 -0400
committer: Rob Clark <[email protected]> 2015-01-07 19:37:28 -0500
commit: 9a9f2a893b5e29a77d66671191653f0b4261f546 (patch)
tree: cb18b6fe28568d5c1f728632b93f60e50bd2b203 /src
parent: dddfe6c21ee92f015b78060545f08239c331ceba (diff)
8 files changed, 622 insertions, 777 deletions
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources
index 1cae52905ef..592f4b4a3fa 100644
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -127,10 +127,10 @@ ir3_SOURCES := \
 	ir3/ir3_depth.c \
 	ir3/ir3_dump.c \
 	ir3/ir3_flatten.c \
+	ir3/ir3_group.c \
 	ir3/ir3.h \
 	ir3/ir3_legalize.c \
 	ir3/ir3_ra.c \
 	ir3/ir3_sched.c \
 	ir3/ir3_shader.c \
-	ir3/ir3_shader.h \
-	ir3/ir3_visitor.h
+	ir3/ir3_shader.h
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index bd0c0a5b693..aaa0ff6efa8 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -228,26 +228,62 @@ struct ir3_instruction {
 		 */
 #define DEPTH_UNUSED  ~0
 		unsigned depth;
-
-		/* Used just during cp stage, which comes before depth pass.
-		 * For fanin, where we need a sequence of consecutive registers,
-		 * keep track of each src instructions left (ie 'n-1') and right
-		 * (ie 'n+1') neighbor.  The front-end must insert enough mov's
-		 * to ensure that each instruction has at most one left and at
-		 * most one right neighbor.  During the copy-propagation pass,
-		 * we only remove mov's when we can preserve this constraint.
-		 */
-		struct {
-			struct ir3_instruction *left, *right;
-			uint16_t left_cnt, right_cnt;
-		} cp;
 	};
+
+	/* Used during CP and RA stages.  For fanin and shader inputs/
+	 * outputs where we need a sequence of consecutive registers,
+	 * keep track of each src instructions left (ie 'n-1') and right
+	 * (ie 'n+1') neighbor.  The front-end must insert enough mov's
+	 * to ensure that each instruction has at most one left and at
+	 * most one right neighbor.  During the copy-propagation pass,
+	 * we only remove mov's when we can preserve this constraint.
+	 * And during the RA stage, we use the neighbor information to
+	 * allocate a block of registers in one shot.
+	 *
+	 * TODO: maybe just add something like:
+	 *   struct ir3_instruction_ref {
+	 *       struct ir3_instruction *instr;
+	 *       unsigned cnt;
+	 *   }
+	 *
+	 * Or can we get away without the refcnt stuff?  It seems like
+	 * it should be overkill..  the problem is if, potentially after
+	 * already eliminating some mov's, if you have a single mov that
+	 * needs to be grouped with it's neighbors in two different
+	 * places (ex. shader output and a fanin).
+	 */
+	struct {
+		struct ir3_instruction *left, *right;
+		uint16_t left_cnt, right_cnt;
+	} cp;
 	struct ir3_instruction *next;
 #ifdef DEBUG
 	uint32_t serialno;
 #endif
 };
 
+static inline struct ir3_instruction *
+ir3_neighbor_first(struct ir3_instruction *instr)
+{
+	while (instr->cp.left)
+		instr = instr->cp.left;
+	return instr;
+}
+
+static inline int ir3_neighbor_count(struct ir3_instruction *instr)
+{
+	int num = 1;
+
+	debug_assert(!instr->cp.left);
+
+	while (instr->cp.right) {
+		num++;
+		instr = instr->cp.right;
+	}
+
+	return num;
+}
+
 struct ir3_heap_chunk;
 
 struct ir3 {
@@ -415,6 +451,15 @@ static inline bool writes_pred(struct ir3_instruction *instr)
 	return false;
 }
 
+/* returns defining instruction for reg */
+/* TODO better name */
+static inline struct ir3_instruction *ssa(struct ir3_register *reg)
+{
+	if (reg->flags & IR3_REG_SSA)
+		return reg->instr;
+	return NULL;
+}
+
 static inline bool reg_gpr(struct ir3_register *r)
 {
 	if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_ADDR))
@@ -443,12 +488,15 @@ void ir3_block_depth(struct ir3_block *block);
 /* copy-propagate: */
 void ir3_block_cp(struct ir3_block *block);
 
+/* group neightbors and insert mov's to resolve conflicts: */
+void ir3_block_group(struct ir3_block *block);
+
 /* scheduling: */
 int ir3_block_sched(struct ir3_block *block);
 
 /* register assignment: */
 int ir3_block_ra(struct ir3_block *block, enum shader_t type,
-		bool half_precision, bool frag_coord, bool frag_face);
+		bool frag_coord, bool frag_face);
 
 /* legalize: */
 void ir3_block_legalize(struct ir3_block *block,
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index 081143d5d6e..6c334d200a3 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -61,8 +61,10 @@ static void dump_info(struct ir3_shader_variant *so, const char *str)
 
 if (block) {
 		for (i = 0; i < block->ninputs; i++) {
-			if (!block->inputs[i])
+			if (!block->inputs[i]) {
+				debug_printf("; in%d unused\n", i);
 				continue;
+			}
 			reg = block->inputs[i]->regs[0];
 			regid = reg->num;
 			debug_printf("@in(%sr%d.%c)\tin%d\n",
@@ -71,8 +73,10 @@ if (block) {
 		}
 
 		for (i = 0; i < block->noutputs; i++) {
-			if (!block->outputs[i])
+			if (!block->outputs[i]) {
+				debug_printf("; out%d unused\n", i);
 				continue;
+			}
 			/* kill shows up as a virtual output.. skip it! */
 			if (is_kill(block->outputs[i]))
 				continue;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
index b47aa1d14d8..209621bd013 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
@@ -571,23 +571,40 @@ add_dst_reg_wrmask(struct ir3_compile_context *ctx,
 	} else if ((dst->File == TGSI_FILE_TEMPORARY) ||
 			(dst->File == TGSI_FILE_OUTPUT) ||
 			(dst->File == TGSI_FILE_ADDRESS)) {
+		struct ir3_instruction *prev = NULL;
 		unsigned i;
 
 		/* if instruction writes multiple, we need to create
 		 * some place-holder collect the registers:
 		 */
 		for (i = 0; i < 4; i++) {
-			if (wrmask & (1 << i)) {
-				struct ir3_instruction *collect =
-						ir3_instr_create(ctx->block, -1, OPC_META_FO);
-				collect->fo.off = i;
-				/* unused dst reg: */
-				ir3_reg_create(collect, 0, 0);
-				/* and src reg used to hold original instr */
-				ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr;
-				if (!ctx->atomic)
-					ssa_dst(ctx, collect, dst, chan+i);
+			/* NOTE: slightly ugly that we setup neighbor ptrs
+			 * for FO here, but handle FI in CP pass.. we should
+			 * probably just always setup neighbor ptrs in the
+			 * frontend?
+			 */
+			struct ir3_instruction *split =
+					ir3_instr_create(ctx->block, -1, OPC_META_FO);
+			split->fo.off = i;
+			/* unused dst reg: */
+			/* NOTE: set SSA flag on dst here, because unused FO's
+			 * which don't get scheduled will end up not in the
+			 * instruction list when RA sets SSA flag on each dst.
+			 * Slight hack.  We really should set SSA flag on
+			 * every dst register in the frontend.
+			 */
+			ir3_reg_create(split, 0, IR3_REG_SSA);
+			/* and src reg used to hold original instr */
+			ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr;
+			if (prev) {
+				split->cp.left = prev;
+				split->cp.left_cnt++;
+				prev->cp.right = split;
+				prev->cp.right_cnt++;
 			}
+			if ((wrmask & (1 << i)) && !ctx->atomic)
+				ssa_dst(ctx, split, dst, chan+i);
+			prev = split;
 		}
 	}
 
@@ -3120,6 +3137,17 @@ ir3_compile_shader(struct ir3_shader_variant *so,
 		}
 	}
 
+	/* if we want half-precision outputs, mark the output registers
+	 * as half:
+	 */
+	if (key.half_precision) {
+		for (i = 0; i < block->noutputs; i++) {
+			if (!block->outputs[i])
+				continue;
+			block->outputs[i]->regs[0]->flags |= IR3_REG_HALF;
+		}
+	}
+
 	/* at this point, we want the kill's in the outputs array too,
 	 * so that they get scheduled (since they have no dst).. we've
 	 * already ensured that the array is big enough in push_block():
@@ -3145,9 +3173,26 @@ ir3_compile_shader(struct ir3_shader_variant *so,
 		ir3_dump_instr_list(block->head);
 	}
 
+	ir3_block_depth(block);
+
+	/* First remove all the extra mov's (which we could skip if the
+	 * front-end was clever enough not to insert them in the first
+	 * place).  Then figure out left/right neighbors, re-inserting
+	 * extra mov's when needed to avoid conflicts.
+	 */
 	if (cp && !(fd_mesa_debug & FD_DBG_NOCP))
 		ir3_block_cp(block);
 
+	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+		printf("BEFORE GROUPING:\n");
+		ir3_dump_instr_list(block->head);
+	}
+
+	/* Group left/right neighbors, inserting mov's where needed to
+	 * solve conflicts:
+	 */
+	ir3_block_group(block);
+
 	if (fd_mesa_debug & FD_DBG_OPTDUMP)
 		compile_dump(&ctx);
 
@@ -3169,20 +3214,19 @@ ir3_compile_shader(struct ir3_shader_variant *so,
 		ir3_dump_instr_list(block->head);
 	}
 
-	ret = ir3_block_ra(block, so->type, key.half_precision,
-			so->frag_coord, so->frag_face);
+	ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face);
 	if (ret) {
 		DBG("RA failed!");
 		goto out;
 	}
 
-	ir3_block_legalize(block, &so->has_samp, &max_bary);
-
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("AFTER RA:\n");
 		ir3_dump_instr_list(block->head);
 	}
 
+	ir3_block_legalize(block, &so->has_samp, &max_bary);
+
 	/* fixup input/outputs: */
 	for (i = 0; i < so->outputs_count; i++) {
 		so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 2076b62acb8..c55425d68d4 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -33,69 +33,14 @@
 /*
  * Copy Propagate:
  *
+ * We could eventually drop this, if the front-end did not insert any
+ * mov's..  For now keeping it as a separate pass since that is less
+ * painful than updating the existing frontend.  It is expected that
+ * with an eventual new NIR based frontend that we won't need this.
  */
 
 static void block_cp(struct ir3_block *block);
-static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, bool keep);
-
-/* XXX move this somewhere useful (and rename?) */
-static struct ir3_instruction *ssa(struct ir3_register *reg)
-{
-	if (reg->flags & IR3_REG_SSA)
-		return reg->instr;
-	return NULL;
-}
-
-static bool conflicts(struct ir3_instruction *a, struct ir3_instruction *b)
-{
-	return (a && b) && (a != b);
-}
-
-static void set_neighbors(struct ir3_instruction *instr,
-		struct ir3_instruction *left, struct ir3_instruction *right)
-{
-	debug_assert(!conflicts(instr->cp.left, left));
-	if (left) {
-		instr->cp.left_cnt++;
-		instr->cp.left = left;
-	}
-	debug_assert(!conflicts(instr->cp.right, right));
-	if (right) {
-		instr->cp.right_cnt++;
-		instr->cp.right = right;
-	}
-}
-
-/* remove neighbor reference, clearing left/right neighbor ptrs when
- * there are no more references:
- */
-static void remove_neighbors(struct ir3_instruction *instr)
-{
-	if (instr->cp.left) {
-		if (--instr->cp.left_cnt == 0)
-			instr->cp.left = NULL;
-	}
-	if (instr->cp.right) {
-		if (--instr->cp.right_cnt == 0)
-			instr->cp.right = NULL;
-	}
-}
-
-/* stop condition for iteration: */
-static bool check_stop(struct ir3_instruction *instr)
-{
-	if (ir3_instr_check_mark(instr))
-		return true;
-
-	/* stay within the block.. don't try to operate across
-	 * basic block boundaries or we'll have problems when
-	 * dealing with multiple basic blocks:
-	 */
-	if (is_meta(instr) && (instr->opc == OPC_META_INPUT))
-		return true;
-
-	return false;
-}
+static struct ir3_instruction * instr_cp(struct ir3_instruction *instr);
 
 static bool is_eligible_mov(struct ir3_instruction *instr)
 {
@@ -109,23 +54,17 @@ static bool is_eligible_mov(struct ir3_instruction *instr)
 		/* TODO: propagate abs/neg modifiers if possible */
 		if (src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV))
 			return false;
-		if (src_instr) {
-			/* check that eliminating the move won't result in
-			 * a neighbor conflict, ie. if an instruction feeds
-			 * into multiple fanins it can still only have at
-			 * most one left and one right neighbor:
-			 */
-			if (conflicts(instr->cp.left, src_instr->cp.left))
-				return false;
-			if (conflicts(instr->cp.right, src_instr->cp.right))
-				return false;
-			return true;
-		}
+		if (!src_instr)
+			return false;
+		/* TODO: remove this hack: */
+		if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO))
+			return false;
+		return true;
 	}
 	return false;
 }
 
-static void walk_children(struct ir3_instruction *instr, bool keep)
+static void walk_children(struct ir3_instruction *instr)
 {
 	unsigned i;
 
@@ -133,188 +72,56 @@ static void walk_children(struct ir3_instruction *instr, bool keep)
 	for (i = 1; i < instr->regs_count; i++) {
 		struct ir3_register *src = instr->regs[i];
 		if (src->flags & IR3_REG_SSA)
-			src->instr = instr_cp(src->instr, keep);
+			src->instr = instr_cp(src->instr);
 	}
 }
 
-static struct ir3_instruction *
-instr_cp_fanin(struct ir3_instruction *instr)
-{
-	unsigned i, j;
-
-	/* we need to handle fanin specially, to detect cases
-	 * when we need to keep a mov
-	 */
-
-	for (i = 1; i < instr->regs_count; i++) {
-		struct ir3_register *src = instr->regs[i];
-		if (src->flags & IR3_REG_SSA) {
-			struct ir3_instruction *cand =
-					instr_cp(src->instr, false);
-
-			/* if the candidate is a fanout, then keep
-			 * the move.
-			 *
-			 * This is a bit, um, fragile, but it should
-			 * catch the extra mov's that the front-end
-			 * puts in for us already in these cases.
-			 */
-			if (is_meta(cand) && (cand->opc == OPC_META_FO))
-				cand = instr_cp(src->instr, true);
-
-			/* we can't have 2 registers referring to the same instruction, so
-			 * go through and check if any already refer to the candidate
-			 * instruction. if so, don't do the propagation.
-			 *
-			 * NOTE: we need to keep this, despite the neighbor
-			 * conflict checks, to avoid A<->B<->A..
-			 */
-			for (j = 1; j < instr->regs_count; j++)
-				if (instr->regs[j]->instr == cand)
-					break;
-			if (j == instr->regs_count)
-				src->instr = cand;
-		}
-	}
-
-	walk_children(instr, false);
-
-	return instr;
-}
 
 static struct ir3_instruction *
-instr_cp(struct ir3_instruction *instr, bool keep)
+instr_cp(struct ir3_instruction *instr)
 {
-	/* if we've already visited this instruction, bail now: */
-	if (check_stop(instr))
+	/* stay within the block.. don't try to operate across
+	 * basic block boundaries or we'll have problems when
+	 * dealing with multiple basic blocks:
+	 */
+	if (is_meta(instr) && (instr->opc == OPC_META_INPUT))
 		return instr;
 
-	if (is_meta(instr) && (instr->opc == OPC_META_FI))
-		return instr_cp_fanin(instr);
-
-	if (!keep && is_eligible_mov(instr)) {
+	if (is_eligible_mov(instr)) {
 		struct ir3_instruction *src_instr = ssa(instr->regs[1]);
-		set_neighbors(src_instr, instr->cp.left, instr->cp.right);
-		remove_neighbors(instr);
-		return instr_cp(src_instr, false);
+		return instr_cp(src_instr);
 	}
 
-	walk_children(instr, false);
+	/* Check termination condition before walking children (rather
+	 * than before checking eligible-mov).  A mov instruction may
+	 * appear as ssa-src for multiple other instructions, and we
+	 * want to consider it for removal for each, rather than just
+	 * the first one.  (But regardless of how many places it shows
+	 * up as a src, we only need to recursively walk the children
+	 * once.)
+	 */
+	if (!ir3_instr_check_mark(instr))
+		walk_children(instr);
 
 	return instr;
 }
 
 static void block_cp(struct ir3_block *block)
 {
-	unsigned i, j;
+	unsigned i;
 
 	for (i = 0; i < block->noutputs; i++) {
 		if (block->outputs[i]) {
 			struct ir3_instruction *out =
-					instr_cp(block->outputs[i], false);
-
-			/* To deal with things like this:
-			 *
-			 *   43: MOV OUT[2], TEMP[5]
-			 *   44: MOV OUT[0], TEMP[5]
-			 *
-			 * we need to ensure that no two outputs point to
-			 * the same instruction
-			 */
-			for (j = 0; j < i; j++) {
-				if (block->outputs[j] == out) {
-					out = instr_cp(block->outputs[i], true);
-					break;
-				}
-			}
+					instr_cp(block->outputs[i]);
 
 			block->outputs[i] = out;
 		}
 	}
 }
 
-/*
- * Find instruction neighbors:
- */
-
-static void instr_find_neighbors(struct ir3_instruction *instr)
-{
-	unsigned i;
-
-	if (check_stop(instr))
-		return;
-
-	if (is_meta(instr) && (instr->opc == OPC_META_FI)) {
-		unsigned n = instr->regs_count;
-		for (i = 1; i < n; i++) {
-			struct ir3_instruction *src_instr = ssa(instr->regs[i]);
-			if (src_instr) {
-				struct ir3_instruction *left = (i > 1) ?
-						ssa(instr->regs[i-1]) : NULL;
-				struct ir3_instruction *right = (i < (n - 1)) ?
-						ssa(instr->regs[i+1]) : NULL;
-				set_neighbors(src_instr, left, right);
-				instr_find_neighbors(src_instr);
-			}
-		}
-	} else {
-		for (i = 1; i < instr->regs_count; i++) {
-			struct ir3_instruction *src_instr = ssa(instr->regs[i]);
-			if (src_instr)
-				instr_find_neighbors(src_instr);
-		}
-	}
-}
-
-static void block_find_neighbors(struct ir3_block *block)
-{
-	unsigned i;
-
-	for (i = 0; i < block->noutputs; i++) {
-		if (block->outputs[i]) {
-			struct ir3_instruction *instr = block->outputs[i];
-			instr_find_neighbors(instr);
-		}
-	}
-}
-
-static void instr_clear_neighbors(struct ir3_instruction *instr)
-{
-	unsigned i;
-
-	if (check_stop(instr))
-		return;
-
-	instr->cp.left_cnt = 0;
-	instr->cp.left = NULL;
-	instr->cp.right_cnt = 0;
-	instr->cp.right = NULL;
-
-	for (i = 1; i < instr->regs_count; i++) {
-		struct ir3_instruction *src_instr = ssa(instr->regs[i]);
-		if (src_instr)
-			instr_clear_neighbors(src_instr);
-	}
-}
-
-static void block_clear_neighbors(struct ir3_block *block)
-{
-	unsigned i;
-
-	for (i = 0; i < block->noutputs; i++) {
-		if (block->outputs[i]) {
-			struct ir3_instruction *instr = block->outputs[i];
-			instr_clear_neighbors(instr);
-		}
-	}
-}
-
 void ir3_block_cp(struct ir3_block *block)
 {
 	ir3_clear_mark(block->shader);
-	block_clear_neighbors(block);
-	ir3_clear_mark(block->shader);
-	block_find_neighbors(block);
-	ir3_clear_mark(block->shader);
 	block_cp(block);
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c
new file mode 100644
index 00000000000..f215c1c15d2
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c
@@ -0,0 +1,228 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "freedreno_util.h"
+
+#include "ir3.h"
+
+/*
+ * Find/group instruction neighbors:
+ */
+
+/* stop condition for iteration: */
+static bool check_stop(struct ir3_instruction *instr)
+{
+	if (ir3_instr_check_mark(instr))
+		return true;
+
+	/* stay within the block.. don't try to operate across
+	 * basic block boundaries or we'll have problems when
+	 * dealing with multiple basic blocks:
+	 */
+	if (is_meta(instr) && (instr->opc == OPC_META_INPUT))
+		return true;
+
+	return false;
+}
+
+/* bleh.. we need to do the same group_n() thing for both inputs/outputs
+ * (where we have a simple instr[] array), and fanin nodes (where we have
+ * an extra indirection via reg->instr).
+ */
+struct group_ops {
+	struct ir3_instruction *(*get)(void *arr, int idx);
+	void (*set)(void *arr, int idx, struct ir3_instruction *instr);
+};
+
+static struct ir3_instruction *arr_get(void *arr, int idx)
+{
+	return ((struct ir3_instruction **)arr)[idx];
+}
+static void arr_set_out(void *arr, int idx, struct ir3_instruction *instr)
+{
+	((struct ir3_instruction **)arr)[idx] = instr;
+}
+static void arr_set_in(void *arr, int idx, struct ir3_instruction *instr)
+{
+	debug_printf("cannot insert mov before input!\n");
+	debug_assert(0);
+}
+static struct group_ops arr_ops_out = { arr_get, arr_set_out };
+static struct group_ops arr_ops_in = { arr_get, arr_set_in };
+
+static struct ir3_instruction *instr_get(void *arr, int idx)
+{
+	return ssa(((struct ir3_instruction *)arr)->regs[idx+1]);
+}
+static void instr_set(void *arr, int idx, struct ir3_instruction *instr)
+{
+	((struct ir3_instruction *)arr)->regs[idx+1]->instr = instr;
+}
+static struct group_ops instr_ops = { instr_get, instr_set };
+
+
+
+static bool conflicts(struct ir3_instruction *a, struct ir3_instruction *b)
+{
+	return (a && b) && (a != b);
+}
+
+static struct ir3_instruction *
+create_mov(struct ir3_instruction *instr)
+{
+	struct ir3_instruction *mov;
+
+	mov = ir3_instr_create(instr->block, 1, 0);
+	mov->cat1.src_type = TYPE_F32;
+	mov->cat1.dst_type = TYPE_F32;
+	ir3_reg_create(mov, 0, 0);    /* dst */
+	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = instr;
+
+	return mov;
+}
+
+static void group_n(struct group_ops *ops, void *arr, unsigned n)
+{
+	unsigned i, j;
+
+	/* first pass, figure out what has conflicts and needs a mov
+	 * inserted.  Do this up front, before starting to setup
+	 * left/right neighbor pointers.  Trying to do it in a single
+	 * pass could result in a situation where we can't even setup
+	 * the mov's right neighbor ptr if the next instr also needs
+	 * a mov.
+	 */
+restart:
+	for (i = 0; i < n; i++) {
+		struct ir3_instruction *instr = ops->get(arr, i);
+		if (instr) {
+			struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
+			struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
+			bool conflict;
+
+			/* check for left/right neighbor conflicts: */
+			conflict = conflicts(instr->cp.left, left) ||
+				conflicts(instr->cp.right, right);
+
+			/* we also can't have an instr twice in the group: */
+			for (j = i + 1; (j < n) && !conflict; j++)
+				if (ops->get(arr, j) == instr)
+					conflict = true;
+
+			if (conflict) {
+				instr = create_mov(instr);
+				ops->set(arr, i, instr);
+				/* inserting the mov may have caused a conflict
+				 * against the previous:
+				 */
+				goto restart;
+			}
+		}
+	}
+
+	/* second pass, now that we've inserted mov's, fixup left/right
+	 * neighbors.  This is guaranteed to succeed, since by definition
+	 * the newly inserted mov's cannot conflict with anything.
+	 */
+	for (i = 0; i < n; i++) {
+		struct ir3_instruction *instr = ops->get(arr, i);
+		if (instr) {
+			struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
+			struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
+
+			debug_assert(!conflicts(instr->cp.left, left));
+			if (left) {
+				instr->cp.left_cnt++;
+				instr->cp.left = left;
+			}
+
+			debug_assert(!conflicts(instr->cp.right, right));
+			if (right) {
+				instr->cp.right_cnt++;
+				instr->cp.right = right;
+			}
+		}
+	}
+}
+
+static void instr_find_neighbors(struct ir3_instruction *instr)
+{
+	unsigned i;
+
+	if (check_stop(instr))
+		return;
+
+	if (is_meta(instr) && (instr->opc == OPC_META_FI))
+		group_n(&instr_ops, instr, instr->regs_count - 1);
+
+	for (i = 1; i < instr->regs_count; i++) {
+		struct ir3_instruction *src_instr = ssa(instr->regs[i]);
+		if (src_instr)
+			instr_find_neighbors(src_instr);
+	}
+}
+
+static void block_find_neighbors(struct ir3_block *block)
+{
+	unsigned i;
+
+	for (i = 0; i < block->noutputs; i++) {
+		if (block->outputs[i]) {
+			struct ir3_instruction *instr = block->outputs[i];
+			instr_find_neighbors(instr);
+		}
+	}
+
+	/* shader inputs/outputs themselves must be contiguous as well:
+	 */
+	if (!block->parent) {
+		/* NOTE: group inputs first, since we only insert mov's
+		 * *before* the conflicted instr (and that would go badly
+		 * for inputs).  By doing inputs first, we should never
+		 * have a conflict on inputs.. pushing any conflict to
+		 * resolve to the outputs, for stuff like:
+		 *
+		 *     MOV OUT[n], IN[m].wzyx
+		 *
+		 * NOTE: we assume here inputs/outputs are grouped in vec4.
+		 * This logic won't quite cut it if we don't align smaller
+		 * on vec4 boundaries
+		 */
+		for (i = 0; i < block->ninputs; i += 4)
+			group_n(&arr_ops_in, &block->inputs[i], 4);
+		for (i = 0; i < block->noutputs; i += 4)
+			group_n(&arr_ops_out, &block->outputs[i], 4);
+
+	}
+}
+
+void ir3_block_group(struct ir3_block *block)
+{
+	ir3_clear_mark(block->shader);
+	block_find_neighbors(block);
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 611b5425466..08540466bb0 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -30,7 +30,6 @@
 #include "util/u_math.h"
 
 #include "ir3.h"
-#include "ir3_visitor.h"
 
 /*
  * Register Assignment:
@@ -53,7 +52,6 @@
 struct ir3_ra_ctx {
 	struct ir3_block *block;
 	enum shader_t type;
-	bool half_precision;
 	bool frag_coord;
 	bool frag_face;
 	int cnt;
@@ -81,6 +79,15 @@ struct ir3_ra_ctx {
 		} \
 	} while (0)
 
+#define ra_assert(ctx, x) do { \
+		debug_assert(x); \
+		if (!(x)) { \
+			debug_printf("RA: failed assert: %s\n", #x); \
+			(ctx)->error = true; \
+		}; \
+	} while (0)
+
+
 /* sorta ugly way to retrofit half-precision support.. rather than
  * passing extra param around, just OR in a high bit.  All the low
  * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
@@ -89,19 +96,6 @@ struct ir3_ra_ctx {
  */
 #define REG_HALF  0x8000
 
-struct ir3_ra_assignment {
-	int8_t  off;        /* offset of instruction dst within range */
-	uint8_t num;        /* number of components for the range */
-};
-
-static void ra_assign(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *assigner, int num);
-static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr);
-
-/*
- * Register Allocation:
- */
-
 #define REG(n, wm, f) (struct ir3_register){ \
 		.flags  = (f), \
 		.num    = (n), \
@@ -117,19 +111,34 @@ static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n
 	return NULL;
 }
 
-static int output_base(struct ir3_ra_ctx *ctx)
+/* figure out if an unassigned src register points back to the instr we
+ * are assigning:
+ */
+static bool instr_used_by(struct ir3_instruction *instr,
+		struct ir3_register *src)
 {
-	/* ugg, for fragment shader we need to have input at r0.x
-	 * (or at least if there is a way to configure it, I can't
-	 * see how because the blob driver always uses r0.x (ie.
-	 * all zeros)
-	 */
-	if (ctx->type == SHADER_FRAGMENT) {
-		if (ctx->half_precision)
-			return ctx->frag_face ? 4 : 3;
-		return ctx->frag_coord ? 8 : 4;
-	}
-	return 0;
+	struct ir3_instruction *src_instr = ssa(src);
+	unsigned i;
+	if (instr == src_instr)
+		return true;
+	if (src_instr && is_meta(src_instr))
+		for (i = 1; i < src_instr->regs_count; i++)
+			if (instr_used_by(instr, src_instr->regs[i]))
+				return true;
+
+	return false;
+}
+
+static bool instr_is_output(struct ir3_instruction *instr)
+{
+	struct ir3_block *block = instr->block;
+	unsigned i;
+
+	for (i = 0; i < block->noutputs; i++)
+		if (instr == block->outputs[i])
+			return true;
+
+	return false;
 }
 
 /* live means read before written */
@@ -137,100 +146,59 @@ static void compute_liveregs(struct ir3_ra_ctx *ctx,
 		struct ir3_instruction *instr, regmask_t *liveregs)
 {
 	struct ir3_block *block = instr->block;
+	struct ir3_instruction *n;
 	regmask_t written;
-	unsigned i, j;
+	unsigned i;
 
-	regmask_init(liveregs);
 	regmask_init(&written);
 
-	for (instr = instr->next; instr; instr = instr->next) {
+	for (n = instr->next; n; n = n->next) {
 		struct ir3_register *r;
 
-		if (is_meta(instr))
+		if (is_meta(n))
 			continue;
 
 		/* check first src's read: */
-		for (j = 1; j < instr->regs_count; j++) {
-			r = reg_check(instr, j);
+		for (i = 1; i < n->regs_count; i++) {
+			r = reg_check(n, i);
 			if (r)
 				regmask_set_if_not(liveregs, r, &written);
+
+			/* if any src points back to the instruction(s) in
+			 * the block of neighbors that we are assigning then
+			 * mark any written (clobbered) registers as live:
+			 */
+			if (instr_used_by(instr, n->regs[i]))
+				regmask_or(liveregs, liveregs, &written);
 		}
 
+		/* meta-instructions don't actually get scheduled,
+		 * so don't let it's write confuse us.. what we
+		 * really care about is when the src to the meta
+		 * instr was written:
+		 */
+		if (is_meta(n))
+			continue;
+
 		/* then dst written (if assigned already): */
-		if (instr->flags & IR3_INSTR_MARK) {
-			r = reg_check(instr, 0);
-			if (r)
+		r = reg_check(n, 0);
+		if (r) {
+			/* if an instruction *is* an output, then it is live */
+			if (!instr_is_output(n))
 				regmask_set(&written, r);
 		}
+
 	}
 
 	/* be sure to account for output registers too: */
 	for (i = 0; i < block->noutputs; i++) {
-		struct ir3_register reg = REG(output_base(ctx) + i, X, 0);
-		regmask_set_if_not(liveregs, &reg, &written);
-	}
-}
-
-/* calculate registers that are clobbered before last use of 'assigner'.
- * This needs to be done backwards, although it could possibly be
- * combined into compute_liveregs().  (Ie. compute_liveregs() could
- * reverse the list, then do this part backwards reversing the list
- * again back to original order.)  Otoh, probably I should try to
- * construct a proper interference graph instead.
- *
- * XXX this need to follow the same recursion path that is used for
- * to rename/assign registers (ie. ra_assign_src()).. this is a bit
- * ugly right now, maybe refactor into node iterator sort of things
- * that iterates nodes in the correct order?
- */
-static bool compute_clobbers(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, struct ir3_instruction *assigner,
-		regmask_t *liveregs)
-{
-	unsigned i;
-	bool live = false, was_live = false;
-
-	if (instr == NULL) {
-		struct ir3_block *block = ctx->block;
-
-		/* if at the end, check outputs: */
-		for (i = 0; i < block->noutputs; i++)
-			if (block->outputs[i] == assigner)
-				return true;
-		return false;
-	}
-
-	for (i = 1; i < instr->regs_count; i++) {
-		struct ir3_register *reg = instr->regs[i];
-		if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) {
-			if (is_meta(instr)) {
-				switch (instr->opc) {
-				case OPC_META_INPUT:
-					// TODO
-					assert(0);
-					break;
-				case OPC_META_FO:
-				case OPC_META_FI:
-					was_live |= compute_clobbers(ctx, instr->next,
-							instr, liveregs);
-					break;
-				default:
-					break;
-				}
-			}
-			live = true;
-			break;
-		}
+		struct ir3_register *r;
+		if (!block->outputs[i])
+			continue;
+		r = reg_check(block->outputs[i], 0);
+		if (r)
+			regmask_set_if_not(liveregs, r, &written);
 	}
-
-	was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs);
-
-	if (was_live && (instr->regs_count > 0) &&
-			(instr->flags & IR3_INSTR_MARK) &&
-			!is_meta(instr))
-		regmask_set(liveregs, instr->regs[0]);
-
-	return live || was_live;
 }
 
 static int find_available(regmask_t *liveregs, int size, bool half)
@@ -254,141 +222,39 @@ static int find_available(regmask_t *liveregs, int size, bool half)
 static int alloc_block(struct ir3_ra_ctx *ctx,
 		struct ir3_instruction *instr, int size)
 {
-	if (!instr) {
-		/* special case, allocating shader outputs.  At this
-		 * point, nothing is allocated, just start the shader
-		 * outputs at r0.x and let compute_liveregs() take
-		 * care of the rest from here:
-		 */
-		return 0;
-	} else {
-		struct ir3_register *dst = instr->regs[0];
-		regmask_t liveregs;
-
-		compute_liveregs(ctx, instr, &liveregs);
-
-		// XXX XXX XXX XXX XXX XXX XXX XXX XXX
-		// XXX hack.. maybe ra_calc should give us a list of
-		// instrs to compute_clobbers() on?
-		if (is_meta(instr) && (instr->opc == OPC_META_INPUT) &&
-				(instr->regs_count == 1)) {
-			unsigned i, base = instr->regs[0]->num & ~0x3;
-			for (i = 0; i < 4; i++) {
-				struct ir3_instruction *in = NULL;
-				if ((base + i) < ctx->block->ninputs)
-					in = ctx->block->inputs[base + i];
-				if (in)
-					compute_clobbers(ctx, in->next, in, &liveregs);
-			}
-		} else
-		// XXX XXX XXX XXX XXX XXX XXX XXX XXX
-		compute_clobbers(ctx, instr->next, instr, &liveregs);
-
-		return find_available(&liveregs, size,
-				!!(dst->flags & IR3_REG_HALF));
-	}
-}
-
-/*
- * Constraint Calculation:
- */
-
-struct ra_calc_visitor {
-	struct ir3_visitor base;
-	struct ir3_ra_assignment a;
-};
-
-static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v)
-{
-	return (struct ra_calc_visitor *)v;
-}
-
-/* calculate register assignment for the instruction.  If the register
- * written by this instruction is required to be part of a range, to
- * handle other (input/output/sam/bary.f/etc) contiguous register range
- * constraints, that is calculated handled here.
- */
-static void ra_calc_dst(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	struct ra_calc_visitor *c = ra_calc_visitor(v);
-	if (is_tex(instr)) {
-		c->a.off = 0;
-		c->a.num = 4;
-	} else {
-		c->a.off = 0;
-		c->a.num = 1;
-	}
-}
-
-static void
-ra_calc_dst_shader_input(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	struct ra_calc_visitor *c = ra_calc_visitor(v);
-	struct ir3_block *block = instr->block;
 	struct ir3_register *dst = instr->regs[0];
-	unsigned base = dst->num & ~0x3;
-	unsigned i, num = 0;
-
-	assert(!(dst->flags & IR3_REG_IA));
-
-	/* check what input components we need: */
-	for (i = 0; i < 4; i++) {
-		unsigned idx = base + i;
-		if ((idx < block->ninputs) && block->inputs[idx])
-			num = i + 1;
-	}
-
-	c->a.off = dst->num - base;
-	c->a.num = num;
-}
-
-static void ra_calc_src_fanin(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	struct ra_calc_visitor *c = ra_calc_visitor(v);
-	unsigned srcn = ir3_instr_regno(instr, reg) - 1;
-	c->a.off += srcn;
-	c->a.num += srcn;
-	c->a.num = MAX2(c->a.num, instr->regs_count - 1);
-}
-
-static const struct ir3_visitor_funcs calc_visitor_funcs = {
-		.instr = ir3_visit_instr,
-		.dst_shader_input = ra_calc_dst_shader_input,
-		.dst_fanout = ra_calc_dst,
-		.dst_fanin = ra_calc_dst,
-		.dst = ra_calc_dst,
-		.src_fanout = ir3_visit_reg,
-		.src_fanin = ra_calc_src_fanin,
-		.src = ir3_visit_reg,
-};
-
-static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner)
-{
-	struct ra_calc_visitor v = {
-			.base.funcs = &calc_visitor_funcs,
-	};
+	struct ir3_instruction *n;
+	regmask_t liveregs;
+	unsigned name;
+
+	/* should only ever be called w/ head of neighbor list: */
+	debug_assert(!instr->cp.left);
+
+	regmask_init(&liveregs);
+
+	for (n = instr; n; n = n->cp.right)
+		compute_liveregs(ctx, n, &liveregs);
+
+	/* because we do assignment on fanout nodes for wrmask!=0x1, we
+	 * need to handle this special case, where the fanout nodes all
+	 * appear after one or more of the consumers of the src node:
+	 *
+	 *   0098:009: sam _, r2.x
+	 *   0028:010: mul.f r3.z, r4.x, c13.x
+	 *   ; we start assigning here for '0098:009: sam'.. but
+	 *   ; would miss the usage at '0028:010: mul.f'
+	 *   0101:009: _meta:fo _, _[0098:009: sam], off=2
+	 */
+	if (is_meta(instr) && (instr->opc == OPC_META_FO))
+		compute_liveregs(ctx, instr->regs[1]->instr, &liveregs);
 
-	ir3_visit_instr(&v.base, assigner);
+	name = find_available(&liveregs, size,
+			!!(dst->flags & IR3_REG_HALF));
 
-	return v.a;
-}
+	if (dst->flags & IR3_REG_HALF)
+		name |= REG_HALF;
 
-/*
- * Register Assignment:
- */
-
-struct ra_assign_visitor {
-	struct ir3_visitor base;
-	struct ir3_ra_ctx *ctx;
-	int num;
-};
-
-static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
-{
-	return (struct ra_assign_visitor *)v;
+	return name;
 }
 
 static type_t half_type(type_t type)
@@ -459,17 +325,15 @@ static void fixup_half_instr_src(struct ir3_instruction *instr)
 	}
 }
 
-static void ra_assign_reg(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
+static void reg_assign(struct ir3_instruction *instr,
+		unsigned r, unsigned name)
 {
-	struct ra_assign_visitor *a = ra_assign_visitor(v);
+	struct ir3_register *reg = instr->regs[r];
 
 	reg->flags &= ~IR3_REG_SSA;
-	reg->num = a->num & ~REG_HALF;
+	reg->num = name & ~REG_HALF;
 
-	assert(reg->num >= 0);
-
-	if (a->num & REG_HALF) {
+	if (name & REG_HALF) {
 		reg->flags |= IR3_REG_HALF;
 		/* if dst reg being assigned, patch up the instr: */
 		if (reg == instr->regs[0])
@@ -479,192 +343,194 @@ static void ra_assign_reg(struct ir3_visitor *v,
 	}
 }
 
-static void ra_assign_dst_shader_input(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
+static void instr_assign(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr, unsigned name);
+
+static void instr_assign_src(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr, unsigned r, unsigned name)
 {
-	struct ra_assign_visitor *a = ra_assign_visitor(v);
-	unsigned i, base = reg->num & ~0x3;
-	int off = base - reg->num;
-
-	ra_assign_reg(v, instr, reg);
-	reg->flags |= IR3_REG_IA;
-
-	/* trigger assignment of all our companion input components: */
-	for (i = 0; i < 4; i++) {
-		struct ir3_instruction *in = NULL;
-		if ((base + i) < instr->block->ninputs)
-			in = instr->block->inputs[base + i];
-		if (in && is_meta(in) && (in->opc == OPC_META_INPUT))
-			ra_assign(a->ctx, in, a->num + off + i);
+	reg_assign(instr, r, name);
+
+	if (is_meta(instr)) {
+		switch (instr->opc) {
+		case OPC_META_INPUT:
+			/* shader-input does not have a src, only block input: */
+			debug_assert(instr->regs_count == 2);
+			instr_assign(ctx, instr, name);
+			return;
+		case OPC_META_FO:
+			instr_assign(ctx, instr, name + instr->fo.off);
+			return;
+		case OPC_META_FI:
+			instr_assign(ctx, instr, name - (r - 1));
+			return;
+		default:
+			break;
+		}
 	}
 }
 
-static void ra_assign_dst_fanout(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
+static void instr_assign(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr, unsigned name)
 {
-	struct ra_assign_visitor *a = ra_assign_visitor(v);
-	struct ir3_register *src = instr->regs[1];
-	ra_assign_reg(v, instr, reg);
-	if (src->flags & IR3_REG_SSA)
-		ra_assign(a->ctx, src->instr, a->num - instr->fo.off);
-}
+	struct ir3_instruction *n;
+	struct ir3_register *reg = instr->regs[0];
 
-static void ra_assign_src_fanout(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	struct ra_assign_visitor *a = ra_assign_visitor(v);
-	ra_assign_reg(v, instr, reg);
-	ra_assign(a->ctx, instr, a->num + instr->fo.off);
-}
+	/* check if already assigned: */
+	if (!(reg->flags & IR3_REG_SSA)) {
+		/* ... and if so, sanity check: */
+		ra_assert(ctx, reg->num == (name & ~REG_HALF));
+		return;
+	}
 
+	/* rename this instructions dst register: */
+	reg_assign(instr, 0, name);
 
-static void ra_assign_src_fanin(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	struct ra_assign_visitor *a = ra_assign_visitor(v);
-	unsigned j, srcn = ir3_instr_regno(instr, reg) - 1;
-	ra_assign_reg(v, instr, reg);
-	ra_assign(a->ctx, instr, a->num - srcn);
-	for (j = 1; j < instr->regs_count; j++) {
-		struct ir3_register *reg = instr->regs[j];
-		if (reg->flags & IR3_REG_SSA)  /* could be renamed already */
-			ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1);
+	/* and rename any subsequent use of result of this instr: */
+	for (n = instr->next; n && !ctx->error; n = n->next) {
+		unsigned i;
+
+		for (i = 1; i < n->regs_count; i++) {
+			reg = n->regs[i];
+			if ((reg->flags & IR3_REG_SSA) && (reg->instr == instr))
+				instr_assign_src(ctx, n, i, name);
+		}
 	}
-}
 
-static const struct ir3_visitor_funcs assign_visitor_funcs = {
-		.instr = ir3_visit_instr,
-		.dst_shader_input = ra_assign_dst_shader_input,
-		.dst_fanout = ra_assign_dst_fanout,
-		.dst_fanin = ra_assign_reg,
-		.dst = ra_assign_reg,
-		.src_fanout = ra_assign_src_fanout,
-		.src_fanin = ra_assign_src_fanin,
-		.src = ra_assign_reg,
-};
+	/* To simplify the neighbor logic, and to "avoid" dealing with
+	 * instructions which write more than one output, we actually
+	 * do register assignment for instructions that produce multiple
+	 * outputs on the fanout nodes and propagate up the assignment
+	 * to the actual instruction:
+	 */
+	if (is_meta(instr) && (instr->opc == OPC_META_FO)) {
+		struct ir3_instruction *src = ssa(instr->regs[1]);
+		debug_assert(name >= instr->fo.off);
+		if (src)
+			instr_assign(ctx, src, name - instr->fo.off);
+	}
+}
 
-static void ra_assign(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *assigner, int num)
+/* check neighbor list to see if it is already partially (or completely)
+ * assigned, in which case register block is already allocated and we
+ * just need to complete the assignment:
+ */
+static int check_partial_assignment(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr)
 {
-	struct ra_assign_visitor v = {
-			.base.funcs = &assign_visitor_funcs,
-			.ctx = ctx,
-			.num = num,
-	};
+	struct ir3_instruction *n;
+	int off = 0;
 
-	/* if we've already visited this instruction, bail now: */
-	if (ir3_instr_check_mark(assigner)) {
-		debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
-		if (assigner->regs[0]->num != (num & ~REG_HALF)) {
-			/* impossible situation, should have been resolved
-			 * at an earlier stage by inserting extra mov's:
-			 */
-			ctx->error = true;
+	debug_assert(!instr->cp.left);
+
+	for (n = instr; n; n = n->cp.right) {
+		struct ir3_register *dst = n->regs[0];
+		if (!(dst->flags & IR3_REG_SSA)) {
+			int name = dst->num - off;
+			debug_assert(name >= 0);
+			return name;
 		}
-		return;
+		off++;
 	}
 
-	ir3_visit_instr(&v.base, assigner);
+	return -1;
 }
 
-/*
- *
+/* allocate register name(s) for a list of neighboring instructions;
+ * instr should point to leftmost neighbor (head of list)
  */
-
-static void ir3_instr_ra(struct ir3_ra_ctx *ctx,
+static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx,
 		struct ir3_instruction *instr)
 {
+	struct ir3_instruction *n;
 	struct ir3_register *dst;
-	unsigned num;
+	int name;
+
+	debug_assert(!instr->cp.left);
 
-	/* skip over nop's */
 	if (instr->regs_count == 0)
 		return;
 
 	dst = instr->regs[0];
 
-	/* if we've already visited this instruction, bail now: */
-	if (instr->flags & IR3_INSTR_MARK)
+	/* for instructions w/ fanouts, do the actual register assignment
+	 * on the group of fanout neighbor nodes and propagate the reg
+	 * name back up to the texture instruction.
+	 */
+	if (dst->wrmask != 0x1)
 		return;
 
+	name = check_partial_assignment(ctx, instr);
+
 	/* allocate register(s): */
-	if (is_addr(instr)) {
-		num = instr->regs[2]->num;
+	if (name >= 0) {
+		/* already partially assigned, just finish the job */
+	} else if (is_addr(instr)) {
+		debug_assert(!instr->cp.right);
+		name = instr->regs[2]->num;
 	} else if (reg_gpr(dst)) {
-		struct ir3_ra_assignment a;
-		a = ra_calc(instr);
-		num = alloc_block(ctx, instr, a.num) + a.off;
+		int size;
+		/* number of consecutive registers to assign: */
+		size = ir3_neighbor_count(instr);
+		if (dst->wrmask != 0x1)
+			size = MAX2(size, ffs(~dst->wrmask) - 1);
+		name = alloc_block(ctx, instr, size);
 	} else if (dst->flags & IR3_REG_ADDR) {
+		debug_assert(!instr->cp.right);
 		dst->flags &= ~IR3_REG_ADDR;
-		num = regid(REG_A0, 0) | REG_HALF;
+		name = regid(REG_A0, 0) | REG_HALF;
 	} else {
+		debug_assert(!instr->cp.right);
 		/* predicate register (p0).. etc */
-		num = regid(REG_P0, 0);
-		debug_assert(dst->num == num);
+		name = regid(REG_P0, 0);
+		debug_assert(dst->num == name);
 	}
 
-	ra_assign(ctx, instr, num);
+	ra_assert(ctx, name >= 0);
+
+	for (n = instr; n && !ctx->error; n = n->cp.right) {
+		instr_assign(ctx, n, name);
+		name++;
+	}
 }
 
 static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 {
 	struct ir3_instruction *n;
 
-	ra_dump_list("before:\n", block->head);
-
-	if (!block->parent) {
-		unsigned i, j;
-		int base, off = output_base(ctx);
-
-		base = alloc_block(ctx, NULL, block->noutputs + off);
-
-		if (ctx->half_precision)
-			base |= REG_HALF;
-
-		for (i = 0; i < block->noutputs; i++)
-			if (block->outputs[i] && !is_kill(block->outputs[i]))
-				ra_assign(ctx, block->outputs[i], base + i + off);
-
-		if (ctx->type == SHADER_FRAGMENT) {
-			i = 0;
-			if (ctx->frag_face) {
-				/* if we have frag_face, it gets hr0.x */
-				ra_assign(ctx, block->inputs[i], REG_HALF | 0);
-				i += 4;
-			}
-			for (j = 0; i < block->ninputs; i++, j++)
-				if (block->inputs[i])
-					ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + j);
-		} else {
-			for (i = 0; i < block->ninputs; i++)
-				if (block->inputs[i])
-					ir3_instr_ra(ctx, block->inputs[i]);
+	/* frag shader inputs get pre-assigned, since we have some
+	 * constraints/unknowns about setup for some of these regs:
+	 */
+	if ((ctx->type == SHADER_FRAGMENT) && !block->parent) {
+		unsigned i = 0, j;
+		if (ctx->frag_face) {
+			/* if we have frag_face, it gets hr0.x */
+			instr_assign(ctx, block->inputs[i], REG_HALF | 0);
+			i += 4;
 		}
+		for (j = 0; i < block->ninputs; i++, j++)
+			if (block->inputs[i])
+				instr_assign(ctx, block->inputs[i], j);
 	}
 
-	ra_dump_list("after:\n", block->head);
+	ra_dump_list("-------\n", block->head);
 
-	/* then loop over instruction list and assign registers:
-	 */
-	for (n = block->head; n; n = n->next) {
+	for (n = block->head; n && !ctx->error; n = n->next) {
 		ra_dump_instr("ASSIGN: ", n);
-		ir3_instr_ra(ctx, n);
-		if (ctx->error)
-			return -1;
-		ra_dump_list("-------", block->head);
+		instr_alloc_and_assign(ctx, ir3_neighbor_first(n));
+		ra_dump_list("-------\n", block->head);
 	}
 
-	return 0;
+	return ctx->error ? -1 : 0;
 }
 
 int ir3_block_ra(struct ir3_block *block, enum shader_t type,
-		bool half_precision, bool frag_coord, bool frag_face)
+		bool frag_coord, bool frag_face)
 {
 	struct ir3_instruction *n;
 	struct ir3_ra_ctx ctx = {
 			.block = block,
 			.type = type,
-			.half_precision = half_precision,
 			.frag_coord = frag_coord,
 			.frag_face = frag_face,
 	};
@@ -672,6 +538,8 @@ int ir3_block_ra(struct ir3_block *block, enum shader_t type,
 
 	/* mark dst registers w/ SSA flag so we can see which
 	 * have been assigned so far:
+	 * NOTE: we really should set SSA flag consistently on
+	 * every dst register in the frontend.
 	 */
 	for (n = block->head; n; n = n->next)
 		if (n->regs_count > 0)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_visitor.h b/src/gallium/drivers/freedreno/ir3/ir3_visitor.h
deleted file mode 100644
index 1c60d1620ca..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_visitor.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2014 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#ifndef IR3_VISITOR_H_
-#define IR3_VISITOR_H_
-
-/**
- * Visitor which follows dst to src relationships between instructions,
- * first visiting the dst (writer) instruction, followed by src (reader)
- * instruction(s).
- *
- * TODO maybe we want multiple different visitors to walk the
- * graph in different ways?
- */
-
-struct ir3_visitor;
-
-typedef void (*ir3_visit_instr_func)(struct ir3_visitor *v,
-		struct ir3_instruction *instr);
-
-typedef void (*ir3_visit_reg_func)(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg);
-
-struct ir3_visitor_funcs {
-	ir3_visit_instr_func instr;  // TODO do we need??
-
-	ir3_visit_reg_func dst_shader_input;
-	ir3_visit_reg_func dst_block_input;
-	ir3_visit_reg_func dst_fanout;
-	ir3_visit_reg_func dst_fanin;
-	ir3_visit_reg_func dst;
-
-	ir3_visit_reg_func src_block_input;
-	ir3_visit_reg_func src_fanout;
-	ir3_visit_reg_func src_fanin;
-	ir3_visit_reg_func src;
-};
-
-struct ir3_visitor {
-	const struct ir3_visitor_funcs *funcs;
-	bool error;
-};
-
-#include "util/u_debug.h"
-
-static void visit_instr_dst(struct ir3_visitor *v,
-		struct ir3_instruction *instr)
-{
-	struct ir3_register *reg = instr->regs[0];
-
-	if (is_meta(instr)) {
-		switch (instr->opc) {
-		case OPC_META_INPUT:
-			if (instr->regs_count == 1)
-				v->funcs->dst_shader_input(v, instr, reg);
-			else
-				v->funcs->dst_block_input(v, instr, reg);
-			return;
-		case OPC_META_FO:
-			v->funcs->dst_fanout(v, instr, reg);
-			return;
-		case OPC_META_FI:
-			v->funcs->dst_fanin(v, instr, reg);
-			return;
-		default:
-			break;
-
-		}
-	}
-
-	v->funcs->dst(v, instr, reg);
-}
-
-static void visit_instr_src(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	if (is_meta(instr)) {
-		switch (instr->opc) {
-		case OPC_META_INPUT:
-			/* shader-input does not have a src, only block input: */
-			debug_assert(instr->regs_count == 2);
-			v->funcs->src_block_input(v, instr, reg);
-			return;
-		case OPC_META_FO:
-			v->funcs->src_fanout(v, instr, reg);
-			return;
-		case OPC_META_FI:
-			v->funcs->src_fanin(v, instr, reg);
-			return;
-		default:
-			break;
-
-		}
-	}
-
-	v->funcs->src(v, instr, reg);
-}
-
-static void ir3_visit_instr(struct ir3_visitor *v,
-		struct ir3_instruction *instr)
-{
-	struct ir3_instruction *n;
-
-	/* visit instruction that assigns value: */
-	if (instr->regs_count > 0)
-		visit_instr_dst(v, instr);
-
-	/* and of any following instructions which read that value: */
-	n = instr->next;
-	while (n && !v->error) {
-		unsigned i;
-
-		for (i = 1; i < n->regs_count; i++) {
-			struct ir3_register *reg = n->regs[i];
-			if ((reg->flags & IR3_REG_SSA) && (reg->instr == instr))
-				visit_instr_src(v, n, reg);
-		}
-
-		n = n->next;
-	}
-}
-
-static void ir3_visit_reg(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	/* no-op */
-}
-
-#endif /* IR3_VISITOR_H_ */
author	Rob Clark <[email protected]>	2014-10-25 15:11:59 -0400
committer	Rob Clark <[email protected]>	2015-01-07 19:37:28 -0500
commit	9a9f2a893b5e29a77d66671191653f0b4261f546 (patch)
tree	cb18b6fe28568d5c1f728632b93f60e50bd2b203 /src
parent	dddfe6c21ee92f015b78060545f08239c331ceba (diff)