8 files changed, 622 insertions, 777 deletions
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources
index 1cae52905ef..592f4b4a3fa 100644
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -127,10 +127,10 @@ ir3_SOURCES := \
 	ir3/ir3_depth.c \
 	ir3/ir3_dump.c \
 	ir3/ir3_flatten.c \
+	ir3/ir3_group.c \
 	ir3/ir3.h \
 	ir3/ir3_legalize.c \
 	ir3/ir3_ra.c \
 	ir3/ir3_sched.c \
 	ir3/ir3_shader.c \
-	ir3/ir3_shader.h \
-	ir3/ir3_visitor.h
+	ir3/ir3_shader.h
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index bd0c0a5b693..aaa0ff6efa8 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -228,26 +228,62 @@ struct ir3_instruction {
 		 */
 #define DEPTH_UNUSED  ~0
 		unsigned depth;
-
-		/* Used just during cp stage, which comes before depth pass.
-		 * For fanin, where we need a sequence of consecutive registers,
-		 * keep track of each src instructions left (ie 'n-1') and right
-		 * (ie 'n+1') neighbor.  The front-end must insert enough mov's
-		 * to ensure that each instruction has at most one left and at
-		 * most one right neighbor.  During the copy-propagation pass,
-		 * we only remove mov's when we can preserve this constraint.
-		 */
-		struct {
-			struct ir3_instruction *left, *right;
-			uint16_t left_cnt, right_cnt;
-		} cp;
 	};
+
+	/* Used during CP and RA stages.  For fanin and shader inputs/
+	 * outputs where we need a sequence of consecutive registers,
+	 * keep track of each src instructions left (ie 'n-1') and right
+	 * (ie 'n+1') neighbor.  The front-end must insert enough mov's
+	 * to ensure that each instruction has at most one left and at
+	 * most one right neighbor.  During the copy-propagation pass,
+	 * we only remove mov's when we can preserve this constraint.
+	 * And during the RA stage, we use the neighbor information to
+	 * allocate a block of registers in one shot.
+	 *
+	 * TODO: maybe just add something like:
+	 *   struct ir3_instruction_ref {
+	 *       struct ir3_instruction *instr;
+	 *       unsigned cnt;
+	 *   }
+	 *
+	 * Or can we get away without the refcnt stuff?  It seems like
+	 * it should be overkill..  the problem is if, potentially after
+	 * already eliminating some mov's, if you have a single mov that
+	 * needs to be grouped with it's neighbors in two different
+	 * places (ex. shader output and a fanin).
+	 */
+	struct {
+		struct ir3_instruction *left, *right;
+		uint16_t left_cnt, right_cnt;
+	} cp;
 	struct ir3_instruction *next;
 #ifdef DEBUG
 	uint32_t serialno;
 #endif
 };
 
+static inline struct ir3_instruction *
+ir3_neighbor_first(struct ir3_instruction *instr)
+{
+	while (instr->cp.left)
+		instr = instr->cp.left;
+	return instr;
+}
+
+static inline int ir3_neighbor_count(struct ir3_instruction *instr)
+{
+	int num = 1;
+
+	debug_assert(!instr->cp.left);
+
+	while (instr->cp.right) {
+		num++;
+		instr = instr->cp.right;
+	}
+
+	return num;
+}
+
 struct ir3_heap_chunk;
 
 struct ir3 {
@@ -415,6 +451,15 @@ static inline bool writes_pred(struct ir3_instruction *instr)
 	return false;
 }
 
+/* returns defining instruction for reg */
+/* TODO better name */
+static inline struct ir3_instruction *ssa(struct ir3_register *reg)
+{
+	if (reg->flags & IR3_REG_SSA)
+		return reg->instr;
+	return NULL;
+}
+
 static inline bool reg_gpr(struct ir3_register *r)
 {
 	if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_ADDR))
@@ -443,12 +488,15 @@ void ir3_block_depth(struct ir3_block *block);
 /* copy-propagate: */
 void ir3_block_cp(struct ir3_block *block);
 
+/* group neightbors and insert mov's to resolve conflicts: */
+void ir3_block_group(struct ir3_block *block);
+
 /* scheduling: */
 int ir3_block_sched(struct ir3_block *block);
 
 /* register assignment: */
 int ir3_block_ra(struct ir3_block *block, enum shader_t type,
-		bool half_precision, bool frag_coord, bool frag_face);
+		bool frag_coord, bool frag_face);
 
 /* legalize: */
 void ir3_block_legalize(struct ir3_block *block,
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index 081143d5d6e..6c334d200a3 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -61,8 +61,10 @@ static void dump_info(struct ir3_shader_variant *so, const char *str)
 
 if (block) {
 		for (i = 0; i < block->ninputs; i++) {
-			if (!block->inputs[i])
+			if (!block->inputs[i]) {
+				debug_printf("; in%d unused\n", i);
 				continue;
+			}
 			reg = block->inputs[i]->regs[0];
 			regid = reg->num;
 			debug_printf("@in(%sr%d.%c)\tin%d\n",
@@ -71,8 +73,10 @@ if (block) {
 		}
 
 		for (i = 0; i < block->noutputs; i++) {
-			if (!block->outputs[i])
+			if (!block->outputs[i]) {
+				debug_printf("; out%d unused\n", i);
 				continue;
+			}
 			/* kill shows up as a virtual output.. skip it! */
 			if (is_kill(block->outputs[i]))
 				continue;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
index b47aa1d14d8..209621bd013 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
@@ -571,23 +571,40 @@ add_dst_reg_wrmask(struct ir3_compile_context *ctx,
 	} else if ((dst->File == TGSI_FILE_TEMPORARY) ||
 			(dst->File == TGSI_FILE_OUTPUT) ||
 			(dst->File == TGSI_FILE_ADDRESS)) {
+		struct ir3_instruction *prev = NULL;
 		unsigned i;
 
 		/* if instruction writes multiple, we need to create
 		 * some place-holder collect the registers:
 		 */
 		for (i = 0; i < 4; i++) {
-			if (wrmask & (1 << i)) {
-				struct ir3_instruction *collect =
-						ir3_instr_create(ctx->block, -1, OPC_META_FO);
-				collect->fo.off = i;
-				/* unused dst reg: */
-				ir3_reg_create(collect, 0, 0);
-				/* and src reg used to hold original instr */
-				ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr;
-				if (!ctx->atomic)
-					ssa_dst(ctx, collect, dst, chan+i);
+			/* NOTE: slightly ugly that we setup neighbor ptrs
+			 * for FO here, but handle FI in CP pass.. we should
+			 * probably just always setup neighbor ptrs in the
+			 * frontend?
+			 */
+			struct ir3_instruction *split =
+					ir3_instr_create(ctx->block, -1, OPC_META_FO);
+			split->fo.off = i;
+			/* unused dst reg: */
+			/* NOTE: set SSA flag on dst here, because unused FO's
+			 * which don't get scheduled will end up not in the
+			 * instruction list when RA sets SSA flag on each dst.
+			 * Slight hack.  We really should set SSA flag on
+			 * every dst register in the frontend.
+			 */
+			ir3_reg_create(split, 0, IR3_REG_SSA);
+			/* and src reg used to hold original instr */
+			ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr;
+			if (prev) {
+				split->cp.left = prev;
+				split->cp.left_cnt++;
+				prev->cp.right = split;
+				prev->cp.right_cnt++;
 			}
+			if ((wrmask & (1 << i)) && !ctx->atomic)
+				ssa_dst(ctx, split, dst, chan+i);
+			prev = split;
 		}
 	}
 
@@ -3120,6 +3137,17 @@ ir3_compile_shader(struct ir3_shader_variant *so,
 		}
 	}
 
+	/* if we want half-precision outputs, mark the output registers
+	 * as half:
+	 */
+	if (key.half_precision) {
+		for (i = 0; i < block->noutputs; i++) {
+			if (!block->outputs[i])
+				continue;
+			block->outputs[i]->regs[0]->flags |= IR3_REG_HALF;
+		}
+	}
+
 	/* at this point, we want the kill's in the outputs array too,
 	 * so that they get scheduled (since they have no dst).. we've
 	 * already ensured that the array is big enough in push_block():
@@ -3145,9 +3173,26 @@ ir3_compile_shader(struct ir3_shader_variant *so,
 		ir3_dump_instr_list(block->head);
 	}
 
+	ir3_block_depth(block);
+
+	/* First remove all the extra mov's (which we could skip if the
+	 * front-end was clever enough not to insert them in the first
+	 * place).  Then figure out left/right neighbors, re-inserting
+	 * extra mov's when needed to avoid conflicts.
+	 */
 	if (cp && !(fd_mesa_debug & FD_DBG_NOCP))
 		ir3_block_cp(block);
 
+	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+		printf("BEFORE GROUPING:\n");
+		ir3_dump_instr_list(block->head);
+	}
+
+	/* Group left/right neighbors, inserting mov's where needed to
+	 * solve conflicts:
+	 */
+	ir3_block_group(block);
+
 	if (fd_mesa_debug & FD_DBG_OPTDUMP)
 		compile_dump(&ctx);
 
@@ -3169,20 +3214,19 @@ ir3_compile_shader(struct ir3_shader_variant *so,
 		ir3_dump_instr_list(block->head);
 	}
 
-	ret = ir3_block_ra(block, so->type, key.half_precision,
-			so->frag_coord, so->frag_face);
+	ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face);
 	if (ret) {
 		DBG("RA failed!");
 		goto out;
 	}
 
-	ir3_block_legalize(block, &so->has_samp, &max_bary);
-
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("AFTER RA:\n");
 		ir3_dump_instr_list(block->head);
 	}
 
+	ir3_block_legalize(block, &so->has_samp, &max_bary);
+
 	/* fixup input/outputs: */
 	for (i = 0; i < so->outputs_count; i++) {
 		so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 2076b62acb8..c55425d68d4 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -33,69 +33,14 @@
 /*
  * Copy Propagate:
  *
+ * We could eventually drop this, if the front-end did not insert any
+ * mov's..  For now keeping it as a separate pass since that is less
+ * painful than updating the existing frontend.  It is expected that
+ * with an eventual new NIR based frontend that we won't need this.
  */
 
 static void block_cp(struct ir3_block *block);
-static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, bool keep);
-
-/* XXX move this somewhere useful (and rename?) */
-static struct ir3_instruction *ssa(struct ir3_register *reg)
-{
-	if (reg->flags & IR3_REG_SSA)
-		return reg->instr;
-	return NULL;
-}
-
-static bool conflicts(struct ir3_instruction *a, struct ir3_instruction *b)
-{
-	return (a && b) && (a != b);
-}
-
-static void set_neighbors(struct ir3_instruction *instr,
-		struct ir3_instruction *left, struct ir3_instruction *right)
-{
-	debug_assert(!conflicts(instr->cp.left, left));
-	if (left) {
-		instr->cp.left_cnt++;
-		instr->cp.left = left;
-	}
-	debug_assert(!conflicts(instr->cp.right, right));
-	if (right) {
-		instr->cp.right_cnt++;
-		instr->cp.right = right;
-	}
-}
-
-/* remove neighbor reference, clearing left/right neighbor ptrs when
- * there are no more references:
- */
-static void remove_neighbors(struct ir3_instruction *instr)
-{
-	if (instr->cp.left) {
-		if (--instr->cp.left_cnt == 0)
-			instr->cp.left = NULL;
-	}
-	if (instr->cp.right) {
-		if (--instr->cp.right_cnt == 0)
-			instr->cp.right = NULL;
-	}
-}
-
-/* stop condition for iteration: */
-static bool check_stop(struct ir3_instruction *instr)
-{
-	if (ir3_instr_check_mark(instr))
-		return true;
-
-	/* stay within the block.. don't try to operate across
-	 * basic block boundaries or we'll have problems when
-	 * dealing with multiple basic blocks:
-	 */
-	if (is_meta(instr) && (instr->opc == OPC_META_INPUT))
-		return true;
-
-	return false;
-}
+static struct ir3_instruction * instr_cp(struct ir3_instruction *instr);
 
 static bool is_eligible_mov(struct ir3_instruction *instr)
 {
@@ -109,23 +54,17 @@ static bool is_eligible_mov(struct ir3_instruction *instr)
 		/* TODO: propagate abs/neg modifiers if possible */
 		if (src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV))
 			return false;
-		if (src_instr) {
-			/* check that eliminating the move won't result in
-			 * a neighbor conflict, ie. if an instruction feeds
-			 * into multiple fanins it can still only have at
-			 * most one left and one right neighbor:
-			 */
-			if (conflicts(instr->cp.left, src_instr->cp.left))
-				return false;
-			if (conflicts(instr->cp.right, src_instr->cp.right))
-				return false;
-			return true;
-		}
+		if (!src_instr)
+			return false;
+		/* TODO: remove this hack: */
+		if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO))
+			return false;
+		return true;
 	}
 	return false;
 }
 
-static void walk_children(struct ir3_instruction *instr, bool keep)
+static void walk_children(struct ir3_instruction *instr)
 {
 	unsigned i;
 
@@ -133,188 +72,56 @@ static void walk_children(struct ir3_instruction *instr, bool keep)
 	for (i = 1; i < instr->regs_count; i++) {
 		struct ir3_register *src = instr->regs[i];
 		if (src->flags & IR3_REG_SSA)
-			src->instr = instr_cp(src->instr, keep);
+			src->instr = instr_cp(src->instr);
 	}
 }
 
-static struct ir3_instruction *
-instr_cp_fanin(struct ir3_instruction *instr)
-{
-	unsigned i, j;
-
-	/* we need to handle fanin specially, to detect cases
-	 * when we need to keep a mov
-	 */
-
-	for (i = 1; i < instr->regs_count; i++) {
-		struct ir3_register *src = instr->regs[i];
-		if (src->flags & IR3_REG_SSA) {
-			struct ir3_instruction *cand =
-					instr_cp(src->instr, false);
-
-			/* if the candidate is a fanout, then keep
-			 * the move.
-			 *
-			 * This is a bit, um, fragile, but it should
-			 * catch the extra mov's that the front-end
-			 * puts in for us already in these cases.
-			 */
-			if (is_meta(cand) && (cand->opc == OPC_META_FO))
-				cand = instr_cp(src->instr, true);
-
-			/* we can't have 2 registers referring to the same instruction, so
-			 * go through and check if any already refer to the candidate
-			 * instruction. if so, don't do the propagation.
-			 *
-			 * NOTE: we need to keep this, despite the neighbor
-			 * conflict checks, to avoid A<->B<->A..
-			 */
-			for (j = 1; j < instr->regs_count; j++)
-				if (instr->regs[j]->instr == cand)
-					break;
-			if (j == instr->regs_count)
-				src->instr = cand;
-		}
-	}
-
-	walk_children(instr, false);
-
-	return instr;
-}
 
 static struct ir3_instruction *
-instr_cp(struct ir3_instruction *instr, bool keep)
+instr_cp(struct ir3_instruction *instr)
 {
-	/* if we've already visited this instruction, bail now: */
-	if (check_stop(instr))
+	/* stay within the block.. don't try to operate across
+	 * basic block boundaries or we'll have problems when
+	 * dealing with multiple basic blocks:
+	 */
+	if (is_meta(instr) && (instr->opc == OPC_META_INPUT))
 		return instr;
 
-	if (is_meta(instr) && (instr->opc == OPC_META_FI))
-		return instr_cp_fanin(instr);
-
-	if (!keep && is_eligible_mov(instr)) {
+	if (is_eligible_mov(instr)) {
 		struct ir3_instruction *src_instr = ssa(instr->regs[1]);
-		set_neighbors(src_instr, instr->cp.left, instr->cp.right);
-		remove_neighbors(instr);
-		return instr_cp(src_instr, false);
+		return instr_cp(src_instr);
 	}
 
-	walk_children(instr, false);
+	/* Check termination condition before walking children (rather
+	 * than before checking eligible-mov).  A mov instruction may
+	 * appear as ssa-src for multiple other instructions, and we
+	 * want to consider it for removal for each, rather than just
+	 * the first one.  (But regardless of how many places it shows
+	 * up as a src, we only need to recursively walk the children
+	 * once.)
+	 */
+	if (!ir3_instr_check_mark(instr))
+		walk_children(instr);
 
 	return instr;
 }
 
 static void block_cp(struct ir3_block *block)
 {
-	unsigned i, j;
+	unsigned i;
 
 	for (i = 0; i < block->noutputs; i++) {
 		if (block->outputs[i]) {
 			struct ir3_instruction *out =
-					instr_cp(block->outputs[i], false);
-
-			/* To deal with things like this:
-			 *
-			 *   43: MOV OUT[2], TEMP[5]
-			 *   44: MOV OUT[0], TEMP[5]
-			 *
-			 * we need to ensure that no two outputs point to
-			 * the same instruction
-			 */
-			for (j = 0; j < i; j++) {
-				if (block->outputs[j] == out) {
-					out = instr_cp(block->outputs[i], true);
-					break;
-				}
-			}
+					instr_cp(block->outputs[i]);
 
 			block->outputs[i] = out;
 		}
 	}
 }
 
-/*
- * Find instruction neighbors:
- */
-
-static void instr_find_neighbors(struct ir3_instruction *instr)
-{
-	unsigned i;
-
-	if (check_stop(instr))
-		return;
-
-	if (is_meta(instr) && (instr->opc == OPC_META_FI)) {
-		unsigned n = instr->regs_count;
-		for (i = 1; i < n; i++) {
-			struct ir3_instruction *src_instr = ssa(instr->regs[i]);
-			if (src_instr) {
-				struct ir3_instruction *left = (i > 1) ?
-						ssa(instr->regs[i-1]) : NULL;
-				struct ir3_instruction *right = (i < (n - 1)) ?
-						ssa(instr->regs[i+1]) : NULL;
-				set_neighbors(src_instr, left, right);
-				instr_find_neighbors(src_instr);
-			}
-		}
-	} else {
-		for (i = 1; i < instr->regs_count; i++) {
-			struct ir3_instruction *src_instr = ssa(instr->regs[i]);
-			if (src_instr)
-				instr_find_neighbors(src_instr);
-		}
-	}
-}
-
-static void block_find_neighbors(struct ir3_block *block)
-{
-	unsigned i;
-
-	for (i = 0; i < block->noutputs; i++) {
-		if (block->outputs[i]) {
-			struct ir3_instruction *instr = block->outputs[i];
-			instr_find_neighbors(instr);
-		}
-	}
-}
-
-static void instr_clear_neighbors(struct ir3_instruction *instr)
-{
-	unsigned i;
-
-	if (check_stop(instr))
-		return;
-
-	instr->cp.left_cnt = 0;
-	instr->cp.left = NULL;
-	instr->cp.right_cnt = 0;
-	instr->cp.right = NULL;
-
-	for (i = 1; i < instr->regs_count; i++) {
-		struct ir3_instruction *src_instr = ssa(instr->regs[i]);
-		if (src_instr)
-			instr_clear_neighbors(src_instr);
-	}
-}
-
-static void block_clear_neighbors(struct ir3_block *block)
-{
-	unsigned i;
-
-	for (i = 0; i < block->noutputs; i++) {
-		if (block->outputs[i]) {
-			struct ir3_instruction *instr = block->outputs[i];
-			instr_clear_neighbors(instr);
-		}
-	}
-}
-
 void ir3_block_cp(struct ir3_block *block)
 {
 	ir3_clear_mark(block->shader);
-	block_clear_neighbors(block);
-	ir3_clear_mark(block->shader);
-	block_find_neighbors(block);
-	ir3_clear_mark(block->shader);
 	block_cp(block);
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c
new file mode 100644
index 00000000000..f215c1c15d2
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c
@@ -0,0 +1,228 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "freedreno_util.h"
+
+#include "ir3.h"
+
+/*
+ * Find/group instruction neighbors:
+ */
+
+/* stop condition for iteration: */
+static bool check_stop(struct ir3_instruction *instr)
+{
+	if (ir3_instr_check_mark(instr))
+		return true;
+
+	/* stay within the block.. don't try to operate across
+	 * basic block boundaries or we'll have problems when
+	 * dealing with multiple basic blocks:
+	 */
+	if (is_meta(instr) && (instr->opc == OPC_META_INPUT))
+		return true;
+
+	return false;
+}
+
+/* bleh.. we need to do the same group_n() thing for both inputs/outputs
+ * (where we have a simple instr[] array), and fanin nodes (where we have
+ * an extra indirection via reg->instr).
+ */
+struct group_ops {
+	struct ir3_instruction *(*get)(void *arr, int idx);
+	void (*set)(void *arr, int idx, struct ir3_instruction *instr);
+};
+
+static struct ir3_instruction *arr_get(void *arr, int idx)
+{
+	return ((struct ir3_instruction **)arr)[idx];
+}
+static void arr_set_out(void *arr, int idx, struct ir3_instruction *instr)
+{
+	((struct ir3_instruction **)arr)[idx] = instr;
+}
+static void arr_set_in(void *arr, int idx, struct ir3_instruction *instr)
+{
+	debug_printf("cannot insert mov before input!\n");
+	debug_assert(0);
+}
+static struct group_ops arr_ops_out = { arr_get, arr_set_out };
+static struct group_ops arr_ops_in = { arr_get, arr_set_in };
+
+static struct ir3_instruction *instr_get(void *arr, int idx)
+{
+	return ssa(((struct ir3_instruction *)arr)->regs[idx+1]);
+}
+static void instr_set(void *arr, int idx, struct ir3_instruction *instr)
+{
+	((struct ir3_instruction *)arr)->regs[idx+1]->instr = instr;
+}
+static struct group_ops instr_ops = { instr_get, instr_set };
+
+
+
+static bool conflicts(struct ir3_instruction *a, struct ir3_instruction *b)
+{
+	return (a && b) && (a != b);
+}
+
+static struct ir3_instruction *
+create_mov(struct ir3_instruction *instr)
+{
+	struct ir3_instruction *mov;
+
+	mov = ir3_instr_create(instr->block, 1, 0);
+	mov->cat1.src_type = TYPE_F32;
+	mov->cat1.dst_type = TYPE_F32;
+	ir3_reg_create(mov, 0, 0);    /* dst */
+	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = instr;
+
+	return mov;
+}
+
+static void group_n(struct group_ops *ops, void *arr, unsigned n)
+{
+	unsigned i, j;
+
+	/* first pass, figure out what has conflicts and needs a mov
+	 * inserted.  Do this up front, before starting to setup
+	 * left/right neighbor pointers.  Trying to do it in a single
+	 * pass could result in a situation where we can't even setup
+	 * the mov's right neighbor ptr if the next instr also needs
+	 * a mov.
+	 */
+restart:
+	for (i = 0; i < n; i++) {
+		struct ir3_instruction *instr = ops->get(arr, i);
+		if (instr) {
+			struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
+			struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
+			bool conflict;
+
+			/* check for left/right neighbor conflicts: */
+			conflict = conflicts(instr->cp.left, left) ||
+				conflicts(instr->cp.right, right);
+
+			/* we also can't have an instr twice in the group: */
+			for (j = i + 1; (j < n) && !conflict; j++)
+				if (ops->get(arr, j) == instr)
+					conflict = true;
+
+			if (conflict) {
+				instr = create_mov(instr);
+				ops->set(arr, i, instr);
+				/* inserting the mov may have caused a conflict
+				 * against the previous:
+				 */
+				goto restart;
+			}
+		}
+	}
+
+	/* second pass, now that we've inserted mov's, fixup left/right
+	 * neighbors.  This is guaranteed to succeed, since by definition
+	 * the newly inserted mov's cannot conflict with anything.
+	 */
+	for (i = 0; i < n; i++) {
+		struct ir3_instruction *instr = ops->get(arr, i);
+		if (instr) {
+			struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
+			struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
+
+			debug_assert(!conflicts(instr->cp.left, left));
+			if (left) {
+				instr->cp.left_cnt++;
+				instr->cp.left = left;
+			}
+
+			debug_assert(!conflicts(instr->cp.right, right));
+			if (right) {
+				instr->cp.right_cnt++;
+				instr->cp.right = right;
+			}
+		}
+	}
+}
+
+static void instr_find_neighbors(struct ir3_instruction *instr)
+{
+	unsigned i;
+
+	if (check_stop(instr))
+		return;
+
+	if (is_meta(instr) && (instr->opc == OPC_META_FI))
+		group_n(&instr_ops, instr, instr->regs_count - 1);
+
+	for (i = 1; i < instr->regs_count; i++) {
+		struct ir3_instruction *src_instr = ssa(instr->regs[i]);
+		if (src_instr)
+			instr_find_neighbors(src_instr);
+	}
+}
+
+static void block_find_neighbors(struct ir3_block *block)
+{
+	unsigned i;
+
+	for (i = 0; i < block->noutputs; i++) {
+		if (block->outputs[i]) {
+			struct ir3_instruction *instr = block->outputs[i];
+			instr_find_neighbors(instr);
+		}
+	}
+
+	/* shader inputs/outputs themselves must be contiguous as well:
+	 */
+	if (!block->parent) {
+		/* NOTE: group inputs first, since we only insert mov's
+		 * *before* the conflicted instr (and that would go badly
+		 * for inputs).  By doing inputs first, we should never
+		 * have a conflict on inputs.. pushing any conflict to
+		 * resolve to the outputs, for stuff like:
+		 *
+		 *     MOV OUT[n], IN[m].wzyx
+		 *
+		 * NOTE: we assume here inputs/outputs are grouped in vec4.
+		 * This logic won't quite cut it if we don't align smaller
+		 * on vec4 boundaries
+		 */
+		for (i = 0; i < block->ninputs; i += 4)
+			group_n(&arr_ops_in, &block->inputs[i], 4);
+		for (i = 0; i < block->noutputs; i += 4)
+			group_n(&arr_ops_out, &block->outputs[i], 4);
+
+	}
+}
+
+void ir3_block_group(struct ir3_block *block)
+{
+	ir3_clear_mark(block->shader);
+	block_find_neighbors(block);
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 611b5425466..08540466bb0 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -30,7 +30,6 @@
 #include "util/u_math.h"
 
 #include "ir3.h"
-#include "ir3_visitor.h"
 
 /*
  * Register Assignment:
@@ -53,7 +52,6 @@
 struct ir3_ra_ctx {
 	struct ir3_block *block;
 	enum shader_t type;
-	bool half_precision;
 	bool frag_coord;
 	bool frag_face;
 	int cnt;
@@ -81,6 +79,15 @@ struct ir3_ra_ctx {
 		} \
 	} while (0)
 
+#define ra_assert(ctx, x) do { \
+		debug_assert(x); \
+		if (!(x)) { \
+			debug_printf("RA: failed assert: %s\n", #x); \
+			(ctx)->error = true; \
+		}; \
+	} while (0)
+
+
 /* sorta ugly way to retrofit half-precision support.. rather than
  * passing extra param around, just OR in a high bit.  All the low
  * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
@@ -89,19 +96,6 @@ struct ir3_ra_ctx {
  */
 #define REG_HALF  0x8000
 
-struct ir3_ra_assignment {
-	int8_t  off;        /* offset of instruction dst within range */
-	uint8_t num;        /* number of components for the range */
-};
-
-static void ra_assign(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *assigner, int num);
-static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr);
-
-/*
- * Register Allocation:
- */
-
 #define REG(n, wm, f) (struct ir3_register){ \
 		.flags  = (f), \
 		.num    = (n), \
@@ -117,19 +111,34 @@ static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n
 	return NULL;
 }
 
-static int output_base(struct ir3_ra_ctx *ctx)
+/* figure out if an unassigned src register points back to the instr we
+ * are assigning:
+ */
+static bool instr_used_by(struct ir3_instruction *instr,
+		struct ir3_register *src)
 {
-	/* ugg, for fragment shader we need to have input at r0.x
-	 * (or at least if there is a way to configure it, I can't
-	 * see how because the blob driver always uses r0.x (ie.
-	 * all zeros)
-	 */
-	if (ctx->type == SHADER_FRAGMENT) {
-		if (ctx->half_precision)
-			return ctx->frag_face ? 4 : 3;
-		return ctx->frag_coord ? 8 : 4;
-	}
-	return 0;
+	struct ir3_instruction *src_instr = ssa(src);
+	unsigned i;
+	if (instr == src_instr)
+		return true;
+	if (src_instr && is_meta(src_instr))
+		for (i = 1; i < src_instr->regs_count; i++)
+			if (instr_used_by(instr, src_instr->regs[i]))
+				return true;
+
+	return false;
+}
+
+static bool instr_is_output(struct ir3_instruction *instr)
+{
+	struct ir3_block *block = instr->block;
+	unsigned i;
+
+	for (i = 0; i < block->noutputs; i++)
+		if (instr == block->outputs[i])
+			return true;
+
+	return false;
 }
 
 /* live means read before written */
@@ -137,100 +146,59 @@ static void compute_liveregs(struct ir3_ra_ctx *ctx,
 		struct ir3_instruction *instr, regmask_t *liveregs)
 {
 	struct ir3_block *block = instr->block;
+	struct ir3_instruction *n;
 	regmask_t written;
-	unsigned i, j;
+	unsigned i;
 
-	regmask_init(liveregs);
 	regmask_init(&written);
 
-	for (instr = instr->next; instr; instr = instr->next) {
+	for (n = instr->next; n; n = n->next) {
 		struct ir3_register *r;
 
-		if (is_meta(instr))
+		if (is_meta(n))
 			continue;
 
 		/* check first src's read: */
-		for (j = 1; j < instr->regs_count; j++) {
-			r = reg_check(instr, j);
+		for (i = 1; i < n->regs_count; i++) {
+			r = reg_check(n, i);
 			if (r)
 				regmask_set_if_not(liveregs, r, &written);
+
+			/* if any src points back to the instruction(s) in
+			 * the block of neighbors that we are assigning then
+			 * mark any written (clobbered) registers as live:
+			 */
+			if (instr_used_by(instr, n->regs[i]))
+				regmask_or(liveregs, liveregs, &written);
 		}
 
+		/* meta-instructions don't actually get scheduled,
+		 * so don't let it's write confuse us.. what we
+		 * really care about is when the src to the meta
+		 * instr was written:
+		 */
+		if (is_meta(n))
+			continue;
+
 		/* then dst written (if assigned already): */
-		if (instr->flags & IR3_INSTR_MARK) {
-			r = reg_check(instr, 0);
-			if (r)
+		r = reg_check(n, 0);
+		if (r) {
+			/* if an instruction *is* an output, then it is live */
+			if (!instr_is_output(n))
 				regmask_set(&written, r);
 		}
+
 	}
 
 	/* be sure to account for output registers too: */
 	for (i = 0; i < block->noutputs; i++) {
-		struct ir3_register reg = REG(output_base(ctx) + i, X, 0);
-		regmask_set_if_not(liveregs, &reg, &written);
-	}
-}
-
-/* calculate registers that are clobbered before last use of 'assigner'.
- * This needs to be done backwards, although it could possibly be
- * combined into compute_liveregs().  (Ie. compute_liveregs() could
- * reverse the list, then do this part backwards reversing the list
- * again back to original order.)  Otoh, probably I should try to
- * construct a proper interference graph instead.
- *
- * XXX this need to follow the same recursion path that is used for
- * to rename/assign registers (ie. ra_assign_src()).. this is a bit
- * ugly right now, maybe refactor into node iterator sort of things
- * that iterates nodes in the correct order?
- */
-static bool compute_clobbers(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, struct ir3_instruction *assigner,
-		regmask_t *liveregs)
-{
-	unsigned i;
-	bool live = false, was_live = false;
-
-	if (instr == NULL) {
-		struct ir3_block *block = ctx->block;
-
-		/* if at the end, check outputs: */
-		for (i = 0; i < block->noutputs; i++)
-			if (block->outputs[i] == assigner)
-				return true;
-		return false;
-	}
-
-	for (i = 1; i < instr->regs_count; i++) {
-		struct ir3_register *reg = instr->regs[i];
-		if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) {
-			if (is_meta(instr)) {
-				switch (instr->opc) {
-				case OPC_META_INPUT:
-					// TODO
-					assert(0);
-					break;
-				case OPC_META_FO:
-				case OPC_META_FI:
-					was_live |= compute_clobbers(ctx, instr->next,
-							instr, liveregs);
-					break;
-				default:
-					break;
-				}
-			}
-			live = true;
-			break;
-		}
+		struct ir3_register *r;
+		if (!block->outputs[i])
+			continue;
+		r = reg_check(block->outputs[i], 0);
+		if (r)
+			regmask_set_if_not(liveregs, r, &written);
 	}
-
-	was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs);
-
-	if (was_live && (instr->regs_count > 0) &&
-			(instr->flags & IR3_INSTR_MARK) &&
-			!is_meta(instr))
-		regmask_set(liveregs, instr->regs[0]);
-
-	return live || was_live;
 }
 
 static int find_available(regmask_t *liveregs, int size, bool half)
@@ -254,141 +222,39 @@ static int find_available(regmask_t *liveregs, int size, bool half)
 static int alloc_block(struct ir3_ra_ctx *ctx,
 		struct ir3_instruction *instr, int size)
 {
-	if (!instr) {
-		/* special case, allocating shader outputs.  At this
-		 * point, nothing is allocated, just start the shader
-		 * outputs at r0.x and let compute_liveregs() take
-		 * care of the rest from here:
-		 */
-		return 0;
-	} else {
-		struct ir3_register *dst = instr->regs[0];
-		regmask_t liveregs;
-
-		compute_liveregs(ctx, instr, &liveregs);
-
-		// XXX XXX XXX XXX XXX XXX XXX XXX XXX
-		// XXX hack.. maybe ra_calc should give us a list of
-		// instrs to compute_clobbers() on?
-		if (is_meta(instr) && (instr->opc == OPC_META_INPUT) &&
-				(instr->regs_count == 1)) {
-			unsigned i, base = instr->regs[0]->num & ~0x3;
-			for (i = 0; i < 4; i++) {
-				struct ir3_instruction *in = NULL;
-				if ((base + i) < ctx->block->ninputs)
-					in = ctx->block->inputs[base + i];
-				if (in)
-					compute_clobbers(ctx, in->next, in, &liveregs);
-			}
-		} else
-		// XXX XXX XXX XXX XXX XXX XXX XXX XXX
-		compute_clobbers(ctx, instr->next, instr, &liveregs);
-
-		return find_available(&liveregs, size,
-				!!(dst->flags & IR3_REG_HALF));
-	}
-}
-
-/*
- * Constraint Calculation:
- */
-
-struct ra_calc_visitor {
-	struct ir3_visitor base;
-	struct ir3_ra_assignment a;
-};
-
-static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v)
-{
-	return (struct ra_calc_visitor *)v;
-}
-
-/* calculate register assignment for the instruction.  If the register
- * written by this instruction is required to be part of a range, to
- * handle other (input/output/sam/bary.f/etc) contiguous register range
- * constraints, that is calculated handled here.
- */
-static void ra_calc_dst(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	struct ra_calc_visitor *c = ra_calc_visitor(v);
-	if (is_tex(instr)) {
-		c->a.off = 0;
-		c->a.num = 4;
-	} else {
-		c->a.off = 0;
-		c->a.num = 1;
-	}
-}
-
-static void
-ra_calc_dst_shader_input(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	struct ra_calc_visitor *c = ra_calc_visitor(v);
-	struct ir3_block *block = instr->block;
 	struct ir3_register *dst = instr->regs[0];
-	unsigned base = dst->num & ~0x3;
-	unsigned i, num = 0;
-
-	assert(!(dst->flags & IR3_REG_IA));
-
-	/* check what input components we need: */
-	for (i = 0; i < 4; i++) {
-		unsigned idx = base + i;
-		if ((idx < block->ninputs) && block->inputs[idx])
-			num = i + 1;
-	}
-
-	c->a.off = dst->num - base;
-	c->a.num = num;
-}
-
-static void ra_calc_src_fanin(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	struct ra_calc_visitor *c = ra_calc_visitor(v);
-	unsigned srcn = ir3_instr_regno(instr, reg) - 1;
-	c->a.off += srcn;
-	c->a.num += srcn;
-	c->a.num = MAX2(c->a.num, instr->regs_count - 1);
-}
-
-static const struct ir3_visitor_funcs calc_visitor_funcs = {
-		.instr = ir3_visit_instr,
-		.dst_shader_input = ra_calc_dst_shader_input,
-		.dst_fanout = ra_calc_dst,
-		.dst_fanin = ra_calc_dst,
-		.dst = ra_calc_dst,
-		.src_fanout = ir3_visit_reg,
-		.src_fanin = ra_calc_src_fanin,
-		.src = ir3_visit_reg,
-};
-
-static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner)
-{
-	struct ra_calc_visitor v = {
-			.base.funcs = &calc_visitor_funcs,
-	};
+	struct ir3_instruction *n;
+	regmask_t liveregs;
+	unsigned name;
+
+	/* should only ever be called w/ head of neighbor list: */
+	debug_assert(!instr->cp.left);
+
+	regmask_init(&liveregs);
+
+	for (n = instr; n; n = n->cp.right)
+		compute_liveregs(ctx, n, &liveregs);
+
+	/* because we do assignment on fanout nodes for wrmask!=0x1, we
+	 * need to handle this special case, where the fanout nodes all
+	 * appear after one or more of the consumers of the src node:
+	 *
+	 *   0098:009: sam _, r2.x
+	 *   0028:010: mul.f r3.z, r4.x, c13.x
+	 *   ; we start assigning here for '0098:009: sam'.. but
+	 *   ; would miss the usage at '0028:010: mul.f'
+	 *   0101:009: _meta:fo _, _[0098:009: sam], off=2
+	 */
+	if (is_meta(instr) && (instr->opc == OPC_META_FO))
+		compute_liveregs(ctx, instr->regs[1]->instr, &liveregs);
 
-	ir3_visit_instr(&v.base, assigner);
+	name = find_available(&liveregs, size,
+			!!(dst->flags & IR3_REG_HALF));
 
-	return v.a;
-}
+	if (dst->flags & IR3_REG_HALF)
+		name |= REG_HALF;
 
-/*
- * Register Assignment:
- */
-
-struct ra_assign_visitor {
-	struct ir3_visitor base;
-	struct ir3_ra_ctx *ctx;
-	int num;
-};
-
-static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
-{
-	return (struct ra_assign_visitor *)v;
+	return name;
 }
 
 static type_t half_type(type_t type)
@@ -459,17 +325,15 @@ static void fixup_half_instr_src(struct ir3_instruction *instr)
 	}
 }
 
-static void ra_assign_reg(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
+static void reg_assign(struct ir3_instruction *instr,
+		unsigned r, unsigned name)
 {
-	struct ra_assign_visitor *a = ra_assign_visitor(v);
+	struct ir3_register *reg = instr->regs[r];
 
 	reg->flags &= ~IR3_REG_SSA;
-	reg->num = a->num & ~REG_HALF;
+	reg->num = name & ~REG_HALF;
 
-	assert(reg->num >= 0);
-
-	if (a->num & REG_HALF) {
+	if (name & REG_HALF) {
 		reg->flags |= IR3_REG_HALF;
 		/* if dst reg being assigned, patch up the instr: */
 		if (reg == instr->regs[0])
@@ -479,192 +343,194 @@ static void ra_assign_reg(struct ir3_visitor *v,
 	}
 }
 
-static void ra_assign_dst_shader_input(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
+static void instr_assign(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr, unsigned name);
+
+static void instr_assign_src(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr, unsigned r, unsigned name)
 {
-	struct ra_assign_visitor *a = ra_assign_visitor(v);
-	unsigned i, base = reg->num & ~0x3;
-	int off = base - reg->num;
-
-	ra_assign_reg(v, instr, reg);
-	reg->flags |= IR3_REG_IA;
-
-	/* trigger assignment of all our companion input components: */
-	for (i = 0; i < 4; i++) {
-		struct ir3_instruction *in = NULL;
-		if ((base + i) < instr->block->ninputs)
-			in = instr->block->inputs[base + i];
-		if (in && is_meta(in) && (in->opc == OPC_META_INPUT))
-			ra_assign(a->ctx, in, a->num + off + i);
+	reg_assign(instr, r, name);
+
+	if (is_meta(instr)) {
+		switch (instr->opc) {
+		case OPC_META_INPUT:
+			/* shader-input does not have a src, only block input: */
+			debug_assert(instr->regs_count == 2);
+			instr_assign(ctx, instr, name);
+			return;
+		case OPC_META_FO:
+			instr_assign(ctx, instr, name + instr->fo.off);
+			return;
+		case OPC_META_FI:
+			instr_assign(ctx, instr, name - (r - 1));
+			return;
+		default:
+			break;
+		}
 	}
 }
 
-static void ra_assign_dst_fanout(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
+static void instr_assign(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr, unsigned name)
 {
-	struct ra_assign_visitor *a = ra_assign_visitor(v);
-	struct ir3_register *src = instr->regs[1];
-	ra_assign_reg(v, instr, reg);
-	if (src->flags & IR3_REG_SSA)
-		ra_assign(a->ctx, src->instr, a->num - instr->fo.off);
-}
+	struct ir3_instruction *n;
+	struct ir3_register *reg = instr->regs[0];
 
-static void ra_assign_src_fanout(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	struct ra_assign_visitor *a = ra_assign_visitor(v);
-	ra_assign_reg(v, instr, reg);
-	ra_assign(a->ctx, instr, a->num + instr->fo.off);
-}
+	/* check if already assigned: */
+	if (!(reg->flags & IR3_REG_SSA)) {
+		/* ... and if so, sanity check: */
+		ra_assert(ctx, reg->num == (name & ~REG_HALF));
+		return;
+	}
 
+	/* rename this instructions dst register: */
+	reg_assign(instr, 0, name);
 
-static void ra_assign_src_fanin(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	struct ra_assign_visitor *a = ra_assign_visitor(v);
-	unsigned j, srcn = ir3_instr_regno(instr, reg) - 1;
-	ra_assign_reg(v, instr, reg);
-	ra_assign(a->ctx, instr, a->num - srcn);
-	for (j = 1; j < instr->regs_count; j++) {
-		struct ir3_register *reg = instr->regs[j];
-		if (reg->flags & IR3_REG_SSA)  /* could be renamed already */
-			ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1);
+	/* and rename any subsequent use of result of this instr: */
+	for (n = instr->next; n && !ctx->error; n = n->next) {
+		unsigned i;
+
+		for (i = 1; i < n->regs_count; i++) {
+			reg = n->regs[i];
+			if ((reg->flags & IR3_REG_SSA) && (reg->instr == instr))
+				instr_assign_src(ctx, n, i, name);
+		}
 	}
-}
 
-static const struct ir3_visitor_funcs assign_visitor_funcs = {
-		.instr = ir3_visit_instr,
-		.dst_shader_input = ra_assign_dst_shader_input,
-		.dst_fanout = ra_assign_dst_fanout,
-		.dst_fanin = ra_assign_reg,
-		.dst = ra_assign_reg,
-		.src_fanout = ra_assign_src_fanout,
-		.src_fanin = ra_assign_src_fanin,
-		.src = ra_assign_reg,
-};
+	/* To simplify the neighbor logic, and to "avoid" dealing with
+	 * instructions which write more than one output, we actually
+	 * do register assignment for instructions that produce multiple
+	 * outputs on the fanout nodes and propagate up the assignment
+	 * to the actual instruction:
+	 */
+	if (is_meta(instr) && (instr->opc == OPC_META_FO)) {
+		struct ir3_instruction *src = ssa(instr->regs[1]);
+		debug_assert(name >= instr->fo.off);
+		if (src)
+			instr_assign(ctx, src, name - instr->fo.off);
+	}
+}
 
-static void ra_assign(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *assigner, int num)
+/* check neighbor list to see if it is already partially (or completely)
+ * assigned, in which case register block is already allocated and we
+ * just need to complete the assignment:
+ */
+static int check_partial_assignment(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr)
 {
-	struct ra_assign_visitor v = {
-			.base.funcs = &assign_visitor_funcs,
-			.ctx = ctx,
-			.num = num,
-	};
+	struct ir3_instruction *n;
+	int off = 0;
 
-	/* if we've already visited this instruction, bail now: */
-	if (ir3_instr_check_mark(assigner)) {
-		debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
-		if (assigner->regs[0]->num != (num & ~REG_HALF)) {
-			/* impossible situation, should have been resolved
-			 * at an earlier stage by inserting extra mov's:
-			 */
-			ctx->error = true;
+	debug_assert(!instr->cp.left);
+
+	for (n = instr; n; n = n->cp.right) {
+		struct ir3_register *dst = n->regs[0];
+		if (!(dst->flags & IR3_REG_SSA)) {
+			int name = dst->num - off;
+			debug_assert(name >= 0);
+			return name;
 		}
-		return;
+		off++;
 	}
 
-	ir3_visit_instr(&v.base, assigner);
+	return -1;
 }
 
-/*
- *
+/* allocate register name(s) for a list of neighboring instructions;
+ * instr should point to leftmost neighbor (head of list)
  */
-
-static void ir3_instr_ra(struct ir3_ra_ctx *ctx,
+static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx,
 		struct ir3_instruction *instr)
 {
+	struct ir3_instruction *n;
 	struct ir3_register *dst;
-	unsigned num;
+	int name;
+
+	debug_assert(!instr->cp.left);
 
-	/* skip over nop's */
 	if (instr->regs_count == 0)
 		return;
 
 	dst = instr->regs[0];
 
-	/* if we've already visited this instruction, bail now: */
-	if (instr->flags & IR3_INSTR_MARK)
+	/* for instructions w/ fanouts, do the actual register assignment
+	 * on the group of fanout neighbor nodes and propagate the reg
+	 * name back up to the texture instruction.
+	 */
+	if (dst->wrmask != 0x1)
 		return;
 
+	name = check_partial_assignment(ctx, instr);
+
 	/* allocate register(s): */
-	if (is_addr(instr)) {
-		num = instr->regs[2]->num;
+	if (name >= 0) {
+		/* already partially assigned, just finish the job */
+	} else if (is_addr(instr)) {
+		debug_assert(!instr->cp.right);
+		name = instr->regs[2]->num;
 	} else if (reg_gpr(dst)) {
-		struct ir3_ra_assignment a;
-		a = ra_calc(instr);
-		num = alloc_block(ctx, instr, a.num) + a.off;
+		int size;
+		/* number of consecutive registers to assign: */
+		size = ir3_neighbor_count(instr);
+		if (dst->wrmask != 0x1)
+			size = MAX2(size, ffs(~dst->wrmask) - 1);
+		name = alloc_block(ctx, instr, size);
 	} else if (dst->flags & IR3_REG_ADDR) {
+		debug_assert(!instr->cp.right);
 		dst->flags &= ~IR3_REG_ADDR;
-		num = regid(REG_A0, 0) | REG_HALF;
+		name = regid(REG_A0, 0) | REG_HALF;
 	} else {
+		debug_assert(!instr->cp.right);
 		/* predicate register (p0).. etc */
-		num = regid(REG_P0, 0);
-		debug_assert(dst->num == num);
+		name = regid(REG_P0, 0);
+		debug_assert(dst->num == name);
 	}
 
-	ra_assign(ctx, instr, num);
+	ra_assert(ctx, name >= 0);
+
+	for (n = instr; n && !ctx->error; n = n->cp.right) {
+		instr_assign(ctx, n, name);
+		name++;
+	}
 }
 
 static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 {
 	struct ir3_instruction *n;
 
-	ra_dump_list("before:\n", block->head);
-
-	if (!block->parent) {
-		unsigned i, j;
-		int base, off = output_base(ctx);
-
-		base = alloc_block(ctx, NULL, block->noutputs + off);
-
-		if (ctx->half_precision)
-			base |= REG_HALF;
-
-		for (i = 0; i < block->noutputs; i++)
-			if (block->outputs[i] && !is_kill(block->outputs[i]))
-				ra_assign(ctx, block->outputs[i], base + i + off);
-
-		if (ctx->type == SHADER_FRAGMENT) {
-			i = 0;
-			if (ctx->frag_face) {
-				/* if we have frag_face, it gets hr0.x */
-				ra_assign(ctx, block->inputs[i], REG_HALF | 0);
-				i += 4;
-			}
-			for (j = 0; i < block->ninputs; i++, j++)
-				if (block->inputs[i])
-					ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + j);
-		} else {
-			for (i = 0; i < block->ninputs; i++)
-				if (block->inputs[i])
-					ir3_instr_ra(ctx, block->inputs[i]);
+	/* frag shader inputs get pre-assigned, since we have some
+	 * constraints/unknowns about setup for some of these regs:
+	 */
+	if ((ctx->type == SHADER_FRAGMENT) && !block->parent) {
+		unsigned i = 0, j;
+		if (ctx->frag_face) {
+			/* if we have frag_face, it gets hr0.x */
+			instr_assign(ctx, block->inputs[i], REG_HALF | 0);
+			i += 4;
 		}
+		for (j = 0; i < block->ninputs; i++, j++)
+			if (block->inputs[i])
+				instr_assign(ctx, block->inputs[i], j);
 	}
 
-	ra_dump_list("after:\n", block->head);
+	ra_dump_list("-------\n", block->head);
 
-	/* then loop over instruction list and assign registers:
-	 */
-	for (n = block->head; n; n = n->next) {
+	for (n = block->head; n && !ctx->error; n = n->next) {
 		ra_dump_instr("ASSIGN: ", n);
-		ir3_instr_ra(ctx, n);
-		if (ctx->error)
-			return -1;
-		ra_dump_list("-------", block->head);
+		instr_alloc_and_assign(ctx, ir3_neighbor_first(n));
+		ra_dump_list("-------\n", block->head);
 	}
 
-	return 0;
+	return ctx->error ? -1 : 0;
 }
 
 int ir3_block_ra(struct ir3_block *block, enum shader_t type,
-		bool half_precision, bool frag_coord, bool frag_face)
+		bool frag_coord, bool frag_face)
 {
 	struct ir3_instruction *n;
 	struct ir3_ra_ctx ctx = {
 			.block = block,
 			.type = type,
-			.half_precision = half_precision,
 			.frag_coord = frag_coord,
 			.frag_face = frag_face,
 	};
@@ -672,6 +538,8 @@ int ir3_block_ra(struct ir3_block *block, enum shader_t type,
 
 	/* mark dst registers w/ SSA flag so we can see which
 	 * have been assigned so far:
+	 * NOTE: we really should set SSA flag consistently on
+	 * every dst register in the frontend.
 	 */
 	for (n = block->head; n; n = n->next)
 		if (n->regs_count > 0)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_visitor.h b/src/gallium/drivers/freedreno/ir3/ir3_visitor.h
deleted file mode 100644
index 1c60d1620ca..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_visitor.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2014 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#ifndef IR3_VISITOR_H_
-#define IR3_VISITOR_H_
-
-/**
- * Visitor which follows dst to src relationships between instructions,
- * first visiting the dst (writer) instruction, followed by src (reader)
- * instruction(s).
- *
- * TODO maybe we want multiple different visitors to walk the
- * graph in different ways?
- */
-
-struct ir3_visitor;
-
-typedef void (*ir3_visit_instr_func)(struct ir3_visitor *v,
-		struct ir3_instruction *instr);
-
-typedef void (*ir3_visit_reg_func)(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg);
-
-struct ir3_visitor_funcs {
-	ir3_visit_instr_func instr;  // TODO do we need??
-
-	ir3_visit_reg_func dst_shader_input;
-	ir3_visit_reg_func dst_block_input;
-	ir3_visit_reg_func dst_fanout;
-	ir3_visit_reg_func dst_fanin;
-	ir3_visit_reg_func dst;
-
-	ir3_visit_reg_func src_block_input;
-	ir3_visit_reg_func src_fanout;
-	ir3_visit_reg_func src_fanin;
-	ir3_visit_reg_func src;
-};
-
-struct ir3_visitor {
-	const struct ir3_visitor_funcs *funcs;
-	bool error;
-};
-
-#include "util/u_debug.h"
-
-static void visit_instr_dst(struct ir3_visitor *v,
-		struct ir3_instruction *instr)
-{
-	struct ir3_register *reg = instr->regs[0];
-
-	if (is_meta(instr)) {
-		switch (instr->opc) {
-		case OPC_META_INPUT:
-			if (instr->regs_count == 1)
-				v->funcs->dst_shader_input(v, instr, reg);
-			else
-				v->funcs->dst_block_input(v, instr, reg);
-			return;
-		case OPC_META_FO:
-			v->funcs->dst_fanout(v, instr, reg);
-			return;
-		case OPC_META_FI:
-			v->funcs->dst_fanin(v, instr, reg);
-			return;
-		default:
-			break;
-
-		}
-	}
-
-	v->funcs->dst(v, instr, reg);
-}
-
-static void visit_instr_src(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	if (is_meta(instr)) {
-		switch (instr->opc) {
-		case OPC_META_INPUT:
-			/* shader-input does not have a src, only block input: */
-			debug_assert(instr->regs_count == 2);
-			v->funcs->src_block_input(v, instr, reg);
-			return;
-		case OPC_META_FO:
-			v->funcs->src_fanout(v, instr, reg);
-			return;
-		case OPC_META_FI:
-			v->funcs->src_fanin(v, instr, reg);
-			return;
-		default:
-			break;
-
-		}
-	}
-
-	v->funcs->src(v, instr, reg);
-}
-
-static void ir3_visit_instr(struct ir3_visitor *v,
-		struct ir3_instruction *instr)
-{
-	struct ir3_instruction *n;
-
-	/* visit instruction that assigns value: */
-	if (instr->regs_count > 0)
-		visit_instr_dst(v, instr);
-
-	/* and of any following instructions which read that value: */
-	n = instr->next;
-	while (n && !v->error) {
-		unsigned i;
-
-		for (i = 1; i < n->regs_count; i++) {
-			struct ir3_register *reg = n->regs[i];
-			if ((reg->flags & IR3_REG_SSA) && (reg->instr == instr))
-				visit_instr_src(v, n, reg);
-		}
-
-		n = n->next;
-	}
-}
-
-static void ir3_visit_reg(struct ir3_visitor *v,
-		struct ir3_instruction *instr, struct ir3_register *reg)
-{
-	/* no-op */
-}
-
-#endif /* IR3_VISITOR_H_ */