10 files changed, 1025 insertions, 126 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index ba5851c6c82..a166b67d7cf 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -80,6 +80,8 @@ struct ir3 * ir3_create(struct ir3_compiler *compiler,
 	shader->noutputs = nout;
 	shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
 
+	list_inithead(&shader->block_list);
+
 	return shader;
 }
 
@@ -548,7 +550,6 @@ static int (*emit[])(struct ir3_instruction *instr, void *ptr,
 void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
 		uint32_t gpu_id)
 {
-	struct ir3_block *block = shader->block;
 	uint32_t *ptr, *dwords;
 
 	info->gpu_id        = gpu_id;
@@ -558,8 +559,10 @@ void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
 	info->instrs_count  = 0;
 	info->sizedwords    = 0;
 
-	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		info->sizedwords += 2;
+	list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			info->sizedwords += 2;
+		}
 	}
 
 	/* need a integer number of instruction "groups" (sets of 16
@@ -574,12 +577,14 @@ void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
 
 	ptr = dwords = calloc(4, info->sizedwords);
 
-	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		int ret = emit[instr->category](instr, dwords, info);
-		if (ret)
-			goto fail;
-		info->instrs_count += 1 + instr->repeat;
-		dwords += 2;
+	list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			int ret = emit[instr->category](instr, dwords, info);
+			if (ret)
+				goto fail;
+			info->instrs_count += 1 + instr->repeat;
+			dwords += 2;
+		}
 	}
 
 	return ptr;
@@ -617,7 +622,12 @@ static void insert_instr(struct ir3_block *block,
 struct ir3_block * ir3_block_create(struct ir3 *shader)
 {
 	struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
+#ifdef DEBUG
+	static uint32_t serialno = 0;
+	block->serialno = ++serialno;
+#endif
 	block->shader = shader;
+	list_inithead(&block->node);
 	list_inithead(&block->instr_list);
 	return block;
 }
@@ -688,10 +698,40 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 		int num, int flags)
 {
-	struct ir3_register *reg = reg_create(instr->block->shader, num, flags);
+	struct ir3 *shader = instr->block->shader;
+	struct ir3_register *reg = reg_create(shader, num, flags);
 #ifdef DEBUG
 	debug_assert(instr->regs_count < instr->regs_max);
 #endif
 	instr->regs[instr->regs_count++] = reg;
 	return reg;
 }
+
+void
+ir3_block_clear_mark(struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+		instr->flags &= ~IR3_INSTR_MARK;
+}
+
+void
+ir3_clear_mark(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		ir3_block_clear_mark(block);
+	}
+}
+
+/* note: this will destroy instr->depth, don't do it until after sched! */
+void
+ir3_count_instructions(struct ir3 *ir)
+{
+	unsigned ip = 0;
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			instr->ip = ip++;
+		}
+		block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+		block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+	}
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 95b866988b8..9c35a763d58 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -83,7 +83,8 @@ struct ir3_register {
 		 * before register assignment is done:
 		 */
 		IR3_REG_SSA    = 0x2000,   /* 'instr' is ptr to assigning instr */
-		IR3_REG_IA     = 0x4000,   /* meta-input dst is "assigned" */
+		IR3_REG_PHI_SRC= 0x4000,   /* phi src, regs[0]->instr points to phi */
+
 	} flags;
 	union {
 		/* normal registers:
@@ -187,6 +188,7 @@ struct ir3_instruction {
 			char inv;
 			char comp;
 			int  immed;
+			struct ir3_block *target;
 		} cat0;
 		struct {
 			type_t src_type, dst_type;
@@ -220,14 +222,14 @@ struct ir3_instruction {
 			int aid;
 		} fi;
 		struct {
-			struct ir3_block *if_block, *else_block;
-		} flow;
+			/* used to temporarily hold reference to nir_phi_instr
+			 * until we resolve the phi srcs
+			 */
+			void *nphi;
+		} phi;
 		struct {
 			struct ir3_block *block;
 		} inout;
-
-		/* XXX keep this as big as all other union members! */
-		uint32_t info[3];
 	};
 
 	/* transient values used during various algorithms: */
@@ -363,16 +365,40 @@ struct ir3 {
 	unsigned predicates_count, predicates_sz;
 	struct ir3_instruction **predicates;
 
-	struct ir3_block *block;
+	/* List of blocks: */
+	struct list_head block_list;
+
 	unsigned heap_idx;
 	struct ir3_heap_chunk *chunk;
 };
 
+typedef struct nir_block nir_block;
+
 struct ir3_block {
+	struct list_head node;
 	struct ir3 *shader;
-	/* only a single address register: */
-	struct ir3_instruction *address;
-	struct list_head instr_list;
+
+	nir_block *nblock;
+
+	struct list_head instr_list;  /* list of ir3_instruction */
+
+	/* each block has either one or two successors.. in case of
+	 * two successors, 'condition' decides which one to follow.
+	 * A block preceding an if/else has two successors.
+	 */
+	struct ir3_instruction *condition;
+	struct ir3_block *successors[2];
+
+	uint16_t start_ip, end_ip;
+
+	/* used for per-pass extra block data.  Mainly used right
+	 * now in RA step to track livein/liveout.
+	 */
+	void *bd;
+
+#ifdef DEBUG
+	uint32_t serialno;
+#endif
 };
 
 struct ir3 * ir3_create(struct ir3_compiler *compiler,
@@ -394,7 +420,6 @@ const char *ir3_instr_name(struct ir3_instruction *instr);
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 		int num, int flags);
 
-
 static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
 {
 	if (instr->flags & IR3_INSTR_MARK)
@@ -403,19 +428,10 @@ static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
 	return false;
 }
 
-static inline void ir3_clear_mark(struct ir3 *shader)
-{
-	/* TODO would be nice to drop the instruction array.. for
-	 * new compiler, _clear_mark() is all we use it for, and
-	 * we could probably manage a linked list instead..
-	 *
-	 * Also, we'll probably want to mark instructions within
-	 * a block, so tracking the list of instrs globally is
-	 * unlikely to be what we want.
-	 */
-	list_for_each_entry (struct ir3_instruction, instr, &shader->block->instr_list, node)
-		instr->flags &= ~IR3_INSTR_MARK;
-}
+void ir3_block_clear_mark(struct ir3_block *block);
+void ir3_clear_mark(struct ir3 *shader);
+
+void ir3_count_instructions(struct ir3 *ir);
 
 static inline int ir3_instr_regno(struct ir3_instruction *instr,
 		struct ir3_register *reg)
@@ -593,6 +609,22 @@ static inline bool reg_gpr(struct ir3_register *r)
 	return true;
 }
 
+static inline type_t half_type(type_t type)
+{
+	switch (type) {
+	case TYPE_F32: return TYPE_F16;
+	case TYPE_U32: return TYPE_U16;
+	case TYPE_S32: return TYPE_S16;
+	case TYPE_F16:
+	case TYPE_U16:
+	case TYPE_S16:
+		return type;
+	default:
+		assert(0);
+		return ~0;
+	}
+}
+
 /* some cat2 instructions (ie. those which are not float) can embed an
  * immediate:
  */
@@ -837,6 +869,15 @@ ir3_NOP(struct ir3_block *block)
 	return ir3_instr_create(block, 0, OPC_NOP);
 }
 
+#define INSTR0(CAT, name)                                                \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block)                                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create(block, CAT, OPC_##name);                        \
+	return instr;                                                        \
+}
+
 #define INSTR1(CAT, name)                                                \
 static inline struct ir3_instruction *                                   \
 ir3_##name(struct ir3_block *block,                                      \
@@ -880,7 +921,10 @@ ir3_##name(struct ir3_block *block,                                      \
 }
 
 /* cat0 instructions: */
+INSTR0(0, BR);
+INSTR0(0, JUMP);
 INSTR1(0, KILL);
+INSTR0(0, END);
 
 /* cat2 instructions, most 2 src but some 1 src: */
 INSTR2(2, ADD_F)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index f62a5ec2b26..4165e2d6aa7 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -48,8 +48,6 @@
 #include "ir3.h"
 
 
-static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
-
 struct ir3_compile {
 	struct ir3_compiler *compiler;
 
@@ -62,7 +60,10 @@ struct ir3_compile {
 	/* bitmask of which samplers are integer: */
 	uint16_t integer_s;
 
-	struct ir3_block *block;
+	struct ir3_block *block;      /* the current block */
+	struct ir3_block *in_block;   /* block created for shader inputs */
+
+	nir_function_impl *impl;
 
 	/* For fragment shaders, from the hw perspective the only
 	 * actual input is r0.xy position register passed to bary.f.
@@ -94,6 +95,11 @@ struct ir3_compile {
 	 */
 	struct hash_table *addr_ht;
 
+	/* maps nir_block to ir3_block, mostly for the purposes of
+	 * figuring out the blocks successors
+	 */
+	struct hash_table *block_ht;
+
 	/* for calculating input/output positions/linkages: */
 	unsigned next_inloc;
 
@@ -120,6 +126,9 @@ struct ir3_compile {
 };
 
 
+static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
+static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock);
+
 static struct nir_shader *to_nir(const struct tgsi_token *tokens)
 {
 	struct nir_shader_compiler_options options = {
@@ -148,6 +157,7 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens)
 
 		nir_lower_vars_to_ssa(s);
 		nir_lower_alu_to_scalar(s);
+		nir_lower_phis_to_scalar(s);
 
 		progress |= nir_copy_prop(s);
 		progress |= nir_opt_dce(s);
@@ -244,6 +254,8 @@ compile_init(struct ir3_compiler *compiler,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
 	ctx->addr_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
+	ctx->block_ht = _mesa_hash_table_create(ctx,
+			_mesa_hash_pointer, _mesa_key_pointer_equal);
 
 	lowered_tokens = lower_tgsi(ctx, tokens, so);
 	if (!lowered_tokens)
@@ -287,33 +299,206 @@ compile_free(struct ir3_compile *ctx)
 	ralloc_free(ctx);
 }
 
-
+/* global per-array information: */
 struct ir3_array {
 	unsigned length, aid;
+};
+
+/* per-block array state: */
+struct ir3_array_value {
+	/* TODO drop length/aid, and just have ptr back to ir3_array */
+	unsigned length, aid;
+	/* initial array element values are phi's, other than for the
+	 * entry block.  The phi src's get added later in a resolve step
+	 * after we have visited all the blocks, to account for back
+	 * edges in the cfg.
+	 */
+	struct ir3_instruction **phis;
+	/* current array element values (as block is processed).  When
+	 * the array phi's are resolved, it will contain the array state
+	 * at exit of block, so successor blocks can use it to add their
+	 * phi srcs.
+	 */
 	struct ir3_instruction *arr[];
 };
 
+/* track array assignments per basic block.  When an array is read
+ * outside of the same basic block, we can use NIR's dominance-frontier
+ * information to figure out where phi nodes are needed.
+ */
+struct ir3_nir_block_data {
+	unsigned foo;
+	/* indexed by array-id (aid): */
+	struct ir3_array_value *arrs[];
+};
+
+static struct ir3_nir_block_data *
+get_block_data(struct ir3_compile *ctx, struct ir3_block *block)
+{
+	if (!block->bd) {
+		struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) +
+				((ctx->num_arrays + 1) * sizeof(bd->arrs[0])));
+		block->bd = bd;
+	}
+	return block->bd;
+}
+
 static void
 declare_var(struct ir3_compile *ctx, nir_variable *var)
 {
 	unsigned length = glsl_get_length(var->type) * 4;  /* always vec4, at least with ttn */
-	struct ir3_array *arr = ralloc_size(ctx, sizeof(*arr) +
-			(length * sizeof(arr->arr[0])));
+	struct ir3_array *arr = ralloc(ctx, struct ir3_array);
 	arr->length = length;
 	arr->aid = ++ctx->num_arrays;
-	/* Some shaders end up reading array elements without first writing..
-	 * so initialize things to prevent null instr ptrs later:
-	 */
-	for (unsigned i = 0; i < length; i++)
-		arr->arr[i] = create_immed(ctx->block, 0);
 	_mesa_hash_table_insert(ctx->var_ht, var, arr);
 }
 
-static struct ir3_array *
+static nir_block *
+nir_block_pred(nir_block *block)
+{
+	assert(block->predecessors->entries < 2);
+	if (block->predecessors->entries == 0)
+		return NULL;
+	return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key;
+}
+
+static struct ir3_array_value *
 get_var(struct ir3_compile *ctx, nir_variable *var)
 {
 	struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var);
-	return entry->data;
+	struct ir3_block *block = ctx->block;
+	struct ir3_nir_block_data *bd = get_block_data(ctx, block);
+	struct ir3_array *arr = entry->data;
+
+	if (!bd->arrs[arr->aid]) {
+		struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) +
+				(arr->length * sizeof(av->arr[0])));
+		struct ir3_array_value *defn = NULL;
+		nir_block *pred_block;
+
+		av->length = arr->length;
+		av->aid = arr->aid;
+
+		/* For loops, we have to consider that we have not visited some
+		 * of the blocks who should feed into the phi (ie. back-edges in
+		 * the cfg).. for example:
+		 *
+		 *   loop {
+		 *      block { load_var; ... }
+		 *      if then block {} else block {}
+		 *      block { store_var; ... }
+		 *      if then block {} else block {}
+		 *      block {...}
+		 *   }
+		 *
+		 * We can skip the phi if we can chase the block predecessors
+		 * until finding the block previously defining the array without
+		 * crossing a block that has more than one predecessor.
+		 *
+		 * Otherwise create phi's and resolve them as a post-pass after
+		 * all the blocks have been visited (to handle back-edges).
+		 */
+
+		for (pred_block = block->nblock;
+				pred_block && (pred_block->predecessors->entries < 2) && !defn;
+				pred_block = nir_block_pred(pred_block)) {
+			struct ir3_block *pblock = get_block(ctx, pred_block);
+			struct ir3_nir_block_data *pbd = pblock->bd;
+			if (!pbd)
+				continue;
+			defn = pbd->arrs[arr->aid];
+		}
+
+		if (defn) {
+			/* only one possible definer: */
+			for (unsigned i = 0; i < arr->length; i++)
+				av->arr[i] = defn->arr[i];
+		} else if (pred_block) {
+			/* not the first block, and multiple potential definers: */
+			av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0]));
+
+			for (unsigned i = 0; i < arr->length; i++) {
+				struct ir3_instruction *phi;
+
+				phi = ir3_instr_create2(block, -1, OPC_META_PHI,
+						1 + ctx->impl->num_blocks);
+				ir3_reg_create(phi, 0, 0);         /* dst */
+
+				/* phi's should go at head of block: */
+				list_delinit(&phi->node);
+				list_add(&phi->node, &block->instr_list);
+
+				av->phis[i] = av->arr[i] = phi;
+			}
+		} else {
+			/* Some shaders end up reading array elements without
+			 * first writing.. so initialize things to prevent null
+			 * instr ptrs later:
+			 */
+			for (unsigned i = 0; i < arr->length; i++)
+				av->arr[i] = create_immed(block, 0);
+		}
+
+		bd->arrs[arr->aid] = av;
+	}
+
+	return bd->arrs[arr->aid];
+}
+
+static void
+add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock,
+		struct ir3_array_value *av, BITSET_WORD *visited)
+{
+	struct ir3_block *block;
+	struct ir3_nir_block_data *bd;
+
+	if (BITSET_TEST(visited, nblock->index))
+		return;
+
+	BITSET_SET(visited, nblock->index);
+
+	block = get_block(ctx, nblock);
+	bd = block->bd;
+
+	if (bd && bd->arrs[av->aid]) {
+		struct ir3_array_value *dav = bd->arrs[av->aid];
+		for (unsigned i = 0; i < av->length; i++) {
+			ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr =
+					dav->arr[i];
+		}
+	} else {
+		/* didn't find defn, recurse predecessors: */
+		struct set_entry *entry;
+		set_foreach(nblock->predecessors, entry) {
+			add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
+		}
+	}
+}
+
+static void
+resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block)
+{
+	struct ir3_nir_block_data *bd = block->bd;
+	unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks);
+
+	if (!bd)
+		return;
+
+	/* TODO use nir dom_frontier to help us with this? */
+
+	for (unsigned i = 1; i <= ctx->num_arrays; i++) {
+		struct ir3_array_value *av = bd->arrs[i];
+		BITSET_WORD visited[bitset_words];
+		struct set_entry *entry;
+
+		if (!(av && av->phis))
+			continue;
+
+		memset(visited, 0, sizeof(visited));
+		set_foreach(block->nblock->predecessors, entry) {
+			add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
+		}
+	}
 }
 
 /* allocate a n element value array (to be populated by caller) and
@@ -417,6 +602,22 @@ get_addr(struct ir3_compile *ctx, struct ir3_instruction *src)
 }
 
 static struct ir3_instruction *
+get_predicate(struct ir3_compile *ctx, struct ir3_instruction *src)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *cond;
+
+	/* NOTE: only cmps.*.* can write p0.x: */
+	cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
+	cond->cat2.condition = IR3_COND_NE;
+
+	/* condition always goes in predicate register: */
+	cond->regs[0]->num = regid(REG_P0, 0);
+
+	return cond;
+}
+
+static struct ir3_instruction *
 create_uniform(struct ir3_compile *ctx, unsigned n)
 {
 	struct ir3_instruction *mov;
@@ -1029,7 +1230,7 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 {
 	nir_deref_var *dvar = intr->variables[0];
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array *arr = get_var(ctx, dvar->var);
+	struct ir3_array_value *arr = get_var(ctx, dvar->var);
 
 	compile_assert(ctx, dvar->deref.child &&
 		(dvar->deref.child->deref_type == nir_deref_type_array));
@@ -1069,7 +1270,7 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 {
 	nir_deref_var *dvar = intr->variables[0];
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array *arr = get_var(ctx, dvar->var);
+	struct ir3_array_value *arr = get_var(ctx, dvar->var);
 	struct ir3_instruction **src;
 
 	compile_assert(ctx, dvar->deref.child &&
@@ -1245,6 +1446,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			cond = create_immed(b, 1);
 		}
 
+		/* NOTE: only cmps.*.* can write p0.x: */
 		cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
 		cond->cat2.condition = IR3_COND_NE;
 
@@ -1558,6 +1760,71 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
 }
 
 static void
+emit_phi(struct ir3_compile *ctx, nir_phi_instr *nphi)
+{
+	struct ir3_instruction *phi, **dst;
+
+	/* NOTE: phi's should be lowered to scalar at this point */
+	compile_assert(ctx, nphi->dest.ssa.num_components == 1);
+
+	dst = get_dst(ctx, &nphi->dest, 1);
+
+	phi = ir3_instr_create2(ctx->block, -1, OPC_META_PHI,
+			1 + exec_list_length(&nphi->srcs));
+	ir3_reg_create(phi, 0, 0);         /* dst */
+	phi->phi.nphi = nphi;
+
+	dst[0] = phi;
+}
+
+/* phi instructions are left partially constructed.  We don't resolve
+ * their srcs until the end of the block, since (eg. loops) one of
+ * the phi's srcs might be defined after the phi due to back edges in
+ * the CFG.
+ */
+static void
+resolve_phis(struct ir3_compile *ctx, struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		nir_phi_instr *nphi;
+
+		/* phi's only come at start of block: */
+		if (!(is_meta(instr) && (instr->opc == OPC_META_PHI)))
+			break;
+
+		if (!instr->phi.nphi)
+			break;
+
+		nphi = instr->phi.nphi;
+		instr->phi.nphi = NULL;
+
+		foreach_list_typed(nir_phi_src, nsrc, node, &nphi->srcs) {
+			struct ir3_instruction *src = get_src(ctx, &nsrc->src)[0];
+			ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+		}
+	}
+
+	resolve_array_phis(ctx, block);
+}
+
+static void
+emit_jump(struct ir3_compile *ctx, nir_jump_instr *jump)
+{
+	switch (jump->type) {
+	case nir_jump_break:
+	case nir_jump_continue:
+		/* I *think* we can simply just ignore this, and use the
+		 * successor block link to figure out where we need to
+		 * jump to for break/continue
+		 */
+		break;
+	default:
+		compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
+		break;
+	}
+}
+
+static void
 emit_instr(struct ir3_compile *ctx, nir_instr *instr)
 {
 	switch (instr->type) {
@@ -1590,45 +1857,112 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr)
 		}
 		break;
 	}
-	case nir_instr_type_call:
-	case nir_instr_type_jump:
 	case nir_instr_type_phi:
+		emit_phi(ctx, nir_instr_as_phi(instr));
+		break;
+	case nir_instr_type_jump:
+		emit_jump(ctx, nir_instr_as_jump(instr));
+		break;
+	case nir_instr_type_call:
 	case nir_instr_type_parallel_copy:
 		compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
 		break;
 	}
 }
 
+static struct ir3_block *
+get_block(struct ir3_compile *ctx, nir_block *nblock)
+{
+	struct ir3_block *block;
+	struct hash_entry *entry;
+	entry = _mesa_hash_table_search(ctx->block_ht, nblock);
+	if (entry)
+		return entry->data;
+
+	block = ir3_block_create(ctx->ir);
+	block->nblock = nblock;
+	_mesa_hash_table_insert(ctx->block_ht, nblock, block);
+
+	return block;
+}
+
 static void
-emit_block(struct ir3_compile *ctx, nir_block *block)
+emit_block(struct ir3_compile *ctx, nir_block *nblock)
 {
-	nir_foreach_instr(block, instr) {
+	struct ir3_block *block = get_block(ctx, nblock);
+
+	for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
+		if (nblock->successors[i]) {
+			block->successors[i] =
+				get_block(ctx, nblock->successors[i]);
+		}
+	}
+
+	ctx->block = block;
+	list_addtail(&block->node, &ctx->ir->block_list);
+
+	nir_foreach_instr(nblock, instr) {
 		emit_instr(ctx, instr);
 		if (ctx->error)
 			return;
 	}
 }
 
+static void emit_cf_list(struct ir3_compile *ctx, struct exec_list *list);
+
 static void
-emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
+emit_if(struct ir3_compile *ctx, nir_if *nif)
+{
+	struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0];
+
+	ctx->block->condition =
+		get_predicate(ctx, ir3_b2n(condition->block, condition));
+
+	emit_cf_list(ctx, &nif->then_list);
+	emit_cf_list(ctx, &nif->else_list);
+}
+
+static void
+emit_loop(struct ir3_compile *ctx, nir_loop *nloop)
+{
+	emit_cf_list(ctx, &nloop->body);
+}
+
+static void
+emit_cf_list(struct ir3_compile *ctx, struct exec_list *list)
 {
-	foreach_list_typed(nir_cf_node, node, node, &impl->body) {
+	foreach_list_typed(nir_cf_node, node, node, list) {
 		switch (node->type) {
 		case nir_cf_node_block:
 			emit_block(ctx, nir_cf_node_as_block(node));
 			break;
 		case nir_cf_node_if:
+			emit_if(ctx, nir_cf_node_as_if(node));
+			break;
 		case nir_cf_node_loop:
+			emit_loop(ctx, nir_cf_node_as_loop(node));
+			break;
 		case nir_cf_node_function:
 			compile_error(ctx, "TODO\n");
 			break;
 		}
-		if (ctx->error)
-			return;
 	}
 }
 
 static void
+emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
+{
+	emit_cf_list(ctx, &impl->body);
+	emit_block(ctx, impl->end_block);
+
+	/* at this point, we should have a single empty block,
+	 * into which we emit the 'end' instruction.
+	 */
+	compile_assert(ctx, list_empty(&ctx->block->instr_list));
+	ir3_END(ctx->block);
+}
+
+static void
 setup_input(struct ir3_compile *ctx, nir_variable *in)
 {
 	struct ir3_shader_variant *so = ctx->so;
@@ -1787,8 +2121,19 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
 static void
 emit_instructions(struct ir3_compile *ctx)
 {
-	unsigned ninputs  = exec_list_length(&ctx->s->inputs) * 4;
-	unsigned noutputs = exec_list_length(&ctx->s->outputs) * 4;
+	unsigned ninputs, noutputs;
+	nir_function_impl *fxn = NULL;
+
+	/* Find the main function: */
+	nir_foreach_overload(ctx->s, overload) {
+		compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
+		compile_assert(ctx, overload->impl);
+		fxn = overload->impl;
+		break;
+	}
+
+	ninputs  = exec_list_length(&ctx->s->inputs) * 4;
+	noutputs = exec_list_length(&ctx->s->outputs) * 4;
 
 	/* we need to allocate big enough outputs array so that
 	 * we can stuff the kill's at the end.  Likewise for vtx
@@ -1801,8 +2146,11 @@ emit_instructions(struct ir3_compile *ctx)
 	}
 
 	ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
-	ctx->block = ir3_block_create(ctx->ir);
-	ctx->ir->block = ctx->block;
+
+	/* Create inputs in first block: */
+	ctx->block = get_block(ctx, fxn->start_block);
+	ctx->in_block = ctx->block;
+	list_addtail(&ctx->block->node, &ctx->ir->block_list);
 
 	if (ctx->so->type == SHADER_FRAGMENT) {
 		ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill);
@@ -1838,13 +2186,12 @@ emit_instructions(struct ir3_compile *ctx)
 		declare_var(ctx, var);
 	}
 
-	/* Find the main function and emit the body: */
-	nir_foreach_overload(ctx->s, overload) {
-		compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
-		compile_assert(ctx, overload->impl);
-		emit_function(ctx, overload->impl);
-		if (ctx->error)
-			return;
+	/* And emit the body: */
+	ctx->impl = fxn;
+	emit_function(ctx, fxn);
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		resolve_phis(ctx, block);
 	}
 }
 
@@ -1906,13 +2253,13 @@ fixup_frag_inputs(struct ir3_compile *ctx)
 	so->pos_regid = regid;
 
 	/* r0.x */
-	instr = create_input(ctx->block, NULL, ir->ninputs);
+	instr = create_input(ctx->in_block, NULL, ir->ninputs);
 	instr->regs[0]->num = regid++;
 	inputs[ir->ninputs++] = instr;
 	ctx->frag_pos->regs[1]->instr = instr;
 
 	/* r0.y */
-	instr = create_input(ctx->block, NULL, ir->ninputs);
+	instr = create_input(ctx->in_block, NULL, ir->ninputs);
 	instr->regs[0]->num = regid++;
 	inputs[ir->ninputs++] = instr;
 	ctx->frag_pos->regs[2]->instr = instr;
@@ -1998,6 +2345,10 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 				out = out->regs[1]->instr;
 				out->regs[0]->flags |= IR3_REG_HALF;
 			}
+
+			if (out->category == 1) {
+				out->cat1.dst_type = half_type(out->cat1.dst_type);
+			}
 		}
 	}
 
@@ -2058,6 +2409,11 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 	ir3_legalize(ir, &so->has_samp, &max_bary);
 
+	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+		printf("AFTER LEGALIZE:\n");
+		ir3_print(ir);
+	}
+
 	/* fixup input/outputs: */
 	for (i = 0; i < so->outputs_count; i++) {
 		so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index a477bd4b237..8c7c80f7aae 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -54,6 +54,13 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
 		/* TODO: remove this hack: */
 		if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO))
 			return false;
+		/* TODO: we currently don't handle left/right neighbors
+		 * very well when inserting parallel-copies into phi..
+		 * to avoid problems don't eliminate a mov coming out
+		 * of phi..
+		 */
+		if (is_meta(src_instr) && (src_instr->opc == OPC_META_PHI))
+			return false;
 		return true;
 	}
 	return false;
@@ -390,7 +397,7 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags)
 void
 ir3_cp(struct ir3 *ir)
 {
-	ir3_clear_mark(ir->block->shader);
+	ir3_clear_mark(ir);
 
 	for (unsigned i = 0; i < ir->noutputs; i++) {
 		if (ir->outputs[i]) {
@@ -400,4 +407,9 @@ ir3_cp(struct ir3 *ir)
 			ir->outputs[i] = out;
 		}
 	}
+
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		if (block->condition)
+			block->condition = instr_cp(block->condition, NULL);
+	}
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index 6fc8b1762ff..3a108243479 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -134,6 +134,8 @@ remove_unused_by_block(struct ir3_block *block)
 {
 	list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
 		if (!ir3_instr_check_mark(instr)) {
+			if (is_flow(instr) && (instr->opc == OPC_END))
+				continue;
 			/* mark it, in case it is input, so we can
 			 * remove unused inputs:
 			 */
@@ -149,13 +151,21 @@ ir3_depth(struct ir3 *ir)
 {
 	unsigned i;
 
-	ir3_clear_mark(ir->block->shader);
+	ir3_clear_mark(ir);
 	for (i = 0; i < ir->noutputs; i++)
 		if (ir->outputs[i])
 			ir3_instr_depth(ir->outputs[i]);
 
+	/* We also need to account for if-condition: */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		if (block->condition)
+			ir3_instr_depth(block->condition);
+	}
+
 	/* mark un-used instructions: */
-	remove_unused_by_block(ir->block);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		remove_unused_by_block(block);
+	}
 
 	/* cleanup unused inputs: */
 	for (i = 0; i < ir->ninputs; i++) {
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c
index 1fe09cc11e5..70d9b08e019 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_group.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c
@@ -116,6 +116,10 @@ restart:
 			conflict = conflicts(instr->cp.left, left) ||
 				conflicts(instr->cp.right, right);
 
+			/* RA can't yet deal very well w/ group'd phi's: */
+			if (is_meta(instr) && (instr->opc == OPC_META_PHI))
+				conflict = true;
+
 			/* we also can't have an instr twice in the group: */
 			for (j = i + 1; (j < n) && !conflict; j++)
 				if (ops->get(arr, j) == instr)
@@ -226,7 +230,6 @@ find_neighbors(struct ir3 *ir)
 	for (i = 0; i < ir->noutputs; i += 4)
 		group_n(&arr_ops_out, &ir->outputs[i], 4);
 
-
 	for (i = 0; i < ir->noutputs; i++) {
 		if (ir->outputs[i]) {
 			struct ir3_instruction *instr = ir->outputs[i];
@@ -238,6 +241,6 @@ find_neighbors(struct ir3 *ir)
 void
 ir3_group(struct ir3 *ir)
 {
-	ir3_clear_mark(ir->block->shader);
+	ir3_clear_mark(ir);
 	find_neighbors(ir);
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index 34055f4c612..f4a4223ae17 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -42,15 +42,28 @@
  */
 
 struct ir3_legalize_ctx {
-	struct ir3_block *block;
 	bool has_samp;
 	int max_bary;
 };
 
+/* We want to evaluate each block from the position of any other
+ * predecessor block, in order that the flags set are the union
+ * of all possible program paths.  For stopping condition, we
+ * want to stop when the pair of <pred-block, current-block> has
+ * been visited already.
+ *
+ * XXX is that completely true?  We could have different needs_xyz
+ * flags set depending on path leading to pred-block.. we could
+ * do *most* of this based on chasing src instructions ptrs (and
+ * following all phi srcs).. except the write-after-read hazzard.
+ *
+ * For now we just set ss/sy flag on first instruction on block,
+ * and handle everything within the block as before.
+ */
+
 static void
-legalize(struct ir3_legalize_ctx *ctx)
+legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 {
-	struct ir3_block *block = ctx->block;
 	struct ir3_instruction *last_input = NULL;
 	struct ir3_instruction *last_rel = NULL;
 	struct list_head instr_list;
@@ -203,6 +216,7 @@ legalize(struct ir3_legalize_ctx *ctx)
 			ir3_reg_create(baryf, regid(0, 0), 0);
 
 			/* insert the dummy bary.f after last_input: */
+			list_delinit(&baryf->node);
 			list_add(&baryf->node, &last_input->node);
 
 			last_input = baryf;
@@ -213,23 +227,177 @@ legalize(struct ir3_legalize_ctx *ctx)
 	if (last_rel)
 		last_rel->flags |= IR3_INSTR_UL;
 
-	/* create/add 'end' instruction: */
-	ir3_instr_create(block, 0, OPC_END);
-
 	list_first_entry(&block->instr_list, struct ir3_instruction, node)
 		->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
 }
 
+/* NOTE: branch instructions are always the last instruction(s)
+ * in the block.  We take advantage of this as we resolve the
+ * branches, since "if (foo) break;" constructs turn into
+ * something like:
+ *
+ *   block3 {
+ *   	...
+ *   	0029:021: mov.s32s32 r62.x, r1.y
+ *   	0082:022: br !p0.x, target=block5
+ *   	0083:023: br p0.x, target=block4
+ *   	// succs: if _[0029:021: mov.s32s32] block4; else block5;
+ *   }
+ *   block4 {
+ *   	0084:024: jump, target=block6
+ *   	// succs: block6;
+ *   }
+ *   block5 {
+ *   	0085:025: jump, target=block7
+ *   	// succs: block7;
+ *   }
+ *
+ * ie. only instruction in block4/block5 is a jump, so when
+ * resolving branches we can easily detect this by checking
+ * that the first instruction in the target block is itself
+ * a jump, and setup the br directly to the jump's target
+ * (and strip back out the now unreached jump)
+ *
+ * TODO sometimes we end up with things like:
+ *
+ *    br !p0.x, #2
+ *    br p0.x, #12
+ *    add.u r0.y, r0.y, 1
+ *
+ * If we swapped the order of the branches, we could drop one.
+ */
+static struct ir3_block *
+resolve_dest_block(struct ir3_block *block)
+{
+	/* special case for last block: */
+	if (!block->successors[0])
+		return block;
+
+	/* NOTE that we may or may not have inserted the jump
+	 * in the target block yet, so conditions to resolve
+	 * the dest to the dest block's successor are:
+	 *
+	 *   (1) successor[1] == NULL &&
+	 *   (2) (block-is-empty || only-instr-is-jump)
+	 */
+	if (block->successors[1] == NULL) {
+		if (list_empty(&block->instr_list)) {
+			return block->successors[0];
+		} else if (list_length(&block->instr_list) == 1) {
+			struct ir3_instruction *instr = list_first_entry(
+					&block->instr_list, struct ir3_instruction, node);
+			if (is_flow(instr) && (instr->opc == OPC_JUMP))
+				return block->successors[0];
+		}
+	}
+	return block;
+}
+
+static bool
+resolve_jump(struct ir3_instruction *instr)
+{
+	struct ir3_block *tblock =
+		resolve_dest_block(instr->cat0.target);
+	struct ir3_instruction *target;
+
+	if (tblock != instr->cat0.target) {
+		list_delinit(&instr->cat0.target->node);
+		instr->cat0.target = tblock;
+		return true;
+	}
+
+	target = list_first_entry(&tblock->instr_list,
+				struct ir3_instruction, node);
+
+	if ((!target) || (target->ip == (instr->ip + 1))) {
+		list_delinit(&instr->node);
+		return true;
+	} else {
+		instr->cat0.immed =
+			(int)target->ip - (int)instr->ip;
+	}
+	return false;
+}
+
+/* resolve jumps, removing jumps/branches to immediately following
+ * instruction which we end up with from earlier stages.  Since
+ * removing an instruction can invalidate earlier instruction's
+ * branch offsets, we need to do this iteratively until no more
+ * branches are removed.
+ */
+static bool
+resolve_jumps(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+			if (is_flow(instr) && instr->cat0.target)
+				if (resolve_jump(instr))
+					return true;
+
+	return false;
+}
+
+/* we want to mark points where divergent flow control re-converges
+ * with (jp) flags.  For now, since we don't do any optimization for
+ * things that start out as a 'do {} while()', re-convergence points
+ * will always be a branch or jump target.  Note that this is overly
+ * conservative, since unconditional jump targets are not convergence
+ * points, we are just assuming that the other path to reach the jump
+ * target was divergent.  If we were clever enough to optimize the
+ * jump at end of a loop back to a conditional branch into a single
+ * conditional branch, ie. like:
+ *
+ *    add.f r1.w, r0.x, (neg)(r)c2.x   <= loop start
+ *    mul.f r1.z, r1.z, r0.x
+ *    mul.f r1.y, r1.y, r0.x
+ *    mul.f r0.z, r1.x, r0.x
+ *    mul.f r0.w, r0.y, r0.x
+ *    cmps.f.ge r0.x, (r)c2.y, (r)r1.w
+ *    add.s r0.x, (r)r0.x, (r)-1
+ *    sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x
+ *    cmps.f.eq p0.x, r0.x, c3.y
+ *    mov.f32f32 r0.x, r1.w
+ *    mov.f32f32 r0.y, r0.w
+ *    mov.f32f32 r1.x, r0.z
+ *    (rpt2)nop
+ *    br !p0.x, #-13
+ *    (jp)mul.f r0.x, c263.y, r1.y
+ *
+ * Then we'd have to be more clever, as the convergence point is no
+ * longer a branch or jump target.
+ */
+static void
+mark_convergence_points(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			if (is_flow(instr) && instr->cat0.target) {
+				struct ir3_instruction *target =
+					list_first_entry(&instr->cat0.target->instr_list,
+							struct ir3_instruction, node);
+				target->flags |= IR3_INSTR_JP;
+			}
+		}
+	}
+}
+
 void
 ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary)
 {
 	struct ir3_legalize_ctx ctx = {
-			.block = ir->block,
 			.max_bary = -1,
 	};
 
-	legalize(&ctx);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		legalize_block(&ctx, block);
+	}
 
 	*has_samp = ctx.has_samp;
 	*max_bary = ctx.max_bary;
+
+	do {
+		ir3_count_instructions(ir);
+	} while(resolve_jumps(ir));
+
+	mark_convergence_points(ir);
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
index 965c834b8aa..f377982dd5e 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -137,6 +137,16 @@ tab(int lvl)
 		printf("\t");
 }
 
+static uint32_t
+block_id(struct ir3_block *block)
+{
+#ifdef DEBUG
+	return block->serialno;
+#else
+	return (uint32_t)(uint64_t)block;
+#endif
+}
+
 static void
 print_instr(struct ir3_instruction *instr, int lvl)
 {
@@ -173,6 +183,14 @@ print_instr(struct ir3_instruction *instr, int lvl)
 		}
 	}
 
+	if (is_flow(instr) && instr->cat0.target) {
+		/* the predicate register src is implied: */
+		if (instr->opc == OPC_BR) {
+			printf(" %sp0.x", instr->cat0.inv ? "!" : "");
+		}
+		printf(", target=block%u", block_id(instr->cat0.target));
+	}
+
 	printf("\n");
 }
 
@@ -184,19 +202,31 @@ void ir3_print_instr(struct ir3_instruction *instr)
 static void
 print_block(struct ir3_block *block, int lvl)
 {
-	tab(lvl); printf("block {\n");
+	tab(lvl); printf("block%u {\n", block_id(block));
 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 		print_instr(instr, lvl+1);
 	}
+	if (block->successors[1]) {
+		/* leading into if/else: */
+		tab(lvl+1);
+		printf("/* succs: if _[");
+		print_instr_name(block->condition);
+		printf("] block%u; else block%u; */\n",
+				block_id(block->successors[0]),
+				block_id(block->successors[1]));
+	} else if (block->successors[0]) {
+		tab(lvl+1);
+		printf("/* succs: block%u; */\n",
+				block_id(block->successors[0]));
+	}
 	tab(lvl); printf("}\n");
 }
 
 void
 ir3_print(struct ir3 *ir)
 {
-	struct ir3_block *block = ir->block;
-
-	print_block(block, 0);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+		print_block(block, 0);
 
 	for (unsigned i = 0; i < ir->noutputs; i++) {
 		if (!ir->outputs[i])
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 394c63f646d..359cd9a0d5d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -29,6 +29,7 @@
 #include "util/u_math.h"
 #include "util/register_allocate.h"
 #include "util/ralloc.h"
+#include "util/bitset.h"
 
 #include "ir3.h"
 #include "ir3_compiler.h"
@@ -255,6 +256,14 @@ struct ir3_ra_ctx {
 	unsigned *def, *use;     /* def/use table */
 };
 
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+	BITSET_WORD *def;        /* variables defined before used in block */
+	BITSET_WORD *use;        /* variables used before defined in block */
+	BITSET_WORD *livein;     /* which defs reach entry point of block */
+	BITSET_WORD *liveout;    /* which defs reach exit point of block */
+};
+
 static bool
 is_half(struct ir3_instruction *instr)
 {
@@ -369,7 +378,39 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 			*sz = util_last_bit(instr->regs[0]->wrmask);
 		}
 		*off = 0;
-		return instr;
+		d = instr;
+	}
+
+	if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
+		struct ir3_instruction *phi = d->regs[0]->instr;
+		struct ir3_instruction *dd;
+		int dsz, doff;
+
+		dd = get_definer(phi, &dsz, &doff);
+
+		*sz = MAX2(*sz, dsz);
+		*off = doff;
+
+		if (dd->ip < d->ip) {
+			d = dd;
+		}
+	}
+
+	if (is_meta(d) && (d->opc == OPC_META_PHI)) {
+		/* we have already inserted parallel-copies into
+		 * the phi, so we don't need to chase definers
+		 */
+		struct ir3_register *src;
+
+		/* note: don't use foreach_ssa_src as this gets called once
+		 * while assigning regs (which clears SSA flag)
+		 */
+		foreach_src(src, d) {
+			if (!src->instr)
+				continue;
+			if (src->instr->ip < d->ip)
+				d = src->instr;
+		}
 	}
 
 	if (is_meta(d) && (d->opc == OPC_META_FO)) {
@@ -396,13 +437,11 @@ static void
 ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 {
 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		instr->ip = ctx->instr_cnt++;
-	}
-
-	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 		struct ir3_instruction *defn;
 		int cls, sz, off;
 
+		ctx->instr_cnt++;
+
 		if (instr->regs_count == 0)
 			continue;
 
@@ -431,8 +470,11 @@ static void
 ra_init(struct ir3_ra_ctx *ctx)
 {
 	ir3_clear_mark(ctx->ir);
+	ir3_count_instructions(ctx->ir);
 
-	ra_block_name_instructions(ctx, ctx->ir->block);
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_name_instructions(ctx, block);
+	}
 
 	/* figure out the base register name for each class.  The
 	 * actual ra name is class_base[cls] + instr->name;
@@ -448,6 +490,16 @@ ra_init(struct ir3_ra_ctx *ctx)
 	ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
 }
 
+static unsigned
+ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+{
+	unsigned name;
+	debug_assert(cls >= 0);
+	name = ctx->class_base[cls] + defn->name;
+	debug_assert(name < ctx->alloc_count);
+	return name;
+}
+
 static void
 ra_destroy(struct ir3_ra_ctx *ctx)
 {
@@ -457,6 +509,18 @@ ra_destroy(struct ir3_ra_ctx *ctx)
 static void
 ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 {
+	struct ir3_ra_block_data *bd;
+	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+
+	bd = rzalloc(ctx->g, struct ir3_ra_block_data);
+
+	bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+
+	block->bd = bd;
+
 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 		struct ir3_instruction *src;
 
@@ -474,7 +538,15 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		 * fanin: used to collect values from lower class and assemble
 		 *     them together into a higher class, for example arguments
 		 *     to texture sample instructions;  We consider these to be
-		 *     defined at the fanin node.
+		 *     defined at the earliest fanin source.
+		 *
+		 * phi: used to merge values from different flow control paths
+		 *     to the same reg.  Consider defined at earliest phi src,
+		 *     and update all the other phi src's (which may come later
+		 *     in the program) as users to extend the var's live range.
+		 *
+		 * Most of this, other than phi, is completely handled in the
+		 * get_definer() helper.
 		 *
 		 * In either case, we trace the instruction back to the original
 		 * definer and consider that as the def/use ip.
@@ -491,11 +563,15 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 				 */
 				cls = size_to_class(sz, is_half(defn));
 				if (cls >= 0) {
-					unsigned name = ctx->class_base[cls] + defn->name;
+					unsigned name = ra_name(ctx, cls, defn);
+
 					ctx->def[name] = defn->ip;
 					ctx->use[name] = defn->ip;
 
-					debug_assert(name < ctx->alloc_count);
+					/* since we are in SSA at this point: */
+					debug_assert(!BITSET_TEST(bd->use, name));
+
+					BITSET_SET(bd->def, name);
 
 					if (is_half(defn)) {
 						ra_set_node_class(ctx->g, name,
@@ -504,6 +580,24 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 						ra_set_node_class(ctx->g, name,
 								ctx->set->classes[cls]);
 					}
+
+					/* extend the live range for phi srcs, which may come
+					 * from the bottom of the loop
+					 */
+					if (defn->regs[0]->flags & IR3_REG_PHI_SRC) {
+						struct ir3_instruction *phi = defn->regs[0]->instr;
+						foreach_ssa_src(src, phi) {
+							/* if src is after phi, then we need to extend
+							 * the liverange to the end of src's block:
+							 */
+							if (src->ip > phi->ip) {
+								struct ir3_instruction *last =
+									list_last_entry(&src->block->instr_list,
+										struct ir3_instruction, node);
+								ctx->use[name] = MAX2(ctx->use[name], last->ip);
+							}
+						}
+					}
 				}
 			}
 		}
@@ -516,12 +610,59 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 				srcdefn = get_definer(src, &sz, &off);
 				cls = size_to_class(sz, is_half(srcdefn));
 				if (cls >= 0) {
-					unsigned name = ctx->class_base[cls] + srcdefn->name;
-					ctx->use[name] = instr->ip;
+					unsigned name = ra_name(ctx, cls, srcdefn);
+					ctx->use[name] = MAX2(ctx->use[name], instr->ip);
+					if (!BITSET_TEST(bd->def, name))
+						BITSET_SET(bd->use, name);
+				}
+			}
+		}
+	}
+}
+
+static bool
+ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
+{
+	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+	bool progress = false;
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		struct ir3_ra_block_data *bd = block->bd;
+
+		/* update livein: */
+		for (unsigned i = 0; i < bitset_words; i++) {
+			BITSET_WORD new_livein =
+				(bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
+
+			if (new_livein & ~bd->livein[i]) {
+				bd->livein[i] |= new_livein;
+				progress = true;
+			}
+		}
+
+		/* update liveout: */
+		for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
+			struct ir3_block *succ = block->successors[j];
+			struct ir3_ra_block_data *succ_bd;
+
+			if (!succ)
+				continue;
+
+			succ_bd = succ->bd;
+
+			for (unsigned i = 0; i < bitset_words; i++) {
+				BITSET_WORD new_liveout =
+					(succ_bd->livein[i] & ~bd->liveout[i]);
+
+				if (new_liveout) {
+					bd->liveout[i] |= new_liveout;
+					progress = true;
 				}
 			}
 		}
 	}
+
+	return progress;
 }
 
 static void
@@ -529,7 +670,34 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 {
 	struct ir3 *ir = ctx->ir;
 
-	ra_block_compute_live_ranges(ctx, ctx->ir->block);
+	/* compute live ranges (use/def) on a block level, also updating
+	 * block's def/use bitmasks (used below to calculate per-block
+	 * livein/liveout):
+	 */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		ra_block_compute_live_ranges(ctx, block);
+	}
+
+	/* update per-block livein/liveout: */
+	while (ra_compute_livein_liveout(ctx)) {}
+
+	/* extend start/end ranges based on livein/liveout info from cfg: */
+	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		struct ir3_ra_block_data *bd = block->bd;
+
+		for (unsigned i = 0; i < bitset_words; i++) {
+			if (BITSET_TEST(bd->livein, i)) {
+				ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
+				ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
+			}
+
+			if (BITSET_TEST(bd->liveout, i)) {
+				ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
+				ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
+			}
+		}
+	}
 
 	/* need to fix things up to keep outputs live: */
 	for (unsigned i = 0; i < ir->noutputs; i++) {
@@ -540,7 +708,7 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 		defn = get_definer(instr, &sz, &off);
 		cls = size_to_class(sz, is_half(defn));
 		if (cls >= 0) {
-			unsigned name = ctx->class_base[cls] + defn->name;
+			unsigned name = ra_name(ctx, cls, defn);
 			ctx->use[name] = ctx->instr_cnt;
 		}
 	}
@@ -555,23 +723,6 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 	}
 }
 
-static type_t half_type(type_t type)
-{
-	switch (type) {
-	case TYPE_F32: return TYPE_F16;
-	case TYPE_U32: return TYPE_U16;
-	case TYPE_S32: return TYPE_S16;
-	/* instructions may already be fixed up: */
-	case TYPE_F16:
-	case TYPE_U16:
-	case TYPE_S16:
-		return type;
-	default:
-		assert(0);
-		return ~0;
-	}
-}
-
 /* some instructions need fix-up if dst register is half precision: */
 static void fixup_half_instr_dst(struct ir3_instruction *instr)
 {
@@ -633,7 +784,7 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 	defn = get_definer(instr, &sz, &off);
 	cls = size_to_class(sz, is_half(defn));
 	if (cls >= 0) {
-		unsigned name = ctx->class_base[cls] + defn->name;
+		unsigned name = ra_name(ctx, cls, defn);
 		unsigned r = ra_get_node_reg(ctx->g, name);
 		unsigned num = ctx->set->ra_reg_to_gpr[r] + off;
 
@@ -641,7 +792,7 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 			num += reg->offset;
 
 		reg->num = num;
-		reg->flags &= ~IR3_REG_SSA;
+		reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
 
 		if (is_half(defn))
 			reg->flags |= IR3_REG_HALF;
@@ -686,8 +837,8 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 		unsigned i = 0, j;
 		if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
 			struct ir3_instruction *instr = ir->inputs[i];
-			unsigned cls = size_to_class(1, true);
-			unsigned name = ctx->class_base[cls] + instr->name;
+			int cls = size_to_class(1, true);
+			unsigned name = ra_name(ctx, cls, instr);
 			unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
 
 			/* if we have frag_face, it gets hr0.x */
@@ -706,8 +857,7 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 					unsigned name, reg;
 
 					cls = size_to_class(sz, is_half(defn));
-					debug_assert(cls >= 0);
-					name = ctx->class_base[cls] + defn->name;
+					name = ra_name(ctx, cls, defn);
 					reg = ctx->set->gpr_to_ra_reg[cls][j];
 
 					ra_set_node_reg(ctx->g, name, reg);
@@ -720,7 +870,9 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 	if (!ra_allocate(ctx->g))
 		return -1;
 
-	ra_block_alloc(ctx, ctx->ir->block);
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_alloc(ctx, block);
+	}
 
 	return 0;
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 0d404a83583..49a4426d163 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -205,6 +205,16 @@ instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	struct ir3_instruction *src;
 	unsigned delay = 0;
 
+	/* Phi instructions can have a dependency on something not
+	 * scheduled yet (for ex, loops).  But OTOH we don't really
+	 * care.  By definition phi's should appear at the top of
+	 * the block, and it's sources should be values from the
+	 * previously executing block, so they are always ready to
+	 * be scheduled:
+	 */
+	if (is_meta(instr) && (instr->opc == OPC_META_PHI))
+		return 0;
+
 	foreach_ssa_src(src, instr) {
 		/* if dependency not scheduled, we aren't ready yet: */
 		if (!is_scheduled(src))
@@ -422,13 +432,87 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 			}
 		}
 	}
+
+	/* And lastly, insert branch/jump instructions to take us to
+	 * the next block.  Later we'll strip back out the branches
+	 * that simply jump to next instruction.
+	 */
+	if (block->successors[1]) {
+		/* if/else, conditional branches to "then" or "else": */
+		struct ir3_instruction *br;
+		unsigned delay = 6;
+
+		debug_assert(ctx->pred);
+		debug_assert(block->condition);
+
+		delay -= distance(ctx, ctx->pred, delay);
+
+		while (delay > 0) {
+			ir3_NOP(block);
+			delay--;
+		}
+
+		/* create "else" branch first (since "then" block should
+		 * frequently/always end up being a fall-thru):
+		 */
+		br = ir3_BR(block);
+		br->cat0.inv = true;
+		br->cat0.target = block->successors[1];
+
+		/* NOTE: we have to hard code delay of 6 above, since
+		 * we want to insert the nop's before constructing the
+		 * branch.  Throw in an assert so we notice if this
+		 * ever breaks on future generation:
+		 */
+		debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
+
+		br = ir3_BR(block);
+		br->cat0.target = block->successors[0];
+
+	} else if (block->successors[0]) {
+		/* otherwise unconditional jump to next block: */
+		struct ir3_instruction *jmp;
+
+		jmp = ir3_JUMP(block);
+		jmp->cat0.target = block->successors[0];
+	}
+
+	/* NOTE: if we kept track of the predecessors, we could do a better
+	 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
+	 * Note that as we eliminate blocks which contain only an unconditional
+	 * jump we probably need to propagate (jp) flag..
+	 */
+}
+
+/* this is needed to ensure later RA stage succeeds: */
+static void
+sched_insert_parallel_copies(struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		if (is_meta(instr) && (instr->opc == OPC_META_PHI)) {
+			struct ir3_register *reg;
+			foreach_src(reg, instr) {
+				struct ir3_instruction *src = reg->instr;
+				struct ir3_instruction *mov =
+					ir3_MOV(src->block, src, TYPE_U32);
+				mov->regs[0]->flags |= IR3_REG_PHI_SRC;
+				mov->regs[0]->instr = instr;
+				reg->instr = mov;
+			}
+		}
+	}
 }
 
 int ir3_sched(struct ir3 *ir)
 {
 	struct ir3_sched_ctx ctx = {0};
-	ir3_clear_mark(ir->block->shader);
-	sched_block(&ctx, ir->block);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		sched_insert_parallel_copies(block);
+	}
+	ir3_clear_mark(ir);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		sched_block(&ctx, block);
+	}
 	if (ctx.error)
 		return -1;
 	return 0;