5 files changed, 244 insertions, 42 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 30932854884..430bcf22d6f 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -205,6 +205,9 @@ struct ir3_instruction {
 			int off;              /* component/offset */
 		} fo;
 		struct {
+			int aid;
+		} fi;
+		struct {
 			struct ir3_block *if_block, *else_block;
 		} flow;
 		struct {
@@ -264,6 +267,19 @@ struct ir3_instruction {
 	 */
 	struct ir3_instruction *address;
 
+	/* in case of a instruction with relative dst instruction, we need to
+	 * capture the dependency on the fanin for the previous values of
+	 * the array elements.  Since we don't know at compile time actually
+	 * which array elements are written, this serves to preserve the
+	 * unconditional write to array elements prior to the conditional
+	 * write.
+	 *
+	 * TODO only cat1 can do indirect write.. we could maybe move this
+	 * into instr->cat1.fanin (but would require the frontend to insert
+	 * the extra mov)
+	 */
+	struct ir3_instruction *fanin;
+
 	struct ir3_instruction *next;
 #ifdef DEBUG
 	uint32_t serialno;
@@ -373,6 +389,8 @@ static inline int ir3_instr_regno(struct ir3_instruction *instr,
 }
 
 
+#define MAX_ARRAYS 16
+
 /* comp:
  *   0 - x
  *   1 - y
@@ -498,6 +516,8 @@ static inline bool reg_gpr(struct ir3_register *r)
 
 static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
 {
+	if (instr->fanin)
+		return instr->regs_count + 2;
 	if (instr->address)
 		return instr->regs_count + 1;
 	return instr->regs_count;
@@ -505,6 +525,8 @@ static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
 
 static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
 {
+	if (n == (instr->regs_count + 1))
+		return instr->fanin;
 	if (n == (instr->regs_count + 0))
 		return instr->address;
 	return ssa(instr->regs[n]);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
index d755babf31b..df428ebed7e 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
@@ -64,7 +64,7 @@ struct ir3_compile_context {
 	 */
 	struct {
 		struct ir3_instruction *instr, **instrp;
-	} output_updates[16];
+	} output_updates[64];
 	unsigned num_output_updates;
 
 	/* are we in a sequence of "atomic" instructions?
@@ -97,7 +97,7 @@ struct ir3_compile_context {
 	struct {
 		unsigned first, last;
 		struct ir3_instruction *fanin;
-	} array[16];
+	} array[MAX_ARRAYS];
 	uint32_t array_dirty;
 	/* offset into array[], per file, of first array info */
 	uint8_t array_offsets[TGSI_FILE_COUNT];
@@ -247,10 +247,6 @@ compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
 	memset(ctx->array_offsets, 0, sizeof(ctx->array_offsets));
 
 #define FM(x) (1 << TGSI_FILE_##x)
-	/* optimize can't deal with relative addressing: */
-	if (info->indirect_files_written & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
-		return TGSI_PARSE_ERROR;
-
 	/* NOTE: if relative addressing is used, we set constlen in
 	 * the compiler (to worst-case value) since we don't know in
 	 * the assembler what the max addr reg value can be:
@@ -595,6 +591,16 @@ ssa_instr_get(struct ir3_compile_context *ctx, unsigned file, unsigned n)
 	return instr;
 }
 
+static int dst_array_id(struct ir3_compile_context *ctx,
+		const struct tgsi_dst_register *dst)
+{
+	// XXX complete hack to recover tgsi_full_dst_register...
+	// nothing that isn't wrapped in a tgsi_full_dst_register
+	// should be indirect
+	const struct tgsi_full_dst_register *fdst = (const void *)dst;
+	return fdst->Indirect.ArrayID + ctx->array_offsets[dst->File];
+}
+
 static int src_array_id(struct ir3_compile_context *ctx,
 		const struct tgsi_src_register *src)
 {
@@ -639,7 +645,56 @@ static void
 ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
 		const struct tgsi_dst_register *dst, unsigned chan)
 {
-	ssa_instr_set(ctx, dst->File, regid(dst->Index, chan), instr);
+	if (dst->Indirect) {
+		struct ir3_register *reg = instr->regs[0];
+		unsigned i, aid = dst_array_id(ctx, dst);
+		unsigned first = ctx->array[aid].first;
+		unsigned last  = ctx->array[aid].last;
+		unsigned off   = dst->Index - first; /* vec4 offset */
+
+		reg->size = 4 * (1 + last - first);
+		reg->offset = regid(off, chan);
+
+		instr->fanin = array_fanin(ctx, aid, dst->File);
+
+		/* annotate with the array-id, to help out the register-
+		 * assignment stage.  At least for the case of indirect
+		 * writes, we should capture enough dependencies to
+		 * preserve the order of reads/writes of the array, so
+		 * the multiple "names" for the array should end up all
+		 * assigned to the same registers.
+		 */
+		instr->fanin->fi.aid = aid;
+
+		/* Since we are scalarizing vec4 tgsi instructions/regs, we
+		 * run into a slight complication here.  To do the naive thing
+		 * and setup a fanout for each scalar array element would end
+		 * up with the result that the instructions generated for each
+		 * component of the vec4 would end up clobbering each other.
+		 * So we take advantage here of knowing that the array index
+		 * (after the shl.b) will be a multiple of four, and only set
+		 * every fourth scalar component in the array.  See also
+		 * fixup_ssa_dst_array()
+		 */
+		for (i = first; i <= last; i++) {
+			struct ir3_instruction *split;
+			unsigned n = regid(i, chan);
+			int off = (4 * (i - first)) + chan;
+
+			if (is_meta(instr) && (instr->opc == OPC_META_FO))
+				off -= instr->fo.off;
+
+			split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
+			split->fo.off = off;
+			ir3_reg_create(split, 0, 0);
+			ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr;
+
+			ssa_instr_set(ctx, dst->File, n, split);
+		}
+	} else {
+		/* normal case (not relative addressed GPR) */
+		ssa_instr_set(ctx, dst->File, regid(dst->Index, chan), instr);
+	}
 }
 
 static void
@@ -705,12 +760,22 @@ add_dst_reg_wrmask(struct ir3_compile_context *ctx,
 		break;
 	}
 
-	if (dst->Indirect)
+	if (dst->Indirect) {
 		flags |= IR3_REG_RELATIV;
 
-	reg = ir3_reg_create(instr, regid(num, chan), flags);
+		/* shouldn't happen, and we can't cope with it below: */
+		compile_assert(ctx, wrmask == 0x1);
+
+		compile_assert(ctx, ctx->block->address);
+		if (instr->address)
+			compile_assert(ctx, ctx->block->address == instr->address);
+
+		instr->address = ctx->block->address;
+	}
 
+	reg = ir3_reg_create(instr, regid(num, chan), flags);
 	reg->wrmask = wrmask;
+
 	if (wrmask == 0x1) {
 		/* normal case */
 		ssa_dst(ctx, instr, dst, chan);
@@ -720,6 +785,8 @@ add_dst_reg_wrmask(struct ir3_compile_context *ctx,
 		struct ir3_instruction *prev = NULL;
 		unsigned i;
 
+		compile_assert(ctx, !dst->Indirect);
+
 		/* if instruction writes multiple, we need to create
 		 * some place-holder collect the registers:
 		 */
@@ -2539,10 +2606,16 @@ instr_cat1(const struct instr_translater *t,
 		struct ir3_compile_context *ctx,
 		struct tgsi_full_instruction *inst)
 {
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
 	struct tgsi_src_register *src = &inst->Src[0].Register;
+
+	/* NOTE: atomic start/end, rather than in create_mov() since
+	 * create_mov() is used already w/in atomic sequences (and
+	 * we aren't clever enough to deal with the nesting)
+	 */
+	instr_atomic_start(ctx);
 	create_mov(ctx, dst, src);
-	put_dst(ctx, inst, dst);
+	instr_atomic_end(ctx);
 }
 
 static void
@@ -3322,6 +3395,10 @@ ir3_compile_shader(struct ir3_shader_variant *so,
 		goto out;
 	}
 
+	/* for now, until the edge cases are worked out: */
+	if (ctx.info.indirect_files_written & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
+		cp = false;
+
 	compile_instructions(&ctx);
 
 	block = ctx.block;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 898ed70abeb..8ad1894a324 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -49,7 +49,7 @@ static bool is_eligible_mov(struct ir3_instruction *instr)
 		struct ir3_register *dst = instr->regs[0];
 		struct ir3_register *src = instr->regs[1];
 		struct ir3_instruction *src_instr = ssa(src);
-		if (dst->flags & IR3_REG_ADDR)
+		if (dst->flags & (IR3_REG_ADDR | IR3_REG_RELATIV))
 			return false;
 		/* TODO: propagate abs/neg modifiers if possible */
 		if (src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV))
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_dump.c b/src/gallium/drivers/freedreno/ir3/ir3_dump.c
index a846777b879..dc251653fa5 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_dump.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_dump.c
@@ -414,8 +414,20 @@ ir3_dump_instr_single(struct ir3_instruction *instr)
 		fprintf(ctx.f, "]");
 	}
 
-	if (is_meta(instr) && (instr->opc == OPC_META_FO))
-		printf(", off=%d", instr->fo.off);
+	if (instr->fanin) {
+		fprintf(ctx.f, ", fanin=_");
+		fprintf(ctx.f, "[");
+		dump_instr_name(&ctx, instr->fanin);
+		fprintf(ctx.f, "]");
+	}
+
+	if (is_meta(instr)) {
+		if (instr->opc == OPC_META_FO) {
+			printf(", off=%d", instr->fo.off);
+		} else if ((instr->opc == OPC_META_FI) && instr->fi.aid) {
+			printf(", aid=%d", instr->fi.aid);
+		}
+	}
 
 	printf("\n");
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 0f6d40f5a7c..a4235a77a15 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -47,6 +47,12 @@
  * I'm not really sure a sane way for the CP stage to realize when it
  * cannot remove a mov due to multi-register constraints..
  *
+ * NOTE: http://scopesconf.org/scopes-01/paper/session1_2.ps.gz has
+ * some ideas to handle array allocation with a more conventional
+ * graph coloring algorithm for register assignment, which might be
+ * a good alternative to the current algo.  However afaict it cannot
+ * handle overlapping arrays, which is a scenario that we have to
+ * deal with
  */
 
 struct ir3_ra_ctx {
@@ -56,6 +62,10 @@ struct ir3_ra_ctx {
 	bool frag_face;
 	int cnt;
 	bool error;
+	struct {
+		unsigned base;
+		unsigned size;
+	} arrays[MAX_ARRAYS];
 };
 
 #ifdef DEBUG
@@ -141,6 +151,26 @@ static bool instr_is_output(struct ir3_instruction *instr)
 	return false;
 }
 
+static void mark_sources(struct ir3_instruction *instr,
+		struct ir3_instruction *n, regmask_t *liveregs, regmask_t *written)
+{
+	unsigned i;
+
+	for (i = 1; i < n->regs_count; i++) {
+		struct ir3_register *r = reg_check(n, i);
+		if (r)
+			regmask_set_if_not(liveregs, r, written);
+
+		/* if any src points back to the instruction(s) in
+		 * the block of neighbors that we are assigning then
+		 * mark any written (clobbered) registers as live:
+		 */
+		if (instr_used_by(instr, n->regs[i]))
+			regmask_or(liveregs, liveregs, written);
+	}
+
+}
+
 /* live means read before written */
 static void compute_liveregs(struct ir3_ra_ctx *ctx,
 		struct ir3_instruction *instr, regmask_t *liveregs)
@@ -159,18 +189,13 @@ static void compute_liveregs(struct ir3_ra_ctx *ctx,
 			continue;
 
 		/* check first src's read: */
-		for (i = 1; i < n->regs_count; i++) {
-			r = reg_check(n, i);
-			if (r)
-				regmask_set_if_not(liveregs, r, &written);
-
-			/* if any src points back to the instruction(s) in
-			 * the block of neighbors that we are assigning then
-			 * mark any written (clobbered) registers as live:
-			 */
-			if (instr_used_by(instr, n->regs[i]))
-				regmask_or(liveregs, liveregs, &written);
-		}
+		mark_sources(instr, n, liveregs, &written);
+
+		/* for instructions that write to an array, we need to
+		 * capture the dependency on the array elements:
+		 */
+		if (n->fanin)
+			mark_sources(instr, n->fanin, liveregs, &written);
 
 		/* meta-instructions don't actually get scheduled,
 		 * so don't let it's write confuse us.. what we
@@ -383,14 +408,32 @@ static void instr_assign_src(struct ir3_ra_ctx *ctx,
 	}
 }
 
-static void instr_assign(struct ir3_ra_ctx *ctx,
+static void instr_assign_srcs(struct ir3_ra_ctx *ctx,
 		struct ir3_instruction *instr, unsigned name)
 {
 	struct ir3_instruction *n, *src;
+
+	for (n = instr->next; n && !ctx->error; n = n->next) {
+		foreach_ssa_src_n(src, i, n) {
+			unsigned r = i + 1;
+
+			/* skip address / etc (non real sources): */
+			if (r >= n->regs_count)
+				continue;
+
+			if (src == instr)
+				instr_assign_src(ctx, n, r, name);
+		}
+	}
+}
+
+static void instr_assign(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr, unsigned name)
+{
 	struct ir3_register *reg = instr->regs[0];
 
-	if ((reg->flags & IR3_REG_RELATIV))
-		name += reg->offset;
+	if (reg->flags & IR3_REG_RELATIV)
+		return;
 
 	/* check if already assigned: */
 	if (!(reg->flags & IR3_REG_SSA)) {
@@ -403,18 +446,7 @@ static void instr_assign(struct ir3_ra_ctx *ctx,
 	reg_assign(instr, 0, name);
 
 	/* and rename any subsequent use of result of this instr: */
-	for (n = instr->next; n && !ctx->error; n = n->next) {
-		foreach_ssa_src_n(src, i, n) {
-			unsigned r = i + 1;
-
-			/* skip address / etc (non real sources): */
-			if (r >= n->regs_count)
-				continue;
-
-			if (src == instr)
-				instr_assign_src(ctx, n, r, name);
-		}
-	}
+	instr_assign_srcs(ctx, instr, name);
 
 	/* To simplify the neighbor logic, and to "avoid" dealing with
 	 * instructions which write more than one output, we actually
@@ -423,6 +455,8 @@ static void instr_assign(struct ir3_ra_ctx *ctx,
 	 * to the actual instruction:
 	 */
 	if (is_meta(instr) && (instr->opc == OPC_META_FO)) {
+		struct ir3_instruction *src;
+
 		debug_assert(name >= instr->fo.off);
 
 		foreach_ssa_src(src, instr)
@@ -444,7 +478,8 @@ static int check_partial_assignment(struct ir3_ra_ctx *ctx,
 
 	for (n = instr; n; n = n->cp.right) {
 		struct ir3_register *dst = n->regs[0];
-		if (!(dst->flags & IR3_REG_SSA)) {
+		if ((n->depth != DEPTH_UNUSED) &&
+				!(dst->flags & IR3_REG_SSA)) {
 			int name = dst->num - off;
 			debug_assert(name >= 0);
 			return name;
@@ -472,6 +507,23 @@ static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx,
 
 	dst = instr->regs[0];
 
+	/* For indirect dst, take the register assignment from the
+	 * fanin and propagate it forward.
+	 */
+	if (dst->flags & IR3_REG_RELATIV) {
+		/* NOTE can be grouped, if for example outputs:
+		 * for now disable cp if indirect writes
+		 */
+		instr_alloc_and_assign(ctx, instr->fanin);
+
+		dst->num += instr->fanin->regs[0]->num;
+		dst->flags &= ~IR3_REG_SSA;
+
+		instr_assign_srcs(ctx, instr, instr->fanin->regs[0]->num);
+
+		return;
+	}
+
 	/* for instructions w/ fanouts, do the actual register assignment
 	 * on the group of fanout neighbor nodes and propagate the reg
 	 * name back up to the texture instruction.
@@ -510,6 +562,33 @@ static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx,
 	}
 }
 
+static void instr_assign_array(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr)
+{
+	struct ir3_instruction *src;
+	int name, aid = instr->fi.aid;
+
+	if (ctx->arrays[aid].base == ~0) {
+		int size = instr->regs_count - 1;
+		ctx->arrays[aid].base = alloc_block(ctx, instr, size);
+		ctx->arrays[aid].size = size;
+	}
+
+	name = ctx->arrays[aid].base;
+
+	foreach_ssa_src_n(src, i, instr) {
+		unsigned r = i + 1;
+
+		/* skip address / etc (non real sources): */
+		if (r >= instr->regs_count)
+			break;
+
+		instr_assign(ctx, src, name);
+		name++;
+	}
+
+}
+
 static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 {
 	struct ir3_instruction *n;
@@ -531,6 +610,16 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 	ra_dump_list("-------\n", block->head);
 
+	/* first pass, assign arrays: */
+	for (n = block->head; n && !ctx->error; n = n->next) {
+		if (is_meta(n) && (n->opc == OPC_META_FI) && n->fi.aid) {
+			debug_assert(!n->cp.left);  /* don't think this should happen */
+			ra_dump_instr("ASSIGN ARRAY: ", n);
+			instr_assign_array(ctx, n);
+			ra_dump_list("-------\n", block->head);
+		}
+	}
+
 	for (n = block->head; n && !ctx->error; n = n->next) {
 		ra_dump_instr("ASSIGN: ", n);
 		instr_alloc_and_assign(ctx, ir3_neighbor_first(n));
@@ -552,6 +641,8 @@ int ir3_block_ra(struct ir3_block *block, enum shader_t type,
 	};
 	int ret;
 
+	memset(&ctx.arrays, ~0, sizeof(ctx.arrays));
+
 	/* mark dst registers w/ SSA flag so we can see which
 	 * have been assigned so far:
 	 * NOTE: we really should set SSA flag consistently on