freedreno/ir3: split out shader compiler from a3xx

Move the bits we want to share between generations from fd3_program to ir3_shader. So overall structure is: fdN_shader_stateobj -> ir3_shader -> ir3_shader_variant -> ir3 |- ... \- ir3_shader_variant -> ir3 So the ir3_shader becomes the topmost generation neutral object, which manages the set of variants each of which generates, compiles, and assembles it's own ir. There is a bit of additional renaming to s/fd3_compiler/ir3_compiler/, etc. Keep the split between the gallium level stateobj and the shader helper object because it might be a good idea to pre-compute some generation specific register values (ie. anything that is independent of linking). Signed-off-by: Rob Clark <[email protected]>
author: Rob Clark <[email protected]> 2014-07-25 11:15:59 -0400
committer: Rob Clark <[email protected]> 2014-07-25 13:29:28 -0400
commit: db193e5ad06e7a2fbcffb3bb5df85d212eb12291 (patch)
tree: 58d1ec24c0af7b1acb1477eeaababe3d7eda6019 /src/gallium/drivers/freedreno/ir3
parent: 7d7e6ae9c3544ce1889aa9b8a34545c6f42017e7 (diff)
16 files changed, 9472 insertions, 0 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
new file mode 100644
index 00000000000..8c3704bf658
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -0,0 +1,805 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+
+#include <util/u_debug.h>
+
+#include "disasm.h"
+#include "instr-a3xx.h"
+
+static enum debug_t debug;
+
+#define printf debug_printf
+
+static const char *levels[] = {
+		"",
+		"\t",
+		"\t\t",
+		"\t\t\t",
+		"\t\t\t\t",
+		"\t\t\t\t\t",
+		"\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t\t\t",
+		"x",
+		"x",
+		"x",
+		"x",
+		"x",
+		"x",
+};
+
+static const char *component = "xyzw";
+
+static const char *type[] = {
+		[TYPE_F16] = "f16",
+		[TYPE_F32] = "f32",
+		[TYPE_U16] = "u16",
+		[TYPE_U32] = "u32",
+		[TYPE_S16] = "s16",
+		[TYPE_S32] = "s32",
+		[TYPE_U8]  = "u8",
+		[TYPE_S8]  = "s8",
+};
+
+static void print_reg(reg_t reg, bool full, bool r, bool c, bool im,
+		bool neg, bool abs, bool addr_rel)
+{
+	const char type = c ? 'c' : 'r';
+
+	// XXX I prefer - and || for neg/abs, but preserving format used
+	// by libllvm-a3xx for easy diffing..
+
+	if (abs && neg)
+		printf("(absneg)");
+	else if (neg)
+		printf("(neg)");
+	else if (abs)
+		printf("(abs)");
+
+	if (r)
+		printf("(r)");
+
+	if (im) {
+		printf("%d", reg.iim_val);
+	} else if (addr_rel) {
+		/* I would just use %+d but trying to make it diff'able with
+		 * libllvm-a3xx...
+		 */
+		if (reg.iim_val < 0)
+			printf("%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
+		else if (reg.iim_val > 0)
+			printf("%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
+		else
+			printf("%s%c<a0.x>", full ? "" : "h", type);
+	} else if ((reg.num == REG_A0) && !c) {
+		printf("a0.%c", component[reg.comp]);
+	} else if ((reg.num == REG_P0) && !c) {
+		printf("p0.%c", component[reg.comp]);
+	} else {
+		printf("%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]);
+	}
+}
+
+
+/* current instruction repeat flag: */
+static unsigned repeat;
+
+static void print_reg_dst(reg_t reg, bool full, bool addr_rel)
+{
+	print_reg(reg, full, false, false, false, false, false, addr_rel);
+}
+
+static void print_reg_src(reg_t reg, bool full, bool r, bool c, bool im,
+		bool neg, bool abs, bool addr_rel)
+{
+	print_reg(reg, full, r, c, im, neg, abs, addr_rel);
+}
+
+static void print_instr_cat0(instr_t *instr)
+{
+	instr_cat0_t *cat0 = &instr->cat0;
+
+	switch (cat0->opc) {
+	case OPC_KILL:
+		printf(" %sp0.%c", cat0->inv ? "!" : "",
+				component[cat0->comp]);
+		break;
+	case OPC_BR:
+		printf(" %sp0.%c, #%d", cat0->inv ? "!" : "",
+				component[cat0->comp], cat0->immed);
+		break;
+	case OPC_JUMP:
+	case OPC_CALL:
+		printf(" #%d", cat0->immed);
+		break;
+	}
+
+	if ((debug & PRINT_VERBOSE) && (cat0->dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4))
+		printf("\t{0: %x,%x,%x,%x}", cat0->dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4);
+}
+
+static void print_instr_cat1(instr_t *instr)
+{
+	instr_cat1_t *cat1 = &instr->cat1;
+
+	if (cat1->ul)
+		printf("(ul)");
+
+	if (cat1->src_type == cat1->dst_type) {
+		if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
+			/* special case (nmemonic?): */
+			printf("mova");
+		} else {
+			printf("mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+		}
+	} else {
+		printf("cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+	}
+
+	printf(" ");
+
+	if (cat1->even)
+		printf("(even)");
+
+	if (cat1->pos_inf)
+		printf("(pos_infinity)");
+
+	print_reg_dst((reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
+			cat1->dst_rel);
+
+	printf(", ");
+
+	/* ugg, have to special case this.. vs print_reg().. */
+	if (cat1->src_im) {
+		if (type_float(cat1->src_type))
+			printf("(%f)", cat1->fim_val);
+		else
+			printf("%d", cat1->iim_val);
+	} else if (cat1->src_rel && !cat1->src_c) {
+		/* I would just use %+d but trying to make it diff'able with
+		 * libllvm-a3xx...
+		 */
+		char type = cat1->src_rel_c ? 'c' : 'r';
+		if (cat1->off < 0)
+			printf("%c<a0.x - %d>", type, -cat1->off);
+		else if (cat1->off > 0)
+			printf("%c<a0.x + %d>", type, cat1->off);
+		else
+			printf("c<a0.x>");
+	} else {
+		print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32,
+				cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
+	}
+
+	if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
+		printf("\t{1: %x}", cat1->must_be_0);
+}
+
+static void print_instr_cat2(instr_t *instr)
+{
+	instr_cat2_t *cat2 = &instr->cat2;
+	static const char *cond[] = {
+			"lt",
+			"le",
+			"gt",
+			"ge",
+			"eq",
+			"ne",
+			"?6?",
+	};
+
+	switch (cat2->opc) {
+	case OPC_CMPS_F:
+	case OPC_CMPS_U:
+	case OPC_CMPS_S:
+	case OPC_CMPV_F:
+	case OPC_CMPV_U:
+	case OPC_CMPV_S:
+		printf(".%s", cond[cat2->cond]);
+		break;
+	}
+
+	printf(" ");
+	if (cat2->ei)
+		printf("(ei)");
+	print_reg_dst((reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
+	printf(", ");
+
+	if (cat2->c1.src1_c) {
+		print_reg_src((reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r,
+				cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg,
+				cat2->src1_abs, false);
+	} else if (cat2->rel1.src1_rel) {
+		print_reg_src((reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r,
+				cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg,
+				cat2->src1_abs, cat2->rel1.src1_rel);
+	} else {
+		print_reg_src((reg_t)(cat2->src1), cat2->full, cat2->src1_r,
+				false, cat2->src1_im, cat2->src1_neg,
+				cat2->src1_abs, false);
+	}
+
+	switch (cat2->opc) {
+	case OPC_ABSNEG_F:
+	case OPC_ABSNEG_S:
+	case OPC_CLZ_B:
+	case OPC_CLZ_S:
+	case OPC_SIGN_F:
+	case OPC_FLOOR_F:
+	case OPC_CEIL_F:
+	case OPC_RNDNE_F:
+	case OPC_RNDAZ_F:
+	case OPC_TRUNC_F:
+	case OPC_NOT_B:
+	case OPC_BFREV_B:
+	case OPC_SETRM:
+	case OPC_CBITS_B:
+		/* these only have one src reg */
+		break;
+	default:
+		printf(", ");
+		if (cat2->c2.src2_c) {
+			print_reg_src((reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r,
+					cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg,
+					cat2->src2_abs, false);
+		} else if (cat2->rel2.src2_rel) {
+			print_reg_src((reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r,
+					cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg,
+					cat2->src2_abs, cat2->rel2.src2_rel);
+		} else {
+			print_reg_src((reg_t)(cat2->src2), cat2->full, cat2->src2_r,
+					false, cat2->src2_im, cat2->src2_neg,
+					cat2->src2_abs, false);
+		}
+		break;
+	}
+}
+
+static void print_instr_cat3(instr_t *instr)
+{
+	instr_cat3_t *cat3 = &instr->cat3;
+	bool full = instr_cat3_full(cat3);
+
+	printf(" ");
+	print_reg_dst((reg_t)(cat3->dst), full ^ cat3->dst_half, false);
+	printf(", ");
+	if (cat3->c1.src1_c) {
+		print_reg_src((reg_t)(cat3->c1.src1), full,
+				cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg,
+				false, false);
+	} else if (cat3->rel1.src1_rel) {
+		print_reg_src((reg_t)(cat3->rel1.src1), full,
+				cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg,
+				false, cat3->rel1.src1_rel);
+	} else {
+		print_reg_src((reg_t)(cat3->src1), full,
+				cat3->src1_r, false, false, cat3->src1_neg,
+				false, false);
+	}
+	printf(", ");
+	print_reg_src((reg_t)cat3->src2, full,
+			cat3->src2_r, cat3->src2_c, false, cat3->src2_neg,
+			false, false);
+	printf(", ");
+	if (cat3->c2.src3_c) {
+		print_reg_src((reg_t)(cat3->c2.src3), full,
+				cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg,
+				false, false);
+	} else if (cat3->rel2.src3_rel) {
+		print_reg_src((reg_t)(cat3->rel2.src3), full,
+				cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg,
+				false, cat3->rel2.src3_rel);
+	} else {
+		print_reg_src((reg_t)(cat3->src3), full,
+				cat3->src3_r, false, false, cat3->src3_neg,
+				false, false);
+	}
+}
+
+static void print_instr_cat4(instr_t *instr)
+{
+	instr_cat4_t *cat4 = &instr->cat4;
+
+	printf(" ");
+	print_reg_dst((reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
+	printf(", ");
+
+	if (cat4->c.src_c) {
+		print_reg_src((reg_t)(cat4->c.src), cat4->full,
+				cat4->src_r, cat4->c.src_c, cat4->src_im,
+				cat4->src_neg, cat4->src_abs, false);
+	} else if (cat4->rel.src_rel) {
+		print_reg_src((reg_t)(cat4->rel.src), cat4->full,
+				cat4->src_r, cat4->rel.src_c, cat4->src_im,
+				cat4->src_neg, cat4->src_abs, cat4->rel.src_rel);
+	} else {
+		print_reg_src((reg_t)(cat4->src), cat4->full,
+				cat4->src_r, false, cat4->src_im,
+				cat4->src_neg, cat4->src_abs, false);
+	}
+
+	if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
+		printf("\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
+}
+
+static void print_instr_cat5(instr_t *instr)
+{
+	static const struct {
+		bool src1, src2, samp, tex;
+	} info[0x1f] = {
+			[OPC_ISAM]     = { true,  false, true,  true,  },
+			[OPC_ISAML]    = { true,  true,  true,  true,  },
+			[OPC_ISAMM]    = { true,  false, true,  true,  },
+			[OPC_SAM]      = { true,  false, true,  true,  },
+			[OPC_SAMB]     = { true,  true,  true,  true,  },
+			[OPC_SAML]     = { true,  true,  true,  true,  },
+			[OPC_SAMGQ]    = { true,  false, true,  true,  },
+			[OPC_GETLOD]   = { true,  false, true,  true,  },
+			[OPC_CONV]     = { true,  true,  true,  true,  },
+			[OPC_CONVM]    = { true,  true,  true,  true,  },
+			[OPC_GETSIZE]  = { true,  false, false, true,  },
+			[OPC_GETBUF]   = { false, false, false, true,  },
+			[OPC_GETPOS]   = { true,  false, false, true,  },
+			[OPC_GETINFO]  = { false, false, false, true,  },
+			[OPC_DSX]      = { true,  false, false, false, },
+			[OPC_DSY]      = { true,  false, false, false, },
+			[OPC_GATHER4R] = { true,  false, true,  true,  },
+			[OPC_GATHER4G] = { true,  false, true,  true,  },
+			[OPC_GATHER4B] = { true,  false, true,  true,  },
+			[OPC_GATHER4A] = { true,  false, true,  true,  },
+			[OPC_SAMGP0]   = { true,  false, true,  true,  },
+			[OPC_SAMGP1]   = { true,  false, true,  true,  },
+			[OPC_SAMGP2]   = { true,  false, true,  true,  },
+			[OPC_SAMGP3]   = { true,  false, true,  true,  },
+			[OPC_DSXPP_1]  = { true,  false, false, false, },
+			[OPC_DSYPP_1]  = { true,  false, false, false, },
+			[OPC_RGETPOS]  = { false, false, false, false, },
+			[OPC_RGETINFO] = { false, false, false, false, },
+	};
+	instr_cat5_t *cat5 = &instr->cat5;
+	int i;
+
+	if (cat5->is_3d)   printf(".3d");
+	if (cat5->is_a)    printf(".a");
+	if (cat5->is_o)    printf(".o");
+	if (cat5->is_p)    printf(".p");
+	if (cat5->is_s)    printf(".s");
+	if (cat5->is_s2en) printf(".s2en");
+
+	printf(" ");
+
+	switch (cat5->opc) {
+	case OPC_DSXPP_1:
+	case OPC_DSYPP_1:
+		break;
+	default:
+		printf("(%s)", type[cat5->type]);
+		break;
+	}
+
+	printf("(");
+	for (i = 0; i < 4; i++)
+		if (cat5->wrmask & (1 << i))
+			printf("%c", "xyzw"[i]);
+	printf(")");
+
+	print_reg_dst((reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
+
+	if (info[cat5->opc].src1) {
+		printf(", ");
+		print_reg_src((reg_t)(cat5->src1), cat5->full, false, false, false,
+				false, false, false);
+	}
+
+	if (cat5->is_s2en) {
+		printf(", ");
+		print_reg_src((reg_t)(cat5->s2en.src2), cat5->full, false, false, false,
+				false, false, false);
+		printf(", ");
+		print_reg_src((reg_t)(cat5->s2en.src3), false, false, false, false,
+				false, false, false);
+	} else {
+		if (cat5->is_o || info[cat5->opc].src2) {
+			printf(", ");
+			print_reg_src((reg_t)(cat5->norm.src2), cat5->full,
+					false, false, false, false, false, false);
+		}
+		if (info[cat5->opc].samp)
+			printf(", s#%d", cat5->norm.samp);
+		if (info[cat5->opc].tex)
+			printf(", t#%d", cat5->norm.tex);
+	}
+
+	if (debug & PRINT_VERBOSE) {
+		if (cat5->is_s2en) {
+			if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2))
+				printf("\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2);
+		} else {
+			if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2))
+				printf("\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2);
+		}
+	}
+}
+
+static int32_t u2i(uint32_t val, int nbits)
+{
+	return ((val >> (nbits-1)) * ~((1 << nbits) - 1)) | val;
+}
+
+static void print_instr_cat6(instr_t *instr)
+{
+	instr_cat6_t *cat6 = &instr->cat6;
+
+	printf(".%s ", type[cat6->type]);
+
+	switch (cat6->opc) {
+	case OPC_LDG:
+	case OPC_LDP:
+	case OPC_LDL:
+	case OPC_LDLW:
+	case OPC_LDLV:
+		/* load instructions: */
+		print_reg_dst((reg_t)(cat6->a.dst), type_size(cat6->type) == 32, false);
+		printf(",");
+		switch (cat6->opc) {
+		case OPC_LDG:
+			printf("g");
+			break;
+		case OPC_LDP:
+			printf("p");
+			break;
+		case OPC_LDL:
+		case OPC_LDLW:
+		case OPC_LDLV:
+			printf("l");
+			break;
+		}
+		printf("[");
+		print_reg_src((reg_t)(cat6->a.src), true,
+				false, false, false, false, false, false);
+		if (cat6->a.off)
+			printf("%+d", cat6->a.off);
+		printf("]");
+		break;
+	case OPC_PREFETCH:
+		/* similar to load instructions: */
+		printf("g[");
+		print_reg_src((reg_t)(cat6->a.src), true,
+				false, false, false, false, false, false);
+		if (cat6->a.off)
+			printf("%+d", cat6->a.off);
+		printf("]");
+		break;
+	case OPC_STG:
+	case OPC_STP:
+	case OPC_STL:
+	case OPC_STLW:
+		/* store instructions: */
+		switch (cat6->opc) {
+		case OPC_STG:
+			printf("g");
+			break;
+		case OPC_STP:
+			printf("p");
+			break;
+		case OPC_STL:
+		case OPC_STLW:
+			printf("l");
+			break;
+		}
+		printf("[");
+		print_reg_dst((reg_t)(cat6->b.dst), true, false);
+		if (cat6->b.off || cat6->b.off_hi)
+			printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13));
+		printf("]");
+		printf(",");
+		print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32,
+				false, false, false, false, false, false);
+
+		break;
+	case OPC_STI:
+		/* sti has same encoding as other store instructions, but
+		 * slightly different syntax:
+		 */
+		print_reg_dst((reg_t)(cat6->b.dst), false /* XXX is it always half? */, false);
+		if (cat6->b.off || cat6->b.off_hi)
+			printf("%+d", u2i((cat6->b.off_hi << 8) | cat6->b.off, 13));
+		printf(",");
+		print_reg_src((reg_t)(cat6->b.src), type_size(cat6->type) == 32,
+				false, false, false, false, false, false);
+		break;
+	}
+
+	printf(", %d", cat6->iim_val);
+
+	if (debug & PRINT_VERBOSE) {
+		switch (cat6->opc) {
+		case OPC_LDG:
+		case OPC_LDP:
+			/* load instructions: */
+			if (cat6->a.dummy1|cat6->a.dummy2|cat6->a.dummy3)
+				printf("\t{6: %x,%x,%x}", cat6->a.dummy1, cat6->a.dummy2, cat6->a.dummy3);
+			if ((cat6->a.must_be_one1 != 1) || (cat6->a.must_be_one2 != 1))
+				printf("{?? %d,%d ??}", cat6->a.must_be_one1, cat6->a.must_be_one2);
+			break;
+		case OPC_STG:
+		case OPC_STP:
+		case OPC_STI:
+			/* store instructions: */
+			if (cat6->b.dummy1|cat6->b.dummy2)
+				printf("\t{6: %x,%x}", cat6->b.dummy1, cat6->b.dummy2);
+			if ((cat6->b.must_be_one1 != 1) || (cat6->b.must_be_one2 != 1) ||
+					(cat6->b.must_be_zero1 != 0))
+				printf("{?? %d,%d,%d ??}", cat6->b.must_be_one1, cat6->b.must_be_one2,
+						cat6->b.must_be_zero1);
+			break;
+		}
+	}
+}
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+struct opc_info {
+	uint16_t cat;
+	uint16_t opc;
+	const char *name;
+	void (*print)(instr_t *instr);
+} opcs[1 << (3+NOPC_BITS)] = {
+#define OPC(cat, opc, name) [((cat) << NOPC_BITS) | (opc)] = { (cat), (opc), #name, print_instr_cat##cat }
+	/* category 0: */
+	OPC(0, OPC_NOP,          nop),
+	OPC(0, OPC_BR,           br),
+	OPC(0, OPC_JUMP,         jump),
+	OPC(0, OPC_CALL,         call),
+	OPC(0, OPC_RET,          ret),
+	OPC(0, OPC_KILL,         kill),
+	OPC(0, OPC_END,          end),
+	OPC(0, OPC_EMIT,         emit),
+	OPC(0, OPC_CUT,          cut),
+	OPC(0, OPC_CHMASK,       chmask),
+	OPC(0, OPC_CHSH,         chsh),
+	OPC(0, OPC_FLOW_REV,     flow_rev),
+
+	/* category 1: */
+	OPC(1, 0, ),
+
+	/* category 2: */
+	OPC(2, OPC_ADD_F,        add.f),
+	OPC(2, OPC_MIN_F,        min.f),
+	OPC(2, OPC_MAX_F,        max.f),
+	OPC(2, OPC_MUL_F,        mul.f),
+	OPC(2, OPC_SIGN_F,       sign.f),
+	OPC(2, OPC_CMPS_F,       cmps.f),
+	OPC(2, OPC_ABSNEG_F,     absneg.f),
+	OPC(2, OPC_CMPV_F,       cmpv.f),
+	OPC(2, OPC_FLOOR_F,      floor.f),
+	OPC(2, OPC_CEIL_F,       ceil.f),
+	OPC(2, OPC_RNDNE_F,      rndne.f),
+	OPC(2, OPC_RNDAZ_F,      rndaz.f),
+	OPC(2, OPC_TRUNC_F,      trunc.f),
+	OPC(2, OPC_ADD_U,        add.u),
+	OPC(2, OPC_ADD_S,        add.s),
+	OPC(2, OPC_SUB_U,        sub.u),
+	OPC(2, OPC_SUB_S,        sub.s),
+	OPC(2, OPC_CMPS_U,       cmps.u),
+	OPC(2, OPC_CMPS_S,       cmps.s),
+	OPC(2, OPC_MIN_U,        min.u),
+	OPC(2, OPC_MIN_S,        min.s),
+	OPC(2, OPC_MAX_U,        max.u),
+	OPC(2, OPC_MAX_S,        max.s),
+	OPC(2, OPC_ABSNEG_S,     absneg.s),
+	OPC(2, OPC_AND_B,        and.b),
+	OPC(2, OPC_OR_B,         or.b),
+	OPC(2, OPC_NOT_B,        not.b),
+	OPC(2, OPC_XOR_B,        xor.b),
+	OPC(2, OPC_CMPV_U,       cmpv.u),
+	OPC(2, OPC_CMPV_S,       cmpv.s),
+	OPC(2, OPC_MUL_U,        mul.u),
+	OPC(2, OPC_MUL_S,        mul.s),
+	OPC(2, OPC_MULL_U,       mull.u),
+	OPC(2, OPC_BFREV_B,      bfrev.b),
+	OPC(2, OPC_CLZ_S,        clz.s),
+	OPC(2, OPC_CLZ_B,        clz.b),
+	OPC(2, OPC_SHL_B,        shl.b),
+	OPC(2, OPC_SHR_B,        shr.b),
+	OPC(2, OPC_ASHR_B,       ashr.b),
+	OPC(2, OPC_BARY_F,       bary.f),
+	OPC(2, OPC_MGEN_B,       mgen.b),
+	OPC(2, OPC_GETBIT_B,     getbit.b),
+	OPC(2, OPC_SETRM,        setrm),
+	OPC(2, OPC_CBITS_B,      cbits.b),
+	OPC(2, OPC_SHB,          shb),
+	OPC(2, OPC_MSAD,         msad),
+
+	/* category 3: */
+	OPC(3, OPC_MAD_U16,      mad.u16),
+	OPC(3, OPC_MADSH_U16,    madsh.u16),
+	OPC(3, OPC_MAD_S16,      mad.s16),
+	OPC(3, OPC_MADSH_M16,    madsh.m16),
+	OPC(3, OPC_MAD_U24,      mad.u24),
+	OPC(3, OPC_MAD_S24,      mad.s24),
+	OPC(3, OPC_MAD_F16,      mad.f16),
+	OPC(3, OPC_MAD_F32,      mad.f32),
+	OPC(3, OPC_SEL_B16,      sel.b16),
+	OPC(3, OPC_SEL_B32,      sel.b32),
+	OPC(3, OPC_SEL_S16,      sel.s16),
+	OPC(3, OPC_SEL_S32,      sel.s32),
+	OPC(3, OPC_SEL_F16,      sel.f16),
+	OPC(3, OPC_SEL_F32,      sel.f32),
+	OPC(3, OPC_SAD_S16,      sad.s16),
+	OPC(3, OPC_SAD_S32,      sad.s32),
+
+	/* category 4: */
+	OPC(4, OPC_RCP,          rcp),
+	OPC(4, OPC_RSQ,          rsq),
+	OPC(4, OPC_LOG2,         log2),
+	OPC(4, OPC_EXP2,         exp2),
+	OPC(4, OPC_SIN,          sin),
+	OPC(4, OPC_COS,          cos),
+	OPC(4, OPC_SQRT,         sqrt),
+
+	/* category 5: */
+	OPC(5, OPC_ISAM,         isam),
+	OPC(5, OPC_ISAML,        isaml),
+	OPC(5, OPC_ISAMM,        isamm),
+	OPC(5, OPC_SAM,          sam),
+	OPC(5, OPC_SAMB,         samb),
+	OPC(5, OPC_SAML,         saml),
+	OPC(5, OPC_SAMGQ,        samgq),
+	OPC(5, OPC_GETLOD,       getlod),
+	OPC(5, OPC_CONV,         conv),
+	OPC(5, OPC_CONVM,        convm),
+	OPC(5, OPC_GETSIZE,      getsize),
+	OPC(5, OPC_GETBUF,       getbuf),
+	OPC(5, OPC_GETPOS,       getpos),
+	OPC(5, OPC_GETINFO,      getinfo),
+	OPC(5, OPC_DSX,          dsx),
+	OPC(5, OPC_DSY,          dsy),
+	OPC(5, OPC_GATHER4R,     gather4r),
+	OPC(5, OPC_GATHER4G,     gather4g),
+	OPC(5, OPC_GATHER4B,     gather4b),
+	OPC(5, OPC_GATHER4A,     gather4a),
+	OPC(5, OPC_SAMGP0,       samgp0),
+	OPC(5, OPC_SAMGP1,       samgp1),
+	OPC(5, OPC_SAMGP2,       samgp2),
+	OPC(5, OPC_SAMGP3,       samgp3),
+	OPC(5, OPC_DSXPP_1,      dsxpp.1),
+	OPC(5, OPC_DSYPP_1,      dsypp.1),
+	OPC(5, OPC_RGETPOS,      rgetpos),
+	OPC(5, OPC_RGETINFO,     rgetinfo),
+
+
+	/* category 6: */
+	OPC(6, OPC_LDG,          ldg),
+	OPC(6, OPC_LDL,          ldl),
+	OPC(6, OPC_LDP,          ldp),
+	OPC(6, OPC_STG,          stg),
+	OPC(6, OPC_STL,          stl),
+	OPC(6, OPC_STP,          stp),
+	OPC(6, OPC_STI,          sti),
+	OPC(6, OPC_G2L,          g2l),
+	OPC(6, OPC_L2G,          l2g),
+	OPC(6, OPC_PREFETCH,     prefetch),
+	OPC(6, OPC_LDLW,         ldlw),
+	OPC(6, OPC_STLW,         stlw),
+	OPC(6, OPC_RESFMT,       resfmt),
+	OPC(6, OPC_RESINFO,      resinf),
+	OPC(6, OPC_ATOMIC_ADD_L,     atomic.add.l),
+	OPC(6, OPC_ATOMIC_SUB_L,     atomic.sub.l),
+	OPC(6, OPC_ATOMIC_XCHG_L,    atomic.xchg.l),
+	OPC(6, OPC_ATOMIC_INC_L,     atomic.inc.l),
+	OPC(6, OPC_ATOMIC_DEC_L,     atomic.dec.l),
+	OPC(6, OPC_ATOMIC_CMPXCHG_L, atomic.cmpxchg.l),
+	OPC(6, OPC_ATOMIC_MIN_L,     atomic.min.l),
+	OPC(6, OPC_ATOMIC_MAX_L,     atomic.max.l),
+	OPC(6, OPC_ATOMIC_AND_L,     atomic.and.l),
+	OPC(6, OPC_ATOMIC_OR_L,      atomic.or.l),
+	OPC(6, OPC_ATOMIC_XOR_L,     atomic.xor.l),
+	OPC(6, OPC_LDGB_TYPED_4D,    ldgb.typed.4d),
+	OPC(6, OPC_STGB_4D_4,    stgb.4d.4),
+	OPC(6, OPC_STIB,         stib),
+	OPC(6, OPC_LDC_4,        ldc.4),
+	OPC(6, OPC_LDLV,         ldlv),
+
+
+#undef OPC
+};
+
+#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)]))
+
+// XXX hack.. probably should move this table somewhere common:
+#include "ir3.h"
+const char *ir3_instr_name(struct ir3_instruction *instr)
+{
+	if (instr->category == -1) return "??meta??";
+	return opcs[(instr->category << NOPC_BITS) | instr->opc].name;
+}
+
+static void print_instr(uint32_t *dwords, int level, int n)
+{
+	instr_t *instr = (instr_t *)dwords;
+	uint32_t opc = instr_opc(instr);
+	const char *name;
+
+	printf("%s%04d[%08xx_%08xx] ", levels[level], n, dwords[1], dwords[0]);
+
+#if 0
+	/* print unknown bits: */
+	if (debug & PRINT_RAW)
+		printf("[%08xx_%08xx] ", dwords[1] & 0x001ff800, dwords[0] & 0x00000000);
+
+	if (debug & PRINT_VERBOSE)
+		printf("%d,%02d ", instr->opc_cat, opc);
+#endif
+
+	/* NOTE: order flags are printed is a bit fugly.. but for now I
+	 * try to match the order in llvm-a3xx disassembler for easy
+	 * diff'ing..
+	 */
+
+	if (instr->sync)
+		printf("(sy)");
+	if (instr->ss && (instr->opc_cat <= 4))
+		printf("(ss)");
+	if (instr->jmp_tgt)
+		printf("(jp)");
+	if (instr->repeat && (instr->opc_cat <= 4)) {
+		printf("(rpt%d)", instr->repeat);
+		repeat = instr->repeat;
+	} else {
+		repeat = 0;
+	}
+	if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
+		printf("(ul)");
+
+	name = GETINFO(instr)->name;
+
+	if (name) {
+		printf("%s", name);
+		GETINFO(instr)->print(instr);
+	} else {
+		printf("unknown(%d,%d)", instr->opc_cat, opc);
+	}
+
+	printf("\n");
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type)
+{
+	int i;
+
+	assert((sizedwords % 2) == 0);
+
+	for (i = 0; i < sizedwords; i += 2)
+		print_instr(&dwords[i], level, i/2);
+
+	return 0;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
new file mode 100644
index 00000000000..c67f1037ced
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@@ -0,0 +1,691 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INSTR_A3XX_H_
+#define INSTR_A3XX_H_
+
+#define PACKED __attribute__((__packed__))
+
+#include <stdint.h>
+#include <assert.h>
+
+typedef enum {
+	/* category 0: */
+	OPC_NOP = 0,
+	OPC_BR = 1,
+	OPC_JUMP = 2,
+	OPC_CALL = 3,
+	OPC_RET = 4,
+	OPC_KILL = 5,
+	OPC_END = 6,
+	OPC_EMIT = 7,
+	OPC_CUT = 8,
+	OPC_CHMASK = 9,
+	OPC_CHSH = 10,
+	OPC_FLOW_REV = 11,
+
+	/* category 1: */
+	/* no opc.. all category 1 are variants of mov */
+
+	/* category 2: */
+	OPC_ADD_F = 0,
+	OPC_MIN_F = 1,
+	OPC_MAX_F = 2,
+	OPC_MUL_F = 3,
+	OPC_SIGN_F = 4,
+	OPC_CMPS_F = 5,
+	OPC_ABSNEG_F = 6,
+	OPC_CMPV_F = 7,
+	/* 8 - invalid */
+	OPC_FLOOR_F = 9,
+	OPC_CEIL_F = 10,
+	OPC_RNDNE_F = 11,
+	OPC_RNDAZ_F = 12,
+	OPC_TRUNC_F = 13,
+	/* 14-15 - invalid */
+	OPC_ADD_U = 16,
+	OPC_ADD_S = 17,
+	OPC_SUB_U = 18,
+	OPC_SUB_S = 19,
+	OPC_CMPS_U = 20,
+	OPC_CMPS_S = 21,
+	OPC_MIN_U = 22,
+	OPC_MIN_S = 23,
+	OPC_MAX_U = 24,
+	OPC_MAX_S = 25,
+	OPC_ABSNEG_S = 26,
+	/* 27 - invalid */
+	OPC_AND_B = 28,
+	OPC_OR_B = 29,
+	OPC_NOT_B = 30,
+	OPC_XOR_B = 31,
+	/* 32 - invalid */
+	OPC_CMPV_U = 33,
+	OPC_CMPV_S = 34,
+	/* 35-47 - invalid */
+	OPC_MUL_U = 48,
+	OPC_MUL_S = 49,
+	OPC_MULL_U = 50,
+	OPC_BFREV_B = 51,
+	OPC_CLZ_S = 52,
+	OPC_CLZ_B = 53,
+	OPC_SHL_B = 54,
+	OPC_SHR_B = 55,
+	OPC_ASHR_B = 56,
+	OPC_BARY_F = 57,
+	OPC_MGEN_B = 58,
+	OPC_GETBIT_B = 59,
+	OPC_SETRM = 60,
+	OPC_CBITS_B = 61,
+	OPC_SHB = 62,
+	OPC_MSAD = 63,
+
+	/* category 3: */
+	OPC_MAD_U16 = 0,
+	OPC_MADSH_U16 = 1,
+	OPC_MAD_S16 = 2,
+	OPC_MADSH_M16 = 3,   /* should this be .s16? */
+	OPC_MAD_U24 = 4,
+	OPC_MAD_S24 = 5,
+	OPC_MAD_F16 = 6,
+	OPC_MAD_F32 = 7,
+	OPC_SEL_B16 = 8,
+	OPC_SEL_B32 = 9,
+	OPC_SEL_S16 = 10,
+	OPC_SEL_S32 = 11,
+	OPC_SEL_F16 = 12,
+	OPC_SEL_F32 = 13,
+	OPC_SAD_S16 = 14,
+	OPC_SAD_S32 = 15,
+
+	/* category 4: */
+	OPC_RCP = 0,
+	OPC_RSQ = 1,
+	OPC_LOG2 = 2,
+	OPC_EXP2 = 3,
+	OPC_SIN = 4,
+	OPC_COS = 5,
+	OPC_SQRT = 6,
+	// 7-63 - invalid
+
+	/* category 5: */
+	OPC_ISAM = 0,
+	OPC_ISAML = 1,
+	OPC_ISAMM = 2,
+	OPC_SAM = 3,
+	OPC_SAMB = 4,
+	OPC_SAML = 5,
+	OPC_SAMGQ = 6,
+	OPC_GETLOD = 7,
+	OPC_CONV = 8,
+	OPC_CONVM = 9,
+	OPC_GETSIZE = 10,
+	OPC_GETBUF = 11,
+	OPC_GETPOS = 12,
+	OPC_GETINFO = 13,
+	OPC_DSX = 14,
+	OPC_DSY = 15,
+	OPC_GATHER4R = 16,
+	OPC_GATHER4G = 17,
+	OPC_GATHER4B = 18,
+	OPC_GATHER4A = 19,
+	OPC_SAMGP0 = 20,
+	OPC_SAMGP1 = 21,
+	OPC_SAMGP2 = 22,
+	OPC_SAMGP3 = 23,
+	OPC_DSXPP_1 = 24,
+	OPC_DSYPP_1 = 25,
+	OPC_RGETPOS = 26,
+	OPC_RGETINFO = 27,
+
+	/* category 6: */
+	OPC_LDG = 0,        /* load-global */
+	OPC_LDL = 1,
+	OPC_LDP = 2,
+	OPC_STG = 3,        /* store-global */
+	OPC_STL = 4,
+	OPC_STP = 5,
+	OPC_STI = 6,
+	OPC_G2L = 7,
+	OPC_L2G = 8,
+	OPC_PREFETCH = 9,
+	OPC_LDLW = 10,
+	OPC_STLW = 11,
+	OPC_RESFMT = 14,
+	OPC_RESINFO = 15,
+	OPC_ATOMIC_ADD_L = 16,
+	OPC_ATOMIC_SUB_L = 17,
+	OPC_ATOMIC_XCHG_L = 18,
+	OPC_ATOMIC_INC_L = 19,
+	OPC_ATOMIC_DEC_L = 20,
+	OPC_ATOMIC_CMPXCHG_L = 21,
+	OPC_ATOMIC_MIN_L = 22,
+	OPC_ATOMIC_MAX_L = 23,
+	OPC_ATOMIC_AND_L = 24,
+	OPC_ATOMIC_OR_L = 25,
+	OPC_ATOMIC_XOR_L = 26,
+	OPC_LDGB_TYPED_4D = 27,
+	OPC_STGB_4D_4 = 28,
+	OPC_STIB = 29,
+	OPC_LDC_4 = 30,
+	OPC_LDLV = 31,
+
+	/* meta instructions (category -1): */
+	/* placeholder instr to mark inputs/outputs: */
+	OPC_META_INPUT = 0,
+	OPC_META_OUTPUT = 1,
+	/* The "fan-in" and "fan-out" instructions are used for keeping
+	 * track of instructions that write to multiple dst registers
+	 * (fan-out) like texture sample instructions, or read multiple
+	 * consecutive scalar registers (fan-in) (bary.f, texture samp)
+	 */
+	OPC_META_FO = 2,
+	OPC_META_FI = 3,
+	/* branches/flow control */
+	OPC_META_FLOW = 4,
+	OPC_META_PHI = 5,
+	/* relative addressing */
+	OPC_META_DEREF = 6,
+
+
+} opc_t;
+
+typedef enum {
+	TYPE_F16 = 0,
+	TYPE_F32 = 1,
+	TYPE_U16 = 2,
+	TYPE_U32 = 3,
+	TYPE_S16 = 4,
+	TYPE_S32 = 5,
+	TYPE_U8  = 6,
+	TYPE_S8  = 7,  // XXX I assume?
+} type_t;
+
+static inline uint32_t type_size(type_t type)
+{
+	switch (type) {
+	case TYPE_F32:
+	case TYPE_U32:
+	case TYPE_S32:
+		return 32;
+	case TYPE_F16:
+	case TYPE_U16:
+	case TYPE_S16:
+		return 16;
+	case TYPE_U8:
+	case TYPE_S8:
+		return 8;
+	default:
+		assert(0); /* invalid type */
+		return 0;
+	}
+}
+
+static inline int type_float(type_t type)
+{
+	return (type == TYPE_F32) || (type == TYPE_F16);
+}
+
+static inline int type_uint(type_t type)
+{
+	return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
+}
+
+static inline int type_sint(type_t type)
+{
+	return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
+}
+
+typedef union PACKED {
+	/* normal gpr or const src register: */
+	struct PACKED {
+		uint32_t comp  : 2;
+		uint32_t num   : 10;
+	};
+	/* for immediate val: */
+	int32_t  iim_val   : 11;
+	/* to make compiler happy: */
+	uint32_t dummy32;
+	uint32_t dummy10   : 10;
+	uint32_t dummy11   : 11;
+	uint32_t dummy12   : 12;
+	uint32_t dummy13   : 13;
+	uint32_t dummy8    : 8;
+} reg_t;
+
+/* special registers: */
+#define REG_A0 61       /* address register */
+#define REG_P0 62       /* predicate register */
+
+static inline int reg_special(reg_t reg)
+{
+	return (reg.num == REG_A0) || (reg.num == REG_P0);
+}
+
+typedef struct PACKED {
+	/* dword0: */
+	int16_t  immed    : 16;
+	uint32_t dummy1   : 16;
+
+	/* dword1: */
+	uint32_t dummy2   : 8;
+	uint32_t repeat   : 3;
+	uint32_t dummy3   : 1;
+	uint32_t ss       : 1;
+	uint32_t dummy4   : 7;
+	uint32_t inv      : 1;
+	uint32_t comp     : 2;
+	uint32_t opc      : 4;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat0_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		/* for normal src register: */
+		struct PACKED {
+			uint32_t src : 11;
+			/* at least low bit of pad must be zero or it will
+			 * look like a address relative src
+			 */
+			uint32_t pad : 21;
+		};
+		/* for address relative: */
+		struct PACKED {
+			int32_t  off : 10;
+			uint32_t src_rel_c : 1;
+			uint32_t src_rel : 1;
+			uint32_t unknown : 20;
+		};
+		/* for immediate: */
+		int32_t iim_val;
+		float   fim_val;
+	};
+
+	/* dword1: */
+	uint32_t dst        : 8;
+	uint32_t repeat     : 3;
+	uint32_t src_r      : 1;
+	uint32_t ss         : 1;
+	uint32_t ul         : 1;
+	uint32_t dst_type   : 3;
+	uint32_t dst_rel    : 1;
+	uint32_t src_type   : 3;
+	uint32_t src_c      : 1;
+	uint32_t src_im     : 1;
+	uint32_t even       : 1;
+	uint32_t pos_inf    : 1;
+	uint32_t must_be_0  : 2;
+	uint32_t jmp_tgt    : 1;
+	uint32_t sync       : 1;
+	uint32_t opc_cat    : 3;
+} instr_cat1_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			uint32_t src1         : 11;
+			uint32_t must_be_zero1: 2;
+			uint32_t src1_im      : 1;   /* immediate */
+			uint32_t src1_neg     : 1;   /* negate */
+			uint32_t src1_abs     : 1;   /* absolute value */
+		};
+		struct PACKED {
+			uint32_t src1         : 10;
+			uint32_t src1_c       : 1;   /* relative-const */
+			uint32_t src1_rel     : 1;   /* relative address */
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel1;
+		struct PACKED {
+			uint32_t src1         : 12;
+			uint32_t src1_c       : 1;   /* const */
+			uint32_t dummy        : 3;
+		} c1;
+	};
+
+	union PACKED {
+		struct PACKED {
+			uint32_t src2         : 11;
+			uint32_t must_be_zero2: 2;
+			uint32_t src2_im      : 1;   /* immediate */
+			uint32_t src2_neg     : 1;   /* negate */
+			uint32_t src2_abs     : 1;   /* absolute value */
+		};
+		struct PACKED {
+			uint32_t src2         : 10;
+			uint32_t src2_c       : 1;   /* relative-const */
+			uint32_t src2_rel     : 1;   /* relative address */
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel2;
+		struct PACKED {
+			uint32_t src2         : 12;
+			uint32_t src2_c       : 1;   /* const */
+			uint32_t dummy        : 3;
+		} c2;
+	};
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t repeat   : 3;
+	uint32_t src1_r   : 1;
+	uint32_t ss       : 1;
+	uint32_t ul       : 1;   /* dunno */
+	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+	uint32_t ei       : 1;
+	uint32_t cond     : 3;
+	uint32_t src2_r   : 1;
+	uint32_t full     : 1;   /* not half */
+	uint32_t opc      : 6;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat2_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			uint32_t src1         : 11;
+			uint32_t must_be_zero1: 2;
+			uint32_t src2_c       : 1;
+			uint32_t src1_neg     : 1;
+			uint32_t src2_r       : 1;
+		};
+		struct PACKED {
+			uint32_t src1         : 10;
+			uint32_t src1_c       : 1;
+			uint32_t src1_rel     : 1;
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel1;
+		struct PACKED {
+			uint32_t src1         : 12;
+			uint32_t src1_c       : 1;
+			uint32_t dummy        : 3;
+		} c1;
+	};
+
+	union PACKED {
+		struct PACKED {
+			uint32_t src3         : 11;
+			uint32_t must_be_zero2: 2;
+			uint32_t src3_r       : 1;
+			uint32_t src2_neg     : 1;
+			uint32_t src3_neg     : 1;
+		};
+		struct PACKED {
+			uint32_t src3         : 10;
+			uint32_t src3_c       : 1;
+			uint32_t src3_rel     : 1;
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel2;
+		struct PACKED {
+			uint32_t src3         : 12;
+			uint32_t src3_c       : 1;
+			uint32_t dummy        : 3;
+		} c2;
+	};
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t repeat   : 3;
+	uint32_t src1_r   : 1;
+	uint32_t ss       : 1;
+	uint32_t ul       : 1;
+	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+	uint32_t src2     : 8;
+	uint32_t opc      : 4;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat3_t;
+
+static inline bool instr_cat3_full(instr_cat3_t *cat3)
+{
+	switch (cat3->opc) {
+	case OPC_MAD_F16:
+	case OPC_MAD_U16:
+	case OPC_MAD_S16:
+	case OPC_SEL_B16:
+	case OPC_SEL_S16:
+	case OPC_SEL_F16:
+	case OPC_SAD_S16:
+	case OPC_SAD_S32:  // really??
+		return false;
+	default:
+		return true;
+	}
+}
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			uint32_t src          : 11;
+			uint32_t must_be_zero1: 2;
+			uint32_t src_im       : 1;   /* immediate */
+			uint32_t src_neg      : 1;   /* negate */
+			uint32_t src_abs      : 1;   /* absolute value */
+		};
+		struct PACKED {
+			uint32_t src          : 10;
+			uint32_t src_c        : 1;   /* relative-const */
+			uint32_t src_rel      : 1;   /* relative address */
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel;
+		struct PACKED {
+			uint32_t src          : 12;
+			uint32_t src_c        : 1;   /* const */
+			uint32_t dummy        : 3;
+		} c;
+	};
+	uint32_t dummy1   : 16;  /* seem to be ignored */
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t repeat   : 3;
+	uint32_t src_r    : 1;
+	uint32_t ss       : 1;
+	uint32_t ul       : 1;
+	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+	uint32_t dummy2   : 5;   /* seem to be ignored */
+	uint32_t full     : 1;   /* not half */
+	uint32_t opc      : 6;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat4_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		/* normal case: */
+		struct PACKED {
+			uint32_t full     : 1;   /* not half */
+			uint32_t src1     : 8;
+			uint32_t src2     : 8;
+			uint32_t dummy1   : 4;   /* seem to be ignored */
+			uint32_t samp     : 4;
+			uint32_t tex      : 7;
+		} norm;
+		/* s2en case: */
+		struct PACKED {
+			uint32_t full     : 1;   /* not half */
+			uint32_t src1     : 8;
+			uint32_t src2     : 11;
+			uint32_t dummy1   : 1;
+			uint32_t src3     : 8;
+			uint32_t dummy2   : 3;
+		} s2en;
+		/* same in either case: */
+		// XXX I think, confirm this
+		struct PACKED {
+			uint32_t full     : 1;   /* not half */
+			uint32_t src1     : 8;
+			uint32_t pad      : 23;
+		};
+	};
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t wrmask   : 4;   /* write-mask */
+	uint32_t type     : 3;
+	uint32_t dummy2   : 1;   /* seems to be ignored */
+	uint32_t is_3d    : 1;
+
+	uint32_t is_a     : 1;
+	uint32_t is_s     : 1;
+	uint32_t is_s2en  : 1;
+	uint32_t is_o     : 1;
+	uint32_t is_p     : 1;
+
+	uint32_t opc      : 5;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat5_t;
+
+/* used for load instructions: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t must_be_one1 : 1;
+	int16_t  off      : 13;
+	uint32_t src      : 8;
+	uint32_t dummy1   : 1;
+	uint32_t must_be_one2 : 1;
+	int32_t  iim_val  : 8;
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t dummy2   : 9;
+	uint32_t type     : 3;
+	uint32_t dummy3   : 2;
+	uint32_t opc      : 5;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat6a_t;
+
+/* used for store instructions: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t must_be_zero1 : 1;
+	uint32_t src      : 8;
+	uint32_t off_hi   : 5;   /* high bits of 'off'... ugly! */
+	uint32_t dummy1   : 9;
+	uint32_t must_be_one1 : 1;
+	int32_t  iim_val  : 8;
+
+	/* dword1: */
+	uint16_t off      : 8;
+	uint32_t must_be_one2 : 1;
+	uint32_t dst      : 8;
+	uint32_t type     : 3;
+	uint32_t dummy2   : 2;
+	uint32_t opc      : 5;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat6b_t;
+
+typedef union PACKED {
+	instr_cat6a_t a;
+	instr_cat6b_t b;
+	struct PACKED {
+		/* dword0: */
+		uint32_t pad1     : 24;
+		int32_t  iim_val  : 8;
+
+		/* dword1: */
+		uint32_t pad2     : 17;
+		uint32_t type     : 3;
+		uint32_t pad3     : 2;
+		uint32_t opc      : 5;
+		uint32_t jmp_tgt  : 1;
+		uint32_t sync     : 1;
+		uint32_t opc_cat  : 3;
+	};
+} instr_cat6_t;
+
+typedef union PACKED {
+	instr_cat0_t cat0;
+	instr_cat1_t cat1;
+	instr_cat2_t cat2;
+	instr_cat3_t cat3;
+	instr_cat4_t cat4;
+	instr_cat5_t cat5;
+	instr_cat6_t cat6;
+	struct PACKED {
+		/* dword0: */
+		uint64_t pad1     : 40;
+		uint32_t repeat   : 3;  /* cat0-cat4 */
+		uint32_t pad2     : 1;
+		uint32_t ss       : 1;  /* cat1-cat4 (cat0??) */
+		uint32_t ul       : 1;  /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
+		uint32_t pad3     : 13;
+		uint32_t jmp_tgt  : 1;
+		uint32_t sync     : 1;
+		uint32_t opc_cat  : 3;
+
+	};
+} instr_t;
+
+static inline uint32_t instr_opc(instr_t *instr)
+{
+	switch (instr->opc_cat) {
+	case 0:  return instr->cat0.opc;
+	case 1:  return 0;
+	case 2:  return instr->cat2.opc;
+	case 3:  return instr->cat3.opc;
+	case 4:  return instr->cat4.opc;
+	case 5:  return instr->cat5.opc;
+	case 6:  return instr->cat6.opc;
+	default: return 0;
+	}
+}
+
+static inline bool is_mad(opc_t opc)
+{
+	switch (opc) {
+	case OPC_MAD_U16:
+	case OPC_MADSH_U16:
+	case OPC_MAD_S16:
+	case OPC_MADSH_M16:
+	case OPC_MAD_U24:
+	case OPC_MAD_S24:
+	case OPC_MAD_F16:
+	case OPC_MAD_F32:
+		return true;
+	default:
+		return false;
+	}
+}
+
+#endif /* INSTR_A3XX_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
new file mode 100644
index 00000000000..ea2a9251b28
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -0,0 +1,675 @@
+/*
+ * Copyright (c) 2012 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "freedreno_util.h"
+#include "instr-a3xx.h"
+
+#define CHUNK_SZ 1020
+
+struct ir3_heap_chunk {
+	struct ir3_heap_chunk *next;
+	uint32_t heap[CHUNK_SZ];
+};
+
+static void grow_heap(struct ir3 *shader)
+{
+	struct ir3_heap_chunk *chunk = calloc(1, sizeof(*chunk));
+	chunk->next = shader->chunk;
+	shader->chunk = chunk;
+	shader->heap_idx = 0;
+}
+
+/* simple allocator to carve allocations out of an up-front allocated heap,
+ * so that we can free everything easily in one shot.
+ */
+void * ir3_alloc(struct ir3 *shader, int sz)
+{
+	void *ptr;
+
+	sz = align(sz, 4) / 4;
+
+	if ((shader->heap_idx + sz) > CHUNK_SZ)
+		grow_heap(shader);
+
+	ptr = &shader->chunk->heap[shader->heap_idx];
+	shader->heap_idx += sz;
+
+	return ptr;
+}
+
+struct ir3 * ir3_create(void)
+{
+	struct ir3 *shader =
+			calloc(1, sizeof(struct ir3));
+	grow_heap(shader);
+	return shader;
+}
+
+void ir3_destroy(struct ir3 *shader)
+{
+	while (shader->chunk) {
+		struct ir3_heap_chunk *chunk = shader->chunk;
+		shader->chunk = chunk->next;
+		free(chunk);
+	}
+	free(shader);
+}
+
+#define iassert(cond) do { \
+	if (!(cond)) { \
+		assert(cond); \
+		return -1; \
+	} } while (0)
+
+static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
+		uint32_t repeat, uint32_t valid_flags)
+{
+	reg_t val = { .dummy32 = 0 };
+
+	assert(!(reg->flags & ~valid_flags));
+
+	if (!(reg->flags & IR3_REG_R))
+		repeat = 0;
+
+	if (reg->flags & IR3_REG_IMMED) {
+		val.iim_val = reg->iim_val;
+	} else {
+		int8_t components = util_last_bit(reg->wrmask);
+		int8_t max = (reg->num + repeat + components - 1) >> 2;
+
+		val.comp = reg->num & 0x3;
+		val.num  = reg->num >> 2;
+
+		if (reg->flags & IR3_REG_CONST) {
+			info->max_const = MAX2(info->max_const, max);
+		} else if ((max != REG_A0) && (max != REG_P0)) {
+			if (reg->flags & IR3_REG_HALF) {
+				info->max_half_reg = MAX2(info->max_half_reg, max);
+			} else {
+				info->max_reg = MAX2(info->max_reg, max);
+			}
+		}
+	}
+
+	return val.dummy32;
+}
+
+static int emit_cat0(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	instr_cat0_t *cat0 = ptr;
+
+	cat0->immed    = instr->cat0.immed;
+	cat0->repeat   = instr->repeat;
+	cat0->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat0->inv      = instr->cat0.inv;
+	cat0->comp     = instr->cat0.comp;
+	cat0->opc      = instr->opc;
+	cat0->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat0->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat0->opc_cat  = 0;
+
+	return 0;
+}
+
+static uint32_t type_flags(type_t type)
+{
+	return (type_size(type) == 32) ? 0 : IR3_REG_HALF;
+}
+
+static int emit_cat1(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src = instr->regs[1];
+	instr_cat1_t *cat1 = ptr;
+
+	iassert(instr->regs_count == 2);
+	iassert(!((dst->flags ^ type_flags(instr->cat1.dst_type)) & IR3_REG_HALF));
+	iassert((src->flags & IR3_REG_IMMED) ||
+			!((src->flags ^ type_flags(instr->cat1.src_type)) & IR3_REG_HALF));
+
+	if (src->flags & IR3_REG_IMMED) {
+		cat1->iim_val = src->iim_val;
+		cat1->src_im  = 1;
+	} else if (src->flags & IR3_REG_RELATIV) {
+		cat1->off       = src->offset;
+		cat1->src_rel   = 1;
+		cat1->src_rel_c = !!(src->flags & IR3_REG_CONST);
+	} else {
+		cat1->src  = reg(src, info, instr->repeat,
+				IR3_REG_IMMED | IR3_REG_R |
+				IR3_REG_CONST | IR3_REG_HALF);
+		cat1->src_c     = !!(src->flags & IR3_REG_CONST);
+	}
+
+	cat1->dst      = reg(dst, info, instr->repeat,
+			IR3_REG_RELATIV | IR3_REG_EVEN |
+			IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF);
+	cat1->repeat   = instr->repeat;
+	cat1->src_r    = !!(src->flags & IR3_REG_R);
+	cat1->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat1->ul       = !!(instr->flags & IR3_INSTR_UL);
+	cat1->dst_type = instr->cat1.dst_type;
+	cat1->dst_rel  = !!(dst->flags & IR3_REG_RELATIV);
+	cat1->src_type = instr->cat1.src_type;
+	cat1->even     = !!(dst->flags & IR3_REG_EVEN);
+	cat1->pos_inf  = !!(dst->flags & IR3_REG_POS_INF);
+	cat1->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat1->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat1->opc_cat  = 1;
+
+	return 0;
+}
+
+static int emit_cat2(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src1 = instr->regs[1];
+	struct ir3_register *src2 = instr->regs[2];
+	instr_cat2_t *cat2 = ptr;
+
+	iassert((instr->regs_count == 2) || (instr->regs_count == 3));
+
+	if (src1->flags & IR3_REG_RELATIV) {
+		iassert(src1->num < (1 << 10));
+		cat2->rel1.src1      = reg(src1, info, instr->repeat,
+				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+				IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
+		cat2->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
+		cat2->rel1.src1_rel  = 1;
+	} else if (src1->flags & IR3_REG_CONST) {
+		iassert(src1->num < (1 << 12));
+		cat2->c1.src1   = reg(src1, info, instr->repeat,
+				IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
+				IR3_REG_R | IR3_REG_HALF);
+		cat2->c1.src1_c = 1;
+	} else {
+		iassert(src1->num < (1 << 11));
+		cat2->src1 = reg(src1, info, instr->repeat,
+				IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
+				IR3_REG_R | IR3_REG_HALF);
+	}
+	cat2->src1_im  = !!(src1->flags & IR3_REG_IMMED);
+	cat2->src1_neg = !!(src1->flags & IR3_REG_NEGATE);
+	cat2->src1_abs = !!(src1->flags & IR3_REG_ABS);
+	cat2->src1_r   = !!(src1->flags & IR3_REG_R);
+
+	if (src2) {
+		iassert((src2->flags & IR3_REG_IMMED) ||
+				!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+
+		if (src2->flags & IR3_REG_RELATIV) {
+			iassert(src2->num < (1 << 10));
+			cat2->rel2.src2      = reg(src2, info, instr->repeat,
+					IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+					IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
+			cat2->rel2.src2_c    = !!(src2->flags & IR3_REG_CONST);
+			cat2->rel2.src2_rel  = 1;
+		} else if (src2->flags & IR3_REG_CONST) {
+			iassert(src2->num < (1 << 12));
+			cat2->c2.src2   = reg(src2, info, instr->repeat,
+					IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
+					IR3_REG_R | IR3_REG_HALF);
+			cat2->c2.src2_c = 1;
+		} else {
+			iassert(src2->num < (1 << 11));
+			cat2->src2 = reg(src2, info, instr->repeat,
+					IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
+					IR3_REG_R | IR3_REG_HALF);
+		}
+
+		cat2->src2_im  = !!(src2->flags & IR3_REG_IMMED);
+		cat2->src2_neg = !!(src2->flags & IR3_REG_NEGATE);
+		cat2->src2_abs = !!(src2->flags & IR3_REG_ABS);
+		cat2->src2_r   = !!(src2->flags & IR3_REG_R);
+	}
+
+	cat2->dst      = reg(dst, info, instr->repeat,
+			IR3_REG_R | IR3_REG_EI | IR3_REG_HALF);
+	cat2->repeat   = instr->repeat;
+	cat2->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat2->ul       = !!(instr->flags & IR3_INSTR_UL);
+	cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF);
+	cat2->ei       = !!(dst->flags & IR3_REG_EI);
+	cat2->cond     = instr->cat2.condition;
+	cat2->full     = ! (src1->flags & IR3_REG_HALF);
+	cat2->opc      = instr->opc;
+	cat2->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat2->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat2->opc_cat  = 2;
+
+	return 0;
+}
+
+static int emit_cat3(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src1 = instr->regs[1];
+	struct ir3_register *src2 = instr->regs[2];
+	struct ir3_register *src3 = instr->regs[3];
+	instr_cat3_t *cat3 = ptr;
+	uint32_t src_flags = 0;
+
+	switch (instr->opc) {
+	case OPC_MAD_F16:
+	case OPC_MAD_U16:
+	case OPC_MAD_S16:
+	case OPC_SEL_B16:
+	case OPC_SEL_S16:
+	case OPC_SEL_F16:
+	case OPC_SAD_S16:
+	case OPC_SAD_S32:  // really??
+		src_flags |= IR3_REG_HALF;
+		break;
+	default:
+		break;
+	}
+
+	iassert(instr->regs_count == 4);
+	iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF));
+	iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF));
+	iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
+
+	if (src1->flags & IR3_REG_RELATIV) {
+		iassert(src1->num < (1 << 10));
+		cat3->rel1.src1      = reg(src1, info, instr->repeat,
+				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+				IR3_REG_R | IR3_REG_HALF);
+		cat3->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
+		cat3->rel1.src1_rel  = 1;
+	} else if (src1->flags & IR3_REG_CONST) {
+		iassert(src1->num < (1 << 12));
+		cat3->c1.src1   = reg(src1, info, instr->repeat,
+				IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R |
+				IR3_REG_HALF);
+		cat3->c1.src1_c = 1;
+	} else {
+		iassert(src1->num < (1 << 11));
+		cat3->src1 = reg(src1, info, instr->repeat,
+				IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF);
+	}
+
+	cat3->src1_neg = !!(src1->flags & IR3_REG_NEGATE);
+	cat3->src1_r   = !!(src1->flags & IR3_REG_R);
+
+	cat3->src2     = reg(src2, info, instr->repeat,
+			IR3_REG_CONST | IR3_REG_NEGATE |
+			IR3_REG_R | IR3_REG_HALF);
+	cat3->src2_c   = !!(src2->flags & IR3_REG_CONST);
+	cat3->src2_neg = !!(src2->flags & IR3_REG_NEGATE);
+	cat3->src2_r   = !!(src2->flags & IR3_REG_R);
+
+
+	if (src3->flags & IR3_REG_RELATIV) {
+		iassert(src3->num < (1 << 10));
+		cat3->rel2.src3      = reg(src3, info, instr->repeat,
+				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+				IR3_REG_R | IR3_REG_HALF);
+		cat3->rel2.src3_c    = !!(src3->flags & IR3_REG_CONST);
+		cat3->rel2.src3_rel  = 1;
+	} else if (src3->flags & IR3_REG_CONST) {
+		iassert(src3->num < (1 << 12));
+		cat3->c2.src3   = reg(src3, info, instr->repeat,
+				IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R |
+				IR3_REG_HALF);
+		cat3->c2.src3_c = 1;
+	} else {
+		iassert(src3->num < (1 << 11));
+		cat3->src3 = reg(src3, info, instr->repeat,
+				IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF);
+	}
+
+	cat3->src3_neg = !!(src3->flags & IR3_REG_NEGATE);
+	cat3->src3_r   = !!(src3->flags & IR3_REG_R);
+
+	cat3->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	cat3->repeat   = instr->repeat;
+	cat3->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat3->ul       = !!(instr->flags & IR3_INSTR_UL);
+	cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF);
+	cat3->opc      = instr->opc;
+	cat3->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat3->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat3->opc_cat  = 3;
+
+	return 0;
+}
+
+static int emit_cat4(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src = instr->regs[1];
+	instr_cat4_t *cat4 = ptr;
+
+	iassert(instr->regs_count == 2);
+
+	if (src->flags & IR3_REG_RELATIV) {
+		iassert(src->num < (1 << 10));
+		cat4->rel.src      = reg(src, info, instr->repeat,
+				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
+				IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
+		cat4->rel.src_c    = !!(src->flags & IR3_REG_CONST);
+		cat4->rel.src_rel  = 1;
+	} else if (src->flags & IR3_REG_CONST) {
+		iassert(src->num < (1 << 12));
+		cat4->c.src   = reg(src, info, instr->repeat,
+				IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
+				IR3_REG_R | IR3_REG_HALF);
+		cat4->c.src_c = 1;
+	} else {
+		iassert(src->num < (1 << 11));
+		cat4->src = reg(src, info, instr->repeat,
+				IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
+				IR3_REG_R | IR3_REG_HALF);
+	}
+
+	cat4->src_im   = !!(src->flags & IR3_REG_IMMED);
+	cat4->src_neg  = !!(src->flags & IR3_REG_NEGATE);
+	cat4->src_abs  = !!(src->flags & IR3_REG_ABS);
+	cat4->src_r    = !!(src->flags & IR3_REG_R);
+
+	cat4->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	cat4->repeat   = instr->repeat;
+	cat4->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat4->ul       = !!(instr->flags & IR3_INSTR_UL);
+	cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF);
+	cat4->full     = ! (src->flags & IR3_REG_HALF);
+	cat4->opc      = instr->opc;
+	cat4->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat4->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat4->opc_cat  = 4;
+
+	return 0;
+}
+
+static int emit_cat5(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src1 = instr->regs[1];
+	struct ir3_register *src2 = instr->regs[2];
+	struct ir3_register *src3 = instr->regs[3];
+	instr_cat5_t *cat5 = ptr;
+
+	iassert(!((dst->flags ^ type_flags(instr->cat5.type)) & IR3_REG_HALF));
+
+	if (src1) {
+		cat5->full = ! (src1->flags & IR3_REG_HALF);
+		cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF);
+	}
+
+
+	if (instr->flags & IR3_INSTR_S2EN) {
+		if (src2) {
+			iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+			cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+		}
+		if (src3) {
+			iassert(src3->flags & IR3_REG_HALF);
+			cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF);
+		}
+		iassert(!(instr->cat5.samp | instr->cat5.tex));
+	} else {
+		iassert(!src3);
+		if (src2) {
+			iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+			cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+		}
+		cat5->norm.samp = instr->cat5.samp;
+		cat5->norm.tex  = instr->cat5.tex;
+	}
+
+	cat5->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	cat5->wrmask   = dst->wrmask;
+	cat5->type     = instr->cat5.type;
+	cat5->is_3d    = !!(instr->flags & IR3_INSTR_3D);
+	cat5->is_a     = !!(instr->flags & IR3_INSTR_A);
+	cat5->is_s     = !!(instr->flags & IR3_INSTR_S);
+	cat5->is_s2en  = !!(instr->flags & IR3_INSTR_S2EN);
+	cat5->is_o     = !!(instr->flags & IR3_INSTR_O);
+	cat5->is_p     = !!(instr->flags & IR3_INSTR_P);
+	cat5->opc      = instr->opc;
+	cat5->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat5->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat5->opc_cat  = 5;
+
+	return 0;
+}
+
+static int emit_cat6(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src = instr->regs[1];
+	instr_cat6_t *cat6 = ptr;
+
+	iassert(instr->regs_count == 2);
+
+	switch (instr->opc) {
+	/* load instructions: */
+	case OPC_LDG:
+	case OPC_LDP:
+	case OPC_LDL:
+	case OPC_LDLW:
+	case OPC_LDLV:
+	case OPC_PREFETCH: {
+		instr_cat6a_t *cat6a = ptr;
+
+		iassert(!((dst->flags ^ type_flags(instr->cat6.type)) & IR3_REG_HALF));
+
+		cat6a->must_be_one1  = 1;
+		cat6a->must_be_one2  = 1;
+		cat6a->off = instr->cat6.offset;
+		cat6a->src = reg(src, info, instr->repeat, 0);
+		cat6a->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+		break;
+	}
+	/* store instructions: */
+	case OPC_STG:
+	case OPC_STP:
+	case OPC_STL:
+	case OPC_STLW:
+	case OPC_STI: {
+		instr_cat6b_t *cat6b = ptr;
+		uint32_t src_flags = type_flags(instr->cat6.type);
+		uint32_t dst_flags = (instr->opc == OPC_STI) ? IR3_REG_HALF : 0;
+
+		iassert(!((src->flags ^ src_flags) & IR3_REG_HALF));
+
+		cat6b->must_be_one1  = 1;
+		cat6b->must_be_one2  = 1;
+		cat6b->src    = reg(src, info, instr->repeat, src_flags);
+		cat6b->off_hi = instr->cat6.offset >> 8;
+		cat6b->off    = instr->cat6.offset;
+		cat6b->dst    = reg(dst, info, instr->repeat, IR3_REG_R | dst_flags);
+
+		break;
+	}
+	default:
+		// TODO
+		break;
+	}
+
+	cat6->iim_val  = instr->cat6.iim_val;
+	cat6->type     = instr->cat6.type;
+	cat6->opc      = instr->opc;
+	cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat6->opc_cat  = 6;
+
+	return 0;
+}
+
+static int (*emit[])(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info) = {
+	emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6,
+};
+
+void * ir3_assemble(struct ir3 *shader, struct ir3_info *info)
+{
+	uint32_t *ptr, *dwords;
+	uint32_t i;
+
+	info->max_reg       = -1;
+	info->max_half_reg  = -1;
+	info->max_const     = -1;
+	info->instrs_count  = 0;
+
+	/* need a integer number of instruction "groups" (sets of four
+	 * instructions), so pad out w/ NOPs if needed:
+	 * (each instruction is 64bits)
+	 */
+	info->sizedwords = 2 * align(shader->instrs_count, 4);
+
+	ptr = dwords = calloc(1, 4 * info->sizedwords);
+
+	for (i = 0; i < shader->instrs_count; i++) {
+		struct ir3_instruction *instr = shader->instrs[i];
+		int ret = emit[instr->category](instr, dwords, info);
+		if (ret)
+			goto fail;
+		info->instrs_count += 1 + instr->repeat;
+		dwords += 2;
+	}
+
+	return ptr;
+
+fail:
+	free(ptr);
+	return NULL;
+}
+
+static struct ir3_register * reg_create(struct ir3 *shader,
+		int num, int flags)
+{
+	struct ir3_register *reg =
+			ir3_alloc(shader, sizeof(struct ir3_register));
+	reg->wrmask = 1;
+	reg->flags = flags;
+	reg->num = num;
+	return reg;
+}
+
+static void insert_instr(struct ir3 *shader,
+		struct ir3_instruction *instr)
+{
+#ifdef DEBUG
+	static uint32_t serialno = 0;
+	instr->serialno = ++serialno;
+#endif
+	if (shader->instrs_count == shader->instrs_sz) {
+		shader->instrs_sz = MAX2(2 * shader->instrs_sz, 16);
+		shader->instrs = realloc(shader->instrs,
+				shader->instrs_sz * sizeof(shader->instrs[0]));
+	}
+	shader->instrs[shader->instrs_count++] = instr;
+}
+
+struct ir3_block * ir3_block_create(struct ir3 *shader,
+		unsigned ntmp, unsigned nin, unsigned nout)
+{
+	struct ir3_block *block;
+	unsigned size;
+	char *ptr;
+
+	size = sizeof(*block);
+	size += sizeof(block->temporaries[0]) * ntmp;
+	size += sizeof(block->inputs[0]) * nin;
+	size += sizeof(block->outputs[0]) * nout;
+
+	ptr = ir3_alloc(shader, size);
+
+	block = (void *)ptr;
+	ptr += sizeof(*block);
+
+	block->temporaries = (void *)ptr;
+	block->ntemporaries = ntmp;
+	ptr += sizeof(block->temporaries[0]) * ntmp;
+
+	block->inputs = (void *)ptr;
+	block->ninputs = nin;
+	ptr += sizeof(block->inputs[0]) * nin;
+
+	block->outputs = (void *)ptr;
+	block->noutputs = nout;
+	ptr += sizeof(block->outputs[0]) * nout;
+
+	block->shader = shader;
+
+	return block;
+}
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
+		int category, opc_t opc)
+{
+	struct ir3_instruction *instr =
+			ir3_alloc(block->shader, sizeof(struct ir3_instruction));
+	instr->block = block;
+	instr->category = category;
+	instr->opc = opc;
+	insert_instr(block->shader, instr);
+	return instr;
+}
+
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
+{
+	struct ir3_instruction *new_instr =
+			ir3_alloc(instr->block->shader, sizeof(struct ir3_instruction));
+	unsigned i;
+
+	*new_instr = *instr;
+	insert_instr(instr->block->shader, new_instr);
+
+	/* clone registers: */
+	new_instr->regs_count = 0;
+	for (i = 0; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		struct ir3_register *new_reg =
+				ir3_reg_create(new_instr, reg->num, reg->flags);
+		*new_reg = *reg;
+	}
+
+	return new_instr;
+}
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+		int num, int flags)
+{
+	struct ir3_register *reg = reg_create(instr->block->shader, num, flags);
+	assert(instr->regs_count < ARRAY_SIZE(instr->regs));
+	instr->regs[instr->regs_count++] = reg;
+	return reg;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
new file mode 100644
index 00000000000..9ed914ba2e4
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IR3_H_
+#define IR3_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "instr-a3xx.h"
+#include "disasm.h"  /* TODO move 'enum shader_t' somewhere else.. */
+
+/* low level intermediate representation of an adreno shader program */
+
+struct ir3;
+struct ir3_instruction;
+struct ir3_block;
+
+struct ir3 * fd_asm_parse(const char *src);
+
+struct ir3_info {
+	uint16_t sizedwords;
+	uint16_t instrs_count;   /* expanded to account for rpt's */
+	/* NOTE: max_reg, etc, does not include registers not touched
+	 * by the shader (ie. vertex fetched via VFD_DECODE but not
+	 * touched by shader)
+	 */
+	int8_t   max_reg;   /* highest GPR # used by shader */
+	int8_t   max_half_reg;
+	int8_t   max_const;
+};
+
+struct ir3_register {
+	enum {
+		IR3_REG_CONST  = 0x001,
+		IR3_REG_IMMED  = 0x002,
+		IR3_REG_HALF   = 0x004,
+		IR3_REG_RELATIV= 0x008,
+		IR3_REG_R      = 0x010,
+		IR3_REG_NEGATE = 0x020,
+		IR3_REG_ABS    = 0x040,
+		IR3_REG_EVEN   = 0x080,
+		IR3_REG_POS_INF= 0x100,
+		/* (ei) flag, end-input?  Set on last bary, presumably to signal
+		 * that the shader needs no more input:
+		 */
+		IR3_REG_EI     = 0x200,
+		/* meta-flags, for intermediate stages of IR, ie.
+		 * before register assignment is done:
+		 */
+		IR3_REG_SSA    = 0x1000,   /* 'instr' is ptr to assigning instr */
+		IR3_REG_IA     = 0x2000,   /* meta-input dst is "assigned" */
+		IR3_REG_ADDR   = 0x4000,   /* register is a0.x */
+	} flags;
+	union {
+		/* normal registers:
+		 * the component is in the low two bits of the reg #, so
+		 * rN.x becomes: (N << 2) | x
+		 */
+		int num;
+		/* immediate: */
+		int     iim_val;
+		float   fim_val;
+		/* relative: */
+		int offset;
+		/* for IR3_REG_SSA, src registers contain ptr back to
+		 * assigning instruction.
+		 */
+		struct ir3_instruction *instr;
+	};
+
+	/* used for cat5 instructions, but also for internal/IR level
+	 * tracking of what registers are read/written by an instruction.
+	 * wrmask may be a bad name since it is used to represent both
+	 * src and dst that touch multiple adjacent registers.
+	 */
+	int wrmask;
+};
+
+struct ir3_instruction {
+	struct ir3_block *block;
+	int category;
+	opc_t opc;
+	enum {
+		/* (sy) flag is set on first instruction, and after sample
+		 * instructions (probably just on RAW hazard).
+		 */
+		IR3_INSTR_SY    = 0x001,
+		/* (ss) flag is set on first instruction, and first instruction
+		 * to depend on the result of "long" instructions (RAW hazard):
+		 *
+		 *   rcp, rsq, log2, exp2, sin, cos, sqrt
+		 *
+		 * It seems to synchronize until all in-flight instructions are
+		 * completed, for example:
+		 *
+		 *   rsq hr1.w, hr1.w
+		 *   add.f hr2.z, (neg)hr2.z, hc0.y
+		 *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
+		 *   rsq hr2.x, hr2.x
+		 *   (rpt1)nop
+		 *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
+		 *   nop
+		 *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
+		 *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
+		 *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
+		 *
+		 * The last mul.f does not have (ss) set, presumably because the
+		 * (ss) on the previous instruction does the job.
+		 *
+		 * The blob driver also seems to set it on WAR hazards, although
+		 * not really clear if this is needed or just blob compiler being
+		 * sloppy.  So far I haven't found a case where removing the (ss)
+		 * causes problems for WAR hazard, but I could just be getting
+		 * lucky:
+		 *
+		 *   rcp r1.y, r3.y
+		 *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
+		 *
+		 */
+		IR3_INSTR_SS    = 0x002,
+		/* (jp) flag is set on jump targets:
+		 */
+		IR3_INSTR_JP    = 0x004,
+		IR3_INSTR_UL    = 0x008,
+		IR3_INSTR_3D    = 0x010,
+		IR3_INSTR_A     = 0x020,
+		IR3_INSTR_O     = 0x040,
+		IR3_INSTR_P     = 0x080,
+		IR3_INSTR_S     = 0x100,
+		IR3_INSTR_S2EN  = 0x200,
+		/* meta-flags, for intermediate stages of IR, ie.
+		 * before register assignment is done:
+		 */
+		IR3_INSTR_MARK  = 0x1000,
+	} flags;
+	int repeat;
+	unsigned regs_count;
+	struct ir3_register *regs[5];
+	union {
+		struct {
+			char inv;
+			char comp;
+			int  immed;
+		} cat0;
+		struct {
+			type_t src_type, dst_type;
+		} cat1;
+		struct {
+			enum {
+				IR3_COND_LT = 0,
+				IR3_COND_LE = 1,
+				IR3_COND_GT = 2,
+				IR3_COND_GE = 3,
+				IR3_COND_EQ = 4,
+				IR3_COND_NE = 5,
+			} condition;
+		} cat2;
+		struct {
+			unsigned samp, tex;
+			type_t type;
+		} cat5;
+		struct {
+			type_t type;
+			int offset;
+			int iim_val;
+		} cat6;
+		/* for meta-instructions, just used to hold extra data
+		 * before instruction scheduling, etc
+		 */
+		struct {
+			int off;              /* component/offset */
+		} fo;
+		struct {
+			struct ir3_block *if_block, *else_block;
+		} flow;
+		struct {
+			struct ir3_block *block;
+		} inout;
+	};
+
+	/* transient values used during various algorithms: */
+	union {
+		/* The instruction depth is the max dependency distance to output.
+		 *
+		 * You can also think of it as the "cost", if we did any sort of
+		 * optimization for register footprint.  Ie. a value that is  just
+		 * result of moving a const to a reg would have a low cost,  so to
+		 * it could make sense to duplicate the instruction at various
+		 * points where the result is needed to reduce register footprint.
+		 */
+		unsigned depth;
+	};
+	struct ir3_instruction *next;
+#ifdef DEBUG
+	uint32_t serialno;
+#endif
+};
+
+struct ir3_heap_chunk;
+
+struct ir3 {
+	unsigned instrs_count, instrs_sz;
+	struct ir3_instruction **instrs;
+	unsigned heap_idx;
+	struct ir3_heap_chunk *chunk;
+};
+
+struct ir3_block {
+	struct ir3 *shader;
+	unsigned ntemporaries, ninputs, noutputs;
+	/* maps TGSI_FILE_TEMPORARY index back to the assigning instruction: */
+	struct ir3_instruction **temporaries;
+	struct ir3_instruction **inputs;
+	struct ir3_instruction **outputs;
+	/* only a single address register: */
+	struct ir3_instruction *address;
+	struct ir3_block *parent;
+	struct ir3_instruction *head;
+};
+
+struct ir3 * ir3_create(void);
+void ir3_destroy(struct ir3 *shader);
+void * ir3_assemble(struct ir3 *shader,
+		struct ir3_info *info);
+void * ir3_alloc(struct ir3 *shader, int sz);
+
+struct ir3_block * ir3_block_create(struct ir3 *shader,
+		unsigned ntmp, unsigned nin, unsigned nout);
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
+		int category, opc_t opc);
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
+const char *ir3_instr_name(struct ir3_instruction *instr);
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+		int num, int flags);
+
+
+static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
+{
+	if (instr->flags & IR3_INSTR_MARK)
+		return true;  /* already visited */
+	instr->flags ^= IR3_INSTR_MARK;
+	return false;
+}
+
+static inline void ir3_clear_mark(struct ir3 *shader)
+{
+	/* TODO would be nice to drop the instruction array.. for
+	 * new compiler, _clear_mark() is all we use it for, and
+	 * we could probably manage a linked list instead..
+	 */
+	unsigned i;
+	for (i = 0; i < shader->instrs_count; i++) {
+		struct ir3_instruction *instr = shader->instrs[i];
+		instr->flags &= ~IR3_INSTR_MARK;
+	}
+}
+
+static inline int ir3_instr_regno(struct ir3_instruction *instr,
+		struct ir3_register *reg)
+{
+	unsigned i;
+	for (i = 0; i < instr->regs_count; i++)
+		if (reg == instr->regs[i])
+			return i;
+	return -1;
+}
+
+
+/* comp:
+ *   0 - x
+ *   1 - y
+ *   2 - z
+ *   3 - w
+ */
+static inline uint32_t regid(int num, int comp)
+{
+	return (num << 2) | (comp & 0x3);
+}
+
+static inline uint32_t reg_num(struct ir3_register *reg)
+{
+	return reg->num >> 2;
+}
+
+static inline uint32_t reg_comp(struct ir3_register *reg)
+{
+	return reg->num & 0x3;
+}
+
+static inline bool is_flow(struct ir3_instruction *instr)
+{
+	return (instr->category == 0);
+}
+
+static inline bool is_kill(struct ir3_instruction *instr)
+{
+	return is_flow(instr) && (instr->opc == OPC_KILL);
+}
+
+static inline bool is_nop(struct ir3_instruction *instr)
+{
+	return is_flow(instr) && (instr->opc == OPC_NOP);
+}
+
+static inline bool is_alu(struct ir3_instruction *instr)
+{
+	return (1 <= instr->category) && (instr->category <= 3);
+}
+
+static inline bool is_sfu(struct ir3_instruction *instr)
+{
+	return (instr->category == 4);
+}
+
+static inline bool is_tex(struct ir3_instruction *instr)
+{
+	return (instr->category == 5);
+}
+
+static inline bool is_input(struct ir3_instruction *instr)
+{
+	return (instr->category == 2) && (instr->opc == OPC_BARY_F);
+}
+
+static inline bool is_meta(struct ir3_instruction *instr)
+{
+	/* TODO how should we count PHI (and maybe fan-in/out) which
+	 * might actually contribute some instructions to the final
+	 * result?
+	 */
+	return (instr->category == -1);
+}
+
+static inline bool is_addr(struct ir3_instruction *instr)
+{
+	return is_meta(instr) && (instr->opc == OPC_META_DEREF);
+}
+
+static inline bool writes_addr(struct ir3_instruction *instr)
+{
+	if (instr->regs_count > 0) {
+		struct ir3_register *dst = instr->regs[0];
+		return !!(dst->flags & IR3_REG_ADDR);
+	}
+	return false;
+}
+
+static inline bool writes_pred(struct ir3_instruction *instr)
+{
+	if (instr->regs_count > 0) {
+		struct ir3_register *dst = instr->regs[0];
+		return reg_num(dst) == REG_P0;
+	}
+	return false;
+}
+
+static inline bool reg_gpr(struct ir3_register *r)
+{
+	if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA | IR3_REG_ADDR))
+		return false;
+	if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
+		return false;
+	return true;
+}
+
+/* dump: */
+#include <stdio.h>
+void ir3_dump(struct ir3 *shader, const char *name,
+		struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */,
+		FILE *f);
+void ir3_dump_instr_single(struct ir3_instruction *instr);
+void ir3_dump_instr_list(struct ir3_instruction *instr);
+
+/* flatten if/else: */
+int ir3_block_flatten(struct ir3_block *block);
+
+/* depth calculation: */
+int ir3_delayslots(struct ir3_instruction *assigner,
+		struct ir3_instruction *consumer, unsigned n);
+void ir3_block_depth(struct ir3_block *block);
+
+/* copy-propagate: */
+void ir3_block_cp(struct ir3_block *block);
+
+/* scheduling: */
+void ir3_block_sched(struct ir3_block *block);
+
+/* register assignment: */
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+		bool half_precision, bool frag_coord, bool frag_face,
+		bool *has_samp);
+
+#ifndef ARRAY_SIZE
+#  define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#endif
+
+/* ************************************************************************* */
+/* split this out or find some helper to use.. like main/bitset.h.. */
+
+#include <string.h>
+
+#define MAX_REG 256
+
+typedef uint8_t regmask_t[2 * MAX_REG / 8];
+
+static inline unsigned regmask_idx(struct ir3_register *reg)
+{
+	unsigned num = reg->num;
+	assert(num < MAX_REG);
+	if (reg->flags & IR3_REG_HALF)
+		num += MAX_REG;
+	return num;
+}
+
+static inline void regmask_init(regmask_t *regmask)
+{
+	memset(regmask, 0, sizeof(*regmask));
+}
+
+static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
+{
+	unsigned idx = regmask_idx(reg);
+	unsigned i;
+	for (i = 0; i < 4; i++, idx++)
+		if (reg->wrmask & (1 << i))
+			(*regmask)[idx / 8] |= 1 << (idx % 8);
+}
+
+/* set bits in a if not set in b, conceptually:
+ *   a |= (reg & ~b)
+ */
+static inline void regmask_set_if_not(regmask_t *a,
+		struct ir3_register *reg, regmask_t *b)
+{
+	unsigned idx = regmask_idx(reg);
+	unsigned i;
+	for (i = 0; i < 4; i++, idx++)
+		if (reg->wrmask & (1 << i))
+			if (!((*b)[idx / 8] & (1 << (idx % 8))))
+				(*a)[idx / 8] |= 1 << (idx % 8);
+}
+
+static inline unsigned regmask_get(regmask_t *regmask,
+		struct ir3_register *reg)
+{
+	unsigned idx = regmask_idx(reg);
+	unsigned i;
+	for (i = 0; i < 4; i++, idx++)
+		if (reg->wrmask & (1 << i))
+			if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+				return true;
+	return false;
+}
+
+/* ************************************************************************* */
+
+#endif /* IR3_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
new file mode 100644
index 00000000000..1fa2fd4e389
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
@@ -0,0 +1,2639 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+
+#include "pipe/p_state.h"
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_strings.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+
+#include "freedreno_lowering.h"
+#include "freedreno_util.h"
+
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+#include "instr-a3xx.h"
+#include "ir3.h"
+
+struct ir3_compile_context {
+	const struct tgsi_token *tokens;
+	bool free_tokens;
+	struct ir3 *ir;
+	struct ir3_shader_variant *so;
+
+	struct ir3_block *block;
+	struct ir3_instruction *current_instr;
+
+	/* we need to defer updates to block->outputs[] until the end
+	 * of an instruction (so we don't see new value until *after*
+	 * the src registers are processed)
+	 */
+	struct {
+		struct ir3_instruction *instr, **instrp;
+	} output_updates[16];
+	unsigned num_output_updates;
+
+	/* are we in a sequence of "atomic" instructions?
+	 */
+	bool atomic;
+
+	/* For fragment shaders, from the hw perspective the only
+	 * actual input is r0.xy position register passed to bary.f.
+	 * But TGSI doesn't know that, it still declares things as
+	 * IN[] registers.  So we do all the input tracking normally
+	 * and fix things up after compile_instructions()
+	 *
+	 * NOTE that frag_pos is the hardware position (possibly it
+	 * is actually an index or tag or some such.. it is *not*
+	 * values that can be directly used for gl_FragCoord..)
+	 */
+	struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
+
+	struct tgsi_parse_context parser;
+	unsigned type;
+
+	struct tgsi_shader_info info;
+
+	/* for calculating input/output positions/linkages: */
+	unsigned next_inloc;
+
+	unsigned num_internal_temps;
+	struct tgsi_src_register internal_temps[6];
+
+	/* idx/slot for last compiler generated immediate */
+	unsigned immediate_idx;
+
+	/* stack of branch instructions that mark (potentially nested)
+	 * branch if/else/loop/etc
+	 */
+	struct {
+		struct ir3_instruction *instr, *cond;
+		bool inv;   /* true iff in else leg of branch */
+	} branch[16];
+	unsigned int branch_count;
+
+	/* list of kill instructions: */
+	struct ir3_instruction *kill[16];
+	unsigned int kill_count;
+
+	/* used when dst is same as one of the src, to avoid overwriting a
+	 * src element before the remaining scalar instructions that make
+	 * up the vector operation
+	 */
+	struct tgsi_dst_register tmp_dst;
+	struct tgsi_src_register *tmp_src;
+};
+
+
+static void vectorize(struct ir3_compile_context *ctx,
+		struct ir3_instruction *instr, struct tgsi_dst_register *dst,
+		int nsrcs, ...);
+static void create_mov(struct ir3_compile_context *ctx,
+		struct tgsi_dst_register *dst, struct tgsi_src_register *src);
+static type_t get_ftype(struct ir3_compile_context *ctx);
+
+static unsigned
+compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
+		const struct tgsi_token *tokens)
+{
+	unsigned ret;
+	struct tgsi_shader_info *info = &ctx->info;
+	const struct fd_lowering_config lconfig = {
+			.color_two_side = so->key.color_two_side,
+			.lower_DST  = true,
+			.lower_XPD  = true,
+			.lower_SCS  = true,
+			.lower_LRP  = true,
+			.lower_FRC  = true,
+			.lower_POW  = true,
+			.lower_LIT  = true,
+			.lower_EXP  = true,
+			.lower_LOG  = true,
+			.lower_DP4  = true,
+			.lower_DP3  = true,
+			.lower_DPH  = true,
+			.lower_DP2  = true,
+			.lower_DP2A = true,
+	};
+
+	ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info);
+	ctx->free_tokens = !!ctx->tokens;
+	if (!ctx->tokens) {
+		/* no lowering */
+		ctx->tokens = tokens;
+	}
+	ctx->ir = so->ir;
+	ctx->so = so;
+	ctx->next_inloc = 8;
+	ctx->num_internal_temps = 0;
+	ctx->branch_count = 0;
+	ctx->kill_count = 0;
+	ctx->block = NULL;
+	ctx->current_instr = NULL;
+	ctx->num_output_updates = 0;
+	ctx->atomic = false;
+	ctx->frag_pos = NULL;
+	ctx->frag_face = NULL;
+
+	memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord));
+
+#define FM(x) (1 << TGSI_FILE_##x)
+	/* optimize can't deal with relative addressing: */
+	if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
+		return TGSI_PARSE_ERROR;
+
+	/* Immediates go after constants: */
+	so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1;
+	ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
+
+	ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
+	if (ret != TGSI_PARSE_OK)
+		return ret;
+
+	ctx->type = ctx->parser.FullHeader.Processor.Processor;
+
+	return ret;
+}
+
+static void
+compile_error(struct ir3_compile_context *ctx, const char *format, ...)
+{
+	va_list ap;
+	va_start(ap, format);
+	_debug_vprintf(format, ap);
+	va_end(ap);
+	tgsi_dump(ctx->tokens, 0);
+	debug_assert(0);
+}
+
+#define compile_assert(ctx, cond) do { \
+		if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
+	} while (0)
+
+static void
+compile_free(struct ir3_compile_context *ctx)
+{
+	if (ctx->free_tokens)
+		free((void *)ctx->tokens);
+	tgsi_parse_free(&ctx->parser);
+}
+
+struct instr_translater {
+	void (*fxn)(const struct instr_translater *t,
+			struct ir3_compile_context *ctx,
+			struct tgsi_full_instruction *inst);
+	unsigned tgsi_opc;
+	opc_t opc;
+	opc_t hopc;    /* opc to use for half_precision mode, if different */
+	unsigned arg;
+};
+
+static void
+instr_finish(struct ir3_compile_context *ctx)
+{
+	unsigned i;
+
+	if (ctx->atomic)
+		return;
+
+	for (i = 0; i < ctx->num_output_updates; i++)
+		*(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr;
+
+	ctx->num_output_updates = 0;
+}
+
+/* For "atomic" groups of instructions, for example the four scalar
+ * instructions to perform a vec4 operation.  Basically this just
+ * blocks out handling of output_updates so the next scalar instruction
+ * still sees the result from before the start of the atomic group.
+ *
+ * NOTE: when used properly, this could probably replace get/put_dst()
+ * stuff.
+ */
+static void
+instr_atomic_start(struct ir3_compile_context *ctx)
+{
+	ctx->atomic = true;
+}
+
+static void
+instr_atomic_end(struct ir3_compile_context *ctx)
+{
+	ctx->atomic = false;
+	instr_finish(ctx);
+}
+
+static struct ir3_instruction *
+instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
+{
+	instr_finish(ctx);
+	return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc));
+}
+
+static struct ir3_instruction *
+instr_clone(struct ir3_compile_context *ctx, struct ir3_instruction *instr)
+{
+	instr_finish(ctx);
+	return (ctx->current_instr = ir3_instr_clone(instr));
+}
+
+static struct ir3_block *
+push_block(struct ir3_compile_context *ctx)
+{
+	struct ir3_block *block;
+	unsigned ntmp, nin, nout;
+
+#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1))
+
+	/* hmm, give ourselves room to create 4 extra temporaries (vec4):
+	 */
+	ntmp = SCALAR_REGS(TEMPORARY);
+	ntmp += 4 * 4;
+
+	nout = SCALAR_REGS(OUTPUT);
+	nin  = SCALAR_REGS(INPUT);
+
+	/* for outermost block, 'inputs' are the actual shader INPUT
+	 * register file.  Reads from INPUT registers always go back to
+	 * top block.  For nested blocks, 'inputs' is used to track any
+	 * TEMPORARY file register from one of the enclosing blocks that
+	 * is ready in this block.
+	 */
+	if (!ctx->block) {
+		/* NOTE: fragment shaders actually have two inputs (r0.xy, the
+		 * position)
+		 */
+		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+			int n = 2;
+			if (ctx->info.reads_position)
+				n += 4;
+			if (ctx->info.uses_frontface)
+				n += 4;
+			nin = MAX2(n, nin);
+			nout += ARRAY_SIZE(ctx->kill);
+		}
+	} else {
+		nin = ntmp;
+	}
+
+	block = ir3_block_create(ctx->ir, ntmp, nin, nout);
+
+	if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block)
+		block->noutputs -= ARRAY_SIZE(ctx->kill);
+
+	block->parent = ctx->block;
+	ctx->block = block;
+
+	return block;
+}
+
+static void
+pop_block(struct ir3_compile_context *ctx)
+{
+	ctx->block = ctx->block->parent;
+	compile_assert(ctx, ctx->block);
+}
+
+static struct ir3_instruction *
+create_output(struct ir3_block *block, struct ir3_instruction *instr,
+		unsigned n)
+{
+	struct ir3_instruction *out;
+
+	out = ir3_instr_create(block, -1, OPC_META_OUTPUT);
+	out->inout.block = block;
+	ir3_reg_create(out, n, 0);
+	if (instr)
+		ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr;
+
+	return out;
+}
+
+static struct ir3_instruction *
+create_input(struct ir3_block *block, struct ir3_instruction *instr,
+		unsigned n)
+{
+	struct ir3_instruction *in;
+
+	in = ir3_instr_create(block, -1, OPC_META_INPUT);
+	in->inout.block = block;
+	ir3_reg_create(in, n, 0);
+	if (instr)
+		ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
+
+	return in;
+}
+
+static struct ir3_instruction *
+block_input(struct ir3_block *block, unsigned n)
+{
+	/* references to INPUT register file always go back up to
+	 * top level:
+	 */
+	if (block->parent)
+		return block_input(block->parent, n);
+	return block->inputs[n];
+}
+
+/* return temporary in scope, creating if needed meta-input node
+ * to track block inputs
+ */
+static struct ir3_instruction *
+block_temporary(struct ir3_block *block, unsigned n)
+{
+	/* references to TEMPORARY register file, find the nearest
+	 * enclosing block which has already assigned this temporary,
+	 * creating meta-input instructions along the way to keep
+	 * track of block inputs
+	 */
+	if (block->parent && !block->temporaries[n]) {
+		/* if already have input for this block, reuse: */
+		if (!block->inputs[n])
+			block->inputs[n] = block_temporary(block->parent, n);
+
+		/* and create new input to return: */
+		return create_input(block, block->inputs[n], n);
+	}
+	return block->temporaries[n];
+}
+
+static struct ir3_instruction *
+create_immed(struct ir3_compile_context *ctx, float val)
+{
+	/* NOTE: *don't* use instr_create() here!
+	 */
+	struct ir3_instruction *instr;
+	instr = ir3_instr_create(ctx->block, 1, 0);
+	instr->cat1.src_type = get_ftype(ctx);
+	instr->cat1.dst_type = get_ftype(ctx);
+	ir3_reg_create(instr, 0, 0);
+	ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val;
+	return instr;
+}
+
+static void
+ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+		const struct tgsi_dst_register *dst, unsigned chan)
+{
+	unsigned n = regid(dst->Index, chan);
+	unsigned idx = ctx->num_output_updates;
+
+	compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates));
+
+	/* NOTE: defer update of temporaries[idx] or output[idx]
+	 * until instr_finish(), so that if the current instruction
+	 * reads the same TEMP/OUT[] it gets the old value:
+	 *
+	 * bleh.. this might be a bit easier to just figure out
+	 * in instr_finish().  But at that point we've already
+	 * lost information about OUTPUT vs TEMPORARY register
+	 * file..
+	 */
+
+	switch (dst->File) {
+	case TGSI_FILE_OUTPUT:
+		compile_assert(ctx, n < ctx->block->noutputs);
+		ctx->output_updates[idx].instrp = &ctx->block->outputs[n];
+		ctx->output_updates[idx].instr = instr;
+		ctx->num_output_updates++;
+		break;
+	case TGSI_FILE_TEMPORARY:
+		compile_assert(ctx, n < ctx->block->ntemporaries);
+		ctx->output_updates[idx].instrp = &ctx->block->temporaries[n];
+		ctx->output_updates[idx].instr = instr;
+		ctx->num_output_updates++;
+		break;
+	case TGSI_FILE_ADDRESS:
+		compile_assert(ctx, n < 1);
+		ctx->output_updates[idx].instrp = &ctx->block->address;
+		ctx->output_updates[idx].instr = instr;
+		ctx->num_output_updates++;
+		break;
+	}
+}
+
+static void
+ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg,
+		const struct tgsi_src_register *src, unsigned chan)
+{
+	struct ir3_block *block = ctx->block;
+	unsigned n = regid(src->Index, chan);
+
+	switch (src->File) {
+	case TGSI_FILE_INPUT:
+		reg->flags |= IR3_REG_SSA;
+		reg->instr = block_input(ctx->block, n);
+		break;
+	case TGSI_FILE_OUTPUT:
+		/* really this should just happen in case of 'MOV_SAT OUT[n], ..',
+		 * for the following clamp instructions:
+		 */
+		reg->flags |= IR3_REG_SSA;
+		reg->instr = block->outputs[n];
+		/* we don't have to worry about read from an OUTPUT that was
+		 * assigned outside of the current block, because the _SAT
+		 * clamp instructions will always be in the same block as
+		 * the original instruction which wrote the OUTPUT
+		 */
+		compile_assert(ctx, reg->instr);
+		break;
+	case TGSI_FILE_TEMPORARY:
+		reg->flags |= IR3_REG_SSA;
+		reg->instr = block_temporary(ctx->block, n);
+		break;
+	}
+
+	if ((reg->flags & IR3_REG_SSA) && !reg->instr) {
+		/* this can happen when registers (or components of a TGSI
+		 * register) are used as src before they have been assigned
+		 * (undefined contents).  To avoid confusing the rest of the
+		 * compiler, and to generally keep things peachy, substitute
+		 * an instruction that sets the src to 0.0.  Or to keep
+		 * things undefined, I could plug in a random number? :-P
+		 *
+		 * NOTE: *don't* use instr_create() here!
+		 */
+		reg->instr = create_immed(ctx, 0.0);
+	}
+}
+
+static struct ir3_register *
+add_dst_reg_wrmask(struct ir3_compile_context *ctx,
+		struct ir3_instruction *instr, const struct tgsi_dst_register *dst,
+		unsigned chan, unsigned wrmask)
+{
+	unsigned flags = 0, num = 0;
+	struct ir3_register *reg;
+
+	switch (dst->File) {
+	case TGSI_FILE_OUTPUT:
+	case TGSI_FILE_TEMPORARY:
+		/* uses SSA */
+		break;
+	case TGSI_FILE_ADDRESS:
+		flags |= IR3_REG_ADDR;
+		/* uses SSA */
+		break;
+	default:
+		compile_error(ctx, "unsupported dst register file: %s\n",
+			tgsi_file_name(dst->File));
+		break;
+	}
+
+	if (dst->Indirect)
+		flags |= IR3_REG_RELATIV;
+
+	reg = ir3_reg_create(instr, regid(num, chan), flags);
+
+	/* NOTE: do not call ssa_dst() if atomic.. vectorize()
+	 * itself will call ssa_dst().  This is to filter out
+	 * the (initially bogus) .x component dst which is
+	 * created (but not necessarily used, ie. if the net
+	 * result of the vector operation does not write to
+	 * the .x component)
+	 */
+
+	reg->wrmask = wrmask;
+	if (wrmask == 0x1) {
+		/* normal case */
+		if (!ctx->atomic)
+			ssa_dst(ctx, instr, dst, chan);
+	} else if ((dst->File == TGSI_FILE_TEMPORARY) ||
+			(dst->File == TGSI_FILE_OUTPUT) ||
+			(dst->File == TGSI_FILE_ADDRESS)) {
+		unsigned i;
+
+		/* if instruction writes multiple, we need to create
+		 * some place-holder collect the registers:
+		 */
+		for (i = 0; i < 4; i++) {
+			if (wrmask & (1 << i)) {
+				struct ir3_instruction *collect =
+						ir3_instr_create(ctx->block, -1, OPC_META_FO);
+				collect->fo.off = i;
+				/* unused dst reg: */
+				ir3_reg_create(collect, 0, 0);
+				/* and src reg used to hold original instr */
+				ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr;
+				if (!ctx->atomic)
+					ssa_dst(ctx, collect, dst, chan+i);
+			}
+		}
+	}
+
+	return reg;
+}
+
+static struct ir3_register *
+add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+		const struct tgsi_dst_register *dst, unsigned chan)
+{
+	return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1);
+}
+
+static struct ir3_register *
+add_src_reg_wrmask(struct ir3_compile_context *ctx,
+		struct ir3_instruction *instr, const struct tgsi_src_register *src,
+		unsigned chan, unsigned wrmask)
+{
+	unsigned flags = 0, num = 0;
+	struct ir3_register *reg;
+	struct ir3_instruction *orig = NULL;
+
+	/* TODO we need to use a mov to temp for const >= 64.. or maybe
+	 * we could use relative addressing..
+	 */
+	compile_assert(ctx, src->Index < 64);
+
+	switch (src->File) {
+	case TGSI_FILE_IMMEDIATE:
+		/* TODO if possible, use actual immediate instead of const.. but
+		 * TGSI has vec4 immediates, we can only embed scalar (of limited
+		 * size, depending on instruction..)
+		 */
+		flags |= IR3_REG_CONST;
+		num = src->Index + ctx->so->first_immediate;
+		break;
+	case TGSI_FILE_CONSTANT:
+		flags |= IR3_REG_CONST;
+		num = src->Index;
+		break;
+	case TGSI_FILE_OUTPUT:
+		/* NOTE: we should only end up w/ OUTPUT file for things like
+		 * clamp()'ing saturated dst instructions
+		 */
+	case TGSI_FILE_INPUT:
+	case TGSI_FILE_TEMPORARY:
+		/* uses SSA */
+		break;
+	default:
+		compile_error(ctx, "unsupported src register file: %s\n",
+			tgsi_file_name(src->File));
+		break;
+	}
+
+	if (src->Absolute)
+		flags |= IR3_REG_ABS;
+	if (src->Negate)
+		flags |= IR3_REG_NEGATE;
+
+	if (src->Indirect) {
+		flags |= IR3_REG_RELATIV;
+
+		/* shouldn't happen, and we can't cope with it below: */
+		compile_assert(ctx, wrmask == 0x1);
+
+		/* wrap in a meta-deref to track both the src and address: */
+		orig = instr;
+
+		instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF);
+		ir3_reg_create(instr, 0, 0);
+		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address;
+	}
+
+	reg = ir3_reg_create(instr, regid(num, chan), flags);
+
+	reg->wrmask = wrmask;
+	if (wrmask == 0x1) {
+		/* normal case */
+		ssa_src(ctx, reg, src, chan);
+	} else if ((src->File == TGSI_FILE_TEMPORARY) ||
+			(src->File == TGSI_FILE_OUTPUT) ||
+			(src->File == TGSI_FILE_INPUT)) {
+		struct ir3_instruction *collect;
+		unsigned i;
+
+		compile_assert(ctx, !src->Indirect);
+
+		/* if instruction reads multiple, we need to create
+		 * some place-holder collect the registers:
+		 */
+		collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
+		ir3_reg_create(collect, 0, 0);   /* unused dst reg */
+
+		for (i = 0; i < 4; i++) {
+			if (wrmask & (1 << i)) {
+				/* and src reg used point to the original instr */
+				ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
+						src, chan + i);
+			} else if (wrmask & ~((i << i) - 1)) {
+				/* if any remaining components, then dummy
+				 * placeholder src reg to fill in the blanks:
+				 */
+				ir3_reg_create(collect, 0, 0);
+			}
+		}
+
+		reg->flags |= IR3_REG_SSA;
+		reg->instr = collect;
+	}
+
+	if (src->Indirect) {
+		reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA);
+		reg->instr = instr;
+	}
+	return reg;
+}
+
+static struct ir3_register *
+add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+		const struct tgsi_src_register *src, unsigned chan)
+{
+	return add_src_reg_wrmask(ctx, instr, src, chan, 0x1);
+}
+
+static void
+src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
+{
+	src->File      = dst->File;
+	src->Indirect  = dst->Indirect;
+	src->Dimension = dst->Dimension;
+	src->Index     = dst->Index;
+	src->Absolute  = 0;
+	src->Negate    = 0;
+	src->SwizzleX  = TGSI_SWIZZLE_X;
+	src->SwizzleY  = TGSI_SWIZZLE_Y;
+	src->SwizzleZ  = TGSI_SWIZZLE_Z;
+	src->SwizzleW  = TGSI_SWIZZLE_W;
+}
+
+/* Get internal-temp src/dst to use for a sequence of instructions
+ * generated by a single TGSI op.
+ */
+static struct tgsi_src_register *
+get_internal_temp(struct ir3_compile_context *ctx,
+		struct tgsi_dst_register *tmp_dst)
+{
+	struct tgsi_src_register *tmp_src;
+	int n;
+
+	tmp_dst->File      = TGSI_FILE_TEMPORARY;
+	tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
+	tmp_dst->Indirect  = 0;
+	tmp_dst->Dimension = 0;
+
+	/* assign next temporary: */
+	n = ctx->num_internal_temps++;
+	compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
+	tmp_src = &ctx->internal_temps[n];
+
+	tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
+
+	src_from_dst(tmp_src, tmp_dst);
+
+	return tmp_src;
+}
+
+static inline bool
+is_const(struct tgsi_src_register *src)
+{
+	return (src->File == TGSI_FILE_CONSTANT) ||
+			(src->File == TGSI_FILE_IMMEDIATE);
+}
+
+static inline bool
+is_relative(struct tgsi_src_register *src)
+{
+	return src->Indirect;
+}
+
+static inline bool
+is_rel_or_const(struct tgsi_src_register *src)
+{
+	return is_relative(src) || is_const(src);
+}
+
+static type_t
+get_ftype(struct ir3_compile_context *ctx)
+{
+	return TYPE_F32;
+}
+
+static type_t
+get_utype(struct ir3_compile_context *ctx)
+{
+	return TYPE_U32;
+}
+
+static unsigned
+src_swiz(struct tgsi_src_register *src, int chan)
+{
+	switch (chan) {
+	case 0: return src->SwizzleX;
+	case 1: return src->SwizzleY;
+	case 2: return src->SwizzleZ;
+	case 3: return src->SwizzleW;
+	}
+	assert(0);
+	return 0;
+}
+
+/* for instructions that cannot take a const register as src, if needed
+ * generate a move to temporary gpr:
+ */
+static struct tgsi_src_register *
+get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
+{
+	struct tgsi_dst_register tmp_dst;
+	struct tgsi_src_register *tmp_src;
+
+	compile_assert(ctx, is_rel_or_const(src));
+
+	tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+	create_mov(ctx, &tmp_dst, src);
+
+	return tmp_src;
+}
+
+static void
+get_immediate(struct ir3_compile_context *ctx,
+		struct tgsi_src_register *reg, uint32_t val)
+{
+	unsigned neg, swiz, idx, i;
+	/* actually maps 1:1 currently.. not sure if that is safe to rely on: */
+	static const unsigned swiz2tgsi[] = {
+			TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
+	};
+
+	for (i = 0; i < ctx->immediate_idx; i++) {
+		swiz = i % 4;
+		idx  = i / 4;
+
+		if (ctx->so->immediates[idx].val[swiz] == val) {
+			neg = 0;
+			break;
+		}
+
+		if (ctx->so->immediates[idx].val[swiz] == -val) {
+			neg = 1;
+			break;
+		}
+	}
+
+	if (i == ctx->immediate_idx) {
+		/* need to generate a new immediate: */
+		swiz = i % 4;
+		idx  = i / 4;
+		neg  = 0;
+		ctx->so->immediates[idx].val[swiz] = val;
+		ctx->so->immediates_count = idx + 1;
+		ctx->immediate_idx++;
+	}
+
+	reg->File      = TGSI_FILE_IMMEDIATE;
+	reg->Indirect  = 0;
+	reg->Dimension = 0;
+	reg->Index     = idx;
+	reg->Absolute  = 0;
+	reg->Negate    = neg;
+	reg->SwizzleX  = swiz2tgsi[swiz];
+	reg->SwizzleY  = swiz2tgsi[swiz];
+	reg->SwizzleZ  = swiz2tgsi[swiz];
+	reg->SwizzleW  = swiz2tgsi[swiz];
+}
+
+static void
+create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
+		struct tgsi_src_register *src)
+{
+	type_t type_mov = get_ftype(ctx);
+	unsigned i;
+
+	for (i = 0; i < 4; i++) {
+		/* move to destination: */
+		if (dst->WriteMask & (1 << i)) {
+			struct ir3_instruction *instr;
+
+			if (src->Absolute || src->Negate) {
+				/* can't have abs or neg on a mov instr, so use
+				 * absneg.f instead to handle these cases:
+				 */
+				instr = instr_create(ctx, 2, OPC_ABSNEG_F);
+			} else {
+				instr = instr_create(ctx, 1, 0);
+				instr->cat1.src_type = type_mov;
+				instr->cat1.dst_type = type_mov;
+			}
+
+			add_dst_reg(ctx, instr, dst, i);
+			add_src_reg(ctx, instr, src, src_swiz(src, i));
+		}
+	}
+}
+
+static void
+create_clamp(struct ir3_compile_context *ctx,
+		struct tgsi_dst_register *dst, struct tgsi_src_register *val,
+		struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
+{
+	struct ir3_instruction *instr;
+
+	instr = instr_create(ctx, 2, OPC_MAX_F);
+	vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
+
+	instr = instr_create(ctx, 2, OPC_MIN_F);
+	vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
+}
+
+static void
+create_clamp_imm(struct ir3_compile_context *ctx,
+		struct tgsi_dst_register *dst,
+		uint32_t minval, uint32_t maxval)
+{
+	struct tgsi_src_register minconst, maxconst;
+	struct tgsi_src_register src;
+
+	src_from_dst(&src, dst);
+
+	get_immediate(ctx, &minconst, minval);
+	get_immediate(ctx, &maxconst, maxval);
+
+	create_clamp(ctx, dst, &src, &minconst, &maxconst);
+}
+
+static struct tgsi_dst_register *
+get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
+{
+	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+	unsigned i;
+	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+		struct tgsi_src_register *src = &inst->Src[i].Register;
+		if ((src->File == dst->File) && (src->Index == dst->Index)) {
+			if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
+					(src->SwizzleX == TGSI_SWIZZLE_X) &&
+					(src->SwizzleY == TGSI_SWIZZLE_Y) &&
+					(src->SwizzleZ == TGSI_SWIZZLE_Z) &&
+					(src->SwizzleW == TGSI_SWIZZLE_W))
+				continue;
+			ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
+			ctx->tmp_dst.WriteMask = dst->WriteMask;
+			dst = &ctx->tmp_dst;
+			break;
+		}
+	}
+	return dst;
+}
+
+static void
+put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
+		struct tgsi_dst_register *dst)
+{
+	/* if necessary, add mov back into original dst: */
+	if (dst != &inst->Dst[0].Register) {
+		create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
+	}
+}
+
+/* helper to generate the necessary repeat and/or additional instructions
+ * to turn a scalar instruction into a vector operation:
+ */
+static void
+vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+		struct tgsi_dst_register *dst, int nsrcs, ...)
+{
+	va_list ap;
+	int i, j, n = 0;
+
+	instr_atomic_start(ctx);
+
+	add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X);
+
+	va_start(ap, nsrcs);
+	for (j = 0; j < nsrcs; j++) {
+		struct tgsi_src_register *src =
+				va_arg(ap, struct tgsi_src_register *);
+		unsigned flags = va_arg(ap, unsigned);
+		struct ir3_register *reg;
+		if (flags & IR3_REG_IMMED) {
+			reg = ir3_reg_create(instr, 0, IR3_REG_IMMED);
+			/* this is an ugly cast.. should have put flags first! */
+			reg->iim_val = *(int *)&src;
+		} else {
+			reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X);
+		}
+		reg->flags |= flags & ~IR3_REG_NEGATE;
+		if (flags & IR3_REG_NEGATE)
+			reg->flags ^= IR3_REG_NEGATE;
+	}
+	va_end(ap);
+
+	for (i = 0; i < 4; i++) {
+		if (dst->WriteMask & (1 << i)) {
+			struct ir3_instruction *cur;
+
+			if (n++ == 0) {
+				cur = instr;
+			} else {
+				cur = instr_clone(ctx, instr);
+			}
+
+			ssa_dst(ctx, cur, dst, i);
+
+			/* fix-up dst register component: */
+			cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i);
+
+			/* fix-up src register component: */
+			va_start(ap, nsrcs);
+			for (j = 0; j < nsrcs; j++) {
+				struct ir3_register *reg = cur->regs[j+1];
+				struct tgsi_src_register *src =
+						va_arg(ap, struct tgsi_src_register *);
+				unsigned flags = va_arg(ap, unsigned);
+				if (reg->flags & IR3_REG_SSA) {
+					ssa_src(ctx, reg, src, src_swiz(src, i));
+				} else if (!(flags & IR3_REG_IMMED)) {
+					reg->num = regid(reg->num >> 2, src_swiz(src, i));
+				}
+			}
+			va_end(ap);
+		}
+	}
+
+	instr_atomic_end(ctx);
+}
+
+/*
+ * Handlers for TGSI instructions which do not have a 1:1 mapping to
+ * native instructions:
+ */
+
+static void
+trans_clamp(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *src0 = &inst->Src[0].Register;
+	struct tgsi_src_register *src1 = &inst->Src[1].Register;
+	struct tgsi_src_register *src2 = &inst->Src[2].Register;
+
+	create_clamp(ctx, dst, src0, src1, src2);
+
+	put_dst(ctx, inst, dst);
+}
+
+/* ARL(x) = x, but mova from hrN.x to a0.. */
+static void
+trans_arl(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+	struct tgsi_dst_register tmp_dst;
+	struct tgsi_src_register *tmp_src;
+	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+	struct tgsi_src_register *src = &inst->Src[0].Register;
+	unsigned chan = src->SwizzleX;
+
+	compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
+
+	/* NOTE: we allocate a temporary from a flat register
+	 * namespace (ignoring half vs full).  It turns out
+	 * not to really matter since registers get reassigned
+	 * later in ir3_ra which (hopefully!) can deal a bit
+	 * better with mixed half and full precision.
+	 */
+	tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+	/* cov.f{32,16}s16 Rtmp, Rsrc */
+	instr = instr_create(ctx, 1, 0);
+	instr->cat1.src_type = get_ftype(ctx);
+	instr->cat1.dst_type = TYPE_S16;
+	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
+	add_src_reg(ctx, instr, src, chan);
+
+	/* shl.b Rtmp, Rtmp, 2 */
+	instr = instr_create(ctx, 2, OPC_SHL_B);
+	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
+	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
+	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
+
+	/* mova a0, Rtmp */
+	instr = instr_create(ctx, 1, 0);
+	instr->cat1.src_type = TYPE_S16;
+	instr->cat1.dst_type = TYPE_S16;
+	add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
+	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
+}
+
+/*
+ * texture fetch/sample instructions:
+ */
+
+struct tex_info {
+	int8_t order[4];
+	unsigned src_wrmask, flags;
+};
+
+static const struct tex_info *
+get_tex_info(struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	static const struct tex_info tex1d = {
+		.order = { 0, -1, -1, -1 },  /* coord.x */
+		.src_wrmask = TGSI_WRITEMASK_XY,
+		.flags = 0,
+	};
+	static const struct tex_info tex1ds = {
+		.order = { 0, -1,  2, -1 },  /* coord.xz */
+		.src_wrmask = TGSI_WRITEMASK_XYZ,
+		.flags = IR3_INSTR_S,
+	};
+	static const struct tex_info tex2d = {
+		.order = { 0,  1, -1, -1 },  /* coord.xy */
+		.src_wrmask = TGSI_WRITEMASK_XY,
+		.flags = 0,
+	};
+	static const struct tex_info tex2ds = {
+		.order = { 0,  1,  2, -1 },  /* coord.xyz */
+		.src_wrmask = TGSI_WRITEMASK_XYZ,
+		.flags = IR3_INSTR_S,
+	};
+	static const struct tex_info tex3d = {
+		.order = { 0,  1,  2, -1 },  /* coord.xyz */
+		.src_wrmask = TGSI_WRITEMASK_XYZ,
+		.flags = IR3_INSTR_3D,
+	};
+	static const struct tex_info tex3ds = {
+		.order = { 0,  1,  2,  3 },  /* coord.xyzw */
+		.src_wrmask = TGSI_WRITEMASK_XYZW,
+		.flags = IR3_INSTR_S | IR3_INSTR_3D,
+	};
+	static const struct tex_info txp1d = {
+		.order = { 0, -1,  3, -1 },  /* coord.xw */
+		.src_wrmask = TGSI_WRITEMASK_XYZ,
+		.flags = IR3_INSTR_P,
+	};
+	static const struct tex_info txp1ds = {
+		.order = { 0, -1,  2,  3 },  /* coord.xzw */
+		.src_wrmask = TGSI_WRITEMASK_XYZW,
+		.flags = IR3_INSTR_P | IR3_INSTR_S,
+	};
+	static const struct tex_info txp2d = {
+		.order = { 0,  1,  3, -1 },  /* coord.xyw */
+		.src_wrmask = TGSI_WRITEMASK_XYZ,
+		.flags = IR3_INSTR_P,
+	};
+	static const struct tex_info txp2ds = {
+		.order = { 0,  1,  2,  3 },  /* coord.xyzw */
+		.src_wrmask = TGSI_WRITEMASK_XYZW,
+		.flags = IR3_INSTR_P | IR3_INSTR_S,
+	};
+	static const struct tex_info txp3d = {
+		.order = { 0,  1,  2,  3 },  /* coord.xyzw */
+		.src_wrmask = TGSI_WRITEMASK_XYZW,
+		.flags = IR3_INSTR_P | IR3_INSTR_3D,
+	};
+
+	unsigned tex = inst->Texture.Texture;
+
+	switch (inst->Instruction.Opcode) {
+	case TGSI_OPCODE_TEX:
+		switch (tex) {
+		case TGSI_TEXTURE_1D:
+			return &tex1d;
+		case TGSI_TEXTURE_SHADOW1D:
+			return &tex1ds;
+		case TGSI_TEXTURE_2D:
+		case TGSI_TEXTURE_RECT:
+			return &tex2d;
+		case TGSI_TEXTURE_SHADOW2D:
+		case TGSI_TEXTURE_SHADOWRECT:
+			return &tex2ds;
+		case TGSI_TEXTURE_3D:
+		case TGSI_TEXTURE_CUBE:
+			return &tex3d;
+		case TGSI_TEXTURE_SHADOWCUBE:
+			return &tex3ds;
+		default:
+			compile_error(ctx, "unknown texture type: %s\n",
+					tgsi_texture_names[tex]);
+			return NULL;
+		}
+		break;
+	case TGSI_OPCODE_TXP:
+		switch (tex) {
+		case TGSI_TEXTURE_1D:
+			return &txp1d;
+		case TGSI_TEXTURE_SHADOW1D:
+			return &txp1ds;
+		case TGSI_TEXTURE_2D:
+		case TGSI_TEXTURE_RECT:
+			return &txp2d;
+		case TGSI_TEXTURE_SHADOW2D:
+		case TGSI_TEXTURE_SHADOWRECT:
+			return &txp2ds;
+		case TGSI_TEXTURE_3D:
+		case TGSI_TEXTURE_CUBE:
+			return &txp3d;
+		default:
+			compile_error(ctx, "unknown texture type: %s\n",
+					tgsi_texture_names[tex]);
+			break;
+		}
+		break;
+	}
+	compile_assert(ctx, 0);
+	return NULL;
+}
+
+static struct tgsi_src_register *
+get_tex_coord(struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst,
+		const struct tex_info *tinf)
+{
+	struct tgsi_src_register *coord = &inst->Src[0].Register;
+	struct ir3_instruction *instr;
+	unsigned tex = inst->Texture.Texture;
+	bool needs_mov = false;
+	unsigned i;
+
+	/* cat5 instruction cannot seem to handle const or relative: */
+	if (is_rel_or_const(coord))
+		needs_mov = true;
+
+	/* 1D textures we fix up w/ 0.0 as 2nd coord: */
+	if ((tex == TGSI_TEXTURE_1D) || (tex == TGSI_TEXTURE_SHADOW1D))
+		needs_mov = true;
+
+	/* The texture sample instructions need to coord in successive
+	 * registers/components (ie. src.xy but not src.yx).  And TXP
+	 * needs the .w component in .z for 2D..  so in some cases we
+	 * might need to emit some mov instructions to shuffle things
+	 * around:
+	 */
+	for (i = 1; (i < 4) && (tinf->order[i] >= 0) && !needs_mov; i++)
+		if (src_swiz(coord, i) != (src_swiz(coord, 0) + tinf->order[i]))
+			needs_mov = true;
+
+	if (needs_mov) {
+		struct tgsi_dst_register tmp_dst;
+		struct tgsi_src_register *tmp_src;
+		unsigned j;
+
+		type_t type_mov = get_ftype(ctx);
+
+		/* need to move things around: */
+		tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+		for (j = 0; j < 4; j++) {
+			if (tinf->order[j] < 0)
+				continue;
+			instr = instr_create(ctx, 1, 0);  /* mov */
+			instr->cat1.src_type = type_mov;
+			instr->cat1.dst_type = type_mov;
+			add_dst_reg(ctx, instr, &tmp_dst, j);
+			add_src_reg(ctx, instr, coord,
+					src_swiz(coord, tinf->order[j]));
+		}
+
+		/* fix up .y coord: */
+		if ((tex == TGSI_TEXTURE_1D) ||
+				(tex == TGSI_TEXTURE_SHADOW1D)) {
+			instr = instr_create(ctx, 1, 0);  /* mov */
+			instr->cat1.src_type = type_mov;
+			instr->cat1.dst_type = type_mov;
+			add_dst_reg(ctx, instr, &tmp_dst, 1);  /* .y */
+			ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = 0.5;
+		}
+
+		coord = tmp_src;
+	}
+
+	return coord;
+}
+
+static void
+trans_samp(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+	struct tgsi_src_register *coord;
+	struct tgsi_src_register *samp  = &inst->Src[1].Register;
+	const struct tex_info *tinf;
+
+	tinf = get_tex_info(ctx, inst);
+	coord = get_tex_coord(ctx, inst, tinf);
+
+	instr = instr_create(ctx, 5, t->opc);
+	instr->cat5.type = get_ftype(ctx);
+	instr->cat5.samp = samp->Index;
+	instr->cat5.tex  = samp->Index;
+	instr->flags |= tinf->flags;
+
+	add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
+	add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleX, tinf->src_wrmask);
+}
+
+/*
+ * SEQ(a,b) = (a == b) ? 1.0 : 0.0
+ *   cmps.f.eq tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * SNE(a,b) = (a != b) ? 1.0 : 0.0
+ *   cmps.f.ne tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * SGE(a,b) = (a >= b) ? 1.0 : 0.0
+ *   cmps.f.ge tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * SLE(a,b) = (a <= b) ? 1.0 : 0.0
+ *   cmps.f.le tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * SGT(a,b) = (a > b)  ? 1.0 : 0.0
+ *   cmps.f.gt tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * SLT(a,b) = (a < b)  ? 1.0 : 0.0
+ *   cmps.f.lt tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * CMP(a,b,c) = (a < 0.0) ? b : c
+ *   cmps.f.lt tmp0, a, {0.0}
+ *   sel.b16 dst, b, tmp0, c
+ */
+static void
+trans_cmp(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+	struct tgsi_dst_register tmp_dst;
+	struct tgsi_src_register *tmp_src;
+	struct tgsi_src_register constval0;
+	/* final instruction for CMP() uses orig src1 and src2: */
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *a0, *a1, *a2;
+	unsigned condition;
+
+	tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+	a0 = &inst->Src[0].Register;  /* a */
+	a1 = &inst->Src[1].Register;  /* b */
+
+	switch (t->tgsi_opc) {
+	case TGSI_OPCODE_SEQ:
+	case TGSI_OPCODE_FSEQ:
+		condition = IR3_COND_EQ;
+		break;
+	case TGSI_OPCODE_SNE:
+	case TGSI_OPCODE_FSNE:
+		condition = IR3_COND_NE;
+		break;
+	case TGSI_OPCODE_SGE:
+	case TGSI_OPCODE_FSGE:
+		condition = IR3_COND_GE;
+		break;
+	case TGSI_OPCODE_SLT:
+	case TGSI_OPCODE_FSLT:
+		condition = IR3_COND_LT;
+		break;
+	case TGSI_OPCODE_SLE:
+		condition = IR3_COND_LE;
+		break;
+	case TGSI_OPCODE_SGT:
+		condition = IR3_COND_GT;
+		break;
+	case TGSI_OPCODE_CMP:
+		get_immediate(ctx, &constval0, fui(0.0));
+		a0 = &inst->Src[0].Register;  /* a */
+		a1 = &constval0;              /* {0.0} */
+		condition = IR3_COND_LT;
+		break;
+	default:
+		compile_assert(ctx, 0);
+		return;
+	}
+
+	if (is_const(a0) && is_const(a1))
+		a0 = get_unconst(ctx, a0);
+
+	/* cmps.f.<cond> tmp, a0, a1 */
+	instr = instr_create(ctx, 2, OPC_CMPS_F);
+	instr->cat2.condition = condition;
+	vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
+
+	switch (t->tgsi_opc) {
+	case TGSI_OPCODE_SEQ:
+	case TGSI_OPCODE_FSEQ:
+	case TGSI_OPCODE_SGE:
+	case TGSI_OPCODE_FSGE:
+	case TGSI_OPCODE_SLE:
+	case TGSI_OPCODE_SNE:
+	case TGSI_OPCODE_FSNE:
+	case TGSI_OPCODE_SGT:
+	case TGSI_OPCODE_SLT:
+	case TGSI_OPCODE_FSLT:
+		/* cov.u16f16 dst, tmp0 */
+		instr = instr_create(ctx, 1, 0);
+		instr->cat1.src_type = get_utype(ctx);
+		instr->cat1.dst_type = get_ftype(ctx);
+		vectorize(ctx, instr, dst, 1, tmp_src, 0);
+		break;
+	case TGSI_OPCODE_CMP:
+		a1 = &inst->Src[1].Register;
+		a2 = &inst->Src[2].Register;
+		/* sel.{b32,b16} dst, src2, tmp, src1 */
+		instr = instr_create(ctx, 3, OPC_SEL_B32);
+		vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
+
+		break;
+	}
+
+	put_dst(ctx, inst, dst);
+}
+
+/*
+ * USNE(a,b) = (a != b) ? 1 : 0
+ *   cmps.u32.ne dst, a, b
+ *
+ * USEQ(a,b) = (a == b) ? 1 : 0
+ *   cmps.u32.eq dst, a, b
+ *
+ * ISGE(a,b) = (a > b) ? 1 : 0
+ *   cmps.s32.ge dst, a, b
+ *
+ * USGE(a,b) = (a > b) ? 1 : 0
+ *   cmps.u32.ge dst, a, b
+ *
+ * ISLT(a,b) = (a < b) ? 1 : 0
+ *   cmps.s32.lt dst, a, b
+ *
+ * USLT(a,b) = (a < b) ? 1 : 0
+ *   cmps.u32.lt dst, a, b
+ *
+ * UCMP(a,b,c) = (a < 0) ? b : c
+ *   cmps.u32.lt tmp0, a, {0}
+ *   sel.b16 dst, b, tmp0, c
+ */
+static void
+trans_icmp(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register constval0;
+	struct tgsi_src_register *a0, *a1, *a2;
+	unsigned condition;
+
+	a0 = &inst->Src[0].Register;  /* a */
+	a1 = &inst->Src[1].Register;  /* b */
+
+	switch (t->tgsi_opc) {
+	case TGSI_OPCODE_USNE:
+		condition = IR3_COND_NE;
+		break;
+	case TGSI_OPCODE_USEQ:
+		condition = IR3_COND_EQ;
+		break;
+	case TGSI_OPCODE_ISGE:
+	case TGSI_OPCODE_USGE:
+		condition = IR3_COND_GE;
+		break;
+	case TGSI_OPCODE_ISLT:
+	case TGSI_OPCODE_USLT:
+		condition = IR3_COND_LT;
+		break;
+	case TGSI_OPCODE_UCMP:
+		get_immediate(ctx, &constval0, 0);
+		a0 = &inst->Src[0].Register;  /* a */
+		a1 = &constval0;              /* {0} */
+		condition = IR3_COND_LT;
+		break;
+
+	default:
+		compile_assert(ctx, 0);
+		return;
+	}
+
+	if (is_const(a0) && is_const(a1))
+		a0 = get_unconst(ctx, a0);
+
+	if (t->tgsi_opc == TGSI_OPCODE_UCMP) {
+		struct tgsi_dst_register tmp_dst;
+		struct tgsi_src_register *tmp_src;
+		tmp_src = get_internal_temp(ctx, &tmp_dst);
+		/* cmps.u32.lt tmp, a0, a1 */
+		instr = instr_create(ctx, 2, t->opc);
+		instr->cat2.condition = condition;
+		vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
+
+		a1 = &inst->Src[1].Register;
+		a2 = &inst->Src[2].Register;
+		/* sel.{b32,b16} dst, src2, tmp, src1 */
+		instr = instr_create(ctx, 3, OPC_SEL_B32);
+		vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
+	} else {
+		/* cmps.{u32,s32}.<cond> dst, a0, a1 */
+		instr = instr_create(ctx, 2, t->opc);
+		instr->cat2.condition = condition;
+		vectorize(ctx, instr, dst, 2, a0, 0, a1, 0);
+	}
+	put_dst(ctx, inst, dst);
+}
+
+/*
+ * Conditional / Flow control
+ */
+
+static void
+push_branch(struct ir3_compile_context *ctx, bool inv,
+		struct ir3_instruction *instr, struct ir3_instruction *cond)
+{
+	unsigned int idx = ctx->branch_count++;
+	compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch));
+	ctx->branch[idx].instr = instr;
+	ctx->branch[idx].inv = inv;
+	/* else side of branch has same condition: */
+	if (!inv)
+		ctx->branch[idx].cond = cond;
+}
+
+static struct ir3_instruction *
+pop_branch(struct ir3_compile_context *ctx)
+{
+	unsigned int idx = --ctx->branch_count;
+	return ctx->branch[idx].instr;
+}
+
+static void
+trans_if(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr, *cond;
+	struct tgsi_src_register *src = &inst->Src[0].Register;
+	struct tgsi_dst_register tmp_dst;
+	struct tgsi_src_register *tmp_src;
+	struct tgsi_src_register constval;
+
+	get_immediate(ctx, &constval, fui(0.0));
+	tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+	if (is_const(src))
+		src = get_unconst(ctx, src);
+
+	/* cmps.f.ne tmp0, b, {0.0} */
+	instr = instr_create(ctx, 2, OPC_CMPS_F);
+	add_dst_reg(ctx, instr, &tmp_dst, 0);
+	add_src_reg(ctx, instr, src, src->SwizzleX);
+	add_src_reg(ctx, instr, &constval, constval.SwizzleX);
+	instr->cat2.condition = IR3_COND_NE;
+
+	compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */
+	cond = instr->regs[1]->instr;
+
+	/* meta:flow tmp0 */
+	instr = instr_create(ctx, -1, OPC_META_FLOW);
+	ir3_reg_create(instr, 0, 0);  /* dummy dst */
+	add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
+
+	push_branch(ctx, false, instr, cond);
+	instr->flow.if_block = push_block(ctx);
+}
+
+static void
+trans_else(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+
+	pop_block(ctx);
+
+	instr = pop_branch(ctx);
+
+	compile_assert(ctx, (instr->category == -1) &&
+			(instr->opc == OPC_META_FLOW));
+
+	push_branch(ctx, true, instr, NULL);
+	instr->flow.else_block = push_block(ctx);
+}
+
+static struct ir3_instruction *
+find_temporary(struct ir3_block *block, unsigned n)
+{
+	if (block->parent && !block->temporaries[n])
+		return find_temporary(block->parent, n);
+	return block->temporaries[n];
+}
+
+static struct ir3_instruction *
+find_output(struct ir3_block *block, unsigned n)
+{
+	if (block->parent && !block->outputs[n])
+		return find_output(block->parent, n);
+	return block->outputs[n];
+}
+
+static struct ir3_instruction *
+create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond,
+		struct ir3_instruction *a, struct ir3_instruction *b)
+{
+	struct ir3_instruction *phi;
+
+	compile_assert(ctx, cond);
+
+	/* Either side of the condition could be null..  which
+	 * indicates a variable written on only one side of the
+	 * branch.  Normally this should only be variables not
+	 * used outside of that side of the branch.  So we could
+	 * just 'return a ? a : b;' in that case.  But for better
+	 * defined undefined behavior we just stick in imm{0.0}.
+	 * In the common case of a value only used within the
+	 * one side of the branch, the PHI instruction will not
+	 * get scheduled
+	 */
+	if (!a)
+		a = create_immed(ctx, 0.0);
+	if (!b)
+		b = create_immed(ctx, 0.0);
+
+	phi = instr_create(ctx, -1, OPC_META_PHI);
+	ir3_reg_create(phi, 0, 0);  /* dummy dst */
+	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond;
+	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a;
+	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b;
+
+	return phi;
+}
+
+static void
+trans_endif(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+	struct ir3_block *ifb, *elseb;
+	struct ir3_instruction **ifout, **elseout;
+	unsigned i, ifnout = 0, elsenout = 0;
+
+	pop_block(ctx);
+
+	instr = pop_branch(ctx);
+
+	compile_assert(ctx, (instr->category == -1) &&
+			(instr->opc == OPC_META_FLOW));
+
+	ifb = instr->flow.if_block;
+	elseb = instr->flow.else_block;
+	/* if there is no else block, the parent block is used for the
+	 * branch-not-taken src of the PHI instructions:
+	 */
+	if (!elseb)
+		elseb = ifb->parent;
+
+	/* worst case sizes: */
+	ifnout = ifb->ntemporaries + ifb->noutputs;
+	elsenout = elseb->ntemporaries + elseb->noutputs;
+
+	ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout);
+	if (elseb != ifb->parent)
+		elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout);
+
+	ifnout = 0;
+	elsenout = 0;
+
+	/* generate PHI instructions for any temporaries written: */
+	for (i = 0; i < ifb->ntemporaries; i++) {
+		struct ir3_instruction *a = ifb->temporaries[i];
+		struct ir3_instruction *b = elseb->temporaries[i];
+
+		/* if temporary written in if-block, or if else block
+		 * is present and temporary written in else-block:
+		 */
+		if (a || ((elseb != ifb->parent) && b)) {
+			struct ir3_instruction *phi;
+
+			/* if only written on one side, find the closest
+			 * enclosing update on other side:
+			 */
+			if (!a)
+				a = find_temporary(ifb, i);
+			if (!b)
+				b = find_temporary(elseb, i);
+
+			ifout[ifnout] = a;
+			a = create_output(ifb, a, ifnout++);
+
+			if (elseb != ifb->parent) {
+				elseout[elsenout] = b;
+				b = create_output(elseb, b, elsenout++);
+			}
+
+			phi = create_phi(ctx, instr, a, b);
+			ctx->block->temporaries[i] = phi;
+		}
+	}
+
+	compile_assert(ctx, ifb->noutputs == elseb->noutputs);
+
+	/* .. and any outputs written: */
+	for (i = 0; i < ifb->noutputs; i++) {
+		struct ir3_instruction *a = ifb->outputs[i];
+		struct ir3_instruction *b = elseb->outputs[i];
+
+		/* if output written in if-block, or if else block
+		 * is present and output written in else-block:
+		 */
+		if (a || ((elseb != ifb->parent) && b)) {
+			struct ir3_instruction *phi;
+
+			/* if only written on one side, find the closest
+			 * enclosing update on other side:
+			 */
+			if (!a)
+				a = find_output(ifb, i);
+			if (!b)
+				b = find_output(elseb, i);
+
+			ifout[ifnout] = a;
+			a = create_output(ifb, a, ifnout++);
+
+			if (elseb != ifb->parent) {
+				elseout[elsenout] = b;
+				b = create_output(elseb, b, elsenout++);
+			}
+
+			phi = create_phi(ctx, instr, a, b);
+			ctx->block->outputs[i] = phi;
+		}
+	}
+
+	ifb->noutputs = ifnout;
+	ifb->outputs = ifout;
+
+	if (elseb != ifb->parent) {
+		elseb->noutputs = elsenout;
+		elseb->outputs = elseout;
+	}
+
+	// TODO maybe we want to compact block->inputs?
+}
+
+/*
+ * Kill
+ */
+
+static void
+trans_kill(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr, *immed, *cond = NULL;
+	bool inv = false;
+
+	switch (t->tgsi_opc) {
+	case TGSI_OPCODE_KILL:
+		/* unconditional kill, use enclosing if condition: */
+		if (ctx->branch_count > 0) {
+			unsigned int idx = ctx->branch_count - 1;
+			cond = ctx->branch[idx].cond;
+			inv = ctx->branch[idx].inv;
+		} else {
+			cond = create_immed(ctx, 1.0);
+		}
+
+		break;
+	}
+
+	compile_assert(ctx, cond);
+
+	immed = create_immed(ctx, 0.0);
+
+	/* cmps.f.ne p0.x, cond, {0.0} */
+	instr = instr_create(ctx, 2, OPC_CMPS_F);
+	instr->cat2.condition = IR3_COND_NE;
+	ir3_reg_create(instr, regid(REG_P0, 0), 0);
+	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
+	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
+	cond = instr;
+
+	/* kill p0.x */
+	instr = instr_create(ctx, 0, OPC_KILL);
+	instr->cat0.inv = inv;
+	ir3_reg_create(instr, 0, 0);  /* dummy dst */
+	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
+
+	ctx->kill[ctx->kill_count++] = instr;
+}
+
+/*
+ * Kill-If
+ */
+
+static void
+trans_killif(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct tgsi_src_register *src = &inst->Src[0].Register;
+	struct ir3_instruction *instr, *immed, *cond = NULL;
+	bool inv = false;
+
+	immed = create_immed(ctx, 0.0);
+
+	/* cmps.f.ne p0.x, cond, {0.0} */
+	instr = instr_create(ctx, 2, OPC_CMPS_F);
+	instr->cat2.condition = IR3_COND_NE;
+	ir3_reg_create(instr, regid(REG_P0, 0), 0);
+	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
+	add_src_reg(ctx, instr, src, src->SwizzleX);
+
+	cond = instr;
+
+	/* kill p0.x */
+	instr = instr_create(ctx, 0, OPC_KILL);
+	instr->cat0.inv = inv;
+	ir3_reg_create(instr, 0, 0);  /* dummy dst */
+	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
+
+	ctx->kill[ctx->kill_count++] = instr;
+
+}
+/*
+ * I2F / U2F / F2I / F2U
+ */
+
+static void
+trans_cov(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *src = &inst->Src[0].Register;
+
+	// cov.f32s32 dst, tmp0 /
+	instr = instr_create(ctx, 1, 0);
+	switch (t->tgsi_opc) {
+	case TGSI_OPCODE_U2F:
+		instr->cat1.src_type = TYPE_U32;
+		instr->cat1.dst_type = TYPE_F32;
+		break;
+	case TGSI_OPCODE_I2F:
+		instr->cat1.src_type = TYPE_S32;
+		instr->cat1.dst_type = TYPE_F32;
+		break;
+	case TGSI_OPCODE_F2U:
+		instr->cat1.src_type = TYPE_F32;
+		instr->cat1.dst_type = TYPE_U32;
+		break;
+	case TGSI_OPCODE_F2I:
+		instr->cat1.src_type = TYPE_F32;
+		instr->cat1.dst_type = TYPE_S32;
+		break;
+
+	}
+	vectorize(ctx, instr, dst, 1, src, 0);
+}
+
+/*
+ * Handlers for TGSI instructions which do have 1:1 mapping to native
+ * instructions:
+ */
+
+static void
+instr_cat0(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	instr_create(ctx, 0, t->opc);
+}
+
+static void
+instr_cat1(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *src = &inst->Src[0].Register;
+	create_mov(ctx, dst, src);
+	put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat2(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *src0 = &inst->Src[0].Register;
+	struct tgsi_src_register *src1 = &inst->Src[1].Register;
+	struct ir3_instruction *instr;
+	unsigned src0_flags = 0, src1_flags = 0;
+
+	switch (t->tgsi_opc) {
+	case TGSI_OPCODE_ABS:
+	case TGSI_OPCODE_IABS:
+		src0_flags = IR3_REG_ABS;
+		break;
+	case TGSI_OPCODE_SUB:
+	case TGSI_OPCODE_INEG:
+		src1_flags = IR3_REG_NEGATE;
+		break;
+	}
+
+	switch (t->opc) {
+	case OPC_ABSNEG_F:
+	case OPC_ABSNEG_S:
+	case OPC_CLZ_B:
+	case OPC_CLZ_S:
+	case OPC_SIGN_F:
+	case OPC_FLOOR_F:
+	case OPC_CEIL_F:
+	case OPC_RNDNE_F:
+	case OPC_RNDAZ_F:
+	case OPC_TRUNC_F:
+	case OPC_NOT_B:
+	case OPC_BFREV_B:
+	case OPC_SETRM:
+	case OPC_CBITS_B:
+		/* these only have one src reg */
+		instr = instr_create(ctx, 2, t->opc);
+		vectorize(ctx, instr, dst, 1, src0, src0_flags);
+		break;
+	default:
+		if (is_const(src0) && is_const(src1))
+			src0 = get_unconst(ctx, src0);
+
+		instr = instr_create(ctx, 2, t->opc);
+		vectorize(ctx, instr, dst, 2, src0, src0_flags,
+				src1, src1_flags);
+		break;
+	}
+
+	put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat3(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *src0 = &inst->Src[0].Register;
+	struct tgsi_src_register *src1 = &inst->Src[1].Register;
+	struct ir3_instruction *instr;
+
+	/* in particular, can't handle const for src1 for cat3..
+	 * for mad, we can swap first two src's if needed:
+	 */
+	if (is_rel_or_const(src1)) {
+		if (is_mad(t->opc) && !is_rel_or_const(src0)) {
+			struct tgsi_src_register *tmp;
+			tmp = src0;
+			src0 = src1;
+			src1 = tmp;
+		} else {
+			src1 = get_unconst(ctx, src1);
+		}
+	}
+
+	instr = instr_create(ctx, 3, t->opc);
+	vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
+			&inst->Src[2].Register, 0);
+	put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat4(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *src = &inst->Src[0].Register;
+	struct ir3_instruction *instr;
+	unsigned i;
+
+	/* seems like blob compiler avoids const as src.. */
+	if (is_const(src))
+		src = get_unconst(ctx, src);
+
+	/* we need to replicate into each component: */
+	for (i = 0; i < 4; i++) {
+		if (dst->WriteMask & (1 << i)) {
+			instr = instr_create(ctx, 4, t->opc);
+			add_dst_reg(ctx, instr, dst, i);
+			add_src_reg(ctx, instr, src, src->SwizzleX);
+		}
+	}
+
+	put_dst(ctx, inst, dst);
+}
+
+static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
+#define INSTR(n, f, ...) \
+	[TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
+
+	INSTR(MOV,          instr_cat1),
+	INSTR(RCP,          instr_cat4, .opc = OPC_RCP),
+	INSTR(RSQ,          instr_cat4, .opc = OPC_RSQ),
+	INSTR(SQRT,         instr_cat4, .opc = OPC_SQRT),
+	INSTR(MUL,          instr_cat2, .opc = OPC_MUL_F),
+	INSTR(ADD,          instr_cat2, .opc = OPC_ADD_F),
+	INSTR(SUB,          instr_cat2, .opc = OPC_ADD_F),
+	INSTR(MIN,          instr_cat2, .opc = OPC_MIN_F),
+	INSTR(MAX,          instr_cat2, .opc = OPC_MAX_F),
+	INSTR(UADD,         instr_cat2, .opc = OPC_ADD_U),
+	INSTR(IMIN,         instr_cat2, .opc = OPC_MIN_S),
+	INSTR(UMIN,         instr_cat2, .opc = OPC_MIN_U),
+	INSTR(IMAX,         instr_cat2, .opc = OPC_MAX_S),
+	INSTR(UMAX,         instr_cat2, .opc = OPC_MAX_U),
+	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
+	INSTR(OR,           instr_cat2, .opc = OPC_OR_B),
+	INSTR(NOT,          instr_cat2, .opc = OPC_NOT_B),
+	INSTR(XOR,          instr_cat2, .opc = OPC_XOR_B),
+	INSTR(UMUL,         instr_cat2, .opc = OPC_MUL_U),
+	INSTR(SHL,          instr_cat2, .opc = OPC_SHL_B),
+	INSTR(USHR,         instr_cat2, .opc = OPC_SHR_B),
+	INSTR(ISHR,         instr_cat2, .opc = OPC_ASHR_B),
+	INSTR(IABS,         instr_cat2, .opc = OPC_ABSNEG_S),
+	INSTR(INEG,         instr_cat2, .opc = OPC_ABSNEG_S),
+	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
+	INSTR(MAD,          instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
+	INSTR(TRUNC,        instr_cat2, .opc = OPC_TRUNC_F),
+	INSTR(CLAMP,        trans_clamp),
+	INSTR(FLR,          instr_cat2, .opc = OPC_FLOOR_F),
+	INSTR(ROUND,        instr_cat2, .opc = OPC_RNDNE_F),
+	INSTR(SSG,          instr_cat2, .opc = OPC_SIGN_F),
+	INSTR(CEIL,         instr_cat2, .opc = OPC_CEIL_F),
+	INSTR(ARL,          trans_arl),
+	INSTR(EX2,          instr_cat4, .opc = OPC_EXP2),
+	INSTR(LG2,          instr_cat4, .opc = OPC_LOG2),
+	INSTR(ABS,          instr_cat2, .opc = OPC_ABSNEG_F),
+	INSTR(COS,          instr_cat4, .opc = OPC_COS),
+	INSTR(SIN,          instr_cat4, .opc = OPC_SIN),
+	INSTR(TEX,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX),
+	INSTR(TXP,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP),
+	INSTR(SGT,          trans_cmp),
+	INSTR(SLT,          trans_cmp),
+	INSTR(FSLT,         trans_cmp),
+	INSTR(SGE,          trans_cmp),
+	INSTR(FSGE,         trans_cmp),
+	INSTR(SLE,          trans_cmp),
+	INSTR(SNE,          trans_cmp),
+	INSTR(FSNE,         trans_cmp),
+	INSTR(SEQ,          trans_cmp),
+	INSTR(FSEQ,         trans_cmp),
+	INSTR(CMP,          trans_cmp),
+	INSTR(USNE,         trans_icmp, .opc = OPC_CMPS_U),
+	INSTR(USEQ,         trans_icmp, .opc = OPC_CMPS_U),
+	INSTR(ISGE,         trans_icmp, .opc = OPC_CMPS_S),
+	INSTR(USGE,         trans_icmp, .opc = OPC_CMPS_U),
+	INSTR(ISLT,         trans_icmp, .opc = OPC_CMPS_S),
+	INSTR(USLT,         trans_icmp, .opc = OPC_CMPS_U),
+	INSTR(UCMP,         trans_icmp, .opc = OPC_CMPS_U),
+	INSTR(IF,           trans_if),
+	INSTR(UIF,          trans_if),
+	INSTR(ELSE,         trans_else),
+	INSTR(ENDIF,        trans_endif),
+	INSTR(END,          instr_cat0, .opc = OPC_END),
+	INSTR(KILL,         trans_kill, .opc = OPC_KILL),
+	INSTR(KILL_IF,      trans_killif, .opc = OPC_KILL),
+	INSTR(I2F,          trans_cov),
+	INSTR(U2F,          trans_cov),
+	INSTR(F2I,          trans_cov),
+	INSTR(F2U,          trans_cov),
+};
+
+static ir3_semantic
+decl_semantic(const struct tgsi_declaration_semantic *sem)
+{
+	return ir3_semantic_name(sem->Name, sem->Index);
+}
+
+static struct ir3_instruction *
+decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid,
+		unsigned j, unsigned inloc)
+{
+	struct ir3_instruction *instr;
+	struct ir3_register *src;
+
+	/* bary.f dst, #inloc, r0.x */
+	instr = instr_create(ctx, 2, OPC_BARY_F);
+	ir3_reg_create(instr, regid, 0);   /* dummy dst */
+	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
+	src = ir3_reg_create(instr, 0, IR3_REG_SSA);
+	src->wrmask = 0x3;
+	src->instr = ctx->frag_pos;
+
+	return instr;
+}
+
+/* TGSI_SEMANTIC_POSITION
+ * """"""""""""""""""""""
+ *
+ * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
+ * fragment shader input contains the fragment's window position.  The X
+ * component starts at zero and always increases from left to right.
+ * The Y component starts at zero and always increases but Y=0 may either
+ * indicate the top of the window or the bottom depending on the fragment
+ * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN).
+ * The Z coordinate ranges from 0 to 1 to represent depth from the front
+ * to the back of the Z buffer.  The W component contains the reciprocol
+ * of the interpolated vertex position W component.
+ */
+static struct ir3_instruction *
+decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid,
+		unsigned j)
+{
+	struct ir3_instruction *instr, *src;
+
+	compile_assert(ctx, !ctx->frag_coord[j]);
+
+	ctx->frag_coord[j] = create_input(ctx->block, NULL, 0);
+
+
+	switch (j) {
+	case 0: /* .x */
+	case 1: /* .y */
+		/* for frag_coord, we get unsigned values.. we need
+		 * to subtract (integer) 8 and divide by 16 (right-
+		 * shift by 4) then convert to float:
+		 */
+
+		/* add.s tmp, src, -8 */
+		instr = instr_create(ctx, 2, OPC_ADD_S);
+		ir3_reg_create(instr, regid, 0);    /* dummy dst */
+		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j];
+		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8;
+		src = instr;
+
+		/* shr.b tmp, tmp, 4 */
+		instr = instr_create(ctx, 2, OPC_SHR_B);
+		ir3_reg_create(instr, regid, 0);    /* dummy dst */
+		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
+		src = instr;
+
+		/* mov.u32f32 dst, tmp */
+		instr = instr_create(ctx, 1, 0);
+		instr->cat1.src_type = TYPE_U32;
+		instr->cat1.dst_type = TYPE_F32;
+		ir3_reg_create(instr, regid, 0);    /* dummy dst */
+		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+
+		break;
+	case 2: /* .z */
+	case 3: /* .w */
+		/* seems that we can use these as-is: */
+		instr = ctx->frag_coord[j];
+		break;
+	default:
+		compile_error(ctx, "invalid channel\n");
+		instr = create_immed(ctx, 0.0);
+		break;
+	}
+
+	return instr;
+}
+
+/* TGSI_SEMANTIC_FACE
+ * """"""""""""""""""
+ *
+ * This label applies to fragment shader inputs only and indicates that
+ * the register contains front/back-face information of the form (F, 0,
+ * 0, 1).  The first component will be positive when the fragment belongs
+ * to a front-facing polygon, and negative when the fragment belongs to a
+ * back-facing polygon.
+ */
+static struct ir3_instruction *
+decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid,
+		unsigned j)
+{
+	struct ir3_instruction *instr, *src;
+
+	switch (j) {
+	case 0: /* .x */
+		compile_assert(ctx, !ctx->frag_face);
+
+		ctx->frag_face = create_input(ctx->block, NULL, 0);
+
+		/* for faceness, we always get -1 or 0 (int).. but TGSI expects
+		 * positive vs negative float.. and piglit further seems to
+		 * expect -1.0 or 1.0:
+		 *
+		 *    mul.s tmp, hr0.x, 2
+		 *    add.s tmp, tmp, 1
+		 *    mov.s16f32, dst, tmp
+		 *
+		 */
+
+		instr = instr_create(ctx, 2, OPC_MUL_S);
+		ir3_reg_create(instr, regid, 0);    /* dummy dst */
+		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face;
+		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
+		src = instr;
+
+		instr = instr_create(ctx, 2, OPC_ADD_S);
+		ir3_reg_create(instr, regid, 0);    /* dummy dst */
+		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
+		src = instr;
+
+		instr = instr_create(ctx, 1, 0); /* mov */
+		instr->cat1.src_type = TYPE_S32;
+		instr->cat1.dst_type = TYPE_F32;
+		ir3_reg_create(instr, regid, 0);    /* dummy dst */
+		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+
+		break;
+	case 1: /* .y */
+	case 2: /* .z */
+		instr = create_immed(ctx, 0.0);
+		break;
+	case 3: /* .w */
+		instr = create_immed(ctx, 1.0);
+		break;
+	default:
+		compile_error(ctx, "invalid channel\n");
+		instr = create_immed(ctx, 0.0);
+		break;
+	}
+
+	return instr;
+}
+
+static void
+decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	unsigned name = decl->Semantic.Name;
+	unsigned i;
+
+	/* I don't think we should get frag shader input without
+	 * semantic info?  Otherwise how do inputs get linked to
+	 * vert outputs?
+	 */
+	compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
+			decl->Declaration.Semantic);
+
+	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
+		unsigned n = so->inputs_count++;
+		unsigned r = regid(i, 0);
+		unsigned ncomp, j;
+
+		/* we'll figure out the actual components used after scheduling */
+		ncomp = 4;
+
+		DBG("decl in -> r%d", i);
+
+		compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
+
+		so->inputs[n].semantic = decl_semantic(&decl->Semantic);
+		so->inputs[n].compmask = (1 << ncomp) - 1;
+		so->inputs[n].regid = r;
+		so->inputs[n].inloc = ctx->next_inloc;
+
+		for (j = 0; j < ncomp; j++) {
+			struct ir3_instruction *instr = NULL;
+
+			if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+				/* for fragment shaders, POSITION and FACE are handled
+				 * specially, not using normal varying / bary.f
+				 */
+				if (name == TGSI_SEMANTIC_POSITION) {
+					so->inputs[n].bary = false;
+					so->frag_coord = true;
+					instr = decl_in_frag_coord(ctx, r + j, j);
+				} else if (name == TGSI_SEMANTIC_FACE) {
+					so->inputs[n].bary = false;
+					so->frag_face = true;
+					instr = decl_in_frag_face(ctx, r + j, j);
+				} else {
+					so->inputs[n].bary = true;
+					instr = decl_in_frag_bary(ctx, r + j, j,
+							so->inputs[n].inloc + j - 8);
+				}
+			} else {
+				instr = create_input(ctx->block, NULL, (i * 4) + j);
+			}
+
+			ctx->block->inputs[(i * 4) + j] = instr;
+		}
+
+		if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) {
+			ctx->next_inloc += ncomp;
+			so->total_in += ncomp;
+		}
+	}
+}
+
+static void
+decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	unsigned comp = 0;
+	unsigned name = decl->Semantic.Name;
+	unsigned i;
+
+	compile_assert(ctx, decl->Declaration.Semantic);
+
+	DBG("decl out[%d] -> r%d", name, decl->Range.First);
+
+	if (ctx->type == TGSI_PROCESSOR_VERTEX) {
+		switch (name) {
+		case TGSI_SEMANTIC_POSITION:
+			so->writes_pos = true;
+			break;
+		case TGSI_SEMANTIC_PSIZE:
+			so->writes_psize = true;
+			break;
+		case TGSI_SEMANTIC_COLOR:
+		case TGSI_SEMANTIC_BCOLOR:
+		case TGSI_SEMANTIC_GENERIC:
+		case TGSI_SEMANTIC_FOG:
+		case TGSI_SEMANTIC_TEXCOORD:
+			break;
+		default:
+			compile_error(ctx, "unknown VS semantic name: %s\n",
+					tgsi_semantic_names[name]);
+		}
+	} else {
+		switch (name) {
+		case TGSI_SEMANTIC_POSITION:
+			comp = 2;  /* tgsi will write to .z component */
+			so->writes_pos = true;
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			break;
+		default:
+			compile_error(ctx, "unknown FS semantic name: %s\n",
+					tgsi_semantic_names[name]);
+		}
+	}
+
+	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
+		unsigned n = so->outputs_count++;
+		unsigned ncomp, j;
+
+		ncomp = 4;
+
+		compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
+
+		so->outputs[n].semantic = decl_semantic(&decl->Semantic);
+		so->outputs[n].regid = regid(i, comp);
+
+		/* avoid undefined outputs, stick a dummy mov from imm{0.0},
+		 * which if the output is actually assigned will be over-
+		 * written
+		 */
+		for (j = 0; j < ncomp; j++)
+			ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0);
+	}
+}
+
+/* from TGSI perspective, we actually have inputs.  But most of the "inputs"
+ * for a fragment shader are just bary.f instructions.  The *actual* inputs
+ * from the hw perspective are the frag_pos and optionally frag_coord and
+ * frag_face.
+ */
+static void
+fixup_frag_inputs(struct ir3_compile_context *ctx)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction **inputs;
+	struct ir3_instruction *instr;
+	int n, regid = 0;
+
+	block->ninputs = 0;
+
+	n  = 4;  /* always have frag_pos */
+	n += COND(so->frag_face, 4);
+	n += COND(so->frag_coord, 4);
+
+	inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
+
+	if (so->frag_face) {
+		/* this ultimately gets assigned to hr0.x so doesn't conflict
+		 * with frag_coord/frag_pos..
+		 */
+		inputs[block->ninputs++] = ctx->frag_face;
+		ctx->frag_face->regs[0]->num = 0;
+
+		/* remaining channels not used, but let's avoid confusing
+		 * other parts that expect inputs to come in groups of vec4
+		 */
+		inputs[block->ninputs++] = NULL;
+		inputs[block->ninputs++] = NULL;
+		inputs[block->ninputs++] = NULL;
+	}
+
+	/* since we don't know where to set the regid for frag_coord,
+	 * we have to use r0.x for it.  But we don't want to *always*
+	 * use r1.x for frag_pos as that could increase the register
+	 * footprint on simple shaders:
+	 */
+	if (so->frag_coord) {
+		ctx->frag_coord[0]->regs[0]->num = regid++;
+		ctx->frag_coord[1]->regs[0]->num = regid++;
+		ctx->frag_coord[2]->regs[0]->num = regid++;
+		ctx->frag_coord[3]->regs[0]->num = regid++;
+
+		inputs[block->ninputs++] = ctx->frag_coord[0];
+		inputs[block->ninputs++] = ctx->frag_coord[1];
+		inputs[block->ninputs++] = ctx->frag_coord[2];
+		inputs[block->ninputs++] = ctx->frag_coord[3];
+	}
+
+	/* we always have frag_pos: */
+	so->pos_regid = regid;
+
+	/* r0.x */
+	instr = create_input(block, NULL, block->ninputs);
+	instr->regs[0]->num = regid++;
+	inputs[block->ninputs++] = instr;
+	ctx->frag_pos->regs[1]->instr = instr;
+
+	/* r0.y */
+	instr = create_input(block, NULL, block->ninputs);
+	instr->regs[0]->num = regid++;
+	inputs[block->ninputs++] = instr;
+	ctx->frag_pos->regs[2]->instr = instr;
+
+	block->inputs = inputs;
+}
+
+static void
+compile_instructions(struct ir3_compile_context *ctx)
+{
+	push_block(ctx);
+
+	/* for fragment shader, we have a single input register (usually
+	 * r0.xy) which is used as the base for bary.f varying fetch instrs:
+	 */
+	if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+		struct ir3_instruction *instr;
+		instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
+		ir3_reg_create(instr, 0, 0);
+		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
+		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
+		ctx->frag_pos = instr;
+	}
+
+	while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
+		tgsi_parse_token(&ctx->parser);
+
+		switch (ctx->parser.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION: {
+			struct tgsi_full_declaration *decl =
+					&ctx->parser.FullToken.FullDeclaration;
+			if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
+				decl_out(ctx, decl);
+			} else if (decl->Declaration.File == TGSI_FILE_INPUT) {
+				decl_in(ctx, decl);
+			}
+			break;
+		}
+		case TGSI_TOKEN_TYPE_IMMEDIATE: {
+			/* TODO: if we know the immediate is small enough, and only
+			 * used with instructions that can embed an immediate, we
+			 * can skip this:
+			 */
+			struct tgsi_full_immediate *imm =
+					&ctx->parser.FullToken.FullImmediate;
+			unsigned n = ctx->so->immediates_count++;
+			compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates));
+			memcpy(ctx->so->immediates[n].val, imm->u, 16);
+			break;
+		}
+		case TGSI_TOKEN_TYPE_INSTRUCTION: {
+			struct tgsi_full_instruction *inst =
+					&ctx->parser.FullToken.FullInstruction;
+			unsigned opc = inst->Instruction.Opcode;
+			const struct instr_translater *t = &translaters[opc];
+
+			if (t->fxn) {
+				t->fxn(t, ctx, inst);
+				ctx->num_internal_temps = 0;
+			} else {
+				compile_error(ctx, "unknown TGSI opc: %s\n",
+						tgsi_get_opcode_name(opc));
+			}
+
+			switch (inst->Instruction.Saturate) {
+			case TGSI_SAT_ZERO_ONE:
+				create_clamp_imm(ctx, &inst->Dst[0].Register,
+						fui(0.0), fui(1.0));
+				break;
+			case TGSI_SAT_MINUS_PLUS_ONE:
+				create_clamp_imm(ctx, &inst->Dst[0].Register,
+						fui(-1.0), fui(1.0));
+				break;
+			}
+
+			instr_finish(ctx);
+
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+static void
+compile_dump(struct ir3_compile_context *ctx)
+{
+	const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
+	static unsigned n = 0;
+	char fname[16];
+	FILE *f;
+	snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
+	f = fopen(fname, "w");
+	if (!f)
+		return;
+	ir3_block_depth(ctx->block);
+	ir3_dump(ctx->ir, name, ctx->block, f);
+	fclose(f);
+}
+
+int
+ir3_compile_shader(struct ir3_shader_variant *so,
+		const struct tgsi_token *tokens, struct ir3_shader_key key)
+{
+	struct ir3_compile_context ctx;
+	struct ir3_block *block;
+	struct ir3_instruction **inputs;
+	unsigned i, j, actual_in;
+	int ret = 0;
+
+	assert(!so->ir);
+
+	so->ir = ir3_create();
+
+	assert(so->ir);
+
+	if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) {
+		ret = -1;
+		goto out;
+	}
+
+	compile_instructions(&ctx);
+
+	block = ctx.block;
+
+	/* keep track of the inputs from TGSI perspective.. */
+	inputs = block->inputs;
+
+	/* but fixup actual inputs for frag shader: */
+	if (ctx.type == TGSI_PROCESSOR_FRAGMENT)
+		fixup_frag_inputs(&ctx);
+
+	/* at this point, for binning pass, throw away unneeded outputs: */
+	if (key.binning_pass) {
+		for (i = 0, j = 0; i < so->outputs_count; i++) {
+			unsigned name = sem2name(so->outputs[i].semantic);
+			unsigned idx = sem2name(so->outputs[i].semantic);
+
+			/* throw away everything but first position/psize */
+			if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
+					(name == TGSI_SEMANTIC_PSIZE))) {
+				if (i != j) {
+					so->outputs[j] = so->outputs[i];
+					block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
+					block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
+					block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
+					block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
+				}
+				j++;
+			}
+		}
+		so->outputs_count = j;
+		block->noutputs = j * 4;
+	}
+
+	/* at this point, we want the kill's in the outputs array too,
+	 * so that they get scheduled (since they have no dst).. we've
+	 * already ensured that the array is big enough in push_block():
+	 */
+	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
+		for (i = 0; i < ctx.kill_count; i++)
+			block->outputs[block->noutputs++] = ctx.kill[i];
+	}
+
+	if (fd_mesa_debug & FD_DBG_OPTDUMP)
+		compile_dump(&ctx);
+
+	ret = ir3_block_flatten(block);
+	if (ret < 0)
+		goto out;
+	if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP))
+		compile_dump(&ctx);
+
+	ir3_block_cp(block);
+
+	if (fd_mesa_debug & FD_DBG_OPTDUMP)
+		compile_dump(&ctx);
+
+	ir3_block_depth(block);
+
+	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+		printf("AFTER DEPTH:\n");
+		ir3_dump_instr_list(block->head);
+	}
+
+	ir3_block_sched(block);
+
+	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+		printf("AFTER SCHED:\n");
+		ir3_dump_instr_list(block->head);
+	}
+
+	ret = ir3_block_ra(block, so->type, key.half_precision,
+			so->frag_coord, so->frag_face, &so->has_samp);
+	if (ret)
+		goto out;
+
+	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+		printf("AFTER RA:\n");
+		ir3_dump_instr_list(block->head);
+	}
+
+	/* fixup input/outputs: */
+	for (i = 0; i < so->outputs_count; i++) {
+		so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
+		/* preserve hack for depth output.. tgsi writes depth to .z,
+		 * but what we give the hw is the scalar register:
+		 */
+		if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) &&
+			(sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
+			so->outputs[i].regid += 2;
+	}
+	/* Note that some or all channels of an input may be unused: */
+	actual_in = 0;
+	for (i = 0; i < so->inputs_count; i++) {
+		unsigned j, regid = ~0, compmask = 0;
+		so->inputs[i].ncomp = 0;
+		for (j = 0; j < 4; j++) {
+			struct ir3_instruction *in = inputs[(i*4) + j];
+			if (in) {
+				compmask |= (1 << j);
+				regid = in->regs[0]->num - j;
+				actual_in++;
+				so->inputs[i].ncomp++;
+			}
+		}
+		so->inputs[i].regid = regid;
+		so->inputs[i].compmask = compmask;
+	}
+
+	/* fragment shader always gets full vec4's even if it doesn't
+	 * fetch all components, but vertex shader we need to update
+	 * with the actual number of components fetch, otherwise thing
+	 * will hang due to mismaptch between VFD_DECODE's and
+	 * TOTALATTRTOVS
+	 */
+	if (so->type == SHADER_VERTEX)
+		so->total_in = actual_in;
+
+out:
+	if (ret) {
+		ir3_destroy(so->ir);
+		so->ir = NULL;
+	}
+	compile_free(&ctx);
+
+	return ret;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
new file mode 100644
index 00000000000..9b11b3d8abf
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
@@ -0,0 +1,42 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#ifndef FD3_COMPILER_H_
+#define FD3_COMPILER_H_
+
+#include "ir3_shader.h"
+
+
+int ir3_compile_shader(struct ir3_shader_variant *so,
+		const struct tgsi_token *tokens,
+		struct ir3_shader_key key);
+int ir3_compile_shader_old(struct ir3_shader_variant *so,
+		const struct tgsi_token *tokens,
+		struct ir3_shader_key key);
+
+#endif /* FD3_COMPILER_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c
new file mode 100644
index 00000000000..1e1ca7ad813
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_old.c
@@ -0,0 +1,1524 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+
+#include "pipe/p_state.h"
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_strings.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+
+#include "freedreno_lowering.h"
+#include "freedreno_util.h"
+
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+#include "instr-a3xx.h"
+#include "ir3.h"
+
+
+struct ir3_compile_context {
+	const struct tgsi_token *tokens;
+	bool free_tokens;
+	struct ir3 *ir;
+	struct ir3_block *block;
+	struct ir3_shader_variant *so;
+
+	struct tgsi_parse_context parser;
+	unsigned type;
+
+	struct tgsi_shader_info info;
+
+	/* last input dst (for setting (ei) flag): */
+	struct ir3_register *last_input;
+
+	/* last instruction with relative addressing: */
+	struct ir3_instruction *last_rel;
+
+	/* for calculating input/output positions/linkages: */
+	unsigned next_inloc;
+
+	unsigned num_internal_temps;
+	struct tgsi_src_register internal_temps[6];
+
+	/* track registers which need to synchronize w/ "complex alu" cat3
+	 * instruction pipeline:
+	 */
+	regmask_t needs_ss;
+
+	/* track registers which need to synchronize with texture fetch
+	 * pipeline:
+	 */
+	regmask_t needs_sy;
+
+	/* inputs start at r0, temporaries start after last input, and
+	 * outputs start after last temporary.
+	 *
+	 * We could be more clever, because this is not a hw restriction,
+	 * but probably best just to implement an optimizing pass to
+	 * reduce the # of registers used and get rid of redundant mov's
+	 * (to output register).
+	 */
+	unsigned base_reg[TGSI_FILE_COUNT];
+
+	/* idx/slot for last compiler generated immediate */
+	unsigned immediate_idx;
+
+	/* stack of branch instructions that start (potentially nested)
+	 * branch instructions, so that we can fix up the branch targets
+	 * so that we can fix up the branch target on the corresponding
+	 * END instruction
+	 */
+	struct ir3_instruction *branch[16];
+	unsigned int branch_count;
+
+	/* used when dst is same as one of the src, to avoid overwriting a
+	 * src element before the remaining scalar instructions that make
+	 * up the vector operation
+	 */
+	struct tgsi_dst_register tmp_dst;
+	struct tgsi_src_register *tmp_src;
+};
+
+
+static void vectorize(struct ir3_compile_context *ctx,
+		struct ir3_instruction *instr, struct tgsi_dst_register *dst,
+		int nsrcs, ...);
+static void create_mov(struct ir3_compile_context *ctx,
+		struct tgsi_dst_register *dst, struct tgsi_src_register *src);
+
+static unsigned
+compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
+		const struct tgsi_token *tokens)
+{
+	unsigned ret, base = 0;
+	struct tgsi_shader_info *info = &ctx->info;
+	const struct fd_lowering_config lconfig = {
+			.color_two_side = so->key.color_two_side,
+			.lower_DST  = true,
+			.lower_XPD  = true,
+			.lower_SCS  = true,
+			.lower_LRP  = true,
+			.lower_FRC  = true,
+			.lower_POW  = true,
+			.lower_LIT  = true,
+			.lower_EXP  = true,
+			.lower_LOG  = true,
+			.lower_DP4  = true,
+			.lower_DP3  = true,
+			.lower_DPH  = true,
+			.lower_DP2  = true,
+			.lower_DP2A = true,
+	};
+
+	ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info);
+	ctx->free_tokens = !!ctx->tokens;
+	if (!ctx->tokens) {
+		/* no lowering */
+		ctx->tokens = tokens;
+	}
+	ctx->ir = so->ir;
+	ctx->block = ir3_block_create(ctx->ir, 0, 0, 0);
+	ctx->so = so;
+	ctx->last_input = NULL;
+	ctx->last_rel = NULL;
+	ctx->next_inloc = 8;
+	ctx->num_internal_temps = 0;
+	ctx->branch_count = 0;
+
+	regmask_init(&ctx->needs_ss);
+	regmask_init(&ctx->needs_sy);
+	memset(ctx->base_reg, 0, sizeof(ctx->base_reg));
+
+	/* Immediates go after constants: */
+	ctx->base_reg[TGSI_FILE_CONSTANT]  = 0;
+	ctx->base_reg[TGSI_FILE_IMMEDIATE] =
+			info->file_max[TGSI_FILE_CONSTANT] + 1;
+
+	/* if full precision and fragment shader, don't clobber
+	 * r0.x w/ bary fetch:
+	 */
+	if ((so->type == SHADER_FRAGMENT) && !so->key.half_precision)
+		base = 1;
+
+	/* Temporaries after outputs after inputs: */
+	ctx->base_reg[TGSI_FILE_INPUT]     = base;
+	ctx->base_reg[TGSI_FILE_OUTPUT]    = base +
+			info->file_max[TGSI_FILE_INPUT] + 1;
+	ctx->base_reg[TGSI_FILE_TEMPORARY] = base +
+			info->file_max[TGSI_FILE_INPUT] + 1 +
+			info->file_max[TGSI_FILE_OUTPUT] + 1;
+
+	so->first_immediate = ctx->base_reg[TGSI_FILE_IMMEDIATE];
+	ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
+
+	ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
+	if (ret != TGSI_PARSE_OK)
+		return ret;
+
+	ctx->type = ctx->parser.FullHeader.Processor.Processor;
+
+	return ret;
+}
+
+static void
+compile_error(struct ir3_compile_context *ctx, const char *format, ...)
+{
+	va_list ap;
+	va_start(ap, format);
+	_debug_vprintf(format, ap);
+	va_end(ap);
+	tgsi_dump(ctx->tokens, 0);
+	debug_assert(0);
+}
+
+#define compile_assert(ctx, cond) do { \
+		if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
+	} while (0)
+
+static void
+compile_free(struct ir3_compile_context *ctx)
+{
+	if (ctx->free_tokens)
+		free((void *)ctx->tokens);
+	tgsi_parse_free(&ctx->parser);
+}
+
+struct instr_translater {
+	void (*fxn)(const struct instr_translater *t,
+			struct ir3_compile_context *ctx,
+			struct tgsi_full_instruction *inst);
+	unsigned tgsi_opc;
+	opc_t opc;
+	opc_t hopc;    /* opc to use for half_precision mode, if different */
+	unsigned arg;
+};
+
+static void
+handle_last_rel(struct ir3_compile_context *ctx)
+{
+	if (ctx->last_rel) {
+		ctx->last_rel->flags |= IR3_INSTR_UL;
+		ctx->last_rel = NULL;
+	}
+}
+
+static struct ir3_instruction *
+instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
+{
+	return ir3_instr_create(ctx->block, category, opc);
+}
+
+static void
+add_nop(struct ir3_compile_context *ctx, unsigned count)
+{
+	while (count-- > 0)
+		instr_create(ctx, 0, OPC_NOP);
+}
+
+static unsigned
+src_flags(struct ir3_compile_context *ctx, struct ir3_register *reg)
+{
+	unsigned flags = 0;
+
+	if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+		return flags;
+
+	if (regmask_get(&ctx->needs_ss, reg)) {
+		flags |= IR3_INSTR_SS;
+		regmask_init(&ctx->needs_ss);
+	}
+
+	if (regmask_get(&ctx->needs_sy, reg)) {
+		flags |= IR3_INSTR_SY;
+		regmask_init(&ctx->needs_sy);
+	}
+
+	return flags;
+}
+
+static struct ir3_register *
+add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+		const struct tgsi_dst_register *dst, unsigned chan)
+{
+	unsigned flags = 0, num = 0;
+	struct ir3_register *reg;
+
+	switch (dst->File) {
+	case TGSI_FILE_OUTPUT:
+	case TGSI_FILE_TEMPORARY:
+		num = dst->Index + ctx->base_reg[dst->File];
+		break;
+	case TGSI_FILE_ADDRESS:
+		num = REG_A0;
+		break;
+	default:
+		compile_error(ctx, "unsupported dst register file: %s\n",
+			tgsi_file_name(dst->File));
+		break;
+	}
+
+	if (dst->Indirect)
+		flags |= IR3_REG_RELATIV;
+	if (ctx->so->key.half_precision)
+		flags |= IR3_REG_HALF;
+
+	reg = ir3_reg_create(instr, regid(num, chan), flags);
+
+	if (dst->Indirect)
+		ctx->last_rel = instr;
+
+	return reg;
+}
+
+static struct ir3_register *
+add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+		const struct tgsi_src_register *src, unsigned chan)
+{
+	unsigned flags = 0, num = 0;
+	struct ir3_register *reg;
+
+	/* TODO we need to use a mov to temp for const >= 64.. or maybe
+	 * we could use relative addressing..
+	 */
+	compile_assert(ctx, src->Index < 64);
+
+	switch (src->File) {
+	case TGSI_FILE_IMMEDIATE:
+		/* TODO if possible, use actual immediate instead of const.. but
+		 * TGSI has vec4 immediates, we can only embed scalar (of limited
+		 * size, depending on instruction..)
+		 */
+	case TGSI_FILE_CONSTANT:
+		flags |= IR3_REG_CONST;
+		num = src->Index + ctx->base_reg[src->File];
+		break;
+	case TGSI_FILE_OUTPUT:
+		/* NOTE: we should only end up w/ OUTPUT file for things like
+		 * clamp()'ing saturated dst instructions
+		 */
+	case TGSI_FILE_INPUT:
+	case TGSI_FILE_TEMPORARY:
+		num = src->Index + ctx->base_reg[src->File];
+		break;
+	default:
+		compile_error(ctx, "unsupported src register file: %s\n",
+			tgsi_file_name(src->File));
+		break;
+	}
+
+	if (src->Absolute)
+		flags |= IR3_REG_ABS;
+	if (src->Negate)
+		flags |= IR3_REG_NEGATE;
+	if (src->Indirect)
+		flags |= IR3_REG_RELATIV;
+	if (ctx->so->key.half_precision)
+		flags |= IR3_REG_HALF;
+
+	reg = ir3_reg_create(instr, regid(num, chan), flags);
+
+	if (src->Indirect)
+		ctx->last_rel = instr;
+
+	instr->flags |= src_flags(ctx, reg);
+
+	return reg;
+}
+
+static void
+src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
+{
+	src->File      = dst->File;
+	src->Indirect  = dst->Indirect;
+	src->Dimension = dst->Dimension;
+	src->Index     = dst->Index;
+	src->Absolute  = 0;
+	src->Negate    = 0;
+	src->SwizzleX  = TGSI_SWIZZLE_X;
+	src->SwizzleY  = TGSI_SWIZZLE_Y;
+	src->SwizzleZ  = TGSI_SWIZZLE_Z;
+	src->SwizzleW  = TGSI_SWIZZLE_W;
+}
+
+/* Get internal-temp src/dst to use for a sequence of instructions
+ * generated by a single TGSI op.
+ */
+static struct tgsi_src_register *
+get_internal_temp(struct ir3_compile_context *ctx,
+		struct tgsi_dst_register *tmp_dst)
+{
+	struct tgsi_src_register *tmp_src;
+	int n;
+
+	tmp_dst->File      = TGSI_FILE_TEMPORARY;
+	tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
+	tmp_dst->Indirect  = 0;
+	tmp_dst->Dimension = 0;
+
+	/* assign next temporary: */
+	n = ctx->num_internal_temps++;
+	compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
+	tmp_src = &ctx->internal_temps[n];
+
+	tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
+
+	src_from_dst(tmp_src, tmp_dst);
+
+	return tmp_src;
+}
+
+/* Get internal half-precision temp src/dst to use for a sequence of
+ * instructions generated by a single TGSI op.
+ */
+static struct tgsi_src_register *
+get_internal_temp_hr(struct ir3_compile_context *ctx,
+		struct tgsi_dst_register *tmp_dst)
+{
+	struct tgsi_src_register *tmp_src;
+	int n;
+
+	if (ctx->so->key.half_precision)
+		return get_internal_temp(ctx, tmp_dst);
+
+	tmp_dst->File      = TGSI_FILE_TEMPORARY;
+	tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
+	tmp_dst->Indirect  = 0;
+	tmp_dst->Dimension = 0;
+
+	/* assign next temporary: */
+	n = ctx->num_internal_temps++;
+	compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
+	tmp_src = &ctx->internal_temps[n];
+
+	/* just use hr0 because no one else should be using half-
+	 * precision regs:
+	 */
+	tmp_dst->Index = 0;
+
+	src_from_dst(tmp_src, tmp_dst);
+
+	return tmp_src;
+}
+
+static inline bool
+is_const(struct tgsi_src_register *src)
+{
+	return (src->File == TGSI_FILE_CONSTANT) ||
+			(src->File == TGSI_FILE_IMMEDIATE);
+}
+
+static inline bool
+is_relative(struct tgsi_src_register *src)
+{
+	return src->Indirect;
+}
+
+static inline bool
+is_rel_or_const(struct tgsi_src_register *src)
+{
+	return is_relative(src) || is_const(src);
+}
+
+static type_t
+get_ftype(struct ir3_compile_context *ctx)
+{
+	return ctx->so->key.half_precision ? TYPE_F16 : TYPE_F32;
+}
+
+static type_t
+get_utype(struct ir3_compile_context *ctx)
+{
+	return ctx->so->key.half_precision ? TYPE_U16 : TYPE_U32;
+}
+
+static unsigned
+src_swiz(struct tgsi_src_register *src, int chan)
+{
+	switch (chan) {
+	case 0: return src->SwizzleX;
+	case 1: return src->SwizzleY;
+	case 2: return src->SwizzleZ;
+	case 3: return src->SwizzleW;
+	}
+	assert(0);
+	return 0;
+}
+
+/* for instructions that cannot take a const register as src, if needed
+ * generate a move to temporary gpr:
+ */
+static struct tgsi_src_register *
+get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
+{
+	struct tgsi_dst_register tmp_dst;
+	struct tgsi_src_register *tmp_src;
+
+	compile_assert(ctx, is_rel_or_const(src));
+
+	tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+	create_mov(ctx, &tmp_dst, src);
+
+	return tmp_src;
+}
+
+static void
+get_immediate(struct ir3_compile_context *ctx,
+		struct tgsi_src_register *reg, uint32_t val)
+{
+	unsigned neg, swiz, idx, i;
+	/* actually maps 1:1 currently.. not sure if that is safe to rely on: */
+	static const unsigned swiz2tgsi[] = {
+			TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
+	};
+
+	for (i = 0; i < ctx->immediate_idx; i++) {
+		swiz = i % 4;
+		idx  = i / 4;
+
+		if (ctx->so->immediates[idx].val[swiz] == val) {
+			neg = 0;
+			break;
+		}
+
+		if (ctx->so->immediates[idx].val[swiz] == -val) {
+			neg = 1;
+			break;
+		}
+	}
+
+	if (i == ctx->immediate_idx) {
+		/* need to generate a new immediate: */
+		swiz = i % 4;
+		idx  = i / 4;
+		neg  = 0;
+		ctx->so->immediates[idx].val[swiz] = val;
+		ctx->so->immediates_count = idx + 1;
+		ctx->immediate_idx++;
+	}
+
+	reg->File      = TGSI_FILE_IMMEDIATE;
+	reg->Indirect  = 0;
+	reg->Dimension = 0;
+	reg->Index     = idx;
+	reg->Absolute  = 0;
+	reg->Negate    = neg;
+	reg->SwizzleX  = swiz2tgsi[swiz];
+	reg->SwizzleY  = swiz2tgsi[swiz];
+	reg->SwizzleZ  = swiz2tgsi[swiz];
+	reg->SwizzleW  = swiz2tgsi[swiz];
+}
+
+static void
+create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
+		struct tgsi_src_register *src)
+{
+	type_t type_mov = get_ftype(ctx);
+	unsigned i;
+
+	for (i = 0; i < 4; i++) {
+		/* move to destination: */
+		if (dst->WriteMask & (1 << i)) {
+			struct ir3_instruction *instr;
+
+			if (src->Absolute || src->Negate) {
+				/* can't have abs or neg on a mov instr, so use
+				 * absneg.f instead to handle these cases:
+				 */
+				instr = instr_create(ctx, 2, OPC_ABSNEG_F);
+			} else {
+				instr = instr_create(ctx, 1, 0);
+				instr->cat1.src_type = type_mov;
+				instr->cat1.dst_type = type_mov;
+			}
+
+			add_dst_reg(ctx, instr, dst, i);
+			add_src_reg(ctx, instr, src, src_swiz(src, i));
+		} else {
+			add_nop(ctx, 1);
+		}
+	}
+}
+
+static void
+create_clamp(struct ir3_compile_context *ctx,
+		struct tgsi_dst_register *dst, struct tgsi_src_register *val,
+		struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
+{
+	struct ir3_instruction *instr;
+
+	instr = instr_create(ctx, 2, OPC_MAX_F);
+	vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
+
+	instr = instr_create(ctx, 2, OPC_MIN_F);
+	vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
+}
+
+static void
+create_clamp_imm(struct ir3_compile_context *ctx,
+		struct tgsi_dst_register *dst,
+		uint32_t minval, uint32_t maxval)
+{
+	struct tgsi_src_register minconst, maxconst;
+	struct tgsi_src_register src;
+
+	src_from_dst(&src, dst);
+
+	get_immediate(ctx, &minconst, minval);
+	get_immediate(ctx, &maxconst, maxval);
+
+	create_clamp(ctx, dst, &src, &minconst, &maxconst);
+}
+
+static struct tgsi_dst_register *
+get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
+{
+	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+	unsigned i;
+	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+		struct tgsi_src_register *src = &inst->Src[i].Register;
+		if ((src->File == dst->File) && (src->Index == dst->Index)) {
+			if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
+					(src->SwizzleX == TGSI_SWIZZLE_X) &&
+					(src->SwizzleY == TGSI_SWIZZLE_Y) &&
+					(src->SwizzleZ == TGSI_SWIZZLE_Z) &&
+					(src->SwizzleW == TGSI_SWIZZLE_W))
+				continue;
+			ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
+			ctx->tmp_dst.WriteMask = dst->WriteMask;
+			dst = &ctx->tmp_dst;
+			break;
+		}
+	}
+	return dst;
+}
+
+static void
+put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
+		struct tgsi_dst_register *dst)
+{
+	/* if necessary, add mov back into original dst: */
+	if (dst != &inst->Dst[0].Register) {
+		create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
+	}
+}
+
+/* helper to generate the necessary repeat and/or additional instructions
+ * to turn a scalar instruction into a vector operation:
+ */
+static void
+vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
+		struct tgsi_dst_register *dst, int nsrcs, ...)
+{
+	va_list ap;
+	int i, j, n = 0;
+	bool indirect = dst->Indirect;
+
+	add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X);
+
+	va_start(ap, nsrcs);
+	for (j = 0; j < nsrcs; j++) {
+		struct tgsi_src_register *src =
+				va_arg(ap, struct tgsi_src_register *);
+		unsigned flags = va_arg(ap, unsigned);
+		struct ir3_register *reg;
+		if (flags & IR3_REG_IMMED) {
+			reg = ir3_reg_create(instr, 0, IR3_REG_IMMED);
+			/* this is an ugly cast.. should have put flags first! */
+			reg->iim_val = *(int *)&src;
+		} else {
+			reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X);
+			indirect |= src->Indirect;
+		}
+		reg->flags |= flags & ~IR3_REG_NEGATE;
+		if (flags & IR3_REG_NEGATE)
+			reg->flags ^= IR3_REG_NEGATE;
+	}
+	va_end(ap);
+
+	for (i = 0; i < 4; i++) {
+		if (dst->WriteMask & (1 << i)) {
+			struct ir3_instruction *cur;
+
+			if (n++ == 0) {
+				cur = instr;
+			} else {
+				cur = ir3_instr_clone(instr);
+				cur->flags &= ~(IR3_INSTR_SY | IR3_INSTR_SS | IR3_INSTR_JP);
+			}
+
+			/* fix-up dst register component: */
+			cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i);
+
+			/* fix-up src register component: */
+			va_start(ap, nsrcs);
+			for (j = 0; j < nsrcs; j++) {
+				struct tgsi_src_register *src =
+						va_arg(ap, struct tgsi_src_register *);
+				unsigned flags = va_arg(ap, unsigned);
+				if (!(flags & IR3_REG_IMMED)) {
+					cur->regs[j+1]->num =
+							regid(cur->regs[j+1]->num >> 2,
+									src_swiz(src, i));
+					cur->flags |= src_flags(ctx, cur->regs[j+1]);
+				}
+			}
+			va_end(ap);
+
+			if (indirect)
+				ctx->last_rel = cur;
+		}
+	}
+
+	/* pad w/ nop's.. at least until we are clever enough to
+	 * figure out if we really need to..
+	 */
+	add_nop(ctx, 4 - n);
+}
+
+/*
+ * Handlers for TGSI instructions which do not have a 1:1 mapping to
+ * native instructions:
+ */
+
+static void
+trans_clamp(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *src0 = &inst->Src[0].Register;
+	struct tgsi_src_register *src1 = &inst->Src[1].Register;
+	struct tgsi_src_register *src2 = &inst->Src[2].Register;
+
+	create_clamp(ctx, dst, src0, src1, src2);
+
+	put_dst(ctx, inst, dst);
+}
+
+/* ARL(x) = x, but mova from hrN.x to a0.. */
+static void
+trans_arl(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+	struct tgsi_dst_register tmp_dst;
+	struct tgsi_src_register *tmp_src;
+	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
+	struct tgsi_src_register *src = &inst->Src[0].Register;
+	unsigned chan = src->SwizzleX;
+	compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
+
+	handle_last_rel(ctx);
+
+	tmp_src = get_internal_temp_hr(ctx, &tmp_dst);
+
+	/* cov.{f32,f16}s16 Rtmp, Rsrc */
+	instr = instr_create(ctx, 1, 0);
+	instr->cat1.src_type = get_ftype(ctx);
+	instr->cat1.dst_type = TYPE_S16;
+	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
+	add_src_reg(ctx, instr, src, chan);
+
+	add_nop(ctx, 3);
+
+	/* shl.b Rtmp, Rtmp, 2 */
+	instr = instr_create(ctx, 2, OPC_SHL_B);
+	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
+	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
+	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
+
+	add_nop(ctx, 3);
+
+	/* mova a0, Rtmp */
+	instr = instr_create(ctx, 1, 0);
+	instr->cat1.src_type = TYPE_S16;
+	instr->cat1.dst_type = TYPE_S16;
+	add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
+	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
+
+	/* need to ensure 5 instr slots before a0 is used: */
+	add_nop(ctx, 6);
+}
+
+/* texture fetch/sample instructions: */
+static void
+trans_samp(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_register *r;
+	struct ir3_instruction *instr;
+	struct tgsi_src_register *coord = &inst->Src[0].Register;
+	struct tgsi_src_register *samp  = &inst->Src[1].Register;
+	unsigned tex = inst->Texture.Texture;
+	int8_t *order;
+	unsigned i, flags = 0, src_wrmask;
+	bool needs_mov = false;
+
+	switch (t->arg) {
+	case TGSI_OPCODE_TEX:
+		if (tex == TGSI_TEXTURE_2D) {
+			order = (int8_t[4]){ 0,  1, -1, -1 };
+			src_wrmask = TGSI_WRITEMASK_XY;
+		} else {
+			order = (int8_t[4]){ 0,  1,  2, -1 };
+			src_wrmask = TGSI_WRITEMASK_XYZ;
+		}
+		break;
+	case TGSI_OPCODE_TXP:
+		if (tex == TGSI_TEXTURE_2D) {
+			order = (int8_t[4]){ 0,  1,  3, -1 };
+			src_wrmask = TGSI_WRITEMASK_XYZ;
+		} else {
+			order = (int8_t[4]){ 0,  1,  2,  3 };
+			src_wrmask = TGSI_WRITEMASK_XYZW;
+		}
+		flags |= IR3_INSTR_P;
+		break;
+	default:
+		compile_assert(ctx, 0);
+		break;
+	}
+
+	if ((tex == TGSI_TEXTURE_3D) || (tex == TGSI_TEXTURE_CUBE)) {
+		add_nop(ctx, 3);
+		flags |= IR3_INSTR_3D;
+	}
+
+	/* cat5 instruction cannot seem to handle const or relative: */
+	if (is_rel_or_const(coord))
+		needs_mov = true;
+
+	/* The texture sample instructions need to coord in successive
+	 * registers/components (ie. src.xy but not src.yx).  And TXP
+	 * needs the .w component in .z for 2D..  so in some cases we
+	 * might need to emit some mov instructions to shuffle things
+	 * around:
+	 */
+	for (i = 1; (i < 4) && (order[i] >= 0) && !needs_mov; i++)
+		if (src_swiz(coord, i) != (src_swiz(coord, 0) + order[i]))
+			needs_mov = true;
+
+	if (needs_mov) {
+		struct tgsi_dst_register tmp_dst;
+		struct tgsi_src_register *tmp_src;
+		unsigned j;
+
+		type_t type_mov = get_ftype(ctx);
+
+		/* need to move things around: */
+		tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+		for (j = 0; (j < 4) && (order[j] >= 0); j++) {
+			instr = instr_create(ctx, 1, 0);
+			instr->cat1.src_type = type_mov;
+			instr->cat1.dst_type = type_mov;
+			add_dst_reg(ctx, instr, &tmp_dst, j);
+			add_src_reg(ctx, instr, coord,
+					src_swiz(coord, order[j]));
+		}
+
+		coord = tmp_src;
+
+		add_nop(ctx, 4 - j);
+	}
+
+	instr = instr_create(ctx, 5, t->opc);
+	instr->cat5.type = get_ftype(ctx);
+	instr->cat5.samp = samp->Index;
+	instr->cat5.tex  = samp->Index;
+	instr->flags |= flags;
+
+	r = add_dst_reg(ctx, instr, &inst->Dst[0].Register, 0);
+	r->wrmask = inst->Dst[0].Register.WriteMask;
+
+	add_src_reg(ctx, instr, coord, coord->SwizzleX)->wrmask = src_wrmask;
+
+	/* after add_src_reg() so we don't set (sy) on sam instr itself! */
+	regmask_set(&ctx->needs_sy, r);
+}
+
+/*
+ * SEQ(a,b) = (a == b) ? 1.0 : 0.0
+ *   cmps.f.eq tmp0, b, a
+ *   cov.u16f16 dst, tmp0
+ *
+ * SNE(a,b) = (a != b) ? 1.0 : 0.0
+ *   cmps.f.eq tmp0, b, a
+ *   add.s tmp0, tmp0, -1
+ *   sel.f16 dst, {0.0}, tmp0, {1.0}
+ *
+ * SGE(a,b) = (a >= b) ? 1.0 : 0.0
+ *   cmps.f.ge tmp0, a, b
+ *   cov.u16f16 dst, tmp0
+ *
+ * SLE(a,b) = (a <= b) ? 1.0 : 0.0
+ *   cmps.f.ge tmp0, b, a
+ *   cov.u16f16 dst, tmp0
+ *
+ * SGT(a,b) = (a > b)  ? 1.0 : 0.0
+ *   cmps.f.ge tmp0, b, a
+ *   add.s tmp0, tmp0, -1
+ *   sel.f16 dst, {0.0}, tmp0, {1.0}
+ *
+ * SLT(a,b) = (a < b)  ? 1.0 : 0.0
+ *   cmps.f.ge tmp0, a, b
+ *   add.s tmp0, tmp0, -1
+ *   sel.f16 dst, {0.0}, tmp0, {1.0}
+ *
+ * CMP(a,b,c) = (a < 0.0) ? b : c
+ *   cmps.f.ge tmp0, a, {0.0}
+ *   add.s tmp0, tmp0, -1
+ *   sel.f16 dst, c, tmp0, b
+ */
+static void
+trans_cmp(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+	struct tgsi_dst_register tmp_dst;
+	struct tgsi_src_register *tmp_src;
+	struct tgsi_src_register constval0, constval1;
+	/* final instruction for CMP() uses orig src1 and src2: */
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *a0, *a1;
+	unsigned condition;
+
+	tmp_src = get_internal_temp(ctx, &tmp_dst);
+
+	switch (t->tgsi_opc) {
+	case TGSI_OPCODE_SEQ:
+	case TGSI_OPCODE_SNE:
+		a0 = &inst->Src[1].Register;  /* b */
+		a1 = &inst->Src[0].Register;  /* a */
+		condition = IR3_COND_EQ;
+		break;
+	case TGSI_OPCODE_SGE:
+	case TGSI_OPCODE_SLT:
+		a0 = &inst->Src[0].Register;  /* a */
+		a1 = &inst->Src[1].Register;  /* b */
+		condition = IR3_COND_GE;
+		break;
+	case TGSI_OPCODE_SLE:
+	case TGSI_OPCODE_SGT:
+		a0 = &inst->Src[1].Register;  /* b */
+		a1 = &inst->Src[0].Register;  /* a */
+		condition = IR3_COND_GE;
+		break;
+	case TGSI_OPCODE_CMP:
+		get_immediate(ctx, &constval0, fui(0.0));
+		a0 = &inst->Src[0].Register;  /* a */
+		a1 = &constval0;              /* {0.0} */
+		condition = IR3_COND_GE;
+		break;
+	default:
+		compile_assert(ctx, 0);
+		return;
+	}
+
+	if (is_const(a0) && is_const(a1))
+		a0 = get_unconst(ctx, a0);
+
+	/* cmps.f.ge tmp, a0, a1 */
+	instr = instr_create(ctx, 2, OPC_CMPS_F);
+	instr->cat2.condition = condition;
+	vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
+
+	switch (t->tgsi_opc) {
+	case TGSI_OPCODE_SEQ:
+	case TGSI_OPCODE_SGE:
+	case TGSI_OPCODE_SLE:
+		/* cov.u16f16 dst, tmp0 */
+		instr = instr_create(ctx, 1, 0);
+		instr->cat1.src_type = get_utype(ctx);
+		instr->cat1.dst_type = get_ftype(ctx);
+		vectorize(ctx, instr, dst, 1, tmp_src, 0);
+		break;
+	case TGSI_OPCODE_SNE:
+	case TGSI_OPCODE_SGT:
+	case TGSI_OPCODE_SLT:
+	case TGSI_OPCODE_CMP:
+		/* add.s tmp, tmp, -1 */
+		instr = instr_create(ctx, 2, OPC_ADD_S);
+		vectorize(ctx, instr, &tmp_dst, 2, tmp_src, 0, -1, IR3_REG_IMMED);
+
+		if (t->tgsi_opc == TGSI_OPCODE_CMP) {
+			/* sel.{f32,f16} dst, src2, tmp, src1 */
+			instr = instr_create(ctx, 3,
+					ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32);
+			vectorize(ctx, instr, dst, 3,
+					&inst->Src[2].Register, 0,
+					tmp_src, 0,
+					&inst->Src[1].Register, 0);
+		} else {
+			get_immediate(ctx, &constval0, fui(0.0));
+			get_immediate(ctx, &constval1, fui(1.0));
+			/* sel.{f32,f16} dst, {0.0}, tmp0, {1.0} */
+			instr = instr_create(ctx, 3,
+					ctx->so->key.half_precision ? OPC_SEL_F16 : OPC_SEL_F32);
+			vectorize(ctx, instr, dst, 3,
+					&constval0, 0, tmp_src, 0, &constval1, 0);
+		}
+
+		break;
+	}
+
+	put_dst(ctx, inst, dst);
+}
+
+/*
+ * Conditional / Flow control
+ */
+
+static unsigned
+find_instruction(struct ir3_compile_context *ctx, struct ir3_instruction *instr)
+{
+	unsigned i;
+	for (i = 0; i < ctx->ir->instrs_count; i++)
+		if (ctx->ir->instrs[i] == instr)
+			return i;
+	return ~0;
+}
+
+static void
+push_branch(struct ir3_compile_context *ctx, struct ir3_instruction *instr)
+{
+	ctx->branch[ctx->branch_count++] = instr;
+}
+
+static void
+pop_branch(struct ir3_compile_context *ctx)
+{
+	struct ir3_instruction *instr;
+
+	/* if we were clever enough, we'd patch this up after the fact,
+	 * and set (jp) flag on whatever the next instruction was, rather
+	 * than inserting an extra nop..
+	 */
+	instr = instr_create(ctx, 0, OPC_NOP);
+	instr->flags |= IR3_INSTR_JP;
+
+	/* pop the branch instruction from the stack and fix up branch target: */
+	instr = ctx->branch[--ctx->branch_count];
+	instr->cat0.immed = ctx->ir->instrs_count - find_instruction(ctx, instr) - 1;
+}
+
+/* We probably don't really want to translate if/else/endif into branches..
+ * the blob driver evaluates both legs of the if and then uses the sel
+ * instruction to pick which sides of the branch to "keep".. but figuring
+ * that out will take somewhat more compiler smarts.  So hopefully branches
+ * don't kill performance too badly.
+ */
+static void
+trans_if(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+	struct tgsi_src_register *src = &inst->Src[0].Register;
+	struct tgsi_src_register constval;
+
+	get_immediate(ctx, &constval, fui(0.0));
+
+	if (is_const(src))
+		src = get_unconst(ctx, src);
+
+	instr = instr_create(ctx, 2, OPC_CMPS_F);
+	ir3_reg_create(instr, regid(REG_P0, 0), 0);
+	add_src_reg(ctx, instr, src, src->SwizzleX);
+	add_src_reg(ctx, instr, &constval, constval.SwizzleX);
+	instr->cat2.condition = IR3_COND_EQ;
+
+	instr = instr_create(ctx, 0, OPC_BR);
+	push_branch(ctx, instr);
+}
+
+static void
+trans_else(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr;
+
+	/* for first half of if/else/endif, generate a jump past the else: */
+	instr = instr_create(ctx, 0, OPC_JUMP);
+
+	pop_branch(ctx);
+	push_branch(ctx, instr);
+}
+
+static void
+trans_endif(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	pop_branch(ctx);
+}
+
+/*
+ * Handlers for TGSI instructions which do have 1:1 mapping to native
+ * instructions:
+ */
+
+static void
+instr_cat0(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	instr_create(ctx, 0, t->opc);
+}
+
+static void
+instr_cat1(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *src = &inst->Src[0].Register;
+
+	/* mov instructions can't handle a negate on src: */
+	if (src->Negate) {
+		struct tgsi_src_register constval;
+		struct ir3_instruction *instr;
+
+		/* since right now, we are using uniformly either TYPE_F16 or
+		 * TYPE_F32, and we don't utilize the conversion possibilities
+		 * of mov instructions, we can get away with substituting an
+		 * add.f which can handle negate.  Might need to revisit this
+		 * in the future if we start supporting widening/narrowing or
+		 * conversion to/from integer..
+		 */
+		instr = instr_create(ctx, 2, OPC_ADD_F);
+		get_immediate(ctx, &constval, fui(0.0));
+		vectorize(ctx, instr, dst, 2, src, 0, &constval, 0);
+	} else {
+		create_mov(ctx, dst, src);
+		/* create_mov() generates vector sequence, so no vectorize() */
+	}
+	put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat2(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *src0 = &inst->Src[0].Register;
+	struct tgsi_src_register *src1 = &inst->Src[1].Register;
+	struct ir3_instruction *instr;
+	unsigned src0_flags = 0, src1_flags = 0;
+
+	switch (t->tgsi_opc) {
+	case TGSI_OPCODE_ABS:
+		src0_flags = IR3_REG_ABS;
+		break;
+	case TGSI_OPCODE_SUB:
+		src1_flags = IR3_REG_NEGATE;
+		break;
+	}
+
+	switch (t->opc) {
+	case OPC_ABSNEG_F:
+	case OPC_ABSNEG_S:
+	case OPC_CLZ_B:
+	case OPC_CLZ_S:
+	case OPC_SIGN_F:
+	case OPC_FLOOR_F:
+	case OPC_CEIL_F:
+	case OPC_RNDNE_F:
+	case OPC_RNDAZ_F:
+	case OPC_TRUNC_F:
+	case OPC_NOT_B:
+	case OPC_BFREV_B:
+	case OPC_SETRM:
+	case OPC_CBITS_B:
+		/* these only have one src reg */
+		instr = instr_create(ctx, 2, t->opc);
+		vectorize(ctx, instr, dst, 1, src0, src0_flags);
+		break;
+	default:
+		if (is_const(src0) && is_const(src1))
+			src0 = get_unconst(ctx, src0);
+
+		instr = instr_create(ctx, 2, t->opc);
+		vectorize(ctx, instr, dst, 2, src0, src0_flags,
+				src1, src1_flags);
+		break;
+	}
+
+	put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat3(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *src0 = &inst->Src[0].Register;
+	struct tgsi_src_register *src1 = &inst->Src[1].Register;
+	struct ir3_instruction *instr;
+
+	/* in particular, can't handle const for src1 for cat3..
+	 * for mad, we can swap first two src's if needed:
+	 */
+	if (is_rel_or_const(src1)) {
+		if (is_mad(t->opc) && !is_rel_or_const(src0)) {
+			struct tgsi_src_register *tmp;
+			tmp = src0;
+			src0 = src1;
+			src1 = tmp;
+		} else {
+			src1 = get_unconst(ctx, src1);
+		}
+	}
+
+	instr = instr_create(ctx, 3,
+			ctx->so->key.half_precision ? t->hopc : t->opc);
+	vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
+			&inst->Src[2].Register, 0);
+	put_dst(ctx, inst, dst);
+}
+
+static void
+instr_cat4(const struct instr_translater *t,
+		struct ir3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct tgsi_dst_register *dst = get_dst(ctx, inst);
+	struct tgsi_src_register *src = &inst->Src[0].Register;
+	struct ir3_instruction *instr;
+	unsigned i, n;
+
+	/* seems like blob compiler avoids const as src.. */
+	if (is_const(src))
+		src = get_unconst(ctx, src);
+
+	/* worst case: */
+	add_nop(ctx, 6);
+
+	/* we need to replicate into each component: */
+	for (i = 0, n = 0; i < 4; i++) {
+		if (dst->WriteMask & (1 << i)) {
+			if (n++)
+				add_nop(ctx, 1);
+			instr = instr_create(ctx, 4, t->opc);
+			add_dst_reg(ctx, instr, dst, i);
+			add_src_reg(ctx, instr, src, src->SwizzleX);
+		}
+	}
+
+	regmask_set(&ctx->needs_ss, instr->regs[0]);
+	put_dst(ctx, inst, dst);
+}
+
+static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
+#define INSTR(n, f, ...) \
+	[TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
+
+	INSTR(MOV,          instr_cat1),
+	INSTR(RCP,          instr_cat4, .opc = OPC_RCP),
+	INSTR(RSQ,          instr_cat4, .opc = OPC_RSQ),
+	INSTR(SQRT,         instr_cat4, .opc = OPC_SQRT),
+	INSTR(MUL,          instr_cat2, .opc = OPC_MUL_F),
+	INSTR(ADD,          instr_cat2, .opc = OPC_ADD_F),
+	INSTR(SUB,          instr_cat2, .opc = OPC_ADD_F),
+	INSTR(MIN,          instr_cat2, .opc = OPC_MIN_F),
+	INSTR(MAX,          instr_cat2, .opc = OPC_MAX_F),
+	INSTR(MAD,          instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
+	INSTR(TRUNC,        instr_cat2, .opc = OPC_TRUNC_F),
+	INSTR(CLAMP,        trans_clamp),
+	INSTR(FLR,          instr_cat2, .opc = OPC_FLOOR_F),
+	INSTR(ROUND,        instr_cat2, .opc = OPC_RNDNE_F),
+	INSTR(SSG,          instr_cat2, .opc = OPC_SIGN_F),
+	INSTR(ARL,          trans_arl),
+	INSTR(EX2,          instr_cat4, .opc = OPC_EXP2),
+	INSTR(LG2,          instr_cat4, .opc = OPC_LOG2),
+	INSTR(ABS,          instr_cat2, .opc = OPC_ABSNEG_F),
+	INSTR(COS,          instr_cat4, .opc = OPC_COS),
+	INSTR(SIN,          instr_cat4, .opc = OPC_SIN),
+	INSTR(TEX,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX),
+	INSTR(TXP,          trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP),
+	INSTR(SGT,          trans_cmp),
+	INSTR(SLT,          trans_cmp),
+	INSTR(SGE,          trans_cmp),
+	INSTR(SLE,          trans_cmp),
+	INSTR(SNE,          trans_cmp),
+	INSTR(SEQ,          trans_cmp),
+	INSTR(CMP,          trans_cmp),
+	INSTR(IF,           trans_if),
+	INSTR(ELSE,         trans_else),
+	INSTR(ENDIF,        trans_endif),
+	INSTR(END,          instr_cat0, .opc = OPC_END),
+	INSTR(KILL,         instr_cat0, .opc = OPC_KILL),
+};
+
+static ir3_semantic
+decl_semantic(const struct tgsi_declaration_semantic *sem)
+{
+	return ir3_semantic_name(sem->Name, sem->Index);
+}
+
+static int
+decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	unsigned base = ctx->base_reg[TGSI_FILE_INPUT];
+	unsigned i, flags = 0;
+	int nop = 0;
+
+	/* I don't think we should get frag shader input without
+	 * semantic info?  Otherwise how do inputs get linked to
+	 * vert outputs?
+	 */
+	compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
+			decl->Declaration.Semantic);
+
+	if (ctx->so->key.half_precision)
+		flags |= IR3_REG_HALF;
+
+	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
+		unsigned n = so->inputs_count++;
+		unsigned r = regid(i + base, 0);
+		unsigned ncomp;
+
+		/* TODO use ctx->info.input_usage_mask[decl->Range.n] to figure out ncomp: */
+		ncomp = 4;
+
+		DBG("decl in -> r%d", i + base);   // XXX
+
+		compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
+
+		so->inputs[n].semantic = decl_semantic(&decl->Semantic);
+		so->inputs[n].compmask = (1 << ncomp) - 1;
+		so->inputs[n].ncomp = ncomp;
+		so->inputs[n].regid = r;
+		so->inputs[n].inloc = ctx->next_inloc;
+		so->inputs[n].bary = true;   /* all that is supported */
+		ctx->next_inloc += ncomp;
+
+		so->total_in += ncomp;
+
+		/* for frag shaders, we need to generate the corresponding bary instr: */
+		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+			unsigned j;
+
+			for (j = 0; j < ncomp; j++) {
+				struct ir3_instruction *instr;
+				struct ir3_register *dst;
+
+				instr = instr_create(ctx, 2, OPC_BARY_F);
+
+				/* dst register: */
+				dst = ir3_reg_create(instr, r + j, flags);
+				ctx->last_input = dst;
+
+				/* input position: */
+				ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val =
+						so->inputs[n].inloc + j - 8;
+
+				/* input base (always r0.xy): */
+				ir3_reg_create(instr, regid(0,0), 0)->wrmask = 0x3;
+			}
+
+			nop = 6;
+		}
+	}
+
+	return nop;
+}
+
+static void
+decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	unsigned base = ctx->base_reg[TGSI_FILE_OUTPUT];
+	unsigned comp = 0;
+	unsigned name = decl->Semantic.Name;
+	unsigned i;
+
+	compile_assert(ctx, decl->Declaration.Semantic);  // TODO is this ever not true?
+
+	DBG("decl out[%d] -> r%d", name, decl->Range.First + base);   // XXX
+
+	if (ctx->type == TGSI_PROCESSOR_VERTEX) {
+		switch (name) {
+		case TGSI_SEMANTIC_POSITION:
+			so->writes_pos = true;
+			break;
+		case TGSI_SEMANTIC_PSIZE:
+			so->writes_psize = true;
+			break;
+		case TGSI_SEMANTIC_COLOR:
+		case TGSI_SEMANTIC_BCOLOR:
+		case TGSI_SEMANTIC_GENERIC:
+		case TGSI_SEMANTIC_FOG:
+		case TGSI_SEMANTIC_TEXCOORD:
+			break;
+		default:
+			compile_error(ctx, "unknown VS semantic name: %s\n",
+					tgsi_semantic_names[name]);
+		}
+	} else {
+		switch (name) {
+		case TGSI_SEMANTIC_POSITION:
+			comp = 2;  /* tgsi will write to .z component */
+			so->writes_pos = true;
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			break;
+		default:
+			compile_error(ctx, "unknown FS semantic name: %s\n",
+					tgsi_semantic_names[name]);
+		}
+	}
+
+	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
+		unsigned n = so->outputs_count++;
+		compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
+		so->outputs[n].semantic = decl_semantic(&decl->Semantic);
+		so->outputs[n].regid = regid(i + base, comp);
+	}
+}
+
+static void
+decl_samp(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
+{
+	ctx->so->has_samp = true;
+}
+
+static void
+compile_instructions(struct ir3_compile_context *ctx)
+{
+	struct ir3 *ir = ctx->ir;
+	int nop = 0;
+
+	while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
+		tgsi_parse_token(&ctx->parser);
+
+		switch (ctx->parser.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION: {
+			struct tgsi_full_declaration *decl =
+					&ctx->parser.FullToken.FullDeclaration;
+			if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
+				decl_out(ctx, decl);
+			} else if (decl->Declaration.File == TGSI_FILE_INPUT) {
+				nop = decl_in(ctx, decl);
+			} else if (decl->Declaration.File == TGSI_FILE_SAMPLER) {
+				decl_samp(ctx, decl);
+			}
+			break;
+		}
+		case TGSI_TOKEN_TYPE_IMMEDIATE: {
+			/* TODO: if we know the immediate is small enough, and only
+			 * used with instructions that can embed an immediate, we
+			 * can skip this:
+			 */
+			struct tgsi_full_immediate *imm =
+					&ctx->parser.FullToken.FullImmediate;
+			unsigned n = ctx->so->immediates_count++;
+			memcpy(ctx->so->immediates[n].val, imm->u, 16);
+			break;
+		}
+		case TGSI_TOKEN_TYPE_INSTRUCTION: {
+			struct tgsi_full_instruction *inst =
+					&ctx->parser.FullToken.FullInstruction;
+			unsigned opc = inst->Instruction.Opcode;
+			const struct instr_translater *t = &translaters[opc];
+
+			add_nop(ctx, nop);
+			nop = 0;
+
+			if (t->fxn) {
+				t->fxn(t, ctx, inst);
+				ctx->num_internal_temps = 0;
+			} else {
+				compile_error(ctx, "unknown TGSI opc: %s\n",
+						tgsi_get_opcode_name(opc));
+			}
+
+			switch (inst->Instruction.Saturate) {
+			case TGSI_SAT_ZERO_ONE:
+				create_clamp_imm(ctx, &inst->Dst[0].Register,
+						fui(0.0), fui(1.0));
+				break;
+			case TGSI_SAT_MINUS_PLUS_ONE:
+				create_clamp_imm(ctx, &inst->Dst[0].Register,
+						fui(-1.0), fui(1.0));
+				break;
+			}
+
+			break;
+		}
+		default:
+			break;
+		}
+	}
+
+	if (ir->instrs_count > 0)
+		ir->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+
+	if (ctx->last_input)
+		ctx->last_input->flags |= IR3_REG_EI;
+
+	handle_last_rel(ctx);
+}
+
+int
+ir3_compile_shader_old(struct ir3_shader_variant *so,
+		const struct tgsi_token *tokens, struct ir3_shader_key key)
+{
+	struct ir3_compile_context ctx;
+
+	assert(!so->ir);
+
+	so->ir = ir3_create();
+
+	assert(so->ir);
+
+	if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK)
+		return -1;
+
+	compile_instructions(&ctx);
+
+	compile_free(&ctx);
+
+	return 0;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
new file mode 100644
index 00000000000..73c2a27c6eb
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -0,0 +1,158 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "ir3.h"
+
+/*
+ * Copy Propagate:
+ *
+ * TODO probably want some sort of visitor sort of interface to
+ * avoid duplicating the same graph traversal logic everywhere..
+ *
+ */
+
+static void block_cp(struct ir3_block *block);
+static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, bool keep);
+
+static bool is_eligible_mov(struct ir3_instruction *instr)
+{
+	if ((instr->category == 1) &&
+			(instr->cat1.src_type == instr->cat1.dst_type)) {
+		struct ir3_register *dst = instr->regs[0];
+		struct ir3_register *src = instr->regs[1];
+		if (dst->flags & IR3_REG_ADDR)
+			return false;
+		if ((src->flags & IR3_REG_SSA) &&
+				/* TODO: propagate abs/neg modifiers if possible */
+				!(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV)))
+			return true;
+	}
+	return false;
+}
+
+static void walk_children(struct ir3_instruction *instr, bool keep)
+{
+	unsigned i;
+
+	/* walk down the graph from each src: */
+	for (i = 1; i < instr->regs_count; i++) {
+		struct ir3_register *src = instr->regs[i];
+		if (src->flags & IR3_REG_SSA)
+			src->instr = instr_cp(src->instr, keep);
+	}
+}
+
+static struct ir3_instruction *
+instr_cp_fanin(struct ir3_instruction *instr)
+{
+	unsigned i;
+
+	/* we need to handle fanin specially, to detect cases
+	 * when we need to keep a mov
+	 */
+
+	for (i = 1; i < instr->regs_count; i++) {
+		struct ir3_register *src = instr->regs[i];
+		if (src->flags & IR3_REG_SSA) {
+			struct ir3_instruction *cand =
+					instr_cp(src->instr, false);
+
+			/* if the candidate is a fanout, then keep
+			 * the move.
+			 *
+			 * This is a bit, um, fragile, but it should
+			 * catch the extra mov's that the front-end
+			 * puts in for us already in these cases.
+			 */
+			if (is_meta(cand) && (cand->opc == OPC_META_FO))
+				cand = instr_cp(src->instr, true);
+
+			src->instr = cand;
+		}
+	}
+
+	walk_children(instr, false);
+
+	return instr;
+
+}
+
+static struct ir3_instruction *
+instr_cp(struct ir3_instruction *instr, bool keep)
+{
+	/* if we've already visited this instruction, bail now: */
+	if (ir3_instr_check_mark(instr))
+		return instr;
+
+	if (is_meta(instr) && (instr->opc == OPC_META_FI))
+		return instr_cp_fanin(instr);
+
+	if (is_eligible_mov(instr) && !keep) {
+		struct ir3_register *src = instr->regs[1];
+		return instr_cp(src->instr, false);
+	}
+
+	walk_children(instr, false);
+
+	return instr;
+}
+
+static void block_cp(struct ir3_block *block)
+{
+	unsigned i, j;
+
+	for (i = 0; i < block->noutputs; i++) {
+		if (block->outputs[i]) {
+			struct ir3_instruction *out =
+					instr_cp(block->outputs[i], false);
+
+			/* To deal with things like this:
+			 *
+			 *   43: MOV OUT[2], TEMP[5]
+			 *   44: MOV OUT[0], TEMP[5]
+			 *
+			 * we need to ensure that no two outputs point to
+			 * the same instruction
+			 */
+			for (j = 0; j < i; j++) {
+				if (block->outputs[j] == out) {
+					out = instr_cp(block->outputs[i], true);
+					break;
+				}
+			}
+
+			block->outputs[i] = out;
+		}
+	}
+}
+
+void ir3_block_cp(struct ir3_block *block)
+{
+	ir3_clear_mark(block->shader);
+	block_cp(block);
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
new file mode 100644
index 00000000000..dcc0362f0c8
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -0,0 +1,159 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Instruction Depth:
+ *
+ * Calculates weighted instruction depth, ie. the sum of # of needed
+ * instructions plus delay slots back to original input (ie INPUT or
+ * CONST).  That is to say, an instructions depth is:
+ *
+ *   depth(instr) {
+ *     d = 0;
+ *     // for each src register:
+ *     foreach (src in instr->regs[1..n])
+ *       d = max(d, delayslots(src->instr, n) + depth(src->instr));
+ *     return d + 1;
+ *   }
+ *
+ * After an instruction's depth is calculated, it is inserted into the
+ * blocks depth sorted list, which is used by the scheduling pass.
+ */
+
+/* calculate required # of delay slots between the instruction that
+ * assigns a value and the one that consumes
+ */
+int ir3_delayslots(struct ir3_instruction *assigner,
+		struct ir3_instruction *consumer, unsigned n)
+{
+	/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
+	 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
+	 * handled with sync bits
+	 */
+
+	if (is_meta(assigner))
+		return 0;
+
+	if (writes_addr(assigner))
+		return 6;
+
+	/* handled via sync flags: */
+	if (is_sfu(assigner) || is_tex(assigner))
+		return 0;
+
+	/* assigner must be alu: */
+	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer)) {
+		return 6;
+	} else if ((consumer->category == 3) &&
+			is_mad(consumer->opc) && (n == 2)) {
+		/* special case, 3rd src to cat3 not required on first cycle */
+		return 1;
+	} else {
+		return 3;
+	}
+}
+
+static void insert_by_depth(struct ir3_instruction *instr)
+{
+	struct ir3_block *block = instr->block;
+	struct ir3_instruction *n = block->head;
+	struct ir3_instruction *p = NULL;
+
+	while (n && (n != instr) && (n->depth > instr->depth)) {
+		p = n;
+		n = n->next;
+	}
+
+	instr->next = n;
+	if (p)
+		p->next = instr;
+	else
+		block->head = instr;
+}
+
+static void ir3_instr_depth(struct ir3_instruction *instr)
+{
+	unsigned i;
+
+	/* if we've already visited this instruction, bail now: */
+	if (ir3_instr_check_mark(instr))
+		return;
+
+	instr->depth = 0;
+
+	for (i = 1; i < instr->regs_count; i++) {
+		struct ir3_register *src = instr->regs[i];
+		if (src->flags & IR3_REG_SSA) {
+			unsigned sd;
+
+			/* visit child to compute it's depth: */
+			ir3_instr_depth(src->instr);
+
+			sd = ir3_delayslots(src->instr, instr, i-1) +
+					src->instr->depth;
+
+			instr->depth = MAX2(instr->depth, sd);
+		}
+	}
+
+	/* meta-instructions don't add cycles, other than PHI.. which
+	 * might translate to a real instruction..
+	 *
+	 * well, not entirely true, fan-in/out, etc might need to need
+	 * to generate some extra mov's in edge cases, etc.. probably
+	 * we might want to do depth calculation considering the worst
+	 * case for these??
+	 */
+	if (!is_meta(instr))
+		instr->depth++;
+
+	insert_by_depth(instr);
+}
+
+void ir3_block_depth(struct ir3_block *block)
+{
+	unsigned i;
+
+	block->head = NULL;
+
+	ir3_clear_mark(block->shader);
+	for (i = 0; i < block->noutputs; i++)
+		if (block->outputs[i])
+			ir3_instr_depth(block->outputs[i]);
+
+	/* at this point, any unvisited input is unused: */
+	for (i = 0; i < block->ninputs; i++) {
+		struct ir3_instruction *in = block->inputs[i];
+		if (in && !ir3_instr_check_mark(in))
+			block->inputs[i] = NULL;
+	}
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_dump.c b/src/gallium/drivers/freedreno/ir3/ir3_dump.c
new file mode 100644
index 00000000000..1a6f49d51cd
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_dump.c
@@ -0,0 +1,425 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+
+#include "ir3.h"
+
+#define PTRID(x) ((unsigned long)(x))
+
+struct ir3_dump_ctx {
+	FILE *f;
+	bool verbose;
+};
+
+static void dump_instr_name(struct ir3_dump_ctx *ctx,
+		struct ir3_instruction *instr)
+{
+	/* for debugging: */
+	if (ctx->verbose) {
+#ifdef DEBUG
+		fprintf(ctx->f, "%04u:", instr->serialno);
+#endif
+		fprintf(ctx->f, "%03u: ", instr->depth);
+	}
+
+	if (instr->flags & IR3_INSTR_SY)
+		fprintf(ctx->f, "(sy)");
+	if (instr->flags & IR3_INSTR_SS)
+		fprintf(ctx->f, "(ss)");
+
+	if (is_meta(instr)) {
+		switch(instr->opc) {
+		case OPC_META_PHI:
+			fprintf(ctx->f, "&#934;");
+			break;
+		case OPC_META_DEREF:
+			fprintf(ctx->f, "(*)");
+			break;
+		default:
+			/* shouldn't hit here.. just for debugging: */
+			switch (instr->opc) {
+			case OPC_META_INPUT:  fprintf(ctx->f, "_meta:in");   break;
+			case OPC_META_OUTPUT: fprintf(ctx->f, "_meta:out");  break;
+			case OPC_META_FO:     fprintf(ctx->f, "_meta:fo");   break;
+			case OPC_META_FI:     fprintf(ctx->f, "_meta:fi");   break;
+			case OPC_META_FLOW:   fprintf(ctx->f, "_meta:flow"); break;
+
+			default: fprintf(ctx->f, "_meta:%d", instr->opc); break;
+			}
+			break;
+		}
+	} else if (instr->category == 1) {
+		static const char *type[] = {
+				[TYPE_F16] = "f16",
+				[TYPE_F32] = "f32",
+				[TYPE_U16] = "u16",
+				[TYPE_U32] = "u32",
+				[TYPE_S16] = "s16",
+				[TYPE_S32] = "s32",
+				[TYPE_U8]  = "u8",
+				[TYPE_S8]  = "s8",
+		};
+		if (instr->cat1.src_type == instr->cat1.dst_type)
+			fprintf(ctx->f, "mov");
+		else
+			fprintf(ctx->f, "cov");
+		fprintf(ctx->f, ".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
+	} else {
+		fprintf(ctx->f, "%s", ir3_instr_name(instr));
+		if (instr->flags & IR3_INSTR_3D)
+			fprintf(ctx->f, ".3d");
+		if (instr->flags & IR3_INSTR_A)
+			fprintf(ctx->f, ".a");
+		if (instr->flags & IR3_INSTR_O)
+			fprintf(ctx->f, ".o");
+		if (instr->flags & IR3_INSTR_P)
+			fprintf(ctx->f, ".p");
+		if (instr->flags & IR3_INSTR_S)
+			fprintf(ctx->f, ".s");
+		if (instr->flags & IR3_INSTR_S2EN)
+			fprintf(ctx->f, ".s2en");
+	}
+}
+
+static void dump_reg_name(struct ir3_dump_ctx *ctx,
+		struct ir3_register *reg)
+{
+	if ((reg->flags & IR3_REG_ABS) && (reg->flags & IR3_REG_NEGATE))
+		fprintf(ctx->f, "(absneg)");
+	else if (reg->flags & IR3_REG_NEGATE)
+		fprintf(ctx->f, "(neg)");
+	else if (reg->flags & IR3_REG_ABS)
+		fprintf(ctx->f, "(abs)");
+
+	if (reg->flags & IR3_REG_IMMED) {
+		fprintf(ctx->f, "imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
+	} else if (reg->flags & IR3_REG_SSA) {
+		if (ctx->verbose) {
+			fprintf(ctx->f, "_[");
+			dump_instr_name(ctx, reg->instr);
+			fprintf(ctx->f, "]");
+		}
+	} else {
+		if (reg->flags & IR3_REG_HALF)
+			fprintf(ctx->f, "h");
+		if (reg->flags & IR3_REG_CONST)
+			fprintf(ctx->f, "c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
+		else
+			fprintf(ctx->f, "r%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
+	}
+}
+
+static void ir3_instr_dump(struct ir3_dump_ctx *ctx,
+		struct ir3_instruction *instr);
+static void ir3_block_dump(struct ir3_dump_ctx *ctx,
+		struct ir3_block *block, const char *name);
+
+static void dump_instr(struct ir3_dump_ctx *ctx,
+		struct ir3_instruction *instr)
+{
+	/* if we've already visited this instruction, bail now: */
+	if (ir3_instr_check_mark(instr))
+		return;
+
+	/* some meta-instructions need to be handled specially: */
+	if (is_meta(instr)) {
+		if ((instr->opc == OPC_META_FO) ||
+				(instr->opc == OPC_META_FI)) {
+			unsigned i;
+			for (i = 1; i < instr->regs_count; i++) {
+				struct ir3_register *reg = instr->regs[i];
+				if (reg->flags & IR3_REG_SSA)
+					dump_instr(ctx, reg->instr);
+			}
+		} else if (instr->opc == OPC_META_FLOW) {
+			struct ir3_register *reg = instr->regs[1];
+			ir3_block_dump(ctx, instr->flow.if_block, "if");
+			if (instr->flow.else_block)
+				ir3_block_dump(ctx, instr->flow.else_block, "else");
+			if (reg->flags & IR3_REG_SSA)
+				dump_instr(ctx, reg->instr);
+		} else if ((instr->opc == OPC_META_PHI) ||
+				(instr->opc == OPC_META_DEREF)) {
+			/* treat like a normal instruction: */
+			ir3_instr_dump(ctx, instr);
+		}
+	} else {
+		ir3_instr_dump(ctx, instr);
+	}
+}
+
+/* arrarraggh!  if link is to something outside of the current block, we
+ * need to defer emitting the link until the end of the block, since the
+ * edge triggers pre-creation of the node it links to inside the cluster,
+ * even though it is meant to be outside..
+ */
+static struct {
+	char buf[40960];
+	unsigned n;
+} edge_buf;
+
+/* helper to print or defer: */
+static void printdef(struct ir3_dump_ctx *ctx,
+		bool defer, const char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	if (defer) {
+		unsigned n = edge_buf.n;
+		n += vsnprintf(&edge_buf.buf[n], sizeof(edge_buf.buf) - n,
+				fmt, ap);
+		edge_buf.n = n;
+	} else {
+		vfprintf(ctx->f, fmt, ap);
+	}
+	va_end(ap);
+}
+
+static void dump_link2(struct ir3_dump_ctx *ctx,
+		struct ir3_instruction *instr, const char *target, bool defer)
+{
+	/* some meta-instructions need to be handled specially: */
+	if (is_meta(instr)) {
+		if (instr->opc == OPC_META_INPUT) {
+			printdef(ctx, defer, "input%lx:<in%u>:w -> %s",
+					PTRID(instr->inout.block),
+					instr->regs[0]->num, target);
+		} else if (instr->opc == OPC_META_FO) {
+			struct ir3_register *reg = instr->regs[1];
+			dump_link2(ctx, reg->instr, target, defer);
+			printdef(ctx, defer, "[label=\".%c\"]",
+					"xyzw"[instr->fo.off & 0x3]);
+		} else if (instr->opc == OPC_META_FI) {
+			unsigned i;
+
+			/* recursively dump all parents and links */
+			for (i = 1; i < instr->regs_count; i++) {
+				struct ir3_register *reg = instr->regs[i];
+				if (reg->flags & IR3_REG_SSA) {
+					dump_link2(ctx, reg->instr, target, defer);
+					printdef(ctx, defer, "[label=\".%c\"]",
+							"xyzw"[(i - 1) & 0x3]);
+				}
+			}
+		} else if (instr->opc == OPC_META_OUTPUT) {
+			printdef(ctx, defer, "output%lx:<out%u>:w -> %s",
+					PTRID(instr->inout.block),
+					instr->regs[0]->num, target);
+		} else if ((instr->opc == OPC_META_PHI) ||
+				(instr->opc == OPC_META_DEREF)) {
+			/* treat like a normal instruction: */
+			printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target);
+		}
+	} else {
+		printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target);
+	}
+}
+
+static void dump_link(struct ir3_dump_ctx *ctx,
+		struct ir3_instruction *instr,
+		struct ir3_block *block, const char *target)
+{
+	bool defer = instr->block != block;
+	dump_link2(ctx, instr, target, defer);
+	printdef(ctx, defer, "\n");
+}
+
+static struct ir3_register *follow_flow(struct ir3_register *reg)
+{
+	if (reg->flags & IR3_REG_SSA) {
+		struct ir3_instruction *instr = reg->instr;
+		/* go with the flow.. */
+		if (is_meta(instr) && (instr->opc == OPC_META_FLOW))
+			return instr->regs[1];
+	}
+	return reg;
+}
+
+static void ir3_instr_dump(struct ir3_dump_ctx *ctx,
+		struct ir3_instruction *instr)
+{
+	unsigned i;
+
+	fprintf(ctx->f, "instr%lx [shape=record,style=filled,fillcolor=lightgrey,label=\"{",
+			PTRID(instr));
+	dump_instr_name(ctx, instr);
+
+	/* destination register: */
+	fprintf(ctx->f, "|<dst0>");
+
+	/* source register(s): */
+	for (i = 1; i < instr->regs_count; i++) {
+		struct ir3_register *reg = follow_flow(instr->regs[i]);
+
+		fprintf(ctx->f, "|");
+
+		if (reg->flags & IR3_REG_SSA)
+			fprintf(ctx->f, "<src%u> ", (i - 1));
+
+		dump_reg_name(ctx, reg);
+	}
+
+	fprintf(ctx->f, "}\"];\n");
+
+	/* and recursively dump dependent instructions: */
+	for (i = 1; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		char target[32];  /* link target */
+
+		if (!(reg->flags & IR3_REG_SSA))
+			continue;
+
+		snprintf(target, sizeof(target), "instr%lx:<src%u>",
+				PTRID(instr), (i - 1));
+
+		dump_instr(ctx, reg->instr);
+		dump_link(ctx, follow_flow(reg)->instr, instr->block, target);
+	}
+}
+
+static void ir3_block_dump(struct ir3_dump_ctx *ctx,
+		struct ir3_block *block, const char *name)
+{
+	unsigned i, n;
+
+	n = edge_buf.n;
+
+	fprintf(ctx->f, "subgraph cluster%lx {\n", PTRID(block));
+	fprintf(ctx->f, "label=\"%s\";\n", name);
+
+	/* draw inputs: */
+	fprintf(ctx->f, "input%lx [shape=record,label=\"inputs", PTRID(block));
+	for (i = 0; i < block->ninputs; i++)
+		if (block->inputs[i])
+			fprintf(ctx->f, "|<in%u> i%u.%c", i, (i >> 2), "xyzw"[i & 0x3]);
+	fprintf(ctx->f, "\"];\n");
+
+	/* draw instruction graph: */
+	for (i = 0; i < block->noutputs; i++)
+		dump_instr(ctx, block->outputs[i]);
+
+	/* draw outputs: */
+	fprintf(ctx->f, "output%lx [shape=record,label=\"outputs", PTRID(block));
+	for (i = 0; i < block->noutputs; i++)
+		fprintf(ctx->f, "|<out%u> o%u.%c", i, (i >> 2), "xyzw"[i & 0x3]);
+	fprintf(ctx->f, "\"];\n");
+
+	/* and links to outputs: */
+	for (i = 0; i < block->noutputs; i++) {
+		char target[32];  /* link target */
+
+		/* NOTE: there could be outputs that are never assigned,
+		 * so skip them
+		 */
+		if (!block->outputs[i])
+			continue;
+
+		snprintf(target, sizeof(target), "output%lx:<out%u>:e",
+				PTRID(block), i);
+
+		dump_link(ctx, block->outputs[i], block, target);
+	}
+
+	fprintf(ctx->f, "}\n");
+
+	/* and links to inputs: */
+	if (block->parent) {
+		for (i = 0; i < block->ninputs; i++) {
+			char target[32];  /* link target */
+
+			if (!block->inputs[i])
+				continue;
+
+			dump_instr(ctx, block->inputs[i]);
+
+			snprintf(target, sizeof(target), "input%lx:<in%u>:e",
+					PTRID(block), i);
+
+			dump_link(ctx, block->inputs[i], block, target);
+		}
+	}
+
+	/* dump deferred edges: */
+	if (edge_buf.n > n) {
+		fprintf(ctx->f, "%*s", edge_buf.n - n, &edge_buf.buf[n]);
+		edge_buf.n = n;
+	}
+}
+
+void ir3_dump(struct ir3 *shader, const char *name,
+		struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */,
+		FILE *f)
+{
+	struct ir3_dump_ctx ctx = {
+			.f = f,
+	};
+	ir3_clear_mark(shader);
+	fprintf(ctx.f, "digraph G {\n");
+	fprintf(ctx.f, "rankdir=RL;\n");
+	fprintf(ctx.f, "nodesep=0.25;\n");
+	fprintf(ctx.f, "ranksep=1.5;\n");
+	ir3_block_dump(&ctx, block, name);
+	fprintf(ctx.f, "}\n");
+}
+
+/*
+ * For Debugging:
+ */
+
+void
+ir3_dump_instr_single(struct ir3_instruction *instr)
+{
+	struct ir3_dump_ctx ctx = {
+			.f = stdout,
+			.verbose = true,
+	};
+	unsigned i;
+
+	dump_instr_name(&ctx, instr);
+	for (i = 0; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		printf(i ? ", " : " ");
+		dump_reg_name(&ctx, reg);
+	}
+	printf("\n");
+}
+
+void
+ir3_dump_instr_list(struct ir3_instruction *instr)
+{
+	unsigned n = 0;
+
+	while (instr) {
+		ir3_dump_instr_single(instr);
+		if (!is_meta(instr))
+			n++;
+		instr = instr->next;
+	}
+	printf("%u instructions\n", n);
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_flatten.c b/src/gallium/drivers/freedreno/ir3/ir3_flatten.c
new file mode 100644
index 00000000000..9389227034c
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_flatten.c
@@ -0,0 +1,155 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+
+#include "ir3.h"
+
+/*
+ * Flatten: flatten out legs of if/else, etc
+ *
+ * TODO probably should use some heuristic to decide to not flatten
+ * if one side of the other is too large / deeply nested / whatever?
+ */
+
+struct ir3_flatten_ctx {
+	struct ir3_block *block;
+	unsigned cnt;
+};
+
+static struct ir3_register *unwrap(struct ir3_register *reg)
+{
+
+	if (reg->flags & IR3_REG_SSA) {
+		struct ir3_instruction *instr = reg->instr;
+		if (is_meta(instr)) {
+			switch (instr->opc) {
+			case OPC_META_OUTPUT:
+			case OPC_META_FLOW:
+				if (instr->regs_count > 1)
+					return instr->regs[1];
+				return NULL;
+			default:
+				break;
+			}
+		}
+	}
+	return reg;
+}
+
+static void ir3_instr_flatten(struct ir3_flatten_ctx *ctx,
+		struct ir3_instruction *instr)
+{
+	unsigned i;
+
+	/* if we've already visited this instruction, bail now: */
+	if (ir3_instr_check_mark(instr))
+		return;
+
+	instr->block = ctx->block;
+
+	/* TODO: maybe some threshold to decide whether to
+	 * flatten or not??
+	 */
+	if (is_meta(instr)) {
+		if (instr->opc == OPC_META_PHI) {
+			struct ir3_register *cond, *t, *f;
+
+			cond = unwrap(instr->regs[1]);
+			t    = unwrap(instr->regs[2]);  /* true val */
+			f    = unwrap(instr->regs[3]);  /* false val */
+
+			/* must have cond, but t or f may be null if only written
+			 * one one side of the if/else (in which case we can just
+			 * convert the PHI to a simple move).
+			 */
+			assert(cond);
+			assert(t || f);
+
+			if (t && f) {
+				/* convert the PHI instruction to sel.{b16,b32} */
+				instr->category = 3;
+
+				/* instruction type based on dst size: */
+				if (instr->regs[0]->flags & IR3_REG_HALF)
+					instr->opc = OPC_SEL_B16;
+				else
+					instr->opc = OPC_SEL_B32;
+
+				instr->regs[1] = t;
+				instr->regs[2] = cond;
+				instr->regs[3] = f;
+			} else {
+				/* convert to simple mov: */
+				instr->category = 1;
+				instr->cat1.dst_type = TYPE_F32;
+				instr->cat1.src_type = TYPE_F32;
+				instr->regs_count = 2;
+				instr->regs[1] = t ? t : f;
+			}
+
+			ctx->cnt++;
+		} else if ((instr->opc == OPC_META_INPUT) &&
+				(instr->regs_count == 2)) {
+			type_t ftype;
+
+			if (instr->regs[0]->flags & IR3_REG_HALF)
+				ftype = TYPE_F16;
+			else
+				ftype = TYPE_F32;
+
+			/* convert meta:input to mov: */
+			instr->category = 1;
+			instr->cat1.src_type = ftype;
+			instr->cat1.dst_type = ftype;
+		}
+	}
+
+	/* recursively visit children: */
+	for (i = 1; i < instr->regs_count; i++) {
+		struct ir3_register *src = instr->regs[i];
+		if (src->flags & IR3_REG_SSA)
+			ir3_instr_flatten(ctx, src->instr);
+	}
+}
+
+/* return >= 0 is # of phi's flattened, < 0 is error */
+int ir3_block_flatten(struct ir3_block *block)
+{
+	struct ir3_flatten_ctx ctx = {
+			.block = block,
+	};
+	unsigned i;
+
+	ir3_clear_mark(block->shader);
+	for(i = 0; i < block->noutputs; i++)
+		if (block->outputs[i])
+			ir3_instr_flatten(&ctx, block->outputs[i]);
+
+	return ctx.cnt;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
new file mode 100644
index 00000000000..b916dd51393
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -0,0 +1,790 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+
+#include "ir3.h"
+#include "ir3_visitor.h"
+
+/*
+ * Register Assignment:
+ *
+ * NOTE: currently only works on a single basic block.. need to think
+ * about how multiple basic blocks are going to get scheduled.  But
+ * I think I want to re-arrange how blocks work, ie. get rid of the
+ * block nesting thing..
+ *
+ * NOTE: we could do register coalescing (eliminate moves) as part of
+ * the RA step.. OTOH I think we need to do scheduling before register
+ * assignment.  And if we remove a mov that effects scheduling (unless
+ * we leave a placeholder nop, which seems lame), so I'm not really
+ * sure how practical this is to do both in a single stage.  But OTOH
+ * I'm not really sure a sane way for the CP stage to realize when it
+ * cannot remove a mov due to multi-register constraints..
+ *
+ */
+
+struct ir3_ra_ctx {
+	struct ir3_block *block;
+	enum shader_t type;
+	bool half_precision;
+	bool frag_coord;
+	bool frag_face;
+	bool has_samp;
+	int cnt;
+	bool error;
+};
+
+/* sorta ugly way to retrofit half-precision support.. rather than
+ * passing extra param around, just OR in a high bit.  All the low
+ * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
+ * will continue to work as long as you don't underflow (and that
+ * would go badly anyways).
+ */
+#define REG_HALF  0x8000
+
+struct ir3_ra_assignment {
+	int8_t  off;        /* offset of instruction dst within range */
+	uint8_t num;        /* number of components for the range */
+};
+
+static void ra_assign(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *assigner, int num);
+static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr);
+
+/*
+ * Register Allocation:
+ */
+
+#define REG(n, wm, f) (struct ir3_register){ \
+		.flags  = (f), \
+		.num    = (n), \
+		.wrmask = TGSI_WRITEMASK_ ## wm, \
+	}
+
+/* check that the register exists, is a GPR and is not special (a0/p0) */
+static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n)
+{
+	if ((n < instr->regs_count) && reg_gpr(instr->regs[n]))
+		return instr->regs[n];
+	return NULL;
+}
+
+static int output_base(struct ir3_ra_ctx *ctx)
+{
+	/* ugg, for fragment shader we need to have input at r0.x
+	 * (or at least if there is a way to configure it, I can't
+	 * see how because the blob driver always uses r0.x (ie.
+	 * all zeros)
+	 */
+	if (ctx->type == SHADER_FRAGMENT) {
+		if (ctx->half_precision)
+			return ctx->frag_face ? 4 : 3;
+		return ctx->frag_coord ? 8 : 4;
+	}
+	return 0;
+}
+
+/* live means read before written */
+static void compute_liveregs(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr, regmask_t *liveregs)
+{
+	struct ir3_block *block = instr->block;
+	regmask_t written;
+	unsigned i, j;
+
+	regmask_init(liveregs);
+	regmask_init(&written);
+
+	for (instr = instr->next; instr; instr = instr->next) {
+		struct ir3_register *r;
+
+		if (is_meta(instr))
+			continue;
+
+		/* check first src's read: */
+		for (j = 1; j < instr->regs_count; j++) {
+			r = reg_check(instr, j);
+			if (r)
+				regmask_set_if_not(liveregs, r, &written);
+		}
+
+		/* then dst written (if assigned already): */
+		if (instr->flags & IR3_INSTR_MARK) {
+			r = reg_check(instr, 0);
+			if (r)
+				regmask_set(&written, r);
+		}
+	}
+
+	/* be sure to account for output registers too: */
+	for (i = 0; i < block->noutputs; i++) {
+		struct ir3_register reg = REG(output_base(ctx) + i, X, 0);
+		regmask_set_if_not(liveregs, &reg, &written);
+	}
+}
+
+/* calculate registers that are clobbered before last use of 'assigner'.
+ * This needs to be done backwards, although it could possibly be
+ * combined into compute_liveregs().  (Ie. compute_liveregs() could
+ * reverse the list, then do this part backwards reversing the list
+ * again back to original order.)  Otoh, probably I should try to
+ * construct a proper interference graph instead.
+ *
+ * XXX this need to follow the same recursion path that is used for
+ * to rename/assign registers (ie. ra_assign_src()).. this is a bit
+ * ugly right now, maybe refactor into node iterator sort of things
+ * that iterates nodes in the correct order?
+ */
+static bool compute_clobbers(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr, struct ir3_instruction *assigner,
+		regmask_t *liveregs)
+{
+	unsigned i;
+	bool live = false, was_live = false;
+
+	if (instr == NULL) {
+		struct ir3_block *block = ctx->block;
+
+		/* if at the end, check outputs: */
+		for (i = 0; i < block->noutputs; i++)
+			if (block->outputs[i] == assigner)
+				return true;
+		return false;
+	}
+
+	for (i = 1; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) {
+			if (is_meta(instr)) {
+				switch (instr->opc) {
+				case OPC_META_INPUT:
+					// TODO
+					assert(0);
+					break;
+				case OPC_META_FO:
+				case OPC_META_FI:
+					was_live |= compute_clobbers(ctx, instr->next,
+							instr, liveregs);
+					break;
+				default:
+					break;
+				}
+			}
+			live = true;
+			break;
+		}
+	}
+
+	was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs);
+
+	if (was_live && (instr->regs_count > 0) &&
+			(instr->flags & IR3_INSTR_MARK) &&
+			!is_meta(instr))
+		regmask_set(liveregs, instr->regs[0]);
+
+	return live || was_live;
+}
+
+static int find_available(regmask_t *liveregs, int size, bool half)
+{
+	unsigned i;
+	unsigned f = half ? IR3_REG_HALF : 0;
+	for (i = 0; i < MAX_REG - size; i++) {
+		if (!regmask_get(liveregs, &REG(i, X, f))) {
+			unsigned start = i++;
+			for (; (i < MAX_REG) && ((i - start) < size); i++)
+				if (regmask_get(liveregs, &REG(i, X, f)))
+					break;
+			if ((i - start) >= size)
+				return start;
+		}
+	}
+	assert(0);
+	return -1;
+}
+
+static int alloc_block(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr, int size)
+{
+	if (!instr) {
+		/* special case, allocating shader outputs.  At this
+		 * point, nothing is allocated, just start the shader
+		 * outputs at r0.x and let compute_liveregs() take
+		 * care of the rest from here:
+		 */
+		return 0;
+	} else {
+		struct ir3_register *dst = instr->regs[0];
+		regmask_t liveregs;
+
+		compute_liveregs(ctx, instr, &liveregs);
+
+		// XXX XXX XXX XXX XXX XXX XXX XXX XXX
+		// XXX hack.. maybe ra_calc should give us a list of
+		// instrs to compute_clobbers() on?
+		if (is_meta(instr) && (instr->opc == OPC_META_INPUT) &&
+				(instr->regs_count == 1)) {
+			unsigned i, base = instr->regs[0]->num & ~0x3;
+			for (i = 0; i < 4; i++) {
+				struct ir3_instruction *in = ctx->block->inputs[base + i];
+				if (in)
+					compute_clobbers(ctx, in->next, in, &liveregs);
+			}
+		} else
+		// XXX XXX XXX XXX XXX XXX XXX XXX XXX
+		compute_clobbers(ctx, instr->next, instr, &liveregs);
+
+		return find_available(&liveregs, size,
+				!!(dst->flags & IR3_REG_HALF));
+	}
+}
+
+/*
+ * Constraint Calculation:
+ */
+
+struct ra_calc_visitor {
+	struct ir3_visitor base;
+	struct ir3_ra_assignment a;
+};
+
+static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v)
+{
+	return (struct ra_calc_visitor *)v;
+}
+
+/* calculate register assignment for the instruction.  If the register
+ * written by this instruction is required to be part of a range, to
+ * handle other (input/output/sam/bary.f/etc) contiguous register range
+ * constraints, that is calculated handled here.
+ */
+static void ra_calc_dst(struct ir3_visitor *v,
+		struct ir3_instruction *instr, struct ir3_register *reg)
+{
+	struct ra_calc_visitor *c = ra_calc_visitor(v);
+	if (is_tex(instr)) {
+		c->a.off = 0;
+		c->a.num = 4;
+	} else {
+		c->a.off = 0;
+		c->a.num = 1;
+	}
+}
+
+static void
+ra_calc_dst_shader_input(struct ir3_visitor *v,
+		struct ir3_instruction *instr, struct ir3_register *reg)
+{
+	struct ra_calc_visitor *c = ra_calc_visitor(v);
+	struct ir3_block *block = instr->block;
+	struct ir3_register *dst = instr->regs[0];
+	unsigned base = dst->num & ~0x3;
+	unsigned i, num = 0;
+
+	assert(!(dst->flags & IR3_REG_IA));
+
+	/* check what input components we need: */
+	for (i = 0; i < 4; i++) {
+		unsigned idx = base + i;
+		if ((idx < block->ninputs) && block->inputs[idx])
+			num = i + 1;
+	}
+
+	c->a.off = dst->num - base;
+	c->a.num = num;
+}
+
+static void ra_calc_src_fanin(struct ir3_visitor *v,
+		struct ir3_instruction *instr, struct ir3_register *reg)
+{
+	struct ra_calc_visitor *c = ra_calc_visitor(v);
+	unsigned srcn = ir3_instr_regno(instr, reg) - 1;
+	c->a.off += srcn;
+	c->a.num += srcn;
+	c->a.num = MAX2(c->a.num, instr->regs_count - 1);
+}
+
+static const struct ir3_visitor_funcs calc_visitor_funcs = {
+		.instr = ir3_visit_instr,
+		.dst_shader_input = ra_calc_dst_shader_input,
+		.dst_fanout = ra_calc_dst,
+		.dst_fanin = ra_calc_dst,
+		.dst = ra_calc_dst,
+		.src_fanout = ir3_visit_reg,
+		.src_fanin = ra_calc_src_fanin,
+		.src = ir3_visit_reg,
+};
+
+static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner)
+{
+	struct ra_calc_visitor v = {
+			.base.funcs = &calc_visitor_funcs,
+	};
+
+	ir3_visit_instr(&v.base, assigner);
+
+	return v.a;
+}
+
+/*
+ * Register Assignment:
+ */
+
+struct ra_assign_visitor {
+	struct ir3_visitor base;
+	struct ir3_ra_ctx *ctx;
+	int num;
+};
+
+static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
+{
+	return (struct ra_assign_visitor *)v;
+}
+
+static type_t half_type(type_t type)
+{
+	switch (type) {
+	case TYPE_F32: return TYPE_F16;
+	case TYPE_U32: return TYPE_U16;
+	case TYPE_S32: return TYPE_S16;
+	/* instructions may already be fixed up: */
+	case TYPE_F16:
+	case TYPE_U16:
+	case TYPE_S16:
+		return type;
+	default:
+		assert(0);
+		return ~0;
+	}
+}
+
+/* some instructions need fix-up if dst register is half precision: */
+static void fixup_half_instr_dst(struct ir3_instruction *instr)
+{
+	switch (instr->category) {
+	case 1: /* move instructions */
+		instr->cat1.dst_type = half_type(instr->cat1.dst_type);
+		break;
+	case 3:
+		switch (instr->opc) {
+		case OPC_MAD_F32:
+			instr->opc = OPC_MAD_F16;
+			break;
+		case OPC_SEL_B32:
+			instr->opc = OPC_SEL_B16;
+			break;
+		case OPC_SEL_S32:
+			instr->opc = OPC_SEL_S16;
+			break;
+		case OPC_SEL_F32:
+			instr->opc = OPC_SEL_F16;
+			break;
+		case OPC_SAD_S32:
+			instr->opc = OPC_SAD_S16;
+			break;
+		/* instructions may already be fixed up: */
+		case OPC_MAD_F16:
+		case OPC_SEL_B16:
+		case OPC_SEL_S16:
+		case OPC_SEL_F16:
+		case OPC_SAD_S16:
+			break;
+		default:
+			assert(0);
+			break;
+		}
+		break;
+	case 5:
+		instr->cat5.type = half_type(instr->cat5.type);
+		break;
+	}
+}
+/* some instructions need fix-up if src register is half precision: */
+static void fixup_half_instr_src(struct ir3_instruction *instr)
+{
+	switch (instr->category) {
+	case 1: /* move instructions */
+		instr->cat1.src_type = half_type(instr->cat1.src_type);
+		break;
+	}
+}
+
+static void ra_assign_reg(struct ir3_visitor *v,
+		struct ir3_instruction *instr, struct ir3_register *reg)
+{
+	struct ra_assign_visitor *a = ra_assign_visitor(v);
+
+	if (is_flow(instr) && (instr->opc == OPC_KILL))
+		return;
+
+	reg->flags &= ~IR3_REG_SSA;
+	reg->num = a->num & ~REG_HALF;
+
+	assert(reg->num >= 0);
+
+	if (a->num & REG_HALF) {
+		reg->flags |= IR3_REG_HALF;
+		/* if dst reg being assigned, patch up the instr: */
+		if (reg == instr->regs[0])
+			fixup_half_instr_dst(instr);
+		else
+			fixup_half_instr_src(instr);
+	}
+}
+
+static void ra_assign_dst_shader_input(struct ir3_visitor *v,
+		struct ir3_instruction *instr, struct ir3_register *reg)
+{
+	struct ra_assign_visitor *a = ra_assign_visitor(v);
+	unsigned i, base = reg->num & ~0x3;
+	int off = base - reg->num;
+
+	ra_assign_reg(v, instr, reg);
+	reg->flags |= IR3_REG_IA;
+
+	/* trigger assignment of all our companion input components: */
+	for (i = 0; i < 4; i++) {
+		struct ir3_instruction *in = instr->block->inputs[i+base];
+		if (in && is_meta(in) && (in->opc == OPC_META_INPUT))
+			ra_assign(a->ctx, in, a->num + off + i);
+	}
+}
+
+static void ra_assign_dst_fanout(struct ir3_visitor *v,
+		struct ir3_instruction *instr, struct ir3_register *reg)
+{
+	struct ra_assign_visitor *a = ra_assign_visitor(v);
+	struct ir3_register *src = instr->regs[1];
+	ra_assign_reg(v, instr, reg);
+	if (src->flags & IR3_REG_SSA)
+		ra_assign(a->ctx, src->instr, a->num - instr->fo.off);
+}
+
+static void ra_assign_src_fanout(struct ir3_visitor *v,
+		struct ir3_instruction *instr, struct ir3_register *reg)
+{
+	struct ra_assign_visitor *a = ra_assign_visitor(v);
+	ra_assign_reg(v, instr, reg);
+	ra_assign(a->ctx, instr, a->num + instr->fo.off);
+}
+
+
+static void ra_assign_src_fanin(struct ir3_visitor *v,
+		struct ir3_instruction *instr, struct ir3_register *reg)
+{
+	struct ra_assign_visitor *a = ra_assign_visitor(v);
+	unsigned j, srcn = ir3_instr_regno(instr, reg) - 1;
+	ra_assign_reg(v, instr, reg);
+	ra_assign(a->ctx, instr, a->num - srcn);
+	for (j = 1; j < instr->regs_count; j++) {
+		struct ir3_register *reg = instr->regs[j];
+		if (reg->flags & IR3_REG_SSA)  /* could be renamed already */
+			ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1);
+	}
+}
+
+static const struct ir3_visitor_funcs assign_visitor_funcs = {
+		.instr = ir3_visit_instr,
+		.dst_shader_input = ra_assign_dst_shader_input,
+		.dst_fanout = ra_assign_dst_fanout,
+		.dst_fanin = ra_assign_reg,
+		.dst = ra_assign_reg,
+		.src_fanout = ra_assign_src_fanout,
+		.src_fanin = ra_assign_src_fanin,
+		.src = ra_assign_reg,
+};
+
+static void ra_assign(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *assigner, int num)
+{
+	struct ra_assign_visitor v = {
+			.base.funcs = &assign_visitor_funcs,
+			.ctx = ctx,
+			.num = num,
+	};
+
+	/* if we've already visited this instruction, bail now: */
+	if (ir3_instr_check_mark(assigner)) {
+		debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
+		if (assigner->regs[0]->num != (num & ~REG_HALF)) {
+			/* impossible situation, should have been resolved
+			 * at an earlier stage by inserting extra mov's:
+			 */
+			ctx->error = true;
+		}
+		return;
+	}
+
+	ir3_visit_instr(&v.base, assigner);
+}
+
+/*
+ *
+ */
+
+static void ir3_instr_ra(struct ir3_ra_ctx *ctx,
+		struct ir3_instruction *instr)
+{
+	struct ir3_register *dst;
+	unsigned num;
+
+	/* skip over nop's */
+	if (instr->regs_count == 0)
+		return;
+
+	dst = instr->regs[0];
+
+	/* if we've already visited this instruction, bail now: */
+	if (instr->flags & IR3_INSTR_MARK)
+		return;
+
+	/* allocate register(s): */
+	if (is_addr(instr)) {
+		num = instr->regs[2]->num;
+	} else if (reg_gpr(dst)) {
+		struct ir3_ra_assignment a;
+		a = ra_calc(instr);
+		num = alloc_block(ctx, instr, a.num) + a.off;
+	} else if (dst->flags & IR3_REG_ADDR) {
+		dst->flags &= ~IR3_REG_ADDR;
+		num = regid(REG_A0, 0) | REG_HALF;
+	} else {
+		/* predicate register (p0).. etc */
+		return;
+	}
+
+	ra_assign(ctx, instr, num);
+}
+
+/* flatten into shader: */
+// XXX this should probably be somewhere else:
+static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	struct ir3_instruction *n;
+	struct ir3 *shader = block->shader;
+	struct ir3_instruction *end =
+			ir3_instr_create(block, 0, OPC_END);
+	struct ir3_instruction *last_input = NULL;
+	struct ir3_instruction *last_rel = NULL;
+	regmask_t needs_ss_war;       /* write after read */
+	regmask_t needs_ss;
+	regmask_t needs_sy;
+
+	regmask_init(&needs_ss_war);
+	regmask_init(&needs_ss);
+	regmask_init(&needs_sy);
+
+	shader->instrs_count = 0;
+
+	for (n = block->head; n; n = n->next) {
+		struct ir3_register *reg;
+		unsigned i;
+
+		if (is_meta(n))
+			continue;
+
+		for (i = 1; i < n->regs_count; i++) {
+			reg = n->regs[i];
+
+			if (reg_gpr(reg)) {
+
+				/* TODO: we probably only need (ss) for alu
+				 * instr consuming sfu result.. need to make
+				 * some tests for both this and (sy)..
+				 */
+				if (regmask_get(&needs_ss, reg)) {
+					n->flags |= IR3_INSTR_SS;
+					regmask_init(&needs_ss);
+				}
+
+				if (regmask_get(&needs_sy, reg)) {
+					n->flags |= IR3_INSTR_SY;
+					regmask_init(&needs_sy);
+				}
+			}
+
+			/* TODO: is it valid to have address reg loaded from a
+			 * relative src (ie. mova a0, c<a0.x+4>)?  If so, the
+			 * last_rel check below should be moved ahead of this:
+			 */
+			if (reg->flags & IR3_REG_RELATIV)
+				last_rel = n;
+		}
+
+		if (n->regs_count > 0) {
+			reg = n->regs[0];
+			if (regmask_get(&needs_ss_war, reg)) {
+				n->flags |= IR3_INSTR_SS;
+				regmask_init(&needs_ss_war); // ??? I assume?
+			}
+
+			if (last_rel && (reg->num == regid(REG_A0, 0))) {
+				last_rel->flags |= IR3_INSTR_UL;
+				last_rel = NULL;
+			}
+		}
+
+		/* cat5+ does not have an (ss) bit, if needed we need to
+		 * insert a nop to carry the sync flag.  Would be kinda
+		 * clever if we were aware of this during scheduling, but
+		 * this should be a pretty rare case:
+		 */
+		if ((n->flags & IR3_INSTR_SS) && (n->category >= 5)) {
+			struct ir3_instruction *nop;
+			nop = ir3_instr_create(block, 0, OPC_NOP);
+			nop->flags |= IR3_INSTR_SS;
+			n->flags &= ~IR3_INSTR_SS;
+		}
+
+		/* need to be able to set (ss) on first instruction: */
+		if ((shader->instrs_count == 0) && (n->category >= 5))
+			ir3_instr_create(block, 0, OPC_NOP);
+
+		if (is_nop(n) && shader->instrs_count) {
+			struct ir3_instruction *last =
+					shader->instrs[shader->instrs_count-1];
+			if (is_nop(last) && (last->repeat < 5)) {
+				last->repeat++;
+				last->flags |= n->flags;
+				continue;
+			}
+		}
+
+		shader->instrs[shader->instrs_count++] = n;
+
+		if (is_sfu(n))
+			regmask_set(&needs_ss, n->regs[0]);
+
+		if (is_tex(n)) {
+			/* this ends up being the # of samp instructions.. but that
+			 * is ok, everything else only cares whether it is zero or
+			 * not.  We do this here, rather than when we encounter a
+			 * SAMP decl, because (especially in binning pass shader)
+			 * the samp instruction(s) could get eliminated if the
+			 * result is not used.
+			 */
+			ctx->has_samp = true;
+			regmask_set(&needs_sy, n->regs[0]);
+		}
+
+		/* both tex/sfu appear to not always immediately consume
+		 * their src register(s):
+		 */
+		if (is_tex(n) || is_sfu(n)) {
+			for (i = 1; i < n->regs_count; i++) {
+				reg = n->regs[i];
+				if (reg_gpr(reg))
+					regmask_set(&needs_ss_war, reg);
+			}
+		}
+
+		if (is_input(n))
+			last_input = n;
+	}
+
+	if (last_input)
+		last_input->regs[0]->flags |= IR3_REG_EI;
+
+	if (last_rel)
+		last_rel->flags |= IR3_INSTR_UL;
+
+	shader->instrs[shader->instrs_count++] = end;
+
+	shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+}
+
+static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	struct ir3_instruction *n;
+
+	if (!block->parent) {
+		unsigned i, j;
+		int base, off = output_base(ctx);
+
+		base = alloc_block(ctx, NULL, block->noutputs + off);
+
+		if (ctx->half_precision)
+			base |= REG_HALF;
+
+		for (i = 0; i < block->noutputs; i++)
+			if (block->outputs[i] && !is_kill(block->outputs[i]))
+				ra_assign(ctx, block->outputs[i], base + i + off);
+
+		if (ctx->type == SHADER_FRAGMENT) {
+			i = 0;
+			if (ctx->frag_face) {
+				/* if we have frag_face, it gets hr0.x */
+				ra_assign(ctx, block->inputs[i], REG_HALF | 0);
+				i += 4;
+			}
+			for (j = 0; i < block->ninputs; i++, j++)
+				if (block->inputs[i])
+					ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + j);
+		} else {
+			for (i = 0; i < block->ninputs; i++)
+				if (block->inputs[i])
+					ir3_instr_ra(ctx, block->inputs[i]);
+		}
+	}
+
+	/* then loop over instruction list and assign registers:
+	 */
+	n = block->head;
+	while (n) {
+		ir3_instr_ra(ctx, n);
+		if (ctx->error)
+			return -1;
+		n = n->next;
+	}
+
+	legalize(ctx, block);
+
+	return 0;
+}
+
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+		bool half_precision, bool frag_coord, bool frag_face,
+		bool *has_samp)
+{
+	struct ir3_ra_ctx ctx = {
+			.block = block,
+			.type = type,
+			.half_precision = half_precision,
+			.frag_coord = frag_coord,
+			.frag_face = frag_face,
+	};
+	int ret;
+
+	ir3_clear_mark(block->shader);
+	ret = block_ra(&ctx, block);
+	*has_samp = ctx.has_samp;
+
+	return ret;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
new file mode 100644
index 00000000000..3ef67731926
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -0,0 +1,401 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+enum {
+	SCHEDULED = -1,
+	DELAYED = -2,
+};
+
+/*
+ * Instruction Scheduling:
+ *
+ * Using the depth sorted list from depth pass, attempt to recursively
+ * schedule deepest unscheduled path.  The first instruction that cannot
+ * be scheduled, returns the required delay slots it needs, at which
+ * point we return back up to the top and attempt to schedule by next
+ * highest depth.  After a sufficient number of instructions have been
+ * scheduled, return back to beginning of list and start again.  If you
+ * reach the end of depth sorted list without being able to insert any
+ * instruction, insert nop's.  Repeat until no more unscheduled
+ * instructions.
+ *
+ * There are a few special cases that need to be handled, since sched
+ * is currently independent of register allocation.  Usages of address
+ * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
+ * if you have two pairs of instructions that write the same special
+ * register and then read it, then those pairs cannot be interleaved.
+ * To solve this, when we are in such a scheduling "critical section",
+ * and we encounter a conflicting write to a special register, we try
+ * to schedule any remaining instructions that use that value first.
+ */
+
+struct ir3_sched_ctx {
+	struct ir3_instruction *scheduled; /* last scheduled instr */
+	struct ir3_instruction *addr;      /* current a0.x user, if any */
+	struct ir3_instruction *pred;      /* current p0.x user, if any */
+	unsigned cnt;
+};
+
+static struct ir3_instruction *
+deepest(struct ir3_instruction **srcs, unsigned nsrcs)
+{
+	struct ir3_instruction *d = NULL;
+	unsigned i = 0, id = 0;
+
+	while ((i < nsrcs) && !(d = srcs[id = i]))
+		i++;
+
+	if (!d)
+		return NULL;
+
+	for (; i < nsrcs; i++)
+		if (srcs[i] && (srcs[i]->depth > d->depth))
+			d = srcs[id = i];
+
+	srcs[id] = NULL;
+
+	return d;
+}
+
+static unsigned distance(struct ir3_sched_ctx *ctx,
+		struct ir3_instruction *instr, unsigned maxd)
+{
+	struct ir3_instruction *n = ctx->scheduled;
+	unsigned d = 0;
+	while (n && (n != instr) && (d < maxd)) {
+		if (is_alu(n) || is_flow(n))
+			d++;
+		n = n->next;
+	}
+	return d;
+}
+
+/* TODO maybe we want double linked list? */
+static struct ir3_instruction * prev(struct ir3_instruction *instr)
+{
+	struct ir3_instruction *p = instr->block->head;
+	while (p && (p->next != instr))
+		p = p->next;
+	return p;
+}
+
+static void schedule(struct ir3_sched_ctx *ctx,
+		struct ir3_instruction *instr, bool remove)
+{
+	struct ir3_block *block = instr->block;
+
+	/* maybe there is a better way to handle this than just stuffing
+	 * a nop.. ideally we'd know about this constraint in the
+	 * scheduling and depth calculation..
+	 */
+	if (ctx->scheduled && is_sfu(ctx->scheduled) && is_sfu(instr))
+		schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
+
+	/* remove from depth list:
+	 */
+	if (remove) {
+		struct ir3_instruction *p = prev(instr);
+
+		/* NOTE: this can happen for inputs which are not
+		 * read.. in that case there is no need to schedule
+		 * the input, so just bail:
+		 */
+		if (instr != (p ? p->next : block->head))
+			return;
+
+		if (p)
+			p->next = instr->next;
+		else
+			block->head = instr->next;
+	}
+
+	if (writes_addr(instr)) {
+		assert(ctx->addr == NULL);
+		ctx->addr = instr;
+	}
+
+	if (writes_pred(instr)) {
+		assert(ctx->pred == NULL);
+		ctx->pred = instr;
+	}
+
+	instr->flags |= IR3_INSTR_MARK;
+
+	instr->next = ctx->scheduled;
+	ctx->scheduled = instr;
+
+	ctx->cnt++;
+}
+
+/*
+ * Delay-slot calculation.  Follows fanin/fanout.
+ */
+
+static unsigned delay_calc2(struct ir3_sched_ctx *ctx,
+		struct ir3_instruction *assigner,
+		struct ir3_instruction *consumer, unsigned srcn)
+{
+	unsigned delay = 0;
+
+	if (is_meta(assigner)) {
+		unsigned i;
+		for (i = 1; i < assigner->regs_count; i++) {
+			struct ir3_register *reg = assigner->regs[i];
+			if (reg->flags & IR3_REG_SSA) {
+				unsigned d = delay_calc2(ctx, reg->instr,
+						consumer, srcn);
+				delay = MAX2(delay, d);
+			}
+		}
+	} else {
+		delay = ir3_delayslots(assigner, consumer, srcn);
+		delay -= distance(ctx, assigner, delay);
+	}
+
+	return delay;
+}
+
+static unsigned delay_calc(struct ir3_sched_ctx *ctx,
+		struct ir3_instruction *instr)
+{
+	unsigned i, delay = 0;
+
+	for (i = 1; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		if (reg->flags & IR3_REG_SSA) {
+			unsigned d = delay_calc2(ctx, reg->instr,
+					instr, i - 1);
+			delay = MAX2(delay, d);
+		}
+	}
+
+	return delay;
+}
+
+/* A negative return value signals that an instruction has been newly
+ * scheduled, return back up to the top of the stack (to block_sched())
+ */
+static int trysched(struct ir3_sched_ctx *ctx,
+		struct ir3_instruction *instr)
+{
+	struct ir3_instruction *srcs[ARRAY_SIZE(instr->regs) - 1];
+	struct ir3_instruction *src;
+	unsigned i, delay, nsrcs = 0;
+
+	/* if already scheduled: */
+	if (instr->flags & IR3_INSTR_MARK)
+		return 0;
+
+	/* figure out our src's: */
+	for (i = 1; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		if (reg->flags & IR3_REG_SSA)
+			srcs[nsrcs++] = reg->instr;
+	}
+
+	/* for each src register in sorted order:
+	 */
+	delay = 0;
+	while ((src = deepest(srcs, nsrcs))) {
+		delay = trysched(ctx, src);
+		if (delay)
+			return delay;
+	}
+
+	/* all our dependents are scheduled, figure out if
+	 * we have enough delay slots to schedule ourself:
+	 */
+	delay = delay_calc(ctx, instr);
+	if (delay)
+		return delay;
+
+	/* if this is a write to address/predicate register, and that
+	 * register is currently in use, we need to defer until it is
+	 * free:
+	 */
+	if (writes_addr(instr) && ctx->addr) {
+		assert(ctx->addr != instr);
+		return DELAYED;
+	}
+	if (writes_pred(instr) && ctx->pred) {
+		assert(ctx->pred != instr);
+		return DELAYED;
+	}
+
+	schedule(ctx, instr, true);
+	return SCHEDULED;
+}
+
+static struct ir3_instruction * reverse(struct ir3_instruction *instr)
+{
+	struct ir3_instruction *reversed = NULL;
+	while (instr) {
+		struct ir3_instruction *next = instr->next;
+		instr->next = reversed;
+		reversed = instr;
+		instr = next;
+	}
+	return reversed;
+}
+
+static bool uses_current_addr(struct ir3_sched_ctx *ctx,
+		struct ir3_instruction *instr)
+{
+	unsigned i;
+	for (i = 1; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		if (reg->flags & IR3_REG_SSA) {
+			if (is_addr(reg->instr)) {
+				struct ir3_instruction *addr;
+				addr = reg->instr->regs[1]->instr; /* the mova */
+				if (ctx->addr == addr)
+					return true;
+			}
+		}
+	}
+	return false;
+}
+
+static bool uses_current_pred(struct ir3_sched_ctx *ctx,
+		struct ir3_instruction *instr)
+{
+	unsigned i;
+	for (i = 1; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		if ((reg->flags & IR3_REG_SSA) && (ctx->pred == reg->instr))
+				return true;
+	}
+	return false;
+}
+
+/* when we encounter an instruction that writes to the address register
+ * when it is in use, we delay that instruction and try to schedule all
+ * other instructions using the current address register:
+ */
+static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
+		struct ir3_block *block)
+{
+	struct ir3_instruction *instr = block->head;
+	bool addr_in_use = false;
+	bool pred_in_use = false;
+	unsigned cnt = ~0;
+
+	while (instr) {
+		struct ir3_instruction *next = instr->next;
+		bool addr = uses_current_addr(ctx, instr);
+		bool pred = uses_current_pred(ctx, instr);
+
+		if (addr || pred) {
+			int ret = trysched(ctx, instr);
+			if (ret == SCHEDULED)
+				cnt = 0;
+			else if (ret > 0)
+				cnt = MIN2(cnt, ret);
+			if (addr)
+				addr_in_use = true;
+			if (pred)
+				pred_in_use = true;
+		}
+
+		instr = next;
+	}
+
+	if (!addr_in_use)
+		ctx->addr = NULL;
+
+	if (!pred_in_use)
+		ctx->pred = NULL;
+
+	return cnt;
+}
+
+static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+	struct ir3_instruction *instr;
+
+	/* schedule all the shader input's (meta-instr) first so that
+	 * the RA step sees that the input registers contain a value
+	 * from the start of the shader:
+	 */
+	if (!block->parent) {
+		unsigned i;
+		for (i = 0; i < block->ninputs; i++) {
+			struct ir3_instruction *in = block->inputs[i];
+			if (in)
+				schedule(ctx, in, true);
+		}
+	}
+
+	while ((instr = block->head)) {
+		/* NOTE: always grab next *before* trysched(), in case the
+		 * instruction is actually scheduled (and therefore moved
+		 * from depth list into scheduled list)
+		 */
+		struct ir3_instruction *next = instr->next;
+		int cnt = trysched(ctx, instr);
+
+		if (cnt == DELAYED)
+			cnt = block_sched_undelayed(ctx, block);
+
+		/* -1 is signal to return up stack, but to us means same as 0: */
+		cnt = MAX2(0, cnt);
+		cnt += ctx->cnt;
+		instr = next;
+
+		/* if deepest remaining instruction cannot be scheduled, try
+		 * the increasingly more shallow instructions until needed
+		 * number of delay slots is filled:
+		 */
+		while (instr && (cnt > ctx->cnt)) {
+			next = instr->next;
+			trysched(ctx, instr);
+			instr = next;
+		}
+
+		/* and if we run out of instructions that can be scheduled,
+		 * then it is time for nop's:
+		 */
+		while (cnt > ctx->cnt)
+			schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
+	}
+
+	/* at this point, scheduled list is in reverse order, so fix that: */
+	block->head = reverse(ctx->scheduled);
+}
+
+void ir3_block_sched(struct ir3_block *block)
+{
+	struct ir3_sched_ctx ctx = {0};
+	ir3_clear_mark(block->shader);
+	block_sched(&ctx, block);
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
new file mode 100644
index 00000000000..ddf99dbc46e
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -0,0 +1,211 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "freedreno_context.h"
+#include "freedreno_lowering.h"
+#include "freedreno_util.h"
+
+#include "ir3_shader.h"
+#include "ir3_compiler.h"
+
+
+static void
+delete_variant(struct ir3_shader_variant *v)
+{
+	ir3_destroy(v->ir);
+	fd_bo_del(v->bo);
+	free(v);
+}
+
+static void
+assemble_variant(struct ir3_shader_variant *v)
+{
+	struct fd_context *ctx = fd_context(v->shader->pctx);
+	uint32_t sz, *bin;
+
+	bin = ir3_assemble(v->ir, &v->info);
+	sz = v->info.sizedwords * 4;
+
+	v->bo = fd_bo_new(ctx->dev, sz,
+			DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
+			DRM_FREEDRENO_GEM_TYPE_KMEM);
+
+	memcpy(fd_bo_map(v->bo), bin, sz);
+
+	free(bin);
+
+	v->instrlen = v->info.sizedwords / 8;
+	v->constlen = v->info.max_const + 1;
+}
+
+/* for vertex shader, the inputs are loaded into registers before the shader
+ * is executed, so max_regs from the shader instructions might not properly
+ * reflect the # of registers actually used:
+ */
+static void
+fixup_vp_regfootprint(struct ir3_shader_variant *v)
+{
+	unsigned i;
+	for (i = 0; i < v->inputs_count; i++) {
+		if (v->inputs[i].compmask) {
+			uint32_t regid = (v->inputs[i].regid + 3) >> 2;
+			v->info.max_reg = MAX2(v->info.max_reg, regid);
+		}
+	}
+	for (i = 0; i < v->outputs_count; i++) {
+		uint32_t regid = (v->outputs[i].regid + 3) >> 2;
+		v->info.max_reg = MAX2(v->info.max_reg, regid);
+	}
+}
+
+static struct ir3_shader_variant *
+create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
+{
+	struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
+	const struct tgsi_token *tokens = shader->tokens;
+	int ret;
+
+	if (!v)
+		return NULL;
+
+	v->shader = shader;
+	v->key = key;
+	v->type = shader->type;
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type,
+			key.binning_pass, key.color_two_side, key.half_precision);
+		tgsi_dump(tokens, 0);
+	}
+
+	if (!(fd_mesa_debug & FD_DBG_NOOPT)) {
+		ret = ir3_compile_shader(v, tokens, key);
+		if (ret) {
+			debug_error("new compiler failed, trying fallback!");
+
+			v->inputs_count = 0;
+			v->outputs_count = 0;
+			v->total_in = 0;
+			v->has_samp = false;
+			v->immediates_count = 0;
+		}
+	} else {
+		ret = -1;  /* force fallback to old compiler */
+	}
+
+	if (ret)
+		ret = ir3_compile_shader_old(v, tokens, key);
+
+	if (ret) {
+		debug_error("compile failed!");
+		goto fail;
+	}
+
+	assemble_variant(v);
+	if (!v->bo) {
+		debug_error("assemble failed!");
+		goto fail;
+	}
+
+	if (shader->type == SHADER_VERTEX)
+		fixup_vp_regfootprint(v);
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+			key.binning_pass, key.color_two_side, key.half_precision);
+		disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type);
+	}
+
+	return v;
+
+fail:
+	delete_variant(v);
+	return NULL;
+}
+
+struct ir3_shader_variant *
+ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key)
+{
+	struct ir3_shader_variant *v;
+
+	/* some shader key values only apply to vertex or frag shader,
+	 * so normalize the key to avoid constructing multiple identical
+	 * variants:
+	 */
+	if (shader->type == SHADER_FRAGMENT) {
+		key.binning_pass = false;
+	}
+	if (shader->type == SHADER_VERTEX) {
+		key.color_two_side = false;
+		key.half_precision = false;
+	}
+
+	for (v = shader->variants; v; v = v->next)
+		if (!memcmp(&key, &v->key, sizeof(key)))
+			return v;
+
+	/* compile new variant if it doesn't exist already: */
+	v = create_variant(shader, key);
+	v->next = shader->variants;
+	shader->variants = v;
+
+	return v;
+}
+
+
+void
+ir3_shader_destroy(struct ir3_shader *shader)
+{
+	struct ir3_shader_variant *v, *t;
+	for (v = shader->variants; v; ) {
+		t = v;
+		v = v->next;
+		delete_variant(t);
+	}
+	free((void *)shader->tokens);
+	free(shader);
+}
+
+struct ir3_shader *
+ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens,
+		enum shader_t type)
+{
+	struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
+	shader->pctx = pctx;
+	shader->type = type;
+	shader->tokens = tgsi_dup_tokens(tokens);
+	return shader;
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
new file mode 100644
index 00000000000..1a91fcbcb13
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -0,0 +1,163 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#ifndef IR3_SHADER_H_
+#define IR3_SHADER_H_
+
+#include "ir3.h"
+#include "disasm.h"
+
+typedef uint16_t ir3_semantic;  /* semantic name + index */
+static inline ir3_semantic
+ir3_semantic_name(uint8_t name, uint16_t index)
+{
+	return (name << 8) | (index & 0xff);
+}
+
+static inline uint8_t sem2name(ir3_semantic sem)
+{
+	return sem >> 8;
+}
+
+static inline uint16_t sem2idx(ir3_semantic sem)
+{
+	return sem & 0xff;
+}
+
+/* Configuration key used to identify a shader variant.. different
+ * shader variants can be used to implement features not supported
+ * in hw (two sided color), binning-pass vertex shader, etc.
+ */
+struct ir3_shader_key {
+	/* vertex shader variant parameters: */
+	unsigned binning_pass : 1;
+
+	/* fragment shader variant parameters: */
+	unsigned color_two_side : 1;
+	unsigned half_precision : 1;
+};
+
+struct ir3_shader_variant {
+	struct fd_bo *bo;
+
+	struct ir3_shader_key key;
+
+	struct ir3_info info;
+	struct ir3 *ir;
+
+	/* the instructions length is in units of instruction groups
+	 * (4 instructions, 8 dwords):
+	 */
+	unsigned instrlen;
+
+	/* the constants length is in units of vec4's, and is the sum of
+	 * the uniforms and the built-in compiler constants
+	 */
+	unsigned constlen;
+
+	/* About Linkage:
+	 *   + Let the frag shader determine the position/compmask for the
+	 *     varyings, since it is the place where we know if the varying
+	 *     is actually used, and if so, which components are used.  So
+	 *     what the hw calls "outloc" is taken from the "inloc" of the
+	 *     frag shader.
+	 *   + From the vert shader, we only need the output regid
+	 */
+
+	/* for frag shader, pos_regid holds the frag_pos, ie. what is passed
+	 * to bary.f instructions
+	 */
+	uint8_t pos_regid;
+	bool frag_coord, frag_face;
+
+	/* varyings/outputs: */
+	unsigned outputs_count;
+	struct {
+		ir3_semantic semantic;
+		uint8_t regid;
+	} outputs[16 + 2];  /* +POSITION +PSIZE */
+	bool writes_pos, writes_psize;
+
+	/* vertices/inputs: */
+	unsigned inputs_count;
+	struct {
+		ir3_semantic semantic;
+		uint8_t regid;
+		uint8_t compmask;
+		uint8_t ncomp;
+		/* in theory inloc of fs should match outloc of vs: */
+		uint8_t inloc;
+		uint8_t bary;
+	} inputs[16 + 2];  /* +POSITION +FACE */
+
+	unsigned total_in;       /* sum of inputs (scalar) */
+
+	/* do we have one or more texture sample instructions: */
+	bool has_samp;
+
+	/* const reg # of first immediate, ie. 1 == c1
+	 * (not regid, because TGSI thinks in terms of vec4 registers,
+	 * not scalar registers)
+	 */
+	unsigned first_immediate;
+	unsigned immediates_count;
+	struct {
+		uint32_t val[4];
+	} immediates[64];
+
+	/* shader variants form a linked list: */
+	struct ir3_shader_variant *next;
+
+	/* replicated here to avoid passing extra ptrs everywhere: */
+	enum shader_t type;
+	struct ir3_shader *shader;
+};
+
+struct ir3_shader {
+	enum shader_t type;
+
+	struct pipe_context *pctx;
+	const struct tgsi_token *tokens;
+
+	struct ir3_shader_variant *variants;
+
+	/* so far, only used for blit_prog shader.. values for
+	 * VPC_VARYING_INTERP[i].MODE and VPC_VARYING_PS_REPL[i].MODE
+	 */
+	uint32_t vinterp[4], vpsrepl[4];
+};
+
+
+struct ir3_shader * ir3_shader_create(struct pipe_context *pctx,
+		const struct tgsi_token *tokens, enum shader_t type);
+void ir3_shader_destroy(struct ir3_shader *shader);
+
+struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
+		struct ir3_shader_key key);
+
+#endif /* IR3_SHADER_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_visitor.h b/src/gallium/drivers/freedreno/ir3/ir3_visitor.h
new file mode 100644
index 00000000000..1c60d1620ca
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_visitor.h
@@ -0,0 +1,154 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#ifndef IR3_VISITOR_H_
+#define IR3_VISITOR_H_
+
+/**
+ * Visitor which follows dst to src relationships between instructions,
+ * first visiting the dst (writer) instruction, followed by src (reader)
+ * instruction(s).
+ *
+ * TODO maybe we want multiple different visitors to walk the
+ * graph in different ways?
+ */
+
+struct ir3_visitor;
+
+typedef void (*ir3_visit_instr_func)(struct ir3_visitor *v,
+		struct ir3_instruction *instr);
+
+typedef void (*ir3_visit_reg_func)(struct ir3_visitor *v,
+		struct ir3_instruction *instr, struct ir3_register *reg);
+
+struct ir3_visitor_funcs {
+	ir3_visit_instr_func instr;  // TODO do we need??
+
+	ir3_visit_reg_func dst_shader_input;
+	ir3_visit_reg_func dst_block_input;
+	ir3_visit_reg_func dst_fanout;
+	ir3_visit_reg_func dst_fanin;
+	ir3_visit_reg_func dst;
+
+	ir3_visit_reg_func src_block_input;
+	ir3_visit_reg_func src_fanout;
+	ir3_visit_reg_func src_fanin;
+	ir3_visit_reg_func src;
+};
+
+struct ir3_visitor {
+	const struct ir3_visitor_funcs *funcs;
+	bool error;
+};
+
+#include "util/u_debug.h"
+
+static void visit_instr_dst(struct ir3_visitor *v,
+		struct ir3_instruction *instr)
+{
+	struct ir3_register *reg = instr->regs[0];
+
+	if (is_meta(instr)) {
+		switch (instr->opc) {
+		case OPC_META_INPUT:
+			if (instr->regs_count == 1)
+				v->funcs->dst_shader_input(v, instr, reg);
+			else
+				v->funcs->dst_block_input(v, instr, reg);
+			return;
+		case OPC_META_FO:
+			v->funcs->dst_fanout(v, instr, reg);
+			return;
+		case OPC_META_FI:
+			v->funcs->dst_fanin(v, instr, reg);
+			return;
+		default:
+			break;
+
+		}
+	}
+
+	v->funcs->dst(v, instr, reg);
+}
+
+static void visit_instr_src(struct ir3_visitor *v,
+		struct ir3_instruction *instr, struct ir3_register *reg)
+{
+	if (is_meta(instr)) {
+		switch (instr->opc) {
+		case OPC_META_INPUT:
+			/* shader-input does not have a src, only block input: */
+			debug_assert(instr->regs_count == 2);
+			v->funcs->src_block_input(v, instr, reg);
+			return;
+		case OPC_META_FO:
+			v->funcs->src_fanout(v, instr, reg);
+			return;
+		case OPC_META_FI:
+			v->funcs->src_fanin(v, instr, reg);
+			return;
+		default:
+			break;
+
+		}
+	}
+
+	v->funcs->src(v, instr, reg);
+}
+
+static void ir3_visit_instr(struct ir3_visitor *v,
+		struct ir3_instruction *instr)
+{
+	struct ir3_instruction *n;
+
+	/* visit instruction that assigns value: */
+	if (instr->regs_count > 0)
+		visit_instr_dst(v, instr);
+
+	/* and of any following instructions which read that value: */
+	n = instr->next;
+	while (n && !v->error) {
+		unsigned i;
+
+		for (i = 1; i < n->regs_count; i++) {
+			struct ir3_register *reg = n->regs[i];
+			if ((reg->flags & IR3_REG_SSA) && (reg->instr == instr))
+				visit_instr_src(v, n, reg);
+		}
+
+		n = n->next;
+	}
+}
+
+static void ir3_visit_reg(struct ir3_visitor *v,
+		struct ir3_instruction *instr, struct ir3_register *reg)
+{
+	/* no-op */
+}
+
+#endif /* IR3_VISITOR_H_ */
author	Rob Clark <[email protected]>	2014-07-25 11:15:59 -0400
committer	Rob Clark <[email protected]>	2014-07-25 13:29:28 -0400
commit	db193e5ad06e7a2fbcffb3bb5df85d212eb12291 (patch)
tree	58d1ec24c0af7b1acb1477eeaababe3d7eda6019 /src/gallium/drivers/freedreno/ir3
parent	7d7e6ae9c3544ce1889aa9b8a34545c6f42017e7 (diff)