24 files changed, 13731 insertions, 1 deletions
diff --git a/src/freedreno/Makefile.am b/src/freedreno/Makefile.am
index 9ddc3c0ad35..8f027e34f8a 100644
--- a/src/freedreno/Makefile.am
+++ b/src/freedreno/Makefile.am
@@ -45,7 +45,8 @@ TESTS =
 BUILT_SOURCES =
 CLEANFILES =
 EXTRA_DIST = \
-	drm/meson.build
+	drm/meson.build \
+	ir3/meson.build
 
 MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
 PYTHON_GEN = $(AM_V_GEN)$(PYTHON) $(PYTHON_FLAGS)
@@ -57,3 +58,19 @@ noinst_LTLIBRARIES += libfreedreno_drm.la
 libfreedreno_drm_la_SOURCES = $(drm_SOURCES)
 libfreedreno_drm_la_CFLAGS = $(VALGRIND_CFLAGS) $(LIBDRM_CFLAGS)
 
+noinst_LTLIBRARIES += libfreedreno_ir3.la
+
+libfreedreno_ir3_la_SOURCES = $(ir3_SOURCES) $(ir3_GENERATED_FILES)
+libfreedreno_ir3_la_CFLAGS = \
+	-I$(top_srcdir)/src/freedreno/ir3 \
+	-I$(top_builddir)/src/compiler/nir \
+	-I$(top_srcdir)/src/compiler/nir
+libfreedreno_ir3_LIBADD = \
+	$(top_builddir)/src/compiler/nir/libnir.la \
+	$(top_builddir)/src/util/libmesautil.la
+
+MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
+ir3/ir3_nir_trig.c: ir3/ir3_nir_trig.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py
+	$(MKDIR_GEN)
+	$(AM_V_GEN) $(PYTHON) $(PYTHON_FLAGS) $(srcdir)/ir3/ir3_nir_trig.py -p $(top_srcdir)/src/compiler/nir > $@ || ($(RM) $@; false)
+
diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources
index 06a1a99b9e2..1df5e6250b5 100644
--- a/src/freedreno/Makefile.sources
+++ b/src/freedreno/Makefile.sources
@@ -15,3 +15,27 @@ drm_SOURCES := \
 	drm/msm_drm.h \
 	drm/msm_ringbuffer.c
 
+ir3_SOURCES := \
+	ir3/disasm-a3xx.c \
+	ir3/instr-a3xx.h \
+	ir3/ir3.c \
+	ir3/ir3_compiler.c \
+	ir3/ir3_compiler.h \
+	ir3/ir3_compiler_nir.c \
+	ir3/ir3_cp.c \
+	ir3/ir3_depth.c \
+	ir3/ir3_group.c \
+	ir3/ir3.h \
+	ir3/ir3_legalize.c \
+	ir3/ir3_nir.c \
+	ir3/ir3_nir.h \
+	ir3/ir3_nir_lower_tg4_to_tex.c \
+	ir3/ir3_print.c \
+	ir3/ir3_ra.c \
+	ir3/ir3_sched.c \
+	ir3/ir3_shader.c \
+	ir3/ir3_shader.h
+
+ir3_GENERATED_FILES := \
+	ir3/ir3_nir_trig.c
+
diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c
new file mode 100644
index 00000000000..4cf45ce9227
--- /dev/null
+++ b/src/freedreno/ir3/disasm-a3xx.c
@@ -0,0 +1,1038 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include <util/u_debug.h>
+
+#include "instr-a3xx.h"
+
+/* bitmask of debug flags */
+enum debug_t {
+	PRINT_RAW      = 0x1,    /* dump raw hexdump */
+	PRINT_VERBOSE  = 0x2,
+};
+
+static enum debug_t debug;
+
+#define printf debug_printf
+
+static const char *levels[] = {
+		"",
+		"\t",
+		"\t\t",
+		"\t\t\t",
+		"\t\t\t\t",
+		"\t\t\t\t\t",
+		"\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t\t\t",
+		"x",
+		"x",
+		"x",
+		"x",
+		"x",
+		"x",
+};
+
+static const char *component = "xyzw";
+
+static const char *type[] = {
+		[TYPE_F16] = "f16",
+		[TYPE_F32] = "f32",
+		[TYPE_U16] = "u16",
+		[TYPE_U32] = "u32",
+		[TYPE_S16] = "s16",
+		[TYPE_S32] = "s32",
+		[TYPE_U8]  = "u8",
+		[TYPE_S8]  = "s8",
+};
+
+struct disasm_ctx {
+	FILE *out;
+	int level;
+
+	/* current instruction repeat flag: */
+	unsigned repeat;
+};
+
+static void print_reg(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
+		bool c, bool im, bool neg, bool abs, bool addr_rel)
+{
+	const char type = c ? 'c' : 'r';
+
+	// XXX I prefer - and || for neg/abs, but preserving format used
+	// by libllvm-a3xx for easy diffing..
+
+	if (abs && neg)
+		fprintf(ctx->out, "(absneg)");
+	else if (neg)
+		fprintf(ctx->out, "(neg)");
+	else if (abs)
+		fprintf(ctx->out, "(abs)");
+
+	if (r)
+		fprintf(ctx->out, "(r)");
+
+	if (im) {
+		fprintf(ctx->out, "%d", reg.iim_val);
+	} else if (addr_rel) {
+		/* I would just use %+d but trying to make it diff'able with
+		 * libllvm-a3xx...
+		 */
+		if (reg.iim_val < 0)
+			fprintf(ctx->out, "%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
+		else if (reg.iim_val > 0)
+			fprintf(ctx->out, "%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
+		else
+			fprintf(ctx->out, "%s%c<a0.x>", full ? "" : "h", type);
+	} else if ((reg.num == REG_A0) && !c) {
+		fprintf(ctx->out, "a0.%c", component[reg.comp]);
+	} else if ((reg.num == REG_P0) && !c) {
+		fprintf(ctx->out, "p0.%c", component[reg.comp]);
+	} else {
+		fprintf(ctx->out, "%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]);
+	}
+}
+
+
+static void print_reg_dst(struct disasm_ctx *ctx, reg_t reg, bool full, bool addr_rel)
+{
+	print_reg(ctx, reg, full, false, false, false, false, false, addr_rel);
+}
+
+static void print_reg_src(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
+		bool c, bool im, bool neg, bool abs, bool addr_rel)
+{
+	print_reg(ctx, reg, full, r, c, im, neg, abs, addr_rel);
+}
+
+/* TODO switch to using reginfo struct everywhere, since more readable
+ * than passing a bunch of bools to print_reg_src
+ */
+
+struct reginfo {
+	reg_t reg;
+	bool full;
+	bool r;
+	bool c;
+	bool im;
+	bool neg;
+	bool abs;
+	bool addr_rel;
+};
+
+static void print_src(struct disasm_ctx *ctx, struct reginfo *info)
+{
+	print_reg_src(ctx, info->reg, info->full, info->r, info->c, info->im,
+			info->neg, info->abs, info->addr_rel);
+}
+
+//static void print_dst(struct disasm_ctx *ctx, struct reginfo *info)
+//{
+//	print_reg_dst(ctx, info->reg, info->full, info->addr_rel);
+//}
+
+static void print_instr_cat0(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat0_t *cat0 = &instr->cat0;
+
+	switch (cat0->opc) {
+	case OPC_KILL:
+		fprintf(ctx->out, " %sp0.%c", cat0->inv ? "!" : "",
+				component[cat0->comp]);
+		break;
+	case OPC_BR:
+		fprintf(ctx->out, " %sp0.%c, #%d", cat0->inv ? "!" : "",
+				component[cat0->comp], cat0->a3xx.immed);
+		break;
+	case OPC_JUMP:
+	case OPC_CALL:
+		fprintf(ctx->out, " #%d", cat0->a3xx.immed);
+		break;
+	}
+
+	if ((debug & PRINT_VERBOSE) && (cat0->dummy2|cat0->dummy3|cat0->dummy4))
+		fprintf(ctx->out, "\t{0: %x,%x,%x}", cat0->dummy2, cat0->dummy3, cat0->dummy4);
+}
+
+static void print_instr_cat1(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat1_t *cat1 = &instr->cat1;
+
+	if (cat1->ul)
+		fprintf(ctx->out, "(ul)");
+
+	if (cat1->src_type == cat1->dst_type) {
+		if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
+			/* special case (nmemonic?): */
+			fprintf(ctx->out, "mova");
+		} else {
+			fprintf(ctx->out, "mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+		}
+	} else {
+		fprintf(ctx->out, "cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+	}
+
+	fprintf(ctx->out, " ");
+
+	if (cat1->even)
+		fprintf(ctx->out, "(even)");
+
+	if (cat1->pos_inf)
+		fprintf(ctx->out, "(pos_infinity)");
+
+	print_reg_dst(ctx, (reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
+			cat1->dst_rel);
+
+	fprintf(ctx->out, ", ");
+
+	/* ugg, have to special case this.. vs print_reg().. */
+	if (cat1->src_im) {
+		if (type_float(cat1->src_type))
+			fprintf(ctx->out, "(%f)", cat1->fim_val);
+		else if (type_uint(cat1->src_type))
+			fprintf(ctx->out, "0x%08x", cat1->uim_val);
+		else
+			fprintf(ctx->out, "%d", cat1->iim_val);
+	} else if (cat1->src_rel && !cat1->src_c) {
+		/* I would just use %+d but trying to make it diff'able with
+		 * libllvm-a3xx...
+		 */
+		char type = cat1->src_rel_c ? 'c' : 'r';
+		if (cat1->off < 0)
+			fprintf(ctx->out, "%c<a0.x - %d>", type, -cat1->off);
+		else if (cat1->off > 0)
+			fprintf(ctx->out, "%c<a0.x + %d>", type, cat1->off);
+		else
+			fprintf(ctx->out, "%c<a0.x>", type);
+	} else {
+		print_reg_src(ctx, (reg_t)(cat1->src), type_size(cat1->src_type) == 32,
+				cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
+	}
+
+	if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
+		fprintf(ctx->out, "\t{1: %x}", cat1->must_be_0);
+}
+
+static void print_instr_cat2(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat2_t *cat2 = &instr->cat2;
+	static const char *cond[] = {
+			"lt",
+			"le",
+			"gt",
+			"ge",
+			"eq",
+			"ne",
+			"?6?",
+	};
+
+	switch (_OPC(2, cat2->opc)) {
+	case OPC_CMPS_F:
+	case OPC_CMPS_U:
+	case OPC_CMPS_S:
+	case OPC_CMPV_F:
+	case OPC_CMPV_U:
+	case OPC_CMPV_S:
+		fprintf(ctx->out, ".%s", cond[cat2->cond]);
+		break;
+	}
+
+	fprintf(ctx->out, " ");
+	if (cat2->ei)
+		fprintf(ctx->out, "(ei)");
+	print_reg_dst(ctx, (reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
+	fprintf(ctx->out, ", ");
+
+	if (cat2->c1.src1_c) {
+		print_reg_src(ctx, (reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r,
+				cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg,
+				cat2->src1_abs, false);
+	} else if (cat2->rel1.src1_rel) {
+		print_reg_src(ctx, (reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r,
+				cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg,
+				cat2->src1_abs, cat2->rel1.src1_rel);
+	} else {
+		print_reg_src(ctx, (reg_t)(cat2->src1), cat2->full, cat2->src1_r,
+				false, cat2->src1_im, cat2->src1_neg,
+				cat2->src1_abs, false);
+	}
+
+	switch (_OPC(2, cat2->opc)) {
+	case OPC_ABSNEG_F:
+	case OPC_ABSNEG_S:
+	case OPC_CLZ_B:
+	case OPC_CLZ_S:
+	case OPC_SIGN_F:
+	case OPC_FLOOR_F:
+	case OPC_CEIL_F:
+	case OPC_RNDNE_F:
+	case OPC_RNDAZ_F:
+	case OPC_TRUNC_F:
+	case OPC_NOT_B:
+	case OPC_BFREV_B:
+	case OPC_SETRM:
+	case OPC_CBITS_B:
+		/* these only have one src reg */
+		break;
+	default:
+		fprintf(ctx->out, ", ");
+		if (cat2->c2.src2_c) {
+			print_reg_src(ctx, (reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r,
+					cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg,
+					cat2->src2_abs, false);
+		} else if (cat2->rel2.src2_rel) {
+			print_reg_src(ctx, (reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r,
+					cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg,
+					cat2->src2_abs, cat2->rel2.src2_rel);
+		} else {
+			print_reg_src(ctx, (reg_t)(cat2->src2), cat2->full, cat2->src2_r,
+					false, cat2->src2_im, cat2->src2_neg,
+					cat2->src2_abs, false);
+		}
+		break;
+	}
+}
+
+static void print_instr_cat3(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat3_t *cat3 = &instr->cat3;
+	bool full = instr_cat3_full(cat3);
+
+	fprintf(ctx->out, " ");
+	print_reg_dst(ctx, (reg_t)(cat3->dst), full ^ cat3->dst_half, false);
+	fprintf(ctx->out, ", ");
+	if (cat3->c1.src1_c) {
+		print_reg_src(ctx, (reg_t)(cat3->c1.src1), full,
+				cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg,
+				false, false);
+	} else if (cat3->rel1.src1_rel) {
+		print_reg_src(ctx, (reg_t)(cat3->rel1.src1), full,
+				cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg,
+				false, cat3->rel1.src1_rel);
+	} else {
+		print_reg_src(ctx, (reg_t)(cat3->src1), full,
+				cat3->src1_r, false, false, cat3->src1_neg,
+				false, false);
+	}
+	fprintf(ctx->out, ", ");
+	print_reg_src(ctx, (reg_t)cat3->src2, full,
+			cat3->src2_r, cat3->src2_c, false, cat3->src2_neg,
+			false, false);
+	fprintf(ctx->out, ", ");
+	if (cat3->c2.src3_c) {
+		print_reg_src(ctx, (reg_t)(cat3->c2.src3), full,
+				cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg,
+				false, false);
+	} else if (cat3->rel2.src3_rel) {
+		print_reg_src(ctx, (reg_t)(cat3->rel2.src3), full,
+				cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg,
+				false, cat3->rel2.src3_rel);
+	} else {
+		print_reg_src(ctx, (reg_t)(cat3->src3), full,
+				cat3->src3_r, false, false, cat3->src3_neg,
+				false, false);
+	}
+}
+
+static void print_instr_cat4(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat4_t *cat4 = &instr->cat4;
+
+	fprintf(ctx->out, " ");
+	print_reg_dst(ctx, (reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
+	fprintf(ctx->out, ", ");
+
+	if (cat4->c.src_c) {
+		print_reg_src(ctx, (reg_t)(cat4->c.src), cat4->full,
+				cat4->src_r, cat4->c.src_c, cat4->src_im,
+				cat4->src_neg, cat4->src_abs, false);
+	} else if (cat4->rel.src_rel) {
+		print_reg_src(ctx, (reg_t)(cat4->rel.src), cat4->full,
+				cat4->src_r, cat4->rel.src_c, cat4->src_im,
+				cat4->src_neg, cat4->src_abs, cat4->rel.src_rel);
+	} else {
+		print_reg_src(ctx, (reg_t)(cat4->src), cat4->full,
+				cat4->src_r, false, cat4->src_im,
+				cat4->src_neg, cat4->src_abs, false);
+	}
+
+	if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
+		fprintf(ctx->out, "\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
+}
+
+static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr)
+{
+	static const struct {
+		bool src1, src2, samp, tex;
+	} info[0x1f] = {
+			[opc_op(OPC_ISAM)]     = { true,  false, true,  true,  },
+			[opc_op(OPC_ISAML)]    = { true,  true,  true,  true,  },
+			[opc_op(OPC_ISAMM)]    = { true,  false, true,  true,  },
+			[opc_op(OPC_SAM)]      = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMB)]     = { true,  true,  true,  true,  },
+			[opc_op(OPC_SAML)]     = { true,  true,  true,  true,  },
+			[opc_op(OPC_SAMGQ)]    = { true,  false, true,  true,  },
+			[opc_op(OPC_GETLOD)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_CONV)]     = { true,  true,  true,  true,  },
+			[opc_op(OPC_CONVM)]    = { true,  true,  true,  true,  },
+			[opc_op(OPC_GETSIZE)]  = { true,  false, false, true,  },
+			[opc_op(OPC_GETBUF)]   = { false, false, false, true,  },
+			[opc_op(OPC_GETPOS)]   = { true,  false, false, true,  },
+			[opc_op(OPC_GETINFO)]  = { false, false, false, true,  },
+			[opc_op(OPC_DSX)]      = { true,  false, false, false, },
+			[opc_op(OPC_DSY)]      = { true,  false, false, false, },
+			[opc_op(OPC_GATHER4R)] = { true,  false, true,  true,  },
+			[opc_op(OPC_GATHER4G)] = { true,  false, true,  true,  },
+			[opc_op(OPC_GATHER4B)] = { true,  false, true,  true,  },
+			[opc_op(OPC_GATHER4A)] = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP0)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP1)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP2)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP3)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_DSXPP_1)]  = { true,  false, false, false, },
+			[opc_op(OPC_DSYPP_1)]  = { true,  false, false, false, },
+			[opc_op(OPC_RGETPOS)]  = { false, false, false, false, },
+			[opc_op(OPC_RGETINFO)] = { false, false, false, false, },
+	};
+	instr_cat5_t *cat5 = &instr->cat5;
+	int i;
+
+	if (cat5->is_3d)   fprintf(ctx->out, ".3d");
+	if (cat5->is_a)    fprintf(ctx->out, ".a");
+	if (cat5->is_o)    fprintf(ctx->out, ".o");
+	if (cat5->is_p)    fprintf(ctx->out, ".p");
+	if (cat5->is_s)    fprintf(ctx->out, ".s");
+	if (cat5->is_s2en) fprintf(ctx->out, ".s2en");
+
+	fprintf(ctx->out, " ");
+
+	switch (_OPC(5, cat5->opc)) {
+	case OPC_DSXPP_1:
+	case OPC_DSYPP_1:
+		break;
+	default:
+		fprintf(ctx->out, "(%s)", type[cat5->type]);
+		break;
+	}
+
+	fprintf(ctx->out, "(");
+	for (i = 0; i < 4; i++)
+		if (cat5->wrmask & (1 << i))
+			fprintf(ctx->out, "%c", "xyzw"[i]);
+	fprintf(ctx->out, ")");
+
+	print_reg_dst(ctx, (reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
+
+	if (info[cat5->opc].src1) {
+		fprintf(ctx->out, ", ");
+		print_reg_src(ctx, (reg_t)(cat5->src1), cat5->full, false, false, false,
+				false, false, false);
+	}
+
+	if (cat5->is_s2en) {
+		fprintf(ctx->out, ", ");
+		print_reg_src(ctx, (reg_t)(cat5->s2en.src2), cat5->full, false, false, false,
+				false, false, false);
+		fprintf(ctx->out, ", ");
+		print_reg_src(ctx, (reg_t)(cat5->s2en.src3), false, false, false, false,
+				false, false, false);
+	} else {
+		if (cat5->is_o || info[cat5->opc].src2) {
+			fprintf(ctx->out, ", ");
+			print_reg_src(ctx, (reg_t)(cat5->norm.src2), cat5->full,
+					false, false, false, false, false, false);
+		}
+		if (info[cat5->opc].samp)
+			fprintf(ctx->out, ", s#%d", cat5->norm.samp);
+		if (info[cat5->opc].tex)
+			fprintf(ctx->out, ", t#%d", cat5->norm.tex);
+	}
+
+	if (debug & PRINT_VERBOSE) {
+		if (cat5->is_s2en) {
+			if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2))
+				fprintf(ctx->out, "\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2);
+		} else {
+			if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2))
+				fprintf(ctx->out, "\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2);
+		}
+	}
+}
+
+static void print_instr_cat6(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat6_t *cat6 = &instr->cat6;
+	char sd = 0, ss = 0;  /* dst/src address space */
+	bool nodst = false;
+	struct reginfo dst, src1, src2;
+	int src1off = 0, dstoff = 0;
+
+	memset(&dst, 0, sizeof(dst));
+	memset(&src1, 0, sizeof(src1));
+	memset(&src2, 0, sizeof(src2));
+
+	switch (_OPC(6, cat6->opc)) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		dst.full  = type_size(cat6->type) == 32;
+		src1.full = type_size(cat6->type) == 32;
+		src2.full = type_size(cat6->type) == 32;
+		break;
+	case OPC_L2G:
+	case OPC_G2L:
+		dst.full = true;
+		src1.full = true;
+		src2.full = true;
+		break;
+	case OPC_STG:
+	case OPC_STL:
+	case OPC_STP:
+	case OPC_STI:
+	case OPC_STLW:
+	case OPC_STIB:
+		dst.full  = true;
+		src1.full = type_size(cat6->type) == 32;
+		src2.full = type_size(cat6->type) == 32;
+		break;
+	default:
+		dst.full  = type_size(cat6->type) == 32;
+		src1.full = true;
+		src2.full = true;
+		break;
+	}
+
+	switch (_OPC(6, cat6->opc)) {
+	case OPC_PREFETCH:
+		break;
+	case OPC_RESINFO:
+		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+		break;
+	case OPC_LDGB:
+		fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
+		break;
+	case OPC_STGB:
+	case OPC_STIB:
+		fprintf(ctx->out, ".%s", cat6->stgb.typed ? "typed" : "untyped");
+		fprintf(ctx->out, ".%dd", cat6->stgb.d + 1);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		fprintf(ctx->out, ".%d", cat6->stgb.type_size + 1);
+		break;
+	case OPC_ATOMIC_ADD:
+	case OPC_ATOMIC_SUB:
+	case OPC_ATOMIC_XCHG:
+	case OPC_ATOMIC_INC:
+	case OPC_ATOMIC_DEC:
+	case OPC_ATOMIC_CMPXCHG:
+	case OPC_ATOMIC_MIN:
+	case OPC_ATOMIC_MAX:
+	case OPC_ATOMIC_AND:
+	case OPC_ATOMIC_OR:
+	case OPC_ATOMIC_XOR:
+		ss = cat6->g ? 'g' : 'l';
+		fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
+		fprintf(ctx->out, ".%c", ss);
+		break;
+	default:
+		dst.im = cat6->g && !cat6->dst_off;
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		break;
+	}
+	fprintf(ctx->out, " ");
+
+	switch (_OPC(6, cat6->opc)) {
+	case OPC_STG:
+		sd = 'g';
+		break;
+	case OPC_STP:
+		sd = 'p';
+		break;
+	case OPC_STL:
+	case OPC_STLW:
+		sd = 'l';
+		break;
+
+	case OPC_LDG:
+	case OPC_LDC:
+		ss = 'g';
+		break;
+	case OPC_LDP:
+		ss = 'p';
+		break;
+	case OPC_LDL:
+	case OPC_LDLW:
+	case OPC_LDLV:
+		ss = 'l';
+		break;
+
+	case OPC_L2G:
+		ss = 'l';
+		sd = 'g';
+		break;
+
+	case OPC_G2L:
+		ss = 'g';
+		sd = 'l';
+		break;
+
+	case OPC_PREFETCH:
+		ss = 'g';
+		nodst = true;
+		break;
+
+	case OPC_STI:
+		dst.full = false;  // XXX or inverts??
+		break;
+	}
+
+	if ((_OPC(6, cat6->opc) == OPC_STGB) || (_OPC(6, cat6->opc) == OPC_STIB)) {
+		struct reginfo src3;
+
+		memset(&src3, 0, sizeof(src3));
+
+		src1.reg = (reg_t)(cat6->stgb.src1);
+		src2.reg = (reg_t)(cat6->stgb.src2);
+		src2.im  = cat6->stgb.src2_im;
+		src3.reg = (reg_t)(cat6->stgb.src3);
+		src3.im  = cat6->stgb.src3_im;
+		src3.full = true;
+
+		fprintf(ctx->out, "g[%u], ", cat6->stgb.dst_ssbo);
+		print_src(ctx, &src1);
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src2);
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src3);
+
+		if (debug & PRINT_VERBOSE)
+			fprintf(ctx->out, " (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3);
+
+		return;
+	}
+
+	if (is_atomic(_OPC(6, cat6->opc))) {
+
+		src1.reg = (reg_t)(cat6->ldgb.src1);
+		src1.im  = cat6->ldgb.src1_im;
+		src2.reg = (reg_t)(cat6->ldgb.src2);
+		src2.im  = cat6->ldgb.src2_im;
+		dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+		print_src(ctx, &dst);
+		fprintf(ctx->out, ", ");
+		if (ss == 'g') {
+			struct reginfo src3;
+			memset(&src3, 0, sizeof(src3));
+
+			src3.reg = (reg_t)(cat6->ldgb.src3);
+			src3.full = true;
+
+			/* For images, the ".typed" variant is used and src2 is
+			 * the ivecN coordinates, ie ivec2 for 2d.
+			 *
+			 * For SSBOs, the ".untyped" variant is used and src2 is
+			 * a simple dword offset..  src3 appears to be
+			 * uvec2(offset * 4, 0).  Not sure the point of that.
+			 */
+
+			fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+			print_src(ctx, &src1);  /* value */
+			fprintf(ctx->out, ", ");
+			print_src(ctx, &src2);  /* offset/coords */
+			fprintf(ctx->out, ", ");
+			print_src(ctx, &src3);  /* 64b byte offset.. */
+
+			if (debug & PRINT_VERBOSE) {
+				fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0,
+						cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+			}
+		} else { /* ss == 'l' */
+			fprintf(ctx->out, "l[");
+			print_src(ctx, &src1);  /* simple byte offset */
+			fprintf(ctx->out, "], ");
+			print_src(ctx, &src2);  /* value */
+
+			if (debug & PRINT_VERBOSE) {
+				fprintf(ctx->out, " (src3=%x, pad0=%x, pad3=%x, mustbe0=%x)",
+						cat6->ldgb.src3, cat6->ldgb.pad0,
+						cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+			}
+		}
+
+		return;
+	} else if (_OPC(6, cat6->opc) == OPC_RESINFO) {
+		dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+		print_src(ctx, &dst);
+		fprintf(ctx->out, ", ");
+		fprintf(ctx->out, "g[%u]", cat6->ldgb.src_ssbo);
+
+		return;
+	} else if (_OPC(6, cat6->opc) == OPC_LDGB) {
+
+		src1.reg = (reg_t)(cat6->ldgb.src1);
+		src1.im  = cat6->ldgb.src1_im;
+		src2.reg = (reg_t)(cat6->ldgb.src2);
+		src2.im  = cat6->ldgb.src2_im;
+		dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+		print_src(ctx, &dst);
+		fprintf(ctx->out, ", ");
+		fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+		print_src(ctx, &src1);
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src2);
+
+		if (debug & PRINT_VERBOSE)
+			fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+
+		return;
+	}
+	if (cat6->dst_off) {
+		dst.reg = (reg_t)(cat6->c.dst);
+		dstoff  = cat6->c.off;
+	} else {
+		dst.reg = (reg_t)(cat6->d.dst);
+	}
+
+	if (cat6->src_off) {
+		src1.reg = (reg_t)(cat6->a.src1);
+		src1.im  = cat6->a.src1_im;
+		src2.reg = (reg_t)(cat6->a.src2);
+		src2.im  = cat6->a.src2_im;
+		src1off  = cat6->a.off;
+	} else {
+		src1.reg = (reg_t)(cat6->b.src1);
+		src1.im  = cat6->b.src1_im;
+		src2.reg = (reg_t)(cat6->b.src2);
+		src2.im  = cat6->b.src2_im;
+	}
+
+	if (!nodst) {
+		if (sd)
+			fprintf(ctx->out, "%c[", sd);
+		/* note: dst might actually be a src (ie. address to store to) */
+		print_src(ctx, &dst);
+		if (dstoff)
+			fprintf(ctx->out, "%+d", dstoff);
+		if (sd)
+			fprintf(ctx->out, "]");
+		fprintf(ctx->out, ", ");
+	}
+
+	if (ss)
+		fprintf(ctx->out, "%c[", ss);
+
+	/* can have a larger than normal immed, so hack: */
+	if (src1.im) {
+		fprintf(ctx->out, "%u", src1.reg.dummy13);
+	} else {
+		print_src(ctx, &src1);
+	}
+
+	if (src1off)
+		fprintf(ctx->out, "%+d", src1off);
+	if (ss)
+		fprintf(ctx->out, "]");
+
+	switch (_OPC(6, cat6->opc)) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		break;
+	default:
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src2);
+		break;
+	}
+}
+
+static void print_instr_cat7(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat7_t *cat7 = &instr->cat7;
+
+	if (cat7->g)
+		fprintf(ctx->out, ".g");
+	if (cat7->l)
+		fprintf(ctx->out, ".l");
+
+	if (_OPC(7, cat7->opc) == OPC_FENCE) {
+		if (cat7->r)
+			fprintf(ctx->out, ".r");
+		if (cat7->w)
+			fprintf(ctx->out, ".w");
+	}
+}
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+static const struct opc_info {
+	uint16_t cat;
+	uint16_t opc;
+	const char *name;
+	void (*print)(struct disasm_ctx *ctx, instr_t *instr);
+} opcs[1 << (3+NOPC_BITS)] = {
+#define OPC(cat, opc, name) [(opc)] = { (cat), (opc), #name, print_instr_cat##cat }
+	/* category 0: */
+	OPC(0, OPC_NOP,          nop),
+	OPC(0, OPC_BR,           br),
+	OPC(0, OPC_JUMP,         jump),
+	OPC(0, OPC_CALL,         call),
+	OPC(0, OPC_RET,          ret),
+	OPC(0, OPC_KILL,         kill),
+	OPC(0, OPC_END,          end),
+	OPC(0, OPC_EMIT,         emit),
+	OPC(0, OPC_CUT,          cut),
+	OPC(0, OPC_CHMASK,       chmask),
+	OPC(0, OPC_CHSH,         chsh),
+	OPC(0, OPC_FLOW_REV,     flow_rev),
+
+	/* category 1: */
+	OPC(1, OPC_MOV, ),
+
+	/* category 2: */
+	OPC(2, OPC_ADD_F,        add.f),
+	OPC(2, OPC_MIN_F,        min.f),
+	OPC(2, OPC_MAX_F,        max.f),
+	OPC(2, OPC_MUL_F,        mul.f),
+	OPC(2, OPC_SIGN_F,       sign.f),
+	OPC(2, OPC_CMPS_F,       cmps.f),
+	OPC(2, OPC_ABSNEG_F,     absneg.f),
+	OPC(2, OPC_CMPV_F,       cmpv.f),
+	OPC(2, OPC_FLOOR_F,      floor.f),
+	OPC(2, OPC_CEIL_F,       ceil.f),
+	OPC(2, OPC_RNDNE_F,      rndne.f),
+	OPC(2, OPC_RNDAZ_F,      rndaz.f),
+	OPC(2, OPC_TRUNC_F,      trunc.f),
+	OPC(2, OPC_ADD_U,        add.u),
+	OPC(2, OPC_ADD_S,        add.s),
+	OPC(2, OPC_SUB_U,        sub.u),
+	OPC(2, OPC_SUB_S,        sub.s),
+	OPC(2, OPC_CMPS_U,       cmps.u),
+	OPC(2, OPC_CMPS_S,       cmps.s),
+	OPC(2, OPC_MIN_U,        min.u),
+	OPC(2, OPC_MIN_S,        min.s),
+	OPC(2, OPC_MAX_U,        max.u),
+	OPC(2, OPC_MAX_S,        max.s),
+	OPC(2, OPC_ABSNEG_S,     absneg.s),
+	OPC(2, OPC_AND_B,        and.b),
+	OPC(2, OPC_OR_B,         or.b),
+	OPC(2, OPC_NOT_B,        not.b),
+	OPC(2, OPC_XOR_B,        xor.b),
+	OPC(2, OPC_CMPV_U,       cmpv.u),
+	OPC(2, OPC_CMPV_S,       cmpv.s),
+	OPC(2, OPC_MUL_U,        mul.u),
+	OPC(2, OPC_MUL_S,        mul.s),
+	OPC(2, OPC_MULL_U,       mull.u),
+	OPC(2, OPC_BFREV_B,      bfrev.b),
+	OPC(2, OPC_CLZ_S,        clz.s),
+	OPC(2, OPC_CLZ_B,        clz.b),
+	OPC(2, OPC_SHL_B,        shl.b),
+	OPC(2, OPC_SHR_B,        shr.b),
+	OPC(2, OPC_ASHR_B,       ashr.b),
+	OPC(2, OPC_BARY_F,       bary.f),
+	OPC(2, OPC_MGEN_B,       mgen.b),
+	OPC(2, OPC_GETBIT_B,     getbit.b),
+	OPC(2, OPC_SETRM,        setrm),
+	OPC(2, OPC_CBITS_B,      cbits.b),
+	OPC(2, OPC_SHB,          shb),
+	OPC(2, OPC_MSAD,         msad),
+
+	/* category 3: */
+	OPC(3, OPC_MAD_U16,      mad.u16),
+	OPC(3, OPC_MADSH_U16,    madsh.u16),
+	OPC(3, OPC_MAD_S16,      mad.s16),
+	OPC(3, OPC_MADSH_M16,    madsh.m16),
+	OPC(3, OPC_MAD_U24,      mad.u24),
+	OPC(3, OPC_MAD_S24,      mad.s24),
+	OPC(3, OPC_MAD_F16,      mad.f16),
+	OPC(3, OPC_MAD_F32,      mad.f32),
+	OPC(3, OPC_SEL_B16,      sel.b16),
+	OPC(3, OPC_SEL_B32,      sel.b32),
+	OPC(3, OPC_SEL_S16,      sel.s16),
+	OPC(3, OPC_SEL_S32,      sel.s32),
+	OPC(3, OPC_SEL_F16,      sel.f16),
+	OPC(3, OPC_SEL_F32,      sel.f32),
+	OPC(3, OPC_SAD_S16,      sad.s16),
+	OPC(3, OPC_SAD_S32,      sad.s32),
+
+	/* category 4: */
+	OPC(4, OPC_RCP,          rcp),
+	OPC(4, OPC_RSQ,          rsq),
+	OPC(4, OPC_LOG2,         log2),
+	OPC(4, OPC_EXP2,         exp2),
+	OPC(4, OPC_SIN,          sin),
+	OPC(4, OPC_COS,          cos),
+	OPC(4, OPC_SQRT,         sqrt),
+
+	/* category 5: */
+	OPC(5, OPC_ISAM,         isam),
+	OPC(5, OPC_ISAML,        isaml),
+	OPC(5, OPC_ISAMM,        isamm),
+	OPC(5, OPC_SAM,          sam),
+	OPC(5, OPC_SAMB,         samb),
+	OPC(5, OPC_SAML,         saml),
+	OPC(5, OPC_SAMGQ,        samgq),
+	OPC(5, OPC_GETLOD,       getlod),
+	OPC(5, OPC_CONV,         conv),
+	OPC(5, OPC_CONVM,        convm),
+	OPC(5, OPC_GETSIZE,      getsize),
+	OPC(5, OPC_GETBUF,       getbuf),
+	OPC(5, OPC_GETPOS,       getpos),
+	OPC(5, OPC_GETINFO,      getinfo),
+	OPC(5, OPC_DSX,          dsx),
+	OPC(5, OPC_DSY,          dsy),
+	OPC(5, OPC_GATHER4R,     gather4r),
+	OPC(5, OPC_GATHER4G,     gather4g),
+	OPC(5, OPC_GATHER4B,     gather4b),
+	OPC(5, OPC_GATHER4A,     gather4a),
+	OPC(5, OPC_SAMGP0,       samgp0),
+	OPC(5, OPC_SAMGP1,       samgp1),
+	OPC(5, OPC_SAMGP2,       samgp2),
+	OPC(5, OPC_SAMGP3,       samgp3),
+	OPC(5, OPC_DSXPP_1,      dsxpp.1),
+	OPC(5, OPC_DSYPP_1,      dsypp.1),
+	OPC(5, OPC_RGETPOS,      rgetpos),
+	OPC(5, OPC_RGETINFO,     rgetinfo),
+
+
+	/* category 6: */
+	OPC(6, OPC_LDG,          ldg),
+	OPC(6, OPC_LDL,          ldl),
+	OPC(6, OPC_LDP,          ldp),
+	OPC(6, OPC_STG,          stg),
+	OPC(6, OPC_STL,          stl),
+	OPC(6, OPC_STP,          stp),
+	OPC(6, OPC_STI,          sti),
+	OPC(6, OPC_G2L,          g2l),
+	OPC(6, OPC_L2G,          l2g),
+	OPC(6, OPC_PREFETCH,     prefetch),
+	OPC(6, OPC_LDLW,         ldlw),
+	OPC(6, OPC_STLW,         stlw),
+	OPC(6, OPC_RESFMT,       resfmt),
+	OPC(6, OPC_RESINFO,      resinfo),
+	OPC(6, OPC_ATOMIC_ADD,     atomic.add),
+	OPC(6, OPC_ATOMIC_SUB,     atomic.sub),
+	OPC(6, OPC_ATOMIC_XCHG,    atomic.xchg),
+	OPC(6, OPC_ATOMIC_INC,     atomic.inc),
+	OPC(6, OPC_ATOMIC_DEC,     atomic.dec),
+	OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg),
+	OPC(6, OPC_ATOMIC_MIN,     atomic.min),
+	OPC(6, OPC_ATOMIC_MAX,     atomic.max),
+	OPC(6, OPC_ATOMIC_AND,     atomic.and),
+	OPC(6, OPC_ATOMIC_OR,      atomic.or),
+	OPC(6, OPC_ATOMIC_XOR,     atomic.xor),
+	OPC(6, OPC_LDGB,         ldgb),
+	OPC(6, OPC_STGB,         stgb),
+	OPC(6, OPC_STIB,         stib),
+	OPC(6, OPC_LDC,          ldc),
+	OPC(6, OPC_LDLV,         ldlv),
+
+	OPC(7, OPC_BAR,          bar),
+	OPC(7, OPC_FENCE,        fence),
+
+#undef OPC
+};
+
+#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)]))
+
+// XXX hack.. probably should move this table somewhere common:
+#include "ir3.h"
+const char *ir3_instr_name(struct ir3_instruction *instr)
+{
+	if (opc_cat(instr->opc) == -1) return "??meta??";
+	return opcs[instr->opc].name;
+}
+
+static bool print_instr(struct disasm_ctx *ctx, uint32_t *dwords, int n)
+{
+	instr_t *instr = (instr_t *)dwords;
+	uint32_t opc = instr_opc(instr);
+	const char *name;
+
+	if (debug & PRINT_VERBOSE)
+		fprintf(ctx->out, "%s%04d[%08xx_%08xx] ", levels[ctx->level], n, dwords[1], dwords[0]);
+
+	/* NOTE: order flags are printed is a bit fugly.. but for now I
+	 * try to match the order in llvm-a3xx disassembler for easy
+	 * diff'ing..
+	 */
+
+	ctx->repeat = instr_repeat(instr);
+
+	if (instr->sync)
+		fprintf(ctx->out, "(sy)");
+	if (instr->ss && ((instr->opc_cat <= 4) || (instr->opc_cat == 7)))
+		fprintf(ctx->out, "(ss)");
+	if (instr->jmp_tgt)
+		fprintf(ctx->out, "(jp)");
+	if (instr_sat(instr))
+		fprintf(ctx->out, "(sat)");
+	if (ctx->repeat)
+		fprintf(ctx->out, "(rpt%d)", ctx->repeat);
+	if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
+		fprintf(ctx->out, "(ul)");
+
+	name = GETINFO(instr)->name;
+
+	if (name) {
+		fprintf(ctx->out, "%s", name);
+		GETINFO(instr)->print(ctx, instr);
+	} else {
+		fprintf(ctx->out, "unknown(%d,%d)", instr->opc_cat, opc);
+	}
+
+	fprintf(ctx->out, "\n");
+
+	return (instr->opc_cat == 0) && (opc == OPC_END);
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out)
+{
+	struct disasm_ctx ctx;
+	int i;
+
+	assert((sizedwords % 2) == 0);
+
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.out = out;
+	ctx.level = level;
+
+	for (i = 0; i < sizedwords; i += 2)
+		print_instr(&ctx, &dwords[i], i/2);
+
+	return 0;
+}
diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h
new file mode 100644
index 00000000000..7f60ee5fd4c
--- /dev/null
+++ b/src/freedreno/ir3/instr-a3xx.h
@@ -0,0 +1,872 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INSTR_A3XX_H_
+#define INSTR_A3XX_H_
+
+#define PACKED __attribute__((__packed__))
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <assert.h>
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+#define _OPC(cat, opc)   (((cat) << NOPC_BITS) | opc)
+
+typedef enum {
+	/* category 0: */
+	OPC_NOP             = _OPC(0, 0),
+	OPC_BR              = _OPC(0, 1),
+	OPC_JUMP            = _OPC(0, 2),
+	OPC_CALL            = _OPC(0, 3),
+	OPC_RET             = _OPC(0, 4),
+	OPC_KILL            = _OPC(0, 5),
+	OPC_END             = _OPC(0, 6),
+	OPC_EMIT            = _OPC(0, 7),
+	OPC_CUT             = _OPC(0, 8),
+	OPC_CHMASK          = _OPC(0, 9),
+	OPC_CHSH            = _OPC(0, 10),
+	OPC_FLOW_REV        = _OPC(0, 11),
+
+	/* category 1: */
+	OPC_MOV             = _OPC(1, 0),
+
+	/* category 2: */
+	OPC_ADD_F           = _OPC(2, 0),
+	OPC_MIN_F           = _OPC(2, 1),
+	OPC_MAX_F           = _OPC(2, 2),
+	OPC_MUL_F           = _OPC(2, 3),
+	OPC_SIGN_F          = _OPC(2, 4),
+	OPC_CMPS_F          = _OPC(2, 5),
+	OPC_ABSNEG_F        = _OPC(2, 6),
+	OPC_CMPV_F          = _OPC(2, 7),
+	/* 8 - invalid */
+	OPC_FLOOR_F         = _OPC(2, 9),
+	OPC_CEIL_F          = _OPC(2, 10),
+	OPC_RNDNE_F         = _OPC(2, 11),
+	OPC_RNDAZ_F         = _OPC(2, 12),
+	OPC_TRUNC_F         = _OPC(2, 13),
+	/* 14-15 - invalid */
+	OPC_ADD_U           = _OPC(2, 16),
+	OPC_ADD_S           = _OPC(2, 17),
+	OPC_SUB_U           = _OPC(2, 18),
+	OPC_SUB_S           = _OPC(2, 19),
+	OPC_CMPS_U          = _OPC(2, 20),
+	OPC_CMPS_S          = _OPC(2, 21),
+	OPC_MIN_U           = _OPC(2, 22),
+	OPC_MIN_S           = _OPC(2, 23),
+	OPC_MAX_U           = _OPC(2, 24),
+	OPC_MAX_S           = _OPC(2, 25),
+	OPC_ABSNEG_S        = _OPC(2, 26),
+	/* 27 - invalid */
+	OPC_AND_B           = _OPC(2, 28),
+	OPC_OR_B            = _OPC(2, 29),
+	OPC_NOT_B           = _OPC(2, 30),
+	OPC_XOR_B           = _OPC(2, 31),
+	/* 32 - invalid */
+	OPC_CMPV_U          = _OPC(2, 33),
+	OPC_CMPV_S          = _OPC(2, 34),
+	/* 35-47 - invalid */
+	OPC_MUL_U           = _OPC(2, 48),
+	OPC_MUL_S           = _OPC(2, 49),
+	OPC_MULL_U          = _OPC(2, 50),
+	OPC_BFREV_B         = _OPC(2, 51),
+	OPC_CLZ_S           = _OPC(2, 52),
+	OPC_CLZ_B           = _OPC(2, 53),
+	OPC_SHL_B           = _OPC(2, 54),
+	OPC_SHR_B           = _OPC(2, 55),
+	OPC_ASHR_B          = _OPC(2, 56),
+	OPC_BARY_F          = _OPC(2, 57),
+	OPC_MGEN_B          = _OPC(2, 58),
+	OPC_GETBIT_B        = _OPC(2, 59),
+	OPC_SETRM           = _OPC(2, 60),
+	OPC_CBITS_B         = _OPC(2, 61),
+	OPC_SHB             = _OPC(2, 62),
+	OPC_MSAD            = _OPC(2, 63),
+
+	/* category 3: */
+	OPC_MAD_U16         = _OPC(3, 0),
+	OPC_MADSH_U16       = _OPC(3, 1),
+	OPC_MAD_S16         = _OPC(3, 2),
+	OPC_MADSH_M16       = _OPC(3, 3),   /* should this be .s16? */
+	OPC_MAD_U24         = _OPC(3, 4),
+	OPC_MAD_S24         = _OPC(3, 5),
+	OPC_MAD_F16         = _OPC(3, 6),
+	OPC_MAD_F32         = _OPC(3, 7),
+	OPC_SEL_B16         = _OPC(3, 8),
+	OPC_SEL_B32         = _OPC(3, 9),
+	OPC_SEL_S16         = _OPC(3, 10),
+	OPC_SEL_S32         = _OPC(3, 11),
+	OPC_SEL_F16         = _OPC(3, 12),
+	OPC_SEL_F32         = _OPC(3, 13),
+	OPC_SAD_S16         = _OPC(3, 14),
+	OPC_SAD_S32         = _OPC(3, 15),
+
+	/* category 4: */
+	OPC_RCP             = _OPC(4, 0),
+	OPC_RSQ             = _OPC(4, 1),
+	OPC_LOG2            = _OPC(4, 2),
+	OPC_EXP2            = _OPC(4, 3),
+	OPC_SIN             = _OPC(4, 4),
+	OPC_COS             = _OPC(4, 5),
+	OPC_SQRT            = _OPC(4, 6),
+	// 7-63 - invalid
+
+	/* category 5: */
+	OPC_ISAM            = _OPC(5, 0),
+	OPC_ISAML           = _OPC(5, 1),
+	OPC_ISAMM           = _OPC(5, 2),
+	OPC_SAM             = _OPC(5, 3),
+	OPC_SAMB            = _OPC(5, 4),
+	OPC_SAML            = _OPC(5, 5),
+	OPC_SAMGQ           = _OPC(5, 6),
+	OPC_GETLOD          = _OPC(5, 7),
+	OPC_CONV            = _OPC(5, 8),
+	OPC_CONVM           = _OPC(5, 9),
+	OPC_GETSIZE         = _OPC(5, 10),
+	OPC_GETBUF          = _OPC(5, 11),
+	OPC_GETPOS          = _OPC(5, 12),
+	OPC_GETINFO         = _OPC(5, 13),
+	OPC_DSX             = _OPC(5, 14),
+	OPC_DSY             = _OPC(5, 15),
+	OPC_GATHER4R        = _OPC(5, 16),
+	OPC_GATHER4G        = _OPC(5, 17),
+	OPC_GATHER4B        = _OPC(5, 18),
+	OPC_GATHER4A        = _OPC(5, 19),
+	OPC_SAMGP0          = _OPC(5, 20),
+	OPC_SAMGP1          = _OPC(5, 21),
+	OPC_SAMGP2          = _OPC(5, 22),
+	OPC_SAMGP3          = _OPC(5, 23),
+	OPC_DSXPP_1         = _OPC(5, 24),
+	OPC_DSYPP_1         = _OPC(5, 25),
+	OPC_RGETPOS         = _OPC(5, 26),
+	OPC_RGETINFO        = _OPC(5, 27),
+
+	/* category 6: */
+	OPC_LDG             = _OPC(6, 0),        /* load-global */
+	OPC_LDL             = _OPC(6, 1),
+	OPC_LDP             = _OPC(6, 2),
+	OPC_STG             = _OPC(6, 3),        /* store-global */
+	OPC_STL             = _OPC(6, 4),
+	OPC_STP             = _OPC(6, 5),
+	OPC_STI             = _OPC(6, 6),
+	OPC_G2L             = _OPC(6, 7),
+	OPC_L2G             = _OPC(6, 8),
+	OPC_PREFETCH        = _OPC(6, 9),
+	OPC_LDLW            = _OPC(6, 10),
+	OPC_STLW            = _OPC(6, 11),
+	OPC_RESFMT          = _OPC(6, 14),
+	OPC_RESINFO         = _OPC(6, 15),
+	OPC_ATOMIC_ADD      = _OPC(6, 16),
+	OPC_ATOMIC_SUB      = _OPC(6, 17),
+	OPC_ATOMIC_XCHG     = _OPC(6, 18),
+	OPC_ATOMIC_INC      = _OPC(6, 19),
+	OPC_ATOMIC_DEC      = _OPC(6, 20),
+	OPC_ATOMIC_CMPXCHG  = _OPC(6, 21),
+	OPC_ATOMIC_MIN      = _OPC(6, 22),
+	OPC_ATOMIC_MAX      = _OPC(6, 23),
+	OPC_ATOMIC_AND      = _OPC(6, 24),
+	OPC_ATOMIC_OR       = _OPC(6, 25),
+	OPC_ATOMIC_XOR      = _OPC(6, 26),
+	OPC_LDGB            = _OPC(6, 27),
+	OPC_STGB            = _OPC(6, 28),
+	OPC_STIB            = _OPC(6, 29),
+	OPC_LDC             = _OPC(6, 30),
+	OPC_LDLV            = _OPC(6, 31),
+
+	/* category 7: */
+	OPC_BAR             = _OPC(7, 0),
+	OPC_FENCE           = _OPC(7, 1),
+
+	/* meta instructions (category -1): */
+	/* placeholder instr to mark shader inputs: */
+	OPC_META_INPUT      = _OPC(-1, 0),
+	/* The "fan-in" and "fan-out" instructions are used for keeping
+	 * track of instructions that write to multiple dst registers
+	 * (fan-out) like texture sample instructions, or read multiple
+	 * consecutive scalar registers (fan-in) (bary.f, texture samp)
+	 */
+	OPC_META_FO         = _OPC(-1, 2),
+	OPC_META_FI         = _OPC(-1, 3),
+
+} opc_t;
+
+#define opc_cat(opc) ((int)((opc) >> NOPC_BITS))
+#define opc_op(opc)  ((unsigned)((opc) & ((1 << NOPC_BITS) - 1)))
+
+typedef enum {
+	TYPE_F16 = 0,
+	TYPE_F32 = 1,
+	TYPE_U16 = 2,
+	TYPE_U32 = 3,
+	TYPE_S16 = 4,
+	TYPE_S32 = 5,
+	TYPE_U8  = 6,
+	TYPE_S8  = 7,  // XXX I assume?
+} type_t;
+
+static inline uint32_t type_size(type_t type)
+{
+	switch (type) {
+	case TYPE_F32:
+	case TYPE_U32:
+	case TYPE_S32:
+		return 32;
+	case TYPE_F16:
+	case TYPE_U16:
+	case TYPE_S16:
+		return 16;
+	case TYPE_U8:
+	case TYPE_S8:
+		return 8;
+	default:
+		assert(0); /* invalid type */
+		return 0;
+	}
+}
+
+static inline int type_float(type_t type)
+{
+	return (type == TYPE_F32) || (type == TYPE_F16);
+}
+
+static inline int type_uint(type_t type)
+{
+	return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
+}
+
+static inline int type_sint(type_t type)
+{
+	return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
+}
+
+typedef union PACKED {
+	/* normal gpr or const src register: */
+	struct PACKED {
+		uint32_t comp  : 2;
+		uint32_t num   : 10;
+	};
+	/* for immediate val: */
+	int32_t  iim_val   : 11;
+	/* to make compiler happy: */
+	uint32_t dummy32;
+	uint32_t dummy10   : 10;
+	int32_t  idummy10  : 10;
+	uint32_t dummy11   : 11;
+	uint32_t dummy12   : 12;
+	uint32_t dummy13   : 13;
+	uint32_t dummy8    : 8;
+} reg_t;
+
+/* special registers: */
+#define REG_A0 61       /* address register */
+#define REG_P0 62       /* predicate register */
+
+static inline int reg_special(reg_t reg)
+{
+	return (reg.num == REG_A0) || (reg.num == REG_P0);
+}
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			int16_t  immed    : 16;
+			uint32_t dummy1   : 16;
+		} a3xx;
+		struct PACKED {
+			int32_t  immed    : 20;
+			uint32_t dummy1   : 12;
+		} a4xx;
+		struct PACKED {
+			int32_t immed     : 32;
+		} a5xx;
+	};
+
+	/* dword1: */
+	uint32_t dummy2   : 8;
+	uint32_t repeat   : 3;
+	uint32_t dummy3   : 1;
+	uint32_t ss       : 1;
+	uint32_t dummy4   : 7;
+	uint32_t inv      : 1;
+	uint32_t comp     : 2;
+	uint32_t opc      : 4;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat0_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		/* for normal src register: */
+		struct PACKED {
+			uint32_t src : 11;
+			/* at least low bit of pad must be zero or it will
+			 * look like a address relative src
+			 */
+			uint32_t pad : 21;
+		};
+		/* for address relative: */
+		struct PACKED {
+			int32_t  off : 10;
+			uint32_t src_rel_c : 1;
+			uint32_t src_rel : 1;
+			uint32_t unknown : 20;
+		};
+		/* for immediate: */
+		int32_t  iim_val;
+		uint32_t uim_val;
+		float    fim_val;
+	};
+
+	/* dword1: */
+	uint32_t dst        : 8;
+	uint32_t repeat     : 3;
+	uint32_t src_r      : 1;
+	uint32_t ss         : 1;
+	uint32_t ul         : 1;
+	uint32_t dst_type   : 3;
+	uint32_t dst_rel    : 1;
+	uint32_t src_type   : 3;
+	uint32_t src_c      : 1;
+	uint32_t src_im     : 1;
+	uint32_t even       : 1;
+	uint32_t pos_inf    : 1;
+	uint32_t must_be_0  : 2;
+	uint32_t jmp_tgt    : 1;
+	uint32_t sync       : 1;
+	uint32_t opc_cat    : 3;
+} instr_cat1_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			uint32_t src1         : 11;
+			uint32_t must_be_zero1: 2;
+			uint32_t src1_im      : 1;   /* immediate */
+			uint32_t src1_neg     : 1;   /* negate */
+			uint32_t src1_abs     : 1;   /* absolute value */
+		};
+		struct PACKED {
+			uint32_t src1         : 10;
+			uint32_t src1_c       : 1;   /* relative-const */
+			uint32_t src1_rel     : 1;   /* relative address */
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel1;
+		struct PACKED {
+			uint32_t src1         : 12;
+			uint32_t src1_c       : 1;   /* const */
+			uint32_t dummy        : 3;
+		} c1;
+	};
+
+	union PACKED {
+		struct PACKED {
+			uint32_t src2         : 11;
+			uint32_t must_be_zero2: 2;
+			uint32_t src2_im      : 1;   /* immediate */
+			uint32_t src2_neg     : 1;   /* negate */
+			uint32_t src2_abs     : 1;   /* absolute value */
+		};
+		struct PACKED {
+			uint32_t src2         : 10;
+			uint32_t src2_c       : 1;   /* relative-const */
+			uint32_t src2_rel     : 1;   /* relative address */
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel2;
+		struct PACKED {
+			uint32_t src2         : 12;
+			uint32_t src2_c       : 1;   /* const */
+			uint32_t dummy        : 3;
+		} c2;
+	};
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t repeat   : 2;
+	uint32_t sat      : 1;
+	uint32_t src1_r   : 1;
+	uint32_t ss       : 1;
+	uint32_t ul       : 1;   /* dunno */
+	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+	uint32_t ei       : 1;
+	uint32_t cond     : 3;
+	uint32_t src2_r   : 1;
+	uint32_t full     : 1;   /* not half */
+	uint32_t opc      : 6;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat2_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			uint32_t src1         : 11;
+			uint32_t must_be_zero1: 2;
+			uint32_t src2_c       : 1;
+			uint32_t src1_neg     : 1;
+			uint32_t src2_r       : 1;
+		};
+		struct PACKED {
+			uint32_t src1         : 10;
+			uint32_t src1_c       : 1;
+			uint32_t src1_rel     : 1;
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel1;
+		struct PACKED {
+			uint32_t src1         : 12;
+			uint32_t src1_c       : 1;
+			uint32_t dummy        : 3;
+		} c1;
+	};
+
+	union PACKED {
+		struct PACKED {
+			uint32_t src3         : 11;
+			uint32_t must_be_zero2: 2;
+			uint32_t src3_r       : 1;
+			uint32_t src2_neg     : 1;
+			uint32_t src3_neg     : 1;
+		};
+		struct PACKED {
+			uint32_t src3         : 10;
+			uint32_t src3_c       : 1;
+			uint32_t src3_rel     : 1;
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel2;
+		struct PACKED {
+			uint32_t src3         : 12;
+			uint32_t src3_c       : 1;
+			uint32_t dummy        : 3;
+		} c2;
+	};
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t repeat   : 2;
+	uint32_t sat      : 1;
+	uint32_t src1_r   : 1;
+	uint32_t ss       : 1;
+	uint32_t ul       : 1;
+	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+	uint32_t src2     : 8;
+	uint32_t opc      : 4;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat3_t;
+
+static inline bool instr_cat3_full(instr_cat3_t *cat3)
+{
+	switch (_OPC(3, cat3->opc)) {
+	case OPC_MAD_F16:
+	case OPC_MAD_U16:
+	case OPC_MAD_S16:
+	case OPC_SEL_B16:
+	case OPC_SEL_S16:
+	case OPC_SEL_F16:
+	case OPC_SAD_S16:
+	case OPC_SAD_S32:  // really??
+		return false;
+	default:
+		return true;
+	}
+}
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			uint32_t src          : 11;
+			uint32_t must_be_zero1: 2;
+			uint32_t src_im       : 1;   /* immediate */
+			uint32_t src_neg      : 1;   /* negate */
+			uint32_t src_abs      : 1;   /* absolute value */
+		};
+		struct PACKED {
+			uint32_t src          : 10;
+			uint32_t src_c        : 1;   /* relative-const */
+			uint32_t src_rel      : 1;   /* relative address */
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel;
+		struct PACKED {
+			uint32_t src          : 12;
+			uint32_t src_c        : 1;   /* const */
+			uint32_t dummy        : 3;
+		} c;
+	};
+	uint32_t dummy1   : 16;  /* seem to be ignored */
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t repeat   : 2;
+	uint32_t sat      : 1;
+	uint32_t src_r    : 1;
+	uint32_t ss       : 1;
+	uint32_t ul       : 1;
+	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+	uint32_t dummy2   : 5;   /* seem to be ignored */
+	uint32_t full     : 1;   /* not half */
+	uint32_t opc      : 6;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat4_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		/* normal case: */
+		struct PACKED {
+			uint32_t full     : 1;   /* not half */
+			uint32_t src1     : 8;
+			uint32_t src2     : 8;
+			uint32_t dummy1   : 4;   /* seem to be ignored */
+			uint32_t samp     : 4;
+			uint32_t tex      : 7;
+		} norm;
+		/* s2en case: */
+		struct PACKED {
+			uint32_t full     : 1;   /* not half */
+			uint32_t src1     : 8;
+			uint32_t src2     : 11;
+			uint32_t dummy1   : 1;
+			uint32_t src3     : 8;
+			uint32_t dummy2   : 3;
+		} s2en;
+		/* same in either case: */
+		// XXX I think, confirm this
+		struct PACKED {
+			uint32_t full     : 1;   /* not half */
+			uint32_t src1     : 8;
+			uint32_t pad      : 23;
+		};
+	};
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t wrmask   : 4;   /* write-mask */
+	uint32_t type     : 3;
+	uint32_t dummy2   : 1;   /* seems to be ignored */
+	uint32_t is_3d    : 1;
+
+	uint32_t is_a     : 1;
+	uint32_t is_s     : 1;
+	uint32_t is_s2en  : 1;
+	uint32_t is_o     : 1;
+	uint32_t is_p     : 1;
+
+	uint32_t opc      : 5;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat5_t;
+
+/* dword0 encoding for src_off: [src1 + off], src2: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t mustbe1  : 1;
+	int32_t  off      : 13;
+	uint32_t src1     : 8;
+	uint32_t src1_im  : 1;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t dword1;
+} instr_cat6a_t;
+
+/* dword0 encoding for !src_off: [src1], src2 */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t mustbe0  : 1;
+	uint32_t src1     : 13;
+	uint32_t ignore0  : 8;
+	uint32_t src1_im  : 1;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t dword1;
+} instr_cat6b_t;
+
+/* dword1 encoding for dst_off: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t dword0;
+
+	/* note: there is some weird stuff going on where sometimes
+	 * cat6->a.off is involved.. but that seems like a bug in
+	 * the blob, since it is used even if !cat6->src_off
+	 * It would make sense for there to be some more bits to
+	 * bring us to 11 bits worth of offset, but not sure..
+	 */
+	int32_t off       : 8;
+	uint32_t mustbe1  : 1;
+	uint32_t dst      : 8;
+	uint32_t pad1     : 15;
+} instr_cat6c_t;
+
+/* dword1 encoding for !dst_off: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t dword0;
+
+	uint32_t dst      : 8;
+	uint32_t mustbe0  : 1;
+	uint32_t idx      : 8;
+	uint32_t pad0     : 15;
+} instr_cat6d_t;
+
+/* ldgb and atomics..
+ *
+ * ldgb:      pad0=0, pad3=1
+ * atomic .g: pad0=1, pad3=1
+ *        .l: pad0=1, pad3=0
+ */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t pad0     : 1;
+	uint32_t src3     : 8;
+	uint32_t d        : 2;
+	uint32_t typed    : 1;
+	uint32_t type_size : 2;
+	uint32_t src1     : 8;
+	uint32_t src1_im  : 1;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t mustbe0  : 1;
+	uint32_t src_ssbo : 8;
+	uint32_t pad2     : 3;  // type
+	uint32_t g        : 1;
+	uint32_t pad3     : 1;
+	uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6ldgb_t;
+
+/* stgb, pad0=0, pad3=2
+ */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t mustbe1  : 1;  // ???
+	uint32_t src1     : 8;
+	uint32_t d        : 2;
+	uint32_t typed    : 1;
+	uint32_t type_size : 2;
+	uint32_t pad0     : 9;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t src3     : 8;
+	uint32_t src3_im  : 1;
+	uint32_t dst_ssbo : 8;
+	uint32_t pad2     : 3;  // type
+	uint32_t pad3     : 2;
+	uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6stgb_t;
+
+typedef union PACKED {
+	instr_cat6a_t a;
+	instr_cat6b_t b;
+	instr_cat6c_t c;
+	instr_cat6d_t d;
+	instr_cat6ldgb_t ldgb;
+	instr_cat6stgb_t stgb;
+	struct PACKED {
+		/* dword0: */
+		uint32_t src_off  : 1;
+		uint32_t pad1     : 31;
+
+		/* dword1: */
+		uint32_t pad2     : 8;
+		uint32_t dst_off  : 1;
+		uint32_t pad3     : 8;
+		uint32_t type     : 3;
+		uint32_t g        : 1;  /* or in some cases it means dst immed */
+		uint32_t pad4     : 1;
+		uint32_t opc      : 5;
+		uint32_t jmp_tgt  : 1;
+		uint32_t sync     : 1;
+		uint32_t opc_cat  : 3;
+	};
+} instr_cat6_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t pad1     : 32;
+
+	/* dword1: */
+	uint32_t pad2     : 12;
+	uint32_t ss       : 1;  /* maybe in the encoding, but blob only uses (sy) */
+	uint32_t pad3     : 6;
+	uint32_t w        : 1;  /* write */
+	uint32_t r        : 1;  /* read */
+	uint32_t l        : 1;  /* local */
+	uint32_t g        : 1;  /* global */
+	uint32_t opc      : 4;  /* presumed, but only a couple known OPCs */
+	uint32_t jmp_tgt  : 1;  /* (jp) */
+	uint32_t sync     : 1;  /* (sy) */
+	uint32_t opc_cat  : 3;
+} instr_cat7_t;
+
+typedef union PACKED {
+	instr_cat0_t cat0;
+	instr_cat1_t cat1;
+	instr_cat2_t cat2;
+	instr_cat3_t cat3;
+	instr_cat4_t cat4;
+	instr_cat5_t cat5;
+	instr_cat6_t cat6;
+	instr_cat7_t cat7;
+	struct PACKED {
+		/* dword0: */
+		uint32_t pad1     : 32;
+
+		/* dword1: */
+		uint32_t pad2     : 12;
+		uint32_t ss       : 1;  /* cat1-cat4 (cat0??) and cat7 (?) */
+		uint32_t ul       : 1;  /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
+		uint32_t pad3     : 13;
+		uint32_t jmp_tgt  : 1;
+		uint32_t sync     : 1;
+		uint32_t opc_cat  : 3;
+
+	};
+} instr_t;
+
+static inline uint32_t instr_repeat(instr_t *instr)
+{
+	switch (instr->opc_cat) {
+	case 0:  return instr->cat0.repeat;
+	case 1:  return instr->cat1.repeat;
+	case 2:  return instr->cat2.repeat;
+	case 3:  return instr->cat3.repeat;
+	case 4:  return instr->cat4.repeat;
+	default: return 0;
+	}
+}
+
+static inline bool instr_sat(instr_t *instr)
+{
+	switch (instr->opc_cat) {
+	case 2:  return instr->cat2.sat;
+	case 3:  return instr->cat3.sat;
+	case 4:  return instr->cat4.sat;
+	default: return false;
+	}
+}
+
+static inline uint32_t instr_opc(instr_t *instr)
+{
+	switch (instr->opc_cat) {
+	case 0:  return instr->cat0.opc;
+	case 1:  return 0;
+	case 2:  return instr->cat2.opc;
+	case 3:  return instr->cat3.opc;
+	case 4:  return instr->cat4.opc;
+	case 5:  return instr->cat5.opc;
+	case 6:  return instr->cat6.opc;
+	case 7:  return instr->cat7.opc;
+	default: return 0;
+	}
+}
+
+static inline bool is_mad(opc_t opc)
+{
+	switch (opc) {
+	case OPC_MAD_U16:
+	case OPC_MAD_S16:
+	case OPC_MAD_U24:
+	case OPC_MAD_S24:
+	case OPC_MAD_F16:
+	case OPC_MAD_F32:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_madsh(opc_t opc)
+{
+	switch (opc) {
+	case OPC_MADSH_U16:
+	case OPC_MADSH_M16:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_atomic(opc_t opc)
+{
+	switch (opc) {
+	case OPC_ATOMIC_ADD:
+	case OPC_ATOMIC_SUB:
+	case OPC_ATOMIC_XCHG:
+	case OPC_ATOMIC_INC:
+	case OPC_ATOMIC_DEC:
+	case OPC_ATOMIC_CMPXCHG:
+	case OPC_ATOMIC_MIN:
+	case OPC_ATOMIC_MAX:
+	case OPC_ATOMIC_AND:
+	case OPC_ATOMIC_OR:
+	case OPC_ATOMIC_XOR:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_ssbo(opc_t opc)
+{
+	switch (opc) {
+	case OPC_RESFMT:
+	case OPC_RESINFO:
+	case OPC_LDGB:
+	case OPC_STGB:
+	case OPC_STIB:
+		return true;
+	default:
+		return false;
+	}
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out);
+
+#endif /* INSTR_A3XX_H_ */
diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
new file mode 100644
index 00000000000..3d1c4449b12
--- /dev/null
+++ b/src/freedreno/ir3/ir3.c
@@ -0,0 +1,941 @@
+/*
+ * Copyright (c) 2012 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "util/bitscan.h"
+#include "util/ralloc.h"
+#include "util/u_math.h"
+
+#include "instr-a3xx.h"
+
+/* simple allocator to carve allocations out of an up-front allocated heap,
+ * so that we can free everything easily in one shot.
+ */
+void * ir3_alloc(struct ir3 *shader, int sz)
+{
+	return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
+}
+
+struct ir3 * ir3_create(struct ir3_compiler *compiler,
+		unsigned nin, unsigned nout)
+{
+	struct ir3 *shader = rzalloc(compiler, struct ir3);
+
+	shader->compiler = compiler;
+	shader->ninputs = nin;
+	shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin);
+
+	shader->noutputs = nout;
+	shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
+
+	list_inithead(&shader->block_list);
+	list_inithead(&shader->array_list);
+
+	return shader;
+}
+
+void ir3_destroy(struct ir3 *shader)
+{
+	ralloc_free(shader);
+}
+
+#define iassert(cond) do { \
+	if (!(cond)) { \
+		debug_assert(cond); \
+		return -1; \
+	} } while (0)
+
+#define iassert_type(reg, full) do { \
+	if ((full)) { \
+		iassert(!((reg)->flags & IR3_REG_HALF)); \
+	} else { \
+		iassert((reg)->flags & IR3_REG_HALF); \
+	} } while (0);
+
+static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
+		uint32_t repeat, uint32_t valid_flags)
+{
+	reg_t val = { .dummy32 = 0 };
+
+	if (reg->flags & ~valid_flags) {
+		debug_printf("INVALID FLAGS: %x vs %x\n",
+				reg->flags, valid_flags);
+	}
+
+	if (!(reg->flags & IR3_REG_R))
+		repeat = 0;
+
+	if (reg->flags & IR3_REG_IMMED) {
+		val.iim_val = reg->iim_val;
+	} else {
+		unsigned components;
+		int16_t max;
+
+		if (reg->flags & IR3_REG_RELATIV) {
+			components = reg->size;
+			val.idummy10 = reg->array.offset;
+			max = (reg->array.offset + repeat + components - 1) >> 2;
+		} else {
+			components = util_last_bit(reg->wrmask);
+			val.comp = reg->num & 0x3;
+			val.num  = reg->num >> 2;
+			max = (reg->num + repeat + components - 1) >> 2;
+		}
+
+		if (reg->flags & IR3_REG_CONST) {
+			info->max_const = MAX2(info->max_const, max);
+		} else if (val.num == 63) {
+			/* ignore writes to dummy register r63.x */
+		} else if (max < 48) {
+			if (reg->flags & IR3_REG_HALF) {
+				if (info->gpu_id >= 600) {
+					/* starting w/ a6xx, half regs conflict with full regs: */
+					info->max_reg = MAX2(info->max_reg, (max+1)/2);
+				} else {
+					info->max_half_reg = MAX2(info->max_half_reg, max);
+				}
+			} else {
+				info->max_reg = MAX2(info->max_reg, max);
+			}
+		}
+	}
+
+	return val.dummy32;
+}
+
+static int emit_cat0(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	instr_cat0_t *cat0 = ptr;
+
+	if (info->gpu_id >= 500) {
+		cat0->a5xx.immed = instr->cat0.immed;
+	} else if (info->gpu_id >= 400) {
+		cat0->a4xx.immed = instr->cat0.immed;
+	} else {
+		cat0->a3xx.immed = instr->cat0.immed;
+	}
+	cat0->repeat   = instr->repeat;
+	cat0->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat0->inv      = instr->cat0.inv;
+	cat0->comp     = instr->cat0.comp;
+	cat0->opc      = instr->opc;
+	cat0->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat0->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat0->opc_cat  = 0;
+
+	return 0;
+}
+
+static int emit_cat1(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src = instr->regs[1];
+	instr_cat1_t *cat1 = ptr;
+
+	iassert(instr->regs_count == 2);
+	iassert_type(dst, type_size(instr->cat1.dst_type) == 32);
+	if (!(src->flags & IR3_REG_IMMED))
+		iassert_type(src, type_size(instr->cat1.src_type) == 32);
+
+	if (src->flags & IR3_REG_IMMED) {
+		cat1->iim_val = src->iim_val;
+		cat1->src_im  = 1;
+	} else if (src->flags & IR3_REG_RELATIV) {
+		cat1->off       = reg(src, info, instr->repeat,
+				IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF | IR3_REG_RELATIV);
+		cat1->src_rel   = 1;
+		cat1->src_rel_c = !!(src->flags & IR3_REG_CONST);
+	} else {
+		cat1->src  = reg(src, info, instr->repeat,
+				IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF);
+		cat1->src_c     = !!(src->flags & IR3_REG_CONST);
+	}
+
+	cat1->dst      = reg(dst, info, instr->repeat,
+			IR3_REG_RELATIV | IR3_REG_EVEN |
+			IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF);
+	cat1->repeat   = instr->repeat;
+	cat1->src_r    = !!(src->flags & IR3_REG_R);
+	cat1->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat1->ul       = !!(instr->flags & IR3_INSTR_UL);
+	cat1->dst_type = instr->cat1.dst_type;
+	cat1->dst_rel  = !!(dst->flags & IR3_REG_RELATIV);
+	cat1->src_type = instr->cat1.src_type;
+	cat1->even     = !!(dst->flags & IR3_REG_EVEN);
+	cat1->pos_inf  = !!(dst->flags & IR3_REG_POS_INF);
+	cat1->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat1->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat1->opc_cat  = 1;
+
+	return 0;
+}
+
+static int emit_cat2(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src1 = instr->regs[1];
+	struct ir3_register *src2 = instr->regs[2];
+	instr_cat2_t *cat2 = ptr;
+	unsigned absneg = ir3_cat2_absneg(instr->opc);
+
+	iassert((instr->regs_count == 2) || (instr->regs_count == 3));
+
+	if (src1->flags & IR3_REG_RELATIV) {
+		iassert(src1->array.offset < (1 << 10));
+		cat2->rel1.src1      = reg(src1, info, instr->repeat,
+				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+				IR3_REG_HALF | absneg);
+		cat2->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
+		cat2->rel1.src1_rel  = 1;
+	} else if (src1->flags & IR3_REG_CONST) {
+		iassert(src1->num < (1 << 12));
+		cat2->c1.src1   = reg(src1, info, instr->repeat,
+				IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+		cat2->c1.src1_c = 1;
+	} else {
+		iassert(src1->num < (1 << 11));
+		cat2->src1 = reg(src1, info, instr->repeat,
+				IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
+				absneg);
+	}
+	cat2->src1_im  = !!(src1->flags & IR3_REG_IMMED);
+	cat2->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+	cat2->src1_abs = !!(src1->flags & (IR3_REG_FABS | IR3_REG_SABS));
+	cat2->src1_r   = !!(src1->flags & IR3_REG_R);
+
+	if (src2) {
+		iassert((src2->flags & IR3_REG_IMMED) ||
+				!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+
+		if (src2->flags & IR3_REG_RELATIV) {
+			iassert(src2->array.offset < (1 << 10));
+			cat2->rel2.src2      = reg(src2, info, instr->repeat,
+					IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+					IR3_REG_HALF | absneg);
+			cat2->rel2.src2_c    = !!(src2->flags & IR3_REG_CONST);
+			cat2->rel2.src2_rel  = 1;
+		} else if (src2->flags & IR3_REG_CONST) {
+			iassert(src2->num < (1 << 12));
+			cat2->c2.src2   = reg(src2, info, instr->repeat,
+					IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+			cat2->c2.src2_c = 1;
+		} else {
+			iassert(src2->num < (1 << 11));
+			cat2->src2 = reg(src2, info, instr->repeat,
+					IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
+					absneg);
+		}
+
+		cat2->src2_im  = !!(src2->flags & IR3_REG_IMMED);
+		cat2->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+		cat2->src2_abs = !!(src2->flags & (IR3_REG_FABS | IR3_REG_SABS));
+		cat2->src2_r   = !!(src2->flags & IR3_REG_R);
+	}
+
+	cat2->dst      = reg(dst, info, instr->repeat,
+			IR3_REG_R | IR3_REG_EI | IR3_REG_HALF);
+	cat2->repeat   = instr->repeat;
+	cat2->sat      = !!(instr->flags & IR3_INSTR_SAT);
+	cat2->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat2->ul       = !!(instr->flags & IR3_INSTR_UL);
+	cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF);
+	cat2->ei       = !!(dst->flags & IR3_REG_EI);
+	cat2->cond     = instr->cat2.condition;
+	cat2->full     = ! (src1->flags & IR3_REG_HALF);
+	cat2->opc      = instr->opc;
+	cat2->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat2->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat2->opc_cat  = 2;
+
+	return 0;
+}
+
+static int emit_cat3(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src1 = instr->regs[1];
+	struct ir3_register *src2 = instr->regs[2];
+	struct ir3_register *src3 = instr->regs[3];
+	unsigned absneg = ir3_cat3_absneg(instr->opc);
+	instr_cat3_t *cat3 = ptr;
+	uint32_t src_flags = 0;
+
+	switch (instr->opc) {
+	case OPC_MAD_F16:
+	case OPC_MAD_U16:
+	case OPC_MAD_S16:
+	case OPC_SEL_B16:
+	case OPC_SEL_S16:
+	case OPC_SEL_F16:
+	case OPC_SAD_S16:
+	case OPC_SAD_S32:  // really??
+		src_flags |= IR3_REG_HALF;
+		break;
+	default:
+		break;
+	}
+
+	iassert(instr->regs_count == 4);
+	iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF));
+	iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF));
+	iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
+
+	if (src1->flags & IR3_REG_RELATIV) {
+		iassert(src1->array.offset < (1 << 10));
+		cat3->rel1.src1      = reg(src1, info, instr->repeat,
+				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+				IR3_REG_HALF | absneg);
+		cat3->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
+		cat3->rel1.src1_rel  = 1;
+	} else if (src1->flags & IR3_REG_CONST) {
+		iassert(src1->num < (1 << 12));
+		cat3->c1.src1   = reg(src1, info, instr->repeat,
+				IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+		cat3->c1.src1_c = 1;
+	} else {
+		iassert(src1->num < (1 << 11));
+		cat3->src1 = reg(src1, info, instr->repeat,
+				IR3_REG_R | IR3_REG_HALF | absneg);
+	}
+
+	cat3->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+	cat3->src1_r   = !!(src1->flags & IR3_REG_R);
+
+	cat3->src2     = reg(src2, info, instr->repeat,
+			IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg);
+	cat3->src2_c   = !!(src2->flags & IR3_REG_CONST);
+	cat3->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+	cat3->src2_r   = !!(src2->flags & IR3_REG_R);
+
+
+	if (src3->flags & IR3_REG_RELATIV) {
+		iassert(src3->array.offset < (1 << 10));
+		cat3->rel2.src3      = reg(src3, info, instr->repeat,
+				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+				IR3_REG_HALF | absneg);
+		cat3->rel2.src3_c    = !!(src3->flags & IR3_REG_CONST);
+		cat3->rel2.src3_rel  = 1;
+	} else if (src3->flags & IR3_REG_CONST) {
+		iassert(src3->num < (1 << 12));
+		cat3->c2.src3   = reg(src3, info, instr->repeat,
+				IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+		cat3->c2.src3_c = 1;
+	} else {
+		iassert(src3->num < (1 << 11));
+		cat3->src3 = reg(src3, info, instr->repeat,
+				IR3_REG_R | IR3_REG_HALF | absneg);
+	}
+
+	cat3->src3_neg = !!(src3->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+	cat3->src3_r   = !!(src3->flags & IR3_REG_R);
+
+	cat3->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	cat3->repeat   = instr->repeat;
+	cat3->sat      = !!(instr->flags & IR3_INSTR_SAT);
+	cat3->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat3->ul       = !!(instr->flags & IR3_INSTR_UL);
+	cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF);
+	cat3->opc      = instr->opc;
+	cat3->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat3->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat3->opc_cat  = 3;
+
+	return 0;
+}
+
+static int emit_cat4(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src = instr->regs[1];
+	instr_cat4_t *cat4 = ptr;
+
+	iassert(instr->regs_count == 2);
+
+	if (src->flags & IR3_REG_RELATIV) {
+		iassert(src->array.offset < (1 << 10));
+		cat4->rel.src      = reg(src, info, instr->repeat,
+				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG |
+				IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF);
+		cat4->rel.src_c    = !!(src->flags & IR3_REG_CONST);
+		cat4->rel.src_rel  = 1;
+	} else if (src->flags & IR3_REG_CONST) {
+		iassert(src->num < (1 << 12));
+		cat4->c.src   = reg(src, info, instr->repeat,
+				IR3_REG_CONST | IR3_REG_FNEG | IR3_REG_FABS |
+				IR3_REG_R | IR3_REG_HALF);
+		cat4->c.src_c = 1;
+	} else {
+		iassert(src->num < (1 << 11));
+		cat4->src = reg(src, info, instr->repeat,
+				IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
+				IR3_REG_R | IR3_REG_HALF);
+	}
+
+	cat4->src_im   = !!(src->flags & IR3_REG_IMMED);
+	cat4->src_neg  = !!(src->flags & IR3_REG_FNEG);
+	cat4->src_abs  = !!(src->flags & IR3_REG_FABS);
+	cat4->src_r    = !!(src->flags & IR3_REG_R);
+
+	cat4->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	cat4->repeat   = instr->repeat;
+	cat4->sat      = !!(instr->flags & IR3_INSTR_SAT);
+	cat4->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat4->ul       = !!(instr->flags & IR3_INSTR_UL);
+	cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF);
+	cat4->full     = ! (src->flags & IR3_REG_HALF);
+	cat4->opc      = instr->opc;
+	cat4->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat4->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat4->opc_cat  = 4;
+
+	return 0;
+}
+
+static int emit_cat5(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src1 = instr->regs[1];
+	struct ir3_register *src2 = instr->regs[2];
+	struct ir3_register *src3 = instr->regs[3];
+	instr_cat5_t *cat5 = ptr;
+
+	iassert_type(dst, type_size(instr->cat5.type) == 32)
+
+	assume(src1 || !src2);
+	assume(src2 || !src3);
+
+	if (src1) {
+		cat5->full = ! (src1->flags & IR3_REG_HALF);
+		cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF);
+	}
+
+	if (instr->flags & IR3_INSTR_S2EN) {
+		if (src2) {
+			iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+			cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+		}
+		if (src3) {
+			iassert(src3->flags & IR3_REG_HALF);
+			cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF);
+		}
+		iassert(!(instr->cat5.samp | instr->cat5.tex));
+	} else {
+		iassert(!src3);
+		if (src2) {
+			iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+			cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+		}
+		cat5->norm.samp = instr->cat5.samp;
+		cat5->norm.tex  = instr->cat5.tex;
+	}
+
+	cat5->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	cat5->wrmask   = dst->wrmask;
+	cat5->type     = instr->cat5.type;
+	cat5->is_3d    = !!(instr->flags & IR3_INSTR_3D);
+	cat5->is_a     = !!(instr->flags & IR3_INSTR_A);
+	cat5->is_s     = !!(instr->flags & IR3_INSTR_S);
+	cat5->is_s2en  = !!(instr->flags & IR3_INSTR_S2EN);
+	cat5->is_o     = !!(instr->flags & IR3_INSTR_O);
+	cat5->is_p     = !!(instr->flags & IR3_INSTR_P);
+	cat5->opc      = instr->opc;
+	cat5->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat5->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat5->opc_cat  = 5;
+
+	return 0;
+}
+
+static int emit_cat6(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst, *src1, *src2;
+	instr_cat6_t *cat6 = ptr;
+	bool type_full = type_size(instr->cat6.type) == 32;
+
+	cat6->type     = instr->cat6.type;
+	cat6->opc      = instr->opc;
+	cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat6->g        = !!(instr->flags & IR3_INSTR_G);
+	cat6->opc_cat  = 6;
+
+	switch (instr->opc) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		iassert_type(instr->regs[0], type_full); /* dst */
+		iassert_type(instr->regs[1], type_full); /* src1 */
+		break;
+	case OPC_L2G:
+	case OPC_G2L:
+		iassert_type(instr->regs[0], true);      /* dst */
+		iassert_type(instr->regs[1], true);      /* src1 */
+		break;
+	case OPC_STG:
+	case OPC_STL:
+	case OPC_STP:
+	case OPC_STI:
+	case OPC_STLW:
+	case OPC_STIB:
+		/* no dst, so regs[0] is dummy */
+		iassert_type(instr->regs[1], true);      /* dst */
+		iassert_type(instr->regs[2], type_full); /* src1 */
+		iassert_type(instr->regs[3], true);      /* src2 */
+		break;
+	default:
+		iassert_type(instr->regs[0], type_full); /* dst */
+		iassert_type(instr->regs[1], true);      /* src1 */
+		if (instr->regs_count > 2)
+			iassert_type(instr->regs[2], true);  /* src1 */
+		break;
+	}
+
+	/* the "dst" for a store instruction is (from the perspective
+	 * of data flow in the shader, ie. register use/def, etc) in
+	 * fact a register that is read by the instruction, rather
+	 * than written:
+	 */
+	if (is_store(instr)) {
+		iassert(instr->regs_count >= 3);
+
+		dst  = instr->regs[1];
+		src1 = instr->regs[2];
+		src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL;
+	} else {
+		iassert(instr->regs_count >= 2);
+
+		dst  = instr->regs[0];
+		src1 = instr->regs[1];
+		src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
+	}
+
+	/* TODO we need a more comprehensive list about which instructions
+	 * can be encoded which way.  Or possibly use IR3_INSTR_0 flag to
+	 * indicate to use the src_off encoding even if offset is zero
+	 * (but then what to do about dst_off?)
+	 */
+	if (is_atomic(instr->opc)) {
+		instr_cat6ldgb_t *ldgb = ptr;
+
+		/* maybe these two bits both determine the instruction encoding? */
+		cat6->src_off = false;
+
+		ldgb->d = instr->cat6.d - 1;
+		ldgb->typed = instr->cat6.typed;
+		ldgb->type_size = instr->cat6.iim_val - 1;
+
+		ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+		if (ldgb->g) {
+			struct ir3_register *src3 = instr->regs[3];
+			struct ir3_register *src4 = instr->regs[4];
+
+			/* first src is src_ssbo: */
+			iassert(src1->flags & IR3_REG_IMMED);
+			ldgb->src_ssbo = src1->uim_val;
+
+			ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+			ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
+			ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+			ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
+
+			ldgb->src3 = reg(src4, info, instr->repeat, 0);
+			ldgb->pad0 = 0x1;
+			ldgb->pad3 = 0x1;
+		} else {
+			ldgb->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
+			ldgb->src1_im = !!(src1->flags & IR3_REG_IMMED);
+			ldgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+			ldgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
+			ldgb->pad0 = 0x1;
+			ldgb->pad3 = 0x0;
+		}
+
+		return 0;
+	} else if (instr->opc == OPC_LDGB) {
+		struct ir3_register *src3 = instr->regs[3];
+		instr_cat6ldgb_t *ldgb = ptr;
+
+		/* maybe these two bits both determine the instruction encoding? */
+		cat6->src_off = false;
+
+		ldgb->d = instr->cat6.d - 1;
+		ldgb->typed = instr->cat6.typed;
+		ldgb->type_size = instr->cat6.iim_val - 1;
+
+		ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+		/* first src is src_ssbo: */
+		iassert(src1->flags & IR3_REG_IMMED);
+		ldgb->src_ssbo = src1->uim_val;
+
+		/* then next two are src1/src2: */
+		ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+		ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
+		ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+		ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
+
+		ldgb->pad0 = 0x0;
+		ldgb->pad3 = 0x1;
+
+		return 0;
+	} else if (instr->opc == OPC_RESINFO) {
+		instr_cat6ldgb_t *ldgb = ptr;
+
+		ldgb->d = instr->cat6.d - 1;
+
+		ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+		/* first src is src_ssbo: */
+		iassert(src1->flags & IR3_REG_IMMED);
+		ldgb->src_ssbo = src1->uim_val;
+
+		return 0;
+	} else if ((instr->opc == OPC_STGB) || (instr->opc == OPC_STIB)) {
+		struct ir3_register *src3 = instr->regs[4];
+		instr_cat6stgb_t *stgb = ptr;
+
+		/* maybe these two bits both determine the instruction encoding? */
+		cat6->src_off = true;
+		stgb->pad3 = 0x2;
+
+		stgb->d = instr->cat6.d - 1;
+		stgb->typed = instr->cat6.typed;
+		stgb->type_size = instr->cat6.iim_val - 1;
+
+		/* first src is dst_ssbo: */
+		iassert(dst->flags & IR3_REG_IMMED);
+		stgb->dst_ssbo = dst->uim_val;
+
+		/* then src1/src2/src3: */
+		stgb->src1 = reg(src1, info, instr->repeat, 0);
+		stgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+		stgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
+		stgb->src3 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+		stgb->src3_im = !!(src3->flags & IR3_REG_IMMED);
+
+		return 0;
+	} else if (instr->cat6.src_offset || (instr->opc == OPC_LDG) ||
+			(instr->opc == OPC_LDL)) {
+		instr_cat6a_t *cat6a = ptr;
+
+		cat6->src_off = true;
+
+		cat6a->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
+		cat6a->src1_im = !!(src1->flags & IR3_REG_IMMED);
+		if (src2) {
+			cat6a->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+			cat6a->src2_im = !!(src2->flags & IR3_REG_IMMED);
+		}
+		cat6a->off = instr->cat6.src_offset;
+	} else {
+		instr_cat6b_t *cat6b = ptr;
+
+		cat6->src_off = false;
+
+		cat6b->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED | IR3_REG_HALF);
+		cat6b->src1_im = !!(src1->flags & IR3_REG_IMMED);
+		if (src2) {
+			cat6b->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+			cat6b->src2_im = !!(src2->flags & IR3_REG_IMMED);
+		}
+	}
+
+	if (instr->cat6.dst_offset || (instr->opc == OPC_STG) ||
+			(instr->opc == OPC_STL)) {
+		instr_cat6c_t *cat6c = ptr;
+		cat6->dst_off = true;
+		cat6c->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+		cat6c->off = instr->cat6.dst_offset;
+	} else {
+		instr_cat6d_t *cat6d = ptr;
+		cat6->dst_off = false;
+		cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	}
+
+	return 0;
+}
+
+static int emit_cat7(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	instr_cat7_t *cat7 = ptr;
+
+	cat7->ss      = !!(instr->flags & IR3_INSTR_SS);
+	cat7->w       = instr->cat7.w;
+	cat7->r       = instr->cat7.r;
+	cat7->l       = instr->cat7.l;
+	cat7->g       = instr->cat7.g;
+	cat7->opc     = instr->opc;
+	cat7->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+	cat7->sync    = !!(instr->flags & IR3_INSTR_SY);
+	cat7->opc_cat = 7;
+
+	return 0;
+}
+
+static int (*emit[])(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info) = {
+	emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6,
+	emit_cat7,
+};
+
+void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
+		uint32_t gpu_id)
+{
+	uint32_t *ptr, *dwords;
+
+	info->gpu_id        = gpu_id;
+	info->max_reg       = -1;
+	info->max_half_reg  = -1;
+	info->max_const     = -1;
+	info->instrs_count  = 0;
+	info->sizedwords    = 0;
+	info->ss = info->sy = 0;
+
+	list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			info->sizedwords += 2;
+		}
+	}
+
+	/* need an integer number of instruction "groups" (sets of 16
+	 * instructions on a4xx or sets of 4 instructions on a3xx),
+	 * so pad out w/ NOPs if needed: (NOTE each instruction is 64bits)
+	 */
+	if (gpu_id >= 400) {
+		info->sizedwords = align(info->sizedwords, 16 * 2);
+	} else {
+		info->sizedwords = align(info->sizedwords, 4 * 2);
+	}
+
+	ptr = dwords = calloc(4, info->sizedwords);
+
+	list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			int ret = emit[opc_cat(instr->opc)](instr, dwords, info);
+			if (ret)
+				goto fail;
+			info->instrs_count += 1 + instr->repeat;
+			dwords += 2;
+
+			if (instr->flags & IR3_INSTR_SS)
+				info->ss++;
+
+			if (instr->flags & IR3_INSTR_SY)
+				info->sy++;
+		}
+	}
+
+	return ptr;
+
+fail:
+	free(ptr);
+	return NULL;
+}
+
+static struct ir3_register * reg_create(struct ir3 *shader,
+		int num, int flags)
+{
+	struct ir3_register *reg =
+			ir3_alloc(shader, sizeof(struct ir3_register));
+	reg->wrmask = 1;
+	reg->flags = flags;
+	reg->num = num;
+	return reg;
+}
+
+static void insert_instr(struct ir3_block *block,
+		struct ir3_instruction *instr)
+{
+	struct ir3 *shader = block->shader;
+#ifdef DEBUG
+	instr->serialno = ++shader->instr_count;
+#endif
+	list_addtail(&instr->node, &block->instr_list);
+
+	if (is_input(instr))
+		array_insert(shader, shader->baryfs, instr);
+}
+
+struct ir3_block * ir3_block_create(struct ir3 *shader)
+{
+	struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
+#ifdef DEBUG
+	block->serialno = ++shader->block_count;
+#endif
+	block->shader = shader;
+	list_inithead(&block->node);
+	list_inithead(&block->instr_list);
+	return block;
+}
+
+static struct ir3_instruction *instr_create(struct ir3_block *block, int nreg)
+{
+	struct ir3_instruction *instr;
+	unsigned sz = sizeof(*instr) + (nreg * sizeof(instr->regs[0]));
+	char *ptr = ir3_alloc(block->shader, sz);
+
+	instr = (struct ir3_instruction *)ptr;
+	ptr  += sizeof(*instr);
+	instr->regs = (struct ir3_register **)ptr;
+
+#ifdef DEBUG
+	instr->regs_max = nreg;
+#endif
+
+	return instr;
+}
+
+struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
+		opc_t opc, int nreg)
+{
+	struct ir3_instruction *instr = instr_create(block, nreg);
+	instr->block = block;
+	instr->opc = opc;
+	insert_instr(block, instr);
+	return instr;
+}
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc)
+{
+	/* NOTE: we could be slightly more clever, at least for non-meta,
+	 * and choose # of regs based on category.
+	 */
+	return ir3_instr_create2(block, opc, 4);
+}
+
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
+{
+	struct ir3_instruction *new_instr = instr_create(instr->block,
+			instr->regs_count);
+	struct ir3_register **regs;
+	unsigned i;
+
+	regs = new_instr->regs;
+	*new_instr = *instr;
+	new_instr->regs = regs;
+
+	insert_instr(instr->block, new_instr);
+
+	/* clone registers: */
+	new_instr->regs_count = 0;
+	for (i = 0; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		struct ir3_register *new_reg =
+				ir3_reg_create(new_instr, reg->num, reg->flags);
+		*new_reg = *reg;
+	}
+
+	return new_instr;
+}
+
+/* Add a false dependency to instruction, to ensure it is scheduled first: */
+void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
+{
+	array_insert(instr, instr->deps, dep);
+}
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+		int num, int flags)
+{
+	struct ir3 *shader = instr->block->shader;
+	struct ir3_register *reg = reg_create(shader, num, flags);
+#ifdef DEBUG
+	debug_assert(instr->regs_count < instr->regs_max);
+#endif
+	instr->regs[instr->regs_count++] = reg;
+	return reg;
+}
+
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+		struct ir3_register *reg)
+{
+	struct ir3_register *new_reg = reg_create(shader, 0, 0);
+	*new_reg = *reg;
+	return new_reg;
+}
+
+void
+ir3_instr_set_address(struct ir3_instruction *instr,
+		struct ir3_instruction *addr)
+{
+	if (instr->address != addr) {
+		struct ir3 *ir = instr->block->shader;
+		instr->address = addr;
+		array_insert(ir, ir->indirects, instr);
+	}
+}
+
+void
+ir3_block_clear_mark(struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+		instr->flags &= ~IR3_INSTR_MARK;
+}
+
+void
+ir3_clear_mark(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		ir3_block_clear_mark(block);
+	}
+}
+
+/* note: this will destroy instr->depth, don't do it until after sched! */
+unsigned
+ir3_count_instructions(struct ir3 *ir)
+{
+	unsigned cnt = 0;
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			instr->ip = cnt++;
+		}
+		block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+		block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+	}
+	return cnt;
+}
+
+struct ir3_array *
+ir3_lookup_array(struct ir3 *ir, unsigned id)
+{
+	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node)
+		if (arr->id == id)
+			return arr;
+	return NULL;
+}
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
new file mode 100644
index 00000000000..ea3218828df
--- /dev/null
+++ b/src/freedreno/ir3/ir3.h
@@ -0,0 +1,1394 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IR3_H_
+#define IR3_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "compiler/shader_enums.h"
+
+#include "util/u_debug.h"
+#include "util/list.h"
+
+#include "instr-a3xx.h"
+
+/* low level intermediate representation of an adreno shader program */
+
+struct ir3_compiler;
+struct ir3;
+struct ir3_instruction;
+struct ir3_block;
+
+struct ir3_info {
+	uint32_t gpu_id;
+	uint16_t sizedwords;
+	uint16_t instrs_count;   /* expanded to account for rpt's */
+	/* NOTE: max_reg, etc, does not include registers not touched
+	 * by the shader (ie. vertex fetched via VFD_DECODE but not
+	 * touched by shader)
+	 */
+	int8_t   max_reg;   /* highest GPR # used by shader */
+	int8_t   max_half_reg;
+	int16_t  max_const;
+
+	/* number of sync bits: */
+	uint16_t ss, sy;
+};
+
+struct ir3_register {
+	enum {
+		IR3_REG_CONST  = 0x001,
+		IR3_REG_IMMED  = 0x002,
+		IR3_REG_HALF   = 0x004,
+		/* high registers are used for some things in compute shaders,
+		 * for example.  Seems to be for things that are global to all
+		 * threads in a wave, so possibly these are global/shared by
+		 * all the threads in the wave?
+		 */
+		IR3_REG_HIGH   = 0x008,
+		IR3_REG_RELATIV= 0x010,
+		IR3_REG_R      = 0x020,
+		/* Most instructions, it seems, can do float abs/neg but not
+		 * integer.  The CP pass needs to know what is intended (int or
+		 * float) in order to do the right thing.  For this reason the
+		 * abs/neg flags are split out into float and int variants.  In
+		 * addition, .b (bitwise) operations, the negate is actually a
+		 * bitwise not, so split that out into a new flag to make it
+		 * more clear.
+		 */
+		IR3_REG_FNEG   = 0x040,
+		IR3_REG_FABS   = 0x080,
+		IR3_REG_SNEG   = 0x100,
+		IR3_REG_SABS   = 0x200,
+		IR3_REG_BNOT   = 0x400,
+		IR3_REG_EVEN   = 0x800,
+		IR3_REG_POS_INF= 0x1000,
+		/* (ei) flag, end-input?  Set on last bary, presumably to signal
+		 * that the shader needs no more input:
+		 */
+		IR3_REG_EI     = 0x2000,
+		/* meta-flags, for intermediate stages of IR, ie.
+		 * before register assignment is done:
+		 */
+		IR3_REG_SSA    = 0x4000,   /* 'instr' is ptr to assigning instr */
+		IR3_REG_ARRAY  = 0x8000,
+
+	} flags;
+
+	/* normal registers:
+	 * the component is in the low two bits of the reg #, so
+	 * rN.x becomes: (N << 2) | x
+	 */
+	int   num;
+	union {
+		/* immediate: */
+		int32_t  iim_val;
+		uint32_t uim_val;
+		float    fim_val;
+		/* relative: */
+		struct {
+			uint16_t id;
+			int16_t offset;
+		} array;
+	};
+
+	/* For IR3_REG_SSA, src registers contain ptr back to assigning
+	 * instruction.
+	 *
+	 * For IR3_REG_ARRAY, the pointer is back to the last dependent
+	 * array access (although the net effect is the same, it points
+	 * back to a previous instruction that we depend on).
+	 */
+	struct ir3_instruction *instr;
+
+	union {
+		/* used for cat5 instructions, but also for internal/IR level
+		 * tracking of what registers are read/written by an instruction.
+		 * wrmask may be a bad name since it is used to represent both
+		 * src and dst that touch multiple adjacent registers.
+		 */
+		unsigned wrmask;
+		/* for relative addressing, 32bits for array size is too small,
+		 * but otoh we don't need to deal with disjoint sets, so instead
+		 * use a simple size field (number of scalar components).
+		 */
+		unsigned size;
+	};
+};
+
+/*
+ * Stupid/simple growable array implementation:
+ */
+#define DECLARE_ARRAY(type, name) \
+	unsigned name ## _count, name ## _sz; \
+	type * name;
+
+#define array_insert(ctx, arr, val) do { \
+		if (arr ## _count == arr ## _sz) { \
+			arr ## _sz = MAX2(2 * arr ## _sz, 16); \
+			arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
+		} \
+		arr[arr ##_count++] = val; \
+	} while (0)
+
+struct ir3_instruction {
+	struct ir3_block *block;
+	opc_t opc;
+	enum {
+		/* (sy) flag is set on first instruction, and after sample
+		 * instructions (probably just on RAW hazard).
+		 */
+		IR3_INSTR_SY    = 0x001,
+		/* (ss) flag is set on first instruction, and first instruction
+		 * to depend on the result of "long" instructions (RAW hazard):
+		 *
+		 *   rcp, rsq, log2, exp2, sin, cos, sqrt
+		 *
+		 * It seems to synchronize until all in-flight instructions are
+		 * completed, for example:
+		 *
+		 *   rsq hr1.w, hr1.w
+		 *   add.f hr2.z, (neg)hr2.z, hc0.y
+		 *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
+		 *   rsq hr2.x, hr2.x
+		 *   (rpt1)nop
+		 *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
+		 *   nop
+		 *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
+		 *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
+		 *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
+		 *
+		 * The last mul.f does not have (ss) set, presumably because the
+		 * (ss) on the previous instruction does the job.
+		 *
+		 * The blob driver also seems to set it on WAR hazards, although
+		 * not really clear if this is needed or just blob compiler being
+		 * sloppy.  So far I haven't found a case where removing the (ss)
+		 * causes problems for WAR hazard, but I could just be getting
+		 * lucky:
+		 *
+		 *   rcp r1.y, r3.y
+		 *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
+		 *
+		 */
+		IR3_INSTR_SS    = 0x002,
+		/* (jp) flag is set on jump targets:
+		 */
+		IR3_INSTR_JP    = 0x004,
+		IR3_INSTR_UL    = 0x008,
+		IR3_INSTR_3D    = 0x010,
+		IR3_INSTR_A     = 0x020,
+		IR3_INSTR_O     = 0x040,
+		IR3_INSTR_P     = 0x080,
+		IR3_INSTR_S     = 0x100,
+		IR3_INSTR_S2EN  = 0x200,
+		IR3_INSTR_G     = 0x400,
+		IR3_INSTR_SAT   = 0x800,
+		/* meta-flags, for intermediate stages of IR, ie.
+		 * before register assignment is done:
+		 */
+		IR3_INSTR_MARK  = 0x1000,
+		IR3_INSTR_UNUSED= 0x2000,
+	} flags;
+	int repeat;
+#ifdef DEBUG
+	unsigned regs_max;
+#endif
+	unsigned regs_count;
+	struct ir3_register **regs;
+	union {
+		struct {
+			char inv;
+			char comp;
+			int  immed;
+			struct ir3_block *target;
+		} cat0;
+		struct {
+			type_t src_type, dst_type;
+		} cat1;
+		struct {
+			enum {
+				IR3_COND_LT = 0,
+				IR3_COND_LE = 1,
+				IR3_COND_GT = 2,
+				IR3_COND_GE = 3,
+				IR3_COND_EQ = 4,
+				IR3_COND_NE = 5,
+			} condition;
+		} cat2;
+		struct {
+			unsigned samp, tex;
+			type_t type;
+		} cat5;
+		struct {
+			type_t type;
+			int src_offset;
+			int dst_offset;
+			int iim_val : 3;      /* for ldgb/stgb, # of components */
+			int d : 3;
+			bool typed : 1;
+		} cat6;
+		struct {
+			unsigned w : 1;       /* write */
+			unsigned r : 1;       /* read */
+			unsigned l : 1;       /* local */
+			unsigned g : 1;       /* global */
+		} cat7;
+		/* for meta-instructions, just used to hold extra data
+		 * before instruction scheduling, etc
+		 */
+		struct {
+			int off;              /* component/offset */
+		} fo;
+		struct {
+			struct ir3_block *block;
+		} inout;
+	};
+
+	/* transient values used during various algorithms: */
+	union {
+		/* The instruction depth is the max dependency distance to output.
+		 *
+		 * You can also think of it as the "cost", if we did any sort of
+		 * optimization for register footprint.  Ie. a value that is  just
+		 * result of moving a const to a reg would have a low cost,  so to
+		 * it could make sense to duplicate the instruction at various
+		 * points where the result is needed to reduce register footprint.
+		 */
+		unsigned depth;
+		/* When we get to the RA stage, we no longer need depth, but
+		 * we do need instruction's position/name:
+		 */
+		struct {
+			uint16_t ip;
+			uint16_t name;
+		};
+	};
+
+	/* used for per-pass extra instruction data.
+	 */
+	void *data;
+
+	/* Used during CP and RA stages.  For fanin and shader inputs/
+	 * outputs where we need a sequence of consecutive registers,
+	 * keep track of each src instructions left (ie 'n-1') and right
+	 * (ie 'n+1') neighbor.  The front-end must insert enough mov's
+	 * to ensure that each instruction has at most one left and at
+	 * most one right neighbor.  During the copy-propagation pass,
+	 * we only remove mov's when we can preserve this constraint.
+	 * And during the RA stage, we use the neighbor information to
+	 * allocate a block of registers in one shot.
+	 *
+	 * TODO: maybe just add something like:
+	 *   struct ir3_instruction_ref {
+	 *       struct ir3_instruction *instr;
+	 *       unsigned cnt;
+	 *   }
+	 *
+	 * Or can we get away without the refcnt stuff?  It seems like
+	 * it should be overkill..  the problem is if, potentially after
+	 * already eliminating some mov's, if you have a single mov that
+	 * needs to be grouped with it's neighbors in two different
+	 * places (ex. shader output and a fanin).
+	 */
+	struct {
+		struct ir3_instruction *left, *right;
+		uint16_t left_cnt, right_cnt;
+	} cp;
+
+	/* an instruction can reference at most one address register amongst
+	 * it's src/dst registers.  Beyond that, you need to insert mov's.
+	 *
+	 * NOTE: do not write this directly, use ir3_instr_set_address()
+	 */
+	struct ir3_instruction *address;
+
+	/* Tracking for additional dependent instructions.  Used to handle
+	 * barriers, WAR hazards for arrays/SSBOs/etc.
+	 */
+	DECLARE_ARRAY(struct ir3_instruction *, deps);
+
+	/*
+	 * From PoV of instruction scheduling, not execution (ie. ignores global/
+	 * local distinction):
+	 *                            shared  image  atomic  SSBO  everything
+	 *   barrier()/            -   R/W     R/W    R/W     R/W       X
+	 *     groupMemoryBarrier()
+	 *   memoryBarrier()       -           R/W    R/W
+	 *     (but only images declared coherent?)
+	 *   memoryBarrierAtomic() -                  R/W
+	 *   memoryBarrierBuffer() -                          R/W
+	 *   memoryBarrierImage()  -           R/W
+	 *   memoryBarrierShared() -   R/W
+	 *
+	 * TODO I think for SSBO/image/shared, in cases where we can determine
+	 * which variable is accessed, we don't need to care about accesses to
+	 * different variables (unless declared coherent??)
+	 */
+	enum {
+		IR3_BARRIER_EVERYTHING = 1 << 0,
+		IR3_BARRIER_SHARED_R   = 1 << 1,
+		IR3_BARRIER_SHARED_W   = 1 << 2,
+		IR3_BARRIER_IMAGE_R    = 1 << 3,
+		IR3_BARRIER_IMAGE_W    = 1 << 4,
+		IR3_BARRIER_BUFFER_R   = 1 << 5,
+		IR3_BARRIER_BUFFER_W   = 1 << 6,
+		IR3_BARRIER_ARRAY_R    = 1 << 7,
+		IR3_BARRIER_ARRAY_W    = 1 << 8,
+	} barrier_class, barrier_conflict;
+
+	/* Entry in ir3_block's instruction list: */
+	struct list_head node;
+
+	int use_count;      /* currently just updated/used by cp */
+
+#ifdef DEBUG
+	uint32_t serialno;
+#endif
+};
+
+static inline struct ir3_instruction *
+ir3_neighbor_first(struct ir3_instruction *instr)
+{
+	int cnt = 0;
+	while (instr->cp.left) {
+		instr = instr->cp.left;
+		if (++cnt > 0xffff) {
+			debug_assert(0);
+			break;
+		}
+	}
+	return instr;
+}
+
+static inline int ir3_neighbor_count(struct ir3_instruction *instr)
+{
+	int num = 1;
+
+	debug_assert(!instr->cp.left);
+
+	while (instr->cp.right) {
+		num++;
+		instr = instr->cp.right;
+		if (num > 0xffff) {
+			debug_assert(0);
+			break;
+		}
+	}
+
+	return num;
+}
+
+struct ir3 {
+	struct ir3_compiler *compiler;
+
+	unsigned ninputs, noutputs;
+	struct ir3_instruction **inputs;
+	struct ir3_instruction **outputs;
+
+	/* Track bary.f (and ldlv) instructions.. this is needed in
+	 * scheduling to ensure that all varying fetches happen before
+	 * any potential kill instructions.  The hw gets grumpy if all
+	 * threads in a group are killed before the last bary.f gets
+	 * a chance to signal end of input (ei).
+	 */
+	DECLARE_ARRAY(struct ir3_instruction *, baryfs);
+
+	/* Track all indirect instructions (read and write).  To avoid
+	 * deadlock scenario where an address register gets scheduled,
+	 * but other dependent src instructions cannot be scheduled due
+	 * to dependency on a *different* address register value, the
+	 * scheduler needs to ensure that all dependencies other than
+	 * the instruction other than the address register are scheduled
+	 * before the one that writes the address register.  Having a
+	 * convenient list of instructions that reference some address
+	 * register simplifies this.
+	 */
+	DECLARE_ARRAY(struct ir3_instruction *, indirects);
+
+	/* and same for instructions that consume predicate register: */
+	DECLARE_ARRAY(struct ir3_instruction *, predicates);
+
+	/* Track texture sample instructions which need texture state
+	 * patched in (for astc-srgb workaround):
+	 */
+	DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
+
+	/* List of blocks: */
+	struct list_head block_list;
+
+	/* List of ir3_array's: */
+	struct list_head array_list;
+
+#ifdef DEBUG
+	unsigned block_count, instr_count;
+#endif
+};
+
+struct ir3_array {
+	struct list_head node;
+	unsigned length;
+	unsigned id;
+
+	struct nir_register *r;
+
+	/* To avoid array write's from getting DCE'd, keep track of the
+	 * most recent write.  Any array access depends on the most
+	 * recent write.  This way, nothing depends on writes after the
+	 * last read.  But all the writes that happen before that have
+	 * something depending on them
+	 */
+	struct ir3_instruction *last_write;
+
+	/* extra stuff used in RA pass: */
+	unsigned base;      /* base vreg name */
+	unsigned reg;       /* base physical reg */
+	uint16_t start_ip, end_ip;
+};
+
+struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
+
+struct ir3_block {
+	struct list_head node;
+	struct ir3 *shader;
+
+	const struct nir_block *nblock;
+
+	struct list_head instr_list;  /* list of ir3_instruction */
+
+	/* each block has either one or two successors.. in case of
+	 * two successors, 'condition' decides which one to follow.
+	 * A block preceding an if/else has two successors.
+	 */
+	struct ir3_instruction *condition;
+	struct ir3_block *successors[2];
+
+	unsigned predecessors_count;
+	struct ir3_block **predecessors;
+
+	uint16_t start_ip, end_ip;
+
+	/* Track instructions which do not write a register but other-
+	 * wise must not be discarded (such as kill, stg, etc)
+	 */
+	DECLARE_ARRAY(struct ir3_instruction *, keeps);
+
+	/* used for per-pass extra block data.  Mainly used right
+	 * now in RA step to track livein/liveout.
+	 */
+	void *data;
+
+#ifdef DEBUG
+	uint32_t serialno;
+#endif
+};
+
+static inline uint32_t
+block_id(struct ir3_block *block)
+{
+#ifdef DEBUG
+	return block->serialno;
+#else
+	return (uint32_t)(unsigned long)block;
+#endif
+}
+
+struct ir3 * ir3_create(struct ir3_compiler *compiler,
+		unsigned nin, unsigned nout);
+void ir3_destroy(struct ir3 *shader);
+void * ir3_assemble(struct ir3 *shader,
+		struct ir3_info *info, uint32_t gpu_id);
+void * ir3_alloc(struct ir3 *shader, int sz);
+
+struct ir3_block * ir3_block_create(struct ir3 *shader);
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
+struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
+		opc_t opc, int nreg);
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
+void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
+const char *ir3_instr_name(struct ir3_instruction *instr);
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+		int num, int flags);
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+		struct ir3_register *reg);
+
+void ir3_instr_set_address(struct ir3_instruction *instr,
+		struct ir3_instruction *addr);
+
+static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
+{
+	if (instr->flags & IR3_INSTR_MARK)
+		return true;  /* already visited */
+	instr->flags |= IR3_INSTR_MARK;
+	return false;
+}
+
+void ir3_block_clear_mark(struct ir3_block *block);
+void ir3_clear_mark(struct ir3 *shader);
+
+unsigned ir3_count_instructions(struct ir3 *ir);
+
+static inline int ir3_instr_regno(struct ir3_instruction *instr,
+		struct ir3_register *reg)
+{
+	unsigned i;
+	for (i = 0; i < instr->regs_count; i++)
+		if (reg == instr->regs[i])
+			return i;
+	return -1;
+}
+
+
+#define MAX_ARRAYS 16
+
+/* comp:
+ *   0 - x
+ *   1 - y
+ *   2 - z
+ *   3 - w
+ */
+static inline uint32_t regid(int num, int comp)
+{
+	return (num << 2) | (comp & 0x3);
+}
+
+static inline uint32_t reg_num(struct ir3_register *reg)
+{
+	return reg->num >> 2;
+}
+
+static inline uint32_t reg_comp(struct ir3_register *reg)
+{
+	return reg->num & 0x3;
+}
+
+static inline bool is_flow(struct ir3_instruction *instr)
+{
+	return (opc_cat(instr->opc) == 0);
+}
+
+static inline bool is_kill(struct ir3_instruction *instr)
+{
+	return instr->opc == OPC_KILL;
+}
+
+static inline bool is_nop(struct ir3_instruction *instr)
+{
+	return instr->opc == OPC_NOP;
+}
+
+/* Is it a non-transformative (ie. not type changing) mov?  This can
+ * also include absneg.s/absneg.f, which for the most part can be
+ * treated as a mov (single src argument).
+ */
+static inline bool is_same_type_mov(struct ir3_instruction *instr)
+{
+	struct ir3_register *dst;
+
+	switch (instr->opc) {
+	case OPC_MOV:
+		if (instr->cat1.src_type != instr->cat1.dst_type)
+			return false;
+		break;
+	case OPC_ABSNEG_F:
+	case OPC_ABSNEG_S:
+		if (instr->flags & IR3_INSTR_SAT)
+			return false;
+		break;
+	default:
+		return false;
+	}
+
+	dst = instr->regs[0];
+
+	/* mov's that write to a0.x or p0.x are special: */
+	if (dst->num == regid(REG_P0, 0))
+		return false;
+	if (dst->num == regid(REG_A0, 0))
+		return false;
+
+	if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
+		return false;
+
+	return true;
+}
+
+static inline bool is_alu(struct ir3_instruction *instr)
+{
+	return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
+}
+
+static inline bool is_sfu(struct ir3_instruction *instr)
+{
+	return (opc_cat(instr->opc) == 4);
+}
+
+static inline bool is_tex(struct ir3_instruction *instr)
+{
+	return (opc_cat(instr->opc) == 5);
+}
+
+static inline bool is_mem(struct ir3_instruction *instr)
+{
+	return (opc_cat(instr->opc) == 6);
+}
+
+static inline bool is_barrier(struct ir3_instruction *instr)
+{
+	return (opc_cat(instr->opc) == 7);
+}
+
+static inline bool
+is_store(struct ir3_instruction *instr)
+{
+	/* these instructions, the "destination" register is
+	 * actually a source, the address to store to.
+	 */
+	switch (instr->opc) {
+	case OPC_STG:
+	case OPC_STGB:
+	case OPC_STIB:
+	case OPC_STP:
+	case OPC_STL:
+	case OPC_STLW:
+	case OPC_L2G:
+	case OPC_G2L:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_load(struct ir3_instruction *instr)
+{
+	switch (instr->opc) {
+	case OPC_LDG:
+	case OPC_LDGB:
+	case OPC_LDL:
+	case OPC_LDP:
+	case OPC_L2G:
+	case OPC_LDLW:
+	case OPC_LDC:
+	case OPC_LDLV:
+		/* probably some others too.. */
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_input(struct ir3_instruction *instr)
+{
+	/* in some cases, ldlv is used to fetch varying without
+	 * interpolation.. fortunately inloc is the first src
+	 * register in either case
+	 */
+	switch (instr->opc) {
+	case OPC_LDLV:
+	case OPC_BARY_F:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_bool(struct ir3_instruction *instr)
+{
+	switch (instr->opc) {
+	case OPC_CMPS_F:
+	case OPC_CMPS_S:
+	case OPC_CMPS_U:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_meta(struct ir3_instruction *instr)
+{
+	/* TODO how should we count PHI (and maybe fan-in/out) which
+	 * might actually contribute some instructions to the final
+	 * result?
+	 */
+	return (opc_cat(instr->opc) == -1);
+}
+
+static inline bool writes_addr(struct ir3_instruction *instr)
+{
+	if (instr->regs_count > 0) {
+		struct ir3_register *dst = instr->regs[0];
+		return reg_num(dst) == REG_A0;
+	}
+	return false;
+}
+
+static inline bool writes_pred(struct ir3_instruction *instr)
+{
+	if (instr->regs_count > 0) {
+		struct ir3_register *dst = instr->regs[0];
+		return reg_num(dst) == REG_P0;
+	}
+	return false;
+}
+
+/* returns defining instruction for reg */
+/* TODO better name */
+static inline struct ir3_instruction *ssa(struct ir3_register *reg)
+{
+	if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
+		return reg->instr;
+	}
+	return NULL;
+}
+
+static inline bool conflicts(struct ir3_instruction *a,
+		struct ir3_instruction *b)
+{
+	return (a && b) && (a != b);
+}
+
+static inline bool reg_gpr(struct ir3_register *r)
+{
+	if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+		return false;
+	if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
+		return false;
+	return true;
+}
+
+static inline type_t half_type(type_t type)
+{
+	switch (type) {
+	case TYPE_F32: return TYPE_F16;
+	case TYPE_U32: return TYPE_U16;
+	case TYPE_S32: return TYPE_S16;
+	case TYPE_F16:
+	case TYPE_U16:
+	case TYPE_S16:
+		return type;
+	default:
+		assert(0);
+		return ~0;
+	}
+}
+
+/* some cat2 instructions (ie. those which are not float) can embed an
+ * immediate:
+ */
+static inline bool ir3_cat2_int(opc_t opc)
+{
+	switch (opc) {
+	case OPC_ADD_U:
+	case OPC_ADD_S:
+	case OPC_SUB_U:
+	case OPC_SUB_S:
+	case OPC_CMPS_U:
+	case OPC_CMPS_S:
+	case OPC_MIN_U:
+	case OPC_MIN_S:
+	case OPC_MAX_U:
+	case OPC_MAX_S:
+	case OPC_CMPV_U:
+	case OPC_CMPV_S:
+	case OPC_MUL_U:
+	case OPC_MUL_S:
+	case OPC_MULL_U:
+	case OPC_CLZ_S:
+	case OPC_ABSNEG_S:
+	case OPC_AND_B:
+	case OPC_OR_B:
+	case OPC_NOT_B:
+	case OPC_XOR_B:
+	case OPC_BFREV_B:
+	case OPC_CLZ_B:
+	case OPC_SHL_B:
+	case OPC_SHR_B:
+	case OPC_ASHR_B:
+	case OPC_MGEN_B:
+	case OPC_GETBIT_B:
+	case OPC_CBITS_B:
+	case OPC_BARY_F:
+		return true;
+
+	default:
+		return false;
+	}
+}
+
+
+/* map cat2 instruction to valid abs/neg flags: */
+static inline unsigned ir3_cat2_absneg(opc_t opc)
+{
+	switch (opc) {
+	case OPC_ADD_F:
+	case OPC_MIN_F:
+	case OPC_MAX_F:
+	case OPC_MUL_F:
+	case OPC_SIGN_F:
+	case OPC_CMPS_F:
+	case OPC_ABSNEG_F:
+	case OPC_CMPV_F:
+	case OPC_FLOOR_F:
+	case OPC_CEIL_F:
+	case OPC_RNDNE_F:
+	case OPC_RNDAZ_F:
+	case OPC_TRUNC_F:
+	case OPC_BARY_F:
+		return IR3_REG_FABS | IR3_REG_FNEG;
+
+	case OPC_ADD_U:
+	case OPC_ADD_S:
+	case OPC_SUB_U:
+	case OPC_SUB_S:
+	case OPC_CMPS_U:
+	case OPC_CMPS_S:
+	case OPC_MIN_U:
+	case OPC_MIN_S:
+	case OPC_MAX_U:
+	case OPC_MAX_S:
+	case OPC_CMPV_U:
+	case OPC_CMPV_S:
+	case OPC_MUL_U:
+	case OPC_MUL_S:
+	case OPC_MULL_U:
+	case OPC_CLZ_S:
+		return 0;
+
+	case OPC_ABSNEG_S:
+		return IR3_REG_SABS | IR3_REG_SNEG;
+
+	case OPC_AND_B:
+	case OPC_OR_B:
+	case OPC_NOT_B:
+	case OPC_XOR_B:
+	case OPC_BFREV_B:
+	case OPC_CLZ_B:
+	case OPC_SHL_B:
+	case OPC_SHR_B:
+	case OPC_ASHR_B:
+	case OPC_MGEN_B:
+	case OPC_GETBIT_B:
+	case OPC_CBITS_B:
+		return IR3_REG_BNOT;
+
+	default:
+		return 0;
+	}
+}
+
+/* map cat3 instructions to valid abs/neg flags: */
+static inline unsigned ir3_cat3_absneg(opc_t opc)
+{
+	switch (opc) {
+	case OPC_MAD_F16:
+	case OPC_MAD_F32:
+	case OPC_SEL_F16:
+	case OPC_SEL_F32:
+		return IR3_REG_FNEG;
+
+	case OPC_MAD_U16:
+	case OPC_MADSH_U16:
+	case OPC_MAD_S16:
+	case OPC_MADSH_M16:
+	case OPC_MAD_U24:
+	case OPC_MAD_S24:
+	case OPC_SEL_S16:
+	case OPC_SEL_S32:
+	case OPC_SAD_S16:
+	case OPC_SAD_S32:
+		/* neg *may* work on 3rd src.. */
+
+	case OPC_SEL_B16:
+	case OPC_SEL_B32:
+
+	default:
+		return 0;
+	}
+}
+
+#define MASK(n) ((1 << (n)) - 1)
+
+/* iterator for an instructions's sources (reg), also returns src #: */
+#define foreach_src_n(__srcreg, __n, __instr) \
+	if ((__instr)->regs_count) \
+		for (unsigned __cnt = (__instr)->regs_count - 1, __n = 0; __n < __cnt; __n++) \
+			if ((__srcreg = (__instr)->regs[__n + 1]))
+
+/* iterator for an instructions's sources (reg): */
+#define foreach_src(__srcreg, __instr) \
+	foreach_src_n(__srcreg, __i, __instr)
+
+static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
+{
+	unsigned cnt = instr->regs_count + instr->deps_count;
+	if (instr->address)
+		cnt++;
+	return cnt;
+}
+
+static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
+{
+	if (n == (instr->regs_count + instr->deps_count))
+		return instr->address;
+	if (n >= instr->regs_count)
+		return instr->deps[n - instr->regs_count];
+	return ssa(instr->regs[n]);
+}
+
+static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
+{
+	if (n == (instr->regs_count + instr->deps_count))
+		return false;
+	if (n >= instr->regs_count)
+		return true;
+	return false;
+}
+
+#define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1)
+
+/* iterator for an instruction's SSA sources (instr), also returns src #: */
+#define foreach_ssa_src_n(__srcinst, __n, __instr) \
+	for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
+		if ((__srcinst = __ssa_src_n(__instr, __n)))
+
+/* iterator for an instruction's SSA sources (instr): */
+#define foreach_ssa_src(__srcinst, __instr) \
+	foreach_ssa_src_n(__srcinst, __i, __instr)
+
+
+/* dump: */
+void ir3_print(struct ir3 *ir);
+void ir3_print_instr(struct ir3_instruction *instr);
+
+/* depth calculation: */
+int ir3_delayslots(struct ir3_instruction *assigner,
+		struct ir3_instruction *consumer, unsigned n);
+void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
+void ir3_depth(struct ir3 *ir);
+
+/* copy-propagate: */
+struct ir3_shader_variant;
+void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
+
+/* group neighbors and insert mov's to resolve conflicts: */
+void ir3_group(struct ir3 *ir);
+
+/* scheduling: */
+void ir3_sched_add_deps(struct ir3 *ir);
+int ir3_sched(struct ir3 *ir);
+
+/* register assignment: */
+struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler);
+int ir3_ra(struct ir3 *ir3, gl_shader_stage type,
+		bool frag_coord, bool frag_face);
+
+/* legalize: */
+void ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary);
+
+/* ************************************************************************* */
+/* instruction helpers */
+
+/* creates SSA src of correct type (ie. half vs full precision) */
+static inline struct ir3_register * __ssa_src(struct ir3_instruction *instr,
+		struct ir3_instruction *src, unsigned flags)
+{
+	struct ir3_register *reg;
+	if (src->regs[0]->flags & IR3_REG_HALF)
+		flags |= IR3_REG_HALF;
+	reg = ir3_reg_create(instr, 0, IR3_REG_SSA | flags);
+	reg->instr = src;
+	return reg;
+}
+
+static inline struct ir3_instruction *
+ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
+{
+	struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
+	ir3_reg_create(instr, 0, 0);   /* dst */
+	if (src->regs[0]->flags & IR3_REG_ARRAY) {
+		struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
+		src_reg->array = src->regs[0]->array;
+	} else {
+		__ssa_src(instr, src, 0);
+	}
+	debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
+	instr->cat1.src_type = type;
+	instr->cat1.dst_type = type;
+	return instr;
+}
+
+static inline struct ir3_instruction *
+ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
+		type_t src_type, type_t dst_type)
+{
+	struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
+	unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
+	unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
+
+	debug_assert((src->regs[0]->flags & IR3_REG_HALF) == src_flags);
+
+	ir3_reg_create(instr, 0, dst_flags);   /* dst */
+	__ssa_src(instr, src, 0);
+	instr->cat1.src_type = src_type;
+	instr->cat1.dst_type = dst_type;
+	debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
+	return instr;
+}
+
+static inline struct ir3_instruction *
+ir3_NOP(struct ir3_block *block)
+{
+	return ir3_instr_create(block, OPC_NOP);
+}
+
+#define INSTR0(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block)                                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create(block, OPC_##name);                             \
+	return instr;                                                        \
+}
+
+#define INSTR1(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+		struct ir3_instruction *a, unsigned aflags)                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create(block, OPC_##name);                             \
+	ir3_reg_create(instr, 0, 0);   /* dst */                             \
+	__ssa_src(instr, a, aflags);                                         \
+	return instr;                                                        \
+}
+
+#define INSTR2(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+		struct ir3_instruction *a, unsigned aflags,                      \
+		struct ir3_instruction *b, unsigned bflags)                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create(block, OPC_##name);                             \
+	ir3_reg_create(instr, 0, 0);   /* dst */                             \
+	__ssa_src(instr, a, aflags);                                         \
+	__ssa_src(instr, b, bflags);                                         \
+	return instr;                                                        \
+}
+
+#define INSTR3(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+		struct ir3_instruction *a, unsigned aflags,                      \
+		struct ir3_instruction *b, unsigned bflags,                      \
+		struct ir3_instruction *c, unsigned cflags)                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create(block, OPC_##name);                             \
+	ir3_reg_create(instr, 0, 0);   /* dst */                             \
+	__ssa_src(instr, a, aflags);                                         \
+	__ssa_src(instr, b, bflags);                                         \
+	__ssa_src(instr, c, cflags);                                         \
+	return instr;                                                        \
+}
+
+#define INSTR4(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+		struct ir3_instruction *a, unsigned aflags,                      \
+		struct ir3_instruction *b, unsigned bflags,                      \
+		struct ir3_instruction *c, unsigned cflags,                      \
+		struct ir3_instruction *d, unsigned dflags)                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create2(block, OPC_##name, 5);                         \
+	ir3_reg_create(instr, 0, 0);   /* dst */                             \
+	__ssa_src(instr, a, aflags);                                         \
+	__ssa_src(instr, b, bflags);                                         \
+	__ssa_src(instr, c, cflags);                                         \
+	__ssa_src(instr, d, dflags);                                         \
+	return instr;                                                        \
+}
+
+#define INSTR4F(f, name)                                                 \
+static inline struct ir3_instruction *                                   \
+ir3_##name##_##f(struct ir3_block *block,                                \
+		struct ir3_instruction *a, unsigned aflags,                      \
+		struct ir3_instruction *b, unsigned bflags,                      \
+		struct ir3_instruction *c, unsigned cflags,                      \
+		struct ir3_instruction *d, unsigned dflags)                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create2(block, OPC_##name, 5);                         \
+	ir3_reg_create(instr, 0, 0);   /* dst */                             \
+	__ssa_src(instr, a, aflags);                                         \
+	__ssa_src(instr, b, bflags);                                         \
+	__ssa_src(instr, c, cflags);                                         \
+	__ssa_src(instr, d, dflags);                                         \
+	instr->flags |= IR3_INSTR_##f;                                       \
+	return instr;                                                        \
+}
+
+/* cat0 instructions: */
+INSTR0(BR)
+INSTR0(JUMP)
+INSTR1(KILL)
+INSTR0(END)
+
+/* cat2 instructions, most 2 src but some 1 src: */
+INSTR2(ADD_F)
+INSTR2(MIN_F)
+INSTR2(MAX_F)
+INSTR2(MUL_F)
+INSTR1(SIGN_F)
+INSTR2(CMPS_F)
+INSTR1(ABSNEG_F)
+INSTR2(CMPV_F)
+INSTR1(FLOOR_F)
+INSTR1(CEIL_F)
+INSTR1(RNDNE_F)
+INSTR1(RNDAZ_F)
+INSTR1(TRUNC_F)
+INSTR2(ADD_U)
+INSTR2(ADD_S)
+INSTR2(SUB_U)
+INSTR2(SUB_S)
+INSTR2(CMPS_U)
+INSTR2(CMPS_S)
+INSTR2(MIN_U)
+INSTR2(MIN_S)
+INSTR2(MAX_U)
+INSTR2(MAX_S)
+INSTR1(ABSNEG_S)
+INSTR2(AND_B)
+INSTR2(OR_B)
+INSTR1(NOT_B)
+INSTR2(XOR_B)
+INSTR2(CMPV_U)
+INSTR2(CMPV_S)
+INSTR2(MUL_U)
+INSTR2(MUL_S)
+INSTR2(MULL_U)
+INSTR1(BFREV_B)
+INSTR1(CLZ_S)
+INSTR1(CLZ_B)
+INSTR2(SHL_B)
+INSTR2(SHR_B)
+INSTR2(ASHR_B)
+INSTR2(BARY_F)
+INSTR2(MGEN_B)
+INSTR2(GETBIT_B)
+INSTR1(SETRM)
+INSTR1(CBITS_B)
+INSTR2(SHB)
+INSTR2(MSAD)
+
+/* cat3 instructions: */
+INSTR3(MAD_U16)
+INSTR3(MADSH_U16)
+INSTR3(MAD_S16)
+INSTR3(MADSH_M16)
+INSTR3(MAD_U24)
+INSTR3(MAD_S24)
+INSTR3(MAD_F16)
+INSTR3(MAD_F32)
+INSTR3(SEL_B16)
+INSTR3(SEL_B32)
+INSTR3(SEL_S16)
+INSTR3(SEL_S32)
+INSTR3(SEL_F16)
+INSTR3(SEL_F32)
+INSTR3(SAD_S16)
+INSTR3(SAD_S32)
+
+/* cat4 instructions: */
+INSTR1(RCP)
+INSTR1(RSQ)
+INSTR1(LOG2)
+INSTR1(EXP2)
+INSTR1(SIN)
+INSTR1(COS)
+INSTR1(SQRT)
+
+/* cat5 instructions: */
+INSTR1(DSX)
+INSTR1(DSY)
+
+static inline struct ir3_instruction *
+ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
+		unsigned wrmask, unsigned flags, unsigned samp, unsigned tex,
+		struct ir3_instruction *src0, struct ir3_instruction *src1)
+{
+	struct ir3_instruction *sam;
+	struct ir3_register *reg;
+
+	sam = ir3_instr_create(block, opc);
+	sam->flags |= flags;
+	ir3_reg_create(sam, 0, 0)->wrmask = wrmask;
+	if (src0) {
+		reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
+		reg->wrmask = (1 << (src0->regs_count - 1)) - 1;
+		reg->instr = src0;
+	}
+	if (src1) {
+		reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
+		reg->instr = src1;
+		reg->wrmask = (1 << (src1->regs_count - 1)) - 1;
+	}
+	sam->cat5.samp = samp;
+	sam->cat5.tex  = tex;
+	sam->cat5.type  = type;
+
+	return sam;
+}
+
+/* cat6 instructions: */
+INSTR2(LDLV)
+INSTR2(LDG)
+INSTR2(LDL)
+INSTR3(STG)
+INSTR3(STL)
+INSTR3(LDGB)
+INSTR4(STGB)
+INSTR4(STIB)
+INSTR1(RESINFO)
+INSTR1(RESFMT)
+INSTR2(ATOMIC_ADD)
+INSTR2(ATOMIC_SUB)
+INSTR2(ATOMIC_XCHG)
+INSTR2(ATOMIC_INC)
+INSTR2(ATOMIC_DEC)
+INSTR2(ATOMIC_CMPXCHG)
+INSTR2(ATOMIC_MIN)
+INSTR2(ATOMIC_MAX)
+INSTR2(ATOMIC_AND)
+INSTR2(ATOMIC_OR)
+INSTR2(ATOMIC_XOR)
+INSTR4F(G, ATOMIC_ADD)
+INSTR4F(G, ATOMIC_SUB)
+INSTR4F(G, ATOMIC_XCHG)
+INSTR4F(G, ATOMIC_INC)
+INSTR4F(G, ATOMIC_DEC)
+INSTR4F(G, ATOMIC_CMPXCHG)
+INSTR4F(G, ATOMIC_MIN)
+INSTR4F(G, ATOMIC_MAX)
+INSTR4F(G, ATOMIC_AND)
+INSTR4F(G, ATOMIC_OR)
+INSTR4F(G, ATOMIC_XOR)
+
+/* cat7 instructions: */
+INSTR0(BAR)
+INSTR0(FENCE)
+
+/* ************************************************************************* */
+/* split this out or find some helper to use.. like main/bitset.h.. */
+
+#include <string.h>
+
+#define MAX_REG 256
+
+typedef uint8_t regmask_t[2 * MAX_REG / 8];
+
+static inline unsigned regmask_idx(struct ir3_register *reg)
+{
+	unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
+	debug_assert(num < MAX_REG);
+	if (reg->flags & IR3_REG_HALF)
+		num += MAX_REG;
+	return num;
+}
+
+static inline void regmask_init(regmask_t *regmask)
+{
+	memset(regmask, 0, sizeof(*regmask));
+}
+
+static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
+{
+	unsigned idx = regmask_idx(reg);
+	if (reg->flags & IR3_REG_RELATIV) {
+		unsigned i;
+		for (i = 0; i < reg->size; i++, idx++)
+			(*regmask)[idx / 8] |= 1 << (idx % 8);
+	} else {
+		unsigned mask;
+		for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+			if (mask & 1)
+				(*regmask)[idx / 8] |= 1 << (idx % 8);
+	}
+}
+
+static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
+{
+	unsigned i;
+	for (i = 0; i < ARRAY_SIZE(*dst); i++)
+		(*dst)[i] = (*a)[i] | (*b)[i];
+}
+
+/* set bits in a if not set in b, conceptually:
+ *   a |= (reg & ~b)
+ */
+static inline void regmask_set_if_not(regmask_t *a,
+		struct ir3_register *reg, regmask_t *b)
+{
+	unsigned idx = regmask_idx(reg);
+	if (reg->flags & IR3_REG_RELATIV) {
+		unsigned i;
+		for (i = 0; i < reg->size; i++, idx++)
+			if (!((*b)[idx / 8] & (1 << (idx % 8))))
+				(*a)[idx / 8] |= 1 << (idx % 8);
+	} else {
+		unsigned mask;
+		for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+			if (mask & 1)
+				if (!((*b)[idx / 8] & (1 << (idx % 8))))
+					(*a)[idx / 8] |= 1 << (idx % 8);
+	}
+}
+
+static inline bool regmask_get(regmask_t *regmask,
+		struct ir3_register *reg)
+{
+	unsigned idx = regmask_idx(reg);
+	if (reg->flags & IR3_REG_RELATIV) {
+		unsigned i;
+		for (i = 0; i < reg->size; i++, idx++)
+			if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+				return true;
+	} else {
+		unsigned mask;
+		for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+			if (mask & 1)
+				if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+					return true;
+	}
+	return false;
+}
+
+/* ************************************************************************* */
+
+#endif /* IR3_H_ */
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
new file mode 100644
index 00000000000..f00daebabf5
--- /dev/null
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "util/ralloc.h"
+
+#include "ir3_compiler.h"
+
+static const struct debug_named_value shader_debug_options[] = {
+		{"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"},
+		{"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"},
+		{"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"},
+		{"disasm",  IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"},
+		{"optmsgs", IR3_DBG_OPTMSGS,"Enable optimizer debug messages"},
+		DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", shader_debug_options, 0)
+
+enum ir3_shader_debug ir3_shader_debug = 0;
+
+struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
+{
+	struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
+
+	ir3_shader_debug = debug_get_option_ir3_shader_debug();
+
+	compiler->dev = dev;
+	compiler->gpu_id = gpu_id;
+	compiler->set = ir3_ra_alloc_reg_set(compiler);
+
+	if (compiler->gpu_id >= 400) {
+		/* need special handling for "flat" */
+		compiler->flat_bypass = true;
+		compiler->levels_add_one = false;
+		compiler->unminify_coords = false;
+		compiler->txf_ms_with_isaml = false;
+		compiler->array_index_add_half = true;
+	} else {
+		/* no special handling for "flat" */
+		compiler->flat_bypass = false;
+		compiler->levels_add_one = true;
+		compiler->unminify_coords = true;
+		compiler->txf_ms_with_isaml = true;
+		compiler->array_index_add_half = false;
+	}
+
+	return compiler;
+}
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
new file mode 100644
index 00000000000..e2336062b29
--- /dev/null
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#ifndef IR3_COMPILER_H_
+#define IR3_COMPILER_H_
+
+#include "ir3_shader.h"
+
+struct ir3_ra_reg_set;
+
+struct ir3_compiler {
+	struct fd_device *dev;
+	uint32_t gpu_id;
+	struct ir3_ra_reg_set *set;
+	uint32_t shader_count;
+
+	/*
+	 * Configuration options for things that are handled differently on
+	 * different generations:
+	 */
+
+	/* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
+	 * so we need to use ldlv.u32 to load the varying directly:
+	 */
+	bool flat_bypass;
+
+	/* on a3xx, we need to add one to # of array levels:
+	 */
+	bool levels_add_one;
+
+	/* on a3xx, we need to scale up integer coords for isaml based
+	 * on LoD:
+	 */
+	bool unminify_coords;
+
+	/* on a3xx do txf_ms w/ isaml and scaled coords: */
+	bool txf_ms_with_isaml;
+
+	/* on a4xx, for array textures we need to add 0.5 to the array
+	 * index coordinate:
+	 */
+	bool array_index_add_half;
+};
+
+struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id);
+
+int ir3_compile_shader_nir(struct ir3_compiler *compiler,
+		struct ir3_shader_variant *so);
+
+enum ir3_shader_debug {
+	IR3_DBG_SHADER_VS = 0x01,
+	IR3_DBG_SHADER_FS = 0x02,
+	IR3_DBG_SHADER_CS = 0x04,
+	IR3_DBG_DISASM    = 0x08,
+	IR3_DBG_OPTMSGS   = 0x10,
+};
+
+extern enum ir3_shader_debug ir3_shader_debug;
+
+static inline bool
+shader_debug_enabled(gl_shader_stage type)
+{
+	switch (type) {
+	case MESA_SHADER_VERTEX:      return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
+	case MESA_SHADER_FRAGMENT:    return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
+	case MESA_SHADER_COMPUTE:     return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
+	default:
+		debug_assert(0);
+		return false;
+	}
+}
+
+#endif /* IR3_COMPILER_H_ */
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
new file mode 100644
index 00000000000..445a2b291e9
--- /dev/null
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -0,0 +1,3818 @@
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+#include "ir3_nir.h"
+
+#include "instr-a3xx.h"
+#include "ir3.h"
+
+/* for conditionally setting boolean flag(s): */
+#define COND(bool, val) ((bool) ? (val) : 0)
+
+#define DBG(fmt, ...) \
+		do { debug_printf("%s:%d: "fmt "\n", \
+				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
+
+struct ir3_context {
+	struct ir3_compiler *compiler;
+
+	struct nir_shader *s;
+
+	struct nir_instr *cur_instr;  /* current instruction, just for debug */
+
+	struct ir3 *ir;
+	struct ir3_shader_variant *so;
+
+	struct ir3_block *block;      /* the current block */
+	struct ir3_block *in_block;   /* block created for shader inputs */
+
+	nir_function_impl *impl;
+
+	/* For fragment shaders, varyings are not actual shader inputs,
+	 * instead the hw passes a varying-coord which is used with
+	 * bary.f.
+	 *
+	 * But NIR doesn't know that, it still declares varyings as
+	 * inputs.  So we do all the input tracking normally and fix
+	 * things up after compile_instructions()
+	 *
+	 * NOTE that frag_vcoord is the hardware position (possibly it
+	 * is actually an index or tag or some such.. it is *not*
+	 * values that can be directly used for gl_FragCoord..)
+	 */
+	struct ir3_instruction *frag_vcoord;
+
+	/* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */
+	struct ir3_instruction *frag_face, *frag_coord;
+
+	/* For vertex shaders, keep track of the system values sources */
+	struct ir3_instruction *vertex_id, *basevertex, *instance_id;
+
+	/* For fragment shaders: */
+	struct ir3_instruction *samp_id, *samp_mask_in;
+
+	/* Compute shader inputs: */
+	struct ir3_instruction *local_invocation_id, *work_group_id;
+
+	/* mapping from nir_register to defining instruction: */
+	struct hash_table *def_ht;
+
+	unsigned num_arrays;
+
+	/* a common pattern for indirect addressing is to request the
+	 * same address register multiple times.  To avoid generating
+	 * duplicate instruction sequences (which our backend does not
+	 * try to clean up, since that should be done as the NIR stage)
+	 * we cache the address value generated for a given src value:
+	 *
+	 * Note that we have to cache these per alignment, since same
+	 * src used for an array of vec1 cannot be also used for an
+	 * array of vec4.
+	 */
+	struct hash_table *addr_ht[4];
+
+	/* last dst array, for indirect we need to insert a var-store.
+	 */
+	struct ir3_instruction **last_dst;
+	unsigned last_dst_n;
+
+	/* maps nir_block to ir3_block, mostly for the purposes of
+	 * figuring out the blocks successors
+	 */
+	struct hash_table *block_ht;
+
+	/* on a4xx, bitmask of samplers which need astc+srgb workaround: */
+	unsigned astc_srgb;
+
+	unsigned samples;             /* bitmask of x,y sample shifts */
+
+	unsigned max_texture_index;
+
+	/* set if we encounter something we can't handle yet, so we
+	 * can bail cleanly and fallback to TGSI compiler f/e
+	 */
+	bool error;
+};
+
+/* gpu pointer size in units of 32bit registers/slots */
+static unsigned pointer_size(struct ir3_context *ctx)
+{
+	return (ctx->compiler->gpu_id >= 500) ? 2 : 1;
+}
+
+static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
+static struct ir3_block * get_block(struct ir3_context *ctx, const nir_block *nblock);
+
+
+static struct ir3_context *
+compile_init(struct ir3_compiler *compiler,
+		struct ir3_shader_variant *so)
+{
+	struct ir3_context *ctx = rzalloc(NULL, struct ir3_context);
+
+	if (compiler->gpu_id >= 400) {
+		if (so->type == MESA_SHADER_VERTEX) {
+			ctx->astc_srgb = so->key.vastc_srgb;
+		} else if (so->type == MESA_SHADER_FRAGMENT) {
+			ctx->astc_srgb = so->key.fastc_srgb;
+		}
+
+	} else {
+		if (so->type == MESA_SHADER_VERTEX) {
+			ctx->samples = so->key.vsamples;
+		} else if (so->type == MESA_SHADER_FRAGMENT) {
+			ctx->samples = so->key.fsamples;
+		}
+	}
+
+	ctx->compiler = compiler;
+	ctx->so = so;
+	ctx->def_ht = _mesa_hash_table_create(ctx,
+			_mesa_hash_pointer, _mesa_key_pointer_equal);
+	ctx->block_ht = _mesa_hash_table_create(ctx,
+			_mesa_hash_pointer, _mesa_key_pointer_equal);
+
+	/* TODO: maybe generate some sort of bitmask of what key
+	 * lowers vs what shader has (ie. no need to lower
+	 * texture clamp lowering if no texture sample instrs)..
+	 * although should be done further up the stack to avoid
+	 * creating duplicate variants..
+	 */
+
+	if (ir3_key_lowers_nir(&so->key)) {
+		nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
+		ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
+	} else {
+		/* fast-path for shader key that lowers nothing in NIR: */
+		ctx->s = so->shader->nir;
+	}
+
+	/* this needs to be the last pass run, so do this here instead of
+	 * in ir3_optimize_nir():
+	 */
+	NIR_PASS_V(ctx->s, nir_lower_locals_to_regs);
+	NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
+
+	if (ir3_shader_debug & IR3_DBG_DISASM) {
+		printf("dump nir%dv%d: type=%d, k={cts=%u,hp=%u}",
+			so->shader->id, so->id, so->type,
+			so->key.color_two_side, so->key.half_precision);
+		nir_print_shader(ctx->s, stdout);
+	}
+
+	if (shader_debug_enabled(so->type)) {
+		fprintf(stderr, "NIR (final form) for %s shader:\n",
+			_mesa_shader_stage_to_string(so->type));
+		nir_print_shader(ctx->s, stderr);
+	}
+
+	ir3_nir_scan_driver_consts(ctx->s, &so->const_layout);
+
+	so->num_uniforms = ctx->s->num_uniforms;
+	so->num_ubos = ctx->s->info.num_ubos;
+
+	/* Layout of constant registers, each section aligned to vec4.  Note
+	 * that pointer size (ubo, etc) changes depending on generation.
+	 *
+	 *    user consts
+	 *    UBO addresses
+	 *    SSBO sizes
+	 *    if (vertex shader) {
+	 *        driver params (IR3_DP_*)
+	 *        if (stream_output.num_outputs > 0)
+	 *           stream-out addresses
+	 *    }
+	 *    immediates
+	 *
+	 * Immediates go last mostly because they are inserted in the CP pass
+	 * after the nir -> ir3 frontend.
+	 */
+	unsigned constoff = align(ctx->s->num_uniforms, 4);
+	unsigned ptrsz = pointer_size(ctx);
+
+	memset(&so->constbase, ~0, sizeof(so->constbase));
+
+	if (so->num_ubos > 0) {
+		so->constbase.ubo = constoff;
+		constoff += align(ctx->s->info.num_ubos * ptrsz, 4) / 4;
+	}
+
+	if (so->const_layout.ssbo_size.count > 0) {
+		unsigned cnt = so->const_layout.ssbo_size.count;
+		so->constbase.ssbo_sizes = constoff;
+		constoff += align(cnt, 4) / 4;
+	}
+
+	if (so->const_layout.image_dims.count > 0) {
+		unsigned cnt = so->const_layout.image_dims.count;
+		so->constbase.image_dims = constoff;
+		constoff += align(cnt, 4) / 4;
+	}
+
+	unsigned num_driver_params = 0;
+	if (so->type == MESA_SHADER_VERTEX) {
+		num_driver_params = IR3_DP_VS_COUNT;
+	} else if (so->type == MESA_SHADER_COMPUTE) {
+		num_driver_params = IR3_DP_CS_COUNT;
+	}
+
+	so->constbase.driver_param = constoff;
+	constoff += align(num_driver_params, 4) / 4;
+
+	if ((so->type == MESA_SHADER_VERTEX) &&
+			(compiler->gpu_id < 500) &&
+			so->shader->stream_output.num_outputs > 0) {
+		so->constbase.tfbo = constoff;
+		constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
+	}
+
+	so->constbase.immediate = constoff;
+
+	return ctx;
+}
+
+static void
+compile_error(struct ir3_context *ctx, const char *format, ...)
+{
+	struct hash_table *errors = NULL;
+	va_list ap;
+	va_start(ap, format);
+	if (ctx->cur_instr) {
+		errors = _mesa_hash_table_create(NULL,
+				_mesa_hash_pointer,
+				_mesa_key_pointer_equal);
+		char *msg = ralloc_vasprintf(errors, format, ap);
+		_mesa_hash_table_insert(errors, ctx->cur_instr, msg);
+	} else {
+		_debug_vprintf(format, ap);
+	}
+	va_end(ap);
+	nir_print_shader_annotated(ctx->s, stdout, errors);
+	ralloc_free(errors);
+	ctx->error = true;
+	debug_assert(0);
+}
+
+#define compile_assert(ctx, cond) do { \
+		if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
+	} while (0)
+
+static void
+compile_free(struct ir3_context *ctx)
+{
+	ralloc_free(ctx);
+}
+
+static void
+declare_array(struct ir3_context *ctx, nir_register *reg)
+{
+	struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
+	arr->id = ++ctx->num_arrays;
+	/* NOTE: sometimes we get non array regs, for example for arrays of
+	 * length 1.  See fs-const-array-of-struct-of-array.shader_test.  So
+	 * treat a non-array as if it was an array of length 1.
+	 *
+	 * It would be nice if there was a nir pass to convert arrays of
+	 * length 1 to ssa.
+	 */
+	arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
+	compile_assert(ctx, arr->length > 0);
+	arr->r = reg;
+	list_addtail(&arr->node, &ctx->ir->array_list);
+}
+
+static struct ir3_array *
+get_array(struct ir3_context *ctx, nir_register *reg)
+{
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		if (arr->r == reg)
+			return arr;
+	}
+	compile_error(ctx, "bogus reg: %s\n", reg->name);
+	return NULL;
+}
+
+/* relative (indirect) if address!=NULL */
+static struct ir3_instruction *
+create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
+		struct ir3_instruction *address)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *mov;
+	struct ir3_register *src;
+
+	mov = ir3_instr_create(block, OPC_MOV);
+	mov->cat1.src_type = TYPE_U32;
+	mov->cat1.dst_type = TYPE_U32;
+	mov->barrier_class = IR3_BARRIER_ARRAY_R;
+	mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
+	ir3_reg_create(mov, 0, 0);
+	src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+			COND(address, IR3_REG_RELATIV));
+	src->instr = arr->last_write;
+	src->size  = arr->length;
+	src->array.id = arr->id;
+	src->array.offset = n;
+
+	if (address)
+		ir3_instr_set_address(mov, address);
+
+	return mov;
+}
+
+/* relative (indirect) if address!=NULL */
+static void
+create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
+		struct ir3_instruction *src, struct ir3_instruction *address)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *mov;
+	struct ir3_register *dst;
+
+	/* if not relative store, don't create an extra mov, since that
+	 * ends up being difficult for cp to remove.
+	 */
+	if (!address) {
+		dst = src->regs[0];
+
+		src->barrier_class |= IR3_BARRIER_ARRAY_W;
+		src->barrier_conflict |= IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
+
+		dst->flags |= IR3_REG_ARRAY;
+		dst->instr = arr->last_write;
+		dst->size = arr->length;
+		dst->array.id = arr->id;
+		dst->array.offset = n;
+
+		arr->last_write = src;
+
+		array_insert(block, block->keeps, src);
+
+		return;
+	}
+
+	mov = ir3_instr_create(block, OPC_MOV);
+	mov->cat1.src_type = TYPE_U32;
+	mov->cat1.dst_type = TYPE_U32;
+	mov->barrier_class = IR3_BARRIER_ARRAY_W;
+	mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
+	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+			COND(address, IR3_REG_RELATIV));
+	dst->instr = arr->last_write;
+	dst->size  = arr->length;
+	dst->array.id = arr->id;
+	dst->array.offset = n;
+	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
+
+	if (address)
+		ir3_instr_set_address(mov, address);
+
+	arr->last_write = mov;
+
+	/* the array store may only matter to something in an earlier
+	 * block (ie. loops), but since arrays are not in SSA, depth
+	 * pass won't know this.. so keep all array stores:
+	 */
+	array_insert(block, block->keeps, mov);
+}
+
+static inline type_t utype_for_size(unsigned bit_size)
+{
+	switch (bit_size) {
+	case 32: return TYPE_U32;
+	case 16: return TYPE_U16;
+	case  8: return TYPE_U8;
+	default: unreachable("bad bitsize"); return ~0;
+	}
+}
+
+static inline type_t utype_src(nir_src src)
+{ return utype_for_size(nir_src_bit_size(src)); }
+
+static inline type_t utype_dst(nir_dest dst)
+{ return utype_for_size(nir_dest_bit_size(dst)); }
+
+/* allocate a n element value array (to be populated by caller) and
+ * insert in def_ht
+ */
+static struct ir3_instruction **
+get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n)
+{
+	struct ir3_instruction **value =
+		ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
+	_mesa_hash_table_insert(ctx->def_ht, dst, value);
+	return value;
+}
+
+static struct ir3_instruction **
+get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n)
+{
+	struct ir3_instruction **value;
+
+	if (dst->is_ssa) {
+		value = get_dst_ssa(ctx, &dst->ssa, n);
+	} else {
+		value = ralloc_array(ctx, struct ir3_instruction *, n);
+	}
+
+	/* NOTE: in non-ssa case, we don't really need to store last_dst
+	 * but this helps us catch cases where put_dst() call is forgotten
+	 */
+	compile_assert(ctx, !ctx->last_dst);
+	ctx->last_dst = value;
+	ctx->last_dst_n = n;
+
+	return value;
+}
+
+static struct ir3_instruction * get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align);
+
+static struct ir3_instruction * const *
+get_src(struct ir3_context *ctx, nir_src *src)
+{
+	if (src->is_ssa) {
+		struct hash_entry *entry;
+		entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
+		compile_assert(ctx, entry);
+		return entry->data;
+	} else {
+		nir_register *reg = src->reg.reg;
+		struct ir3_array *arr = get_array(ctx, reg);
+		unsigned num_components = arr->r->num_components;
+		struct ir3_instruction *addr = NULL;
+		struct ir3_instruction **value =
+			ralloc_array(ctx, struct ir3_instruction *, num_components);
+
+		if (src->reg.indirect)
+			addr = get_addr(ctx, get_src(ctx, src->reg.indirect)[0],
+					reg->num_components);
+
+		for (unsigned i = 0; i < num_components; i++) {
+			unsigned n = src->reg.base_offset * reg->num_components + i;
+			compile_assert(ctx, n < arr->length);
+			value[i] = create_array_load(ctx, arr, n, addr);
+		}
+
+		return value;
+	}
+}
+
+static void
+put_dst(struct ir3_context *ctx, nir_dest *dst)
+{
+	unsigned bit_size = nir_dest_bit_size(*dst);
+
+	if (bit_size < 32) {
+		for (unsigned i = 0; i < ctx->last_dst_n; i++) {
+			struct ir3_instruction *dst = ctx->last_dst[i];
+			dst->regs[0]->flags |= IR3_REG_HALF;
+			if (ctx->last_dst[i]->opc == OPC_META_FO)
+				dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF;
+		}
+	}
+
+	if (!dst->is_ssa) {
+		nir_register *reg = dst->reg.reg;
+		struct ir3_array *arr = get_array(ctx, reg);
+		unsigned num_components = ctx->last_dst_n;
+		struct ir3_instruction *addr = NULL;
+
+		if (dst->reg.indirect)
+			addr = get_addr(ctx, get_src(ctx, dst->reg.indirect)[0],
+					reg->num_components);
+
+		for (unsigned i = 0; i < num_components; i++) {
+			unsigned n = dst->reg.base_offset * reg->num_components + i;
+			compile_assert(ctx, n < arr->length);
+			if (!ctx->last_dst[i])
+				continue;
+			create_array_store(ctx, arr, n, ctx->last_dst[i], addr);
+		}
+
+		ralloc_free(ctx->last_dst);
+	}
+	ctx->last_dst = NULL;
+	ctx->last_dst_n = 0;
+}
+
+static struct ir3_instruction *
+create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
+{
+	struct ir3_instruction *mov;
+	unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
+
+	mov = ir3_instr_create(block, OPC_MOV);
+	mov->cat1.src_type = type;
+	mov->cat1.dst_type = type;
+	ir3_reg_create(mov, 0, flags);
+	ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
+
+	return mov;
+}
+
+static struct ir3_instruction *
+create_immed(struct ir3_block *block, uint32_t val)
+{
+	return create_immed_typed(block, val, TYPE_U32);
+}
+
+static struct ir3_instruction *
+create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
+{
+	struct ir3_instruction *instr, *immed;
+
+	/* TODO in at least some cases, the backend could probably be
+	 * made clever enough to propagate IR3_REG_HALF..
+	 */
+	instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
+	instr->regs[0]->flags |= IR3_REG_HALF;
+
+	switch(align){
+	case 1:
+		/* src *= 1: */
+		break;
+	case 2:
+		/* src *= 2	=> src <<= 1: */
+		immed = create_immed(block, 1);
+		immed->regs[0]->flags |= IR3_REG_HALF;
+
+		instr = ir3_SHL_B(block, instr, 0, immed, 0);
+		instr->regs[0]->flags |= IR3_REG_HALF;
+		instr->regs[1]->flags |= IR3_REG_HALF;
+		break;
+	case 3:
+		/* src *= 3: */
+		immed = create_immed(block, 3);
+		immed->regs[0]->flags |= IR3_REG_HALF;
+
+		instr = ir3_MULL_U(block, instr, 0, immed, 0);
+		instr->regs[0]->flags |= IR3_REG_HALF;
+		instr->regs[1]->flags |= IR3_REG_HALF;
+		break;
+	case 4:
+		/* src *= 4 => src <<= 2: */
+		immed = create_immed(block, 2);
+		immed->regs[0]->flags |= IR3_REG_HALF;
+
+		instr = ir3_SHL_B(block, instr, 0, immed, 0);
+		instr->regs[0]->flags |= IR3_REG_HALF;
+		instr->regs[1]->flags |= IR3_REG_HALF;
+		break;
+	default:
+		unreachable("bad align");
+		return NULL;
+	}
+
+	instr = ir3_MOV(block, instr, TYPE_S16);
+	instr->regs[0]->num = regid(REG_A0, 0);
+	instr->regs[0]->flags |= IR3_REG_HALF;
+	instr->regs[1]->flags |= IR3_REG_HALF;
+
+	return instr;
+}
+
+/* caches addr values to avoid generating multiple cov/shl/mova
+ * sequences for each use of a given NIR level src as address
+ */
+static struct ir3_instruction *
+get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align)
+{
+	struct ir3_instruction *addr;
+	unsigned idx = align - 1;
+
+	compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht));
+
+	if (!ctx->addr_ht[idx]) {
+		ctx->addr_ht[idx] = _mesa_hash_table_create(ctx,
+				_mesa_hash_pointer, _mesa_key_pointer_equal);
+	} else {
+		struct hash_entry *entry;
+		entry = _mesa_hash_table_search(ctx->addr_ht[idx], src);
+		if (entry)
+			return entry->data;
+	}
+
+	addr = create_addr(ctx->block, src, align);
+	_mesa_hash_table_insert(ctx->addr_ht[idx], src, addr);
+
+	return addr;
+}
+
+static struct ir3_instruction *
+get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *cond;
+
+	/* NOTE: only cmps.*.* can write p0.x: */
+	cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
+	cond->cat2.condition = IR3_COND_NE;
+
+	/* condition always goes in predicate register: */
+	cond->regs[0]->num = regid(REG_P0, 0);
+
+	return cond;
+}
+
+static struct ir3_instruction *
+create_uniform(struct ir3_context *ctx, unsigned n)
+{
+	struct ir3_instruction *mov;
+
+	mov = ir3_instr_create(ctx->block, OPC_MOV);
+	/* TODO get types right? */
+	mov->cat1.src_type = TYPE_F32;
+	mov->cat1.dst_type = TYPE_F32;
+	ir3_reg_create(mov, 0, 0);
+	ir3_reg_create(mov, n, IR3_REG_CONST);
+
+	return mov;
+}
+
+static struct ir3_instruction *
+create_uniform_indirect(struct ir3_context *ctx, int n,
+		struct ir3_instruction *address)
+{
+	struct ir3_instruction *mov;
+
+	mov = ir3_instr_create(ctx->block, OPC_MOV);
+	mov->cat1.src_type = TYPE_U32;
+	mov->cat1.dst_type = TYPE_U32;
+	ir3_reg_create(mov, 0, 0);
+	ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
+
+	ir3_instr_set_address(mov, address);
+
+	return mov;
+}
+
+static struct ir3_instruction *
+create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
+		unsigned arrsz)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *collect;
+
+	if (arrsz == 0)
+		return NULL;
+
+	unsigned flags = arr[0]->regs[0]->flags & IR3_REG_HALF;
+
+	collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz);
+	ir3_reg_create(collect, 0, flags);     /* dst */
+	for (unsigned i = 0; i < arrsz; i++) {
+		struct ir3_instruction *elem = arr[i];
+
+		/* Since arrays are pre-colored in RA, we can't assume that
+		 * things will end up in the right place.  (Ie. if a collect
+		 * joins elements from two different arrays.)  So insert an
+		 * extra mov.
+		 *
+		 * We could possibly skip this if all the collected elements
+		 * are contiguous elements in a single array.. not sure how
+		 * likely that is to happen.
+		 *
+		 * Fixes a problem with glamor shaders, that in effect do
+		 * something like:
+		 *
+		 *   if (foo)
+		 *     texcoord = ..
+		 *   else
+		 *     texcoord = ..
+		 *   color = texture2D(tex, texcoord);
+		 *
+		 * In this case, texcoord will end up as nir registers (which
+		 * translate to ir3 array's of length 1.  And we can't assume
+		 * the two (or more) arrays will get allocated in consecutive
+		 * scalar registers.
+		 *
+		 */
+		if (elem->regs[0]->flags & IR3_REG_ARRAY) {
+			type_t type = (flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+			elem = ir3_MOV(block, elem, type);
+		}
+
+		compile_assert(ctx, (elem->regs[0]->flags & IR3_REG_HALF) == flags);
+		ir3_reg_create(collect, 0, IR3_REG_SSA | flags)->instr = elem;
+	}
+
+	return collect;
+}
+
+static struct ir3_instruction *
+create_indirect_load(struct ir3_context *ctx, unsigned arrsz, int n,
+		struct ir3_instruction *address, struct ir3_instruction *collect)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *mov;
+	struct ir3_register *src;
+
+	mov = ir3_instr_create(block, OPC_MOV);
+	mov->cat1.src_type = TYPE_U32;
+	mov->cat1.dst_type = TYPE_U32;
+	ir3_reg_create(mov, 0, 0);
+	src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
+	src->instr = collect;
+	src->size  = arrsz;
+	src->array.offset = n;
+
+	ir3_instr_set_address(mov, address);
+
+	return mov;
+}
+
+static struct ir3_instruction *
+create_input_compmask(struct ir3_context *ctx, unsigned n, unsigned compmask)
+{
+	struct ir3_instruction *in;
+
+	in = ir3_instr_create(ctx->in_block, OPC_META_INPUT);
+	in->inout.block = ctx->in_block;
+	ir3_reg_create(in, n, 0);
+
+	in->regs[0]->wrmask = compmask;
+
+	return in;
+}
+
+static struct ir3_instruction *
+create_input(struct ir3_context *ctx, unsigned n)
+{
+	return create_input_compmask(ctx, n, 0x1);
+}
+
+static struct ir3_instruction *
+create_frag_input(struct ir3_context *ctx, bool use_ldlv)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *instr;
+	/* actual inloc is assigned and fixed up later: */
+	struct ir3_instruction *inloc = create_immed(block, 0);
+
+	if (use_ldlv) {
+		instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
+		instr->cat6.type = TYPE_U32;
+		instr->cat6.iim_val = 1;
+	} else {
+		instr = ir3_BARY_F(block, inloc, 0, ctx->frag_vcoord, 0);
+		instr->regs[2]->wrmask = 0x3;
+	}
+
+	return instr;
+}
+
+static struct ir3_instruction *
+create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
+{
+	/* first four vec4 sysval's reserved for UBOs: */
+	/* NOTE: dp is in scalar, but there can be >4 dp components: */
+	unsigned n = ctx->so->constbase.driver_param;
+	unsigned r = regid(n + dp / 4, dp % 4);
+	return create_uniform(ctx, r);
+}
+
+/* helper for instructions that produce multiple consecutive scalar
+ * outputs which need to have a split/fanout meta instruction inserted
+ */
+static void
+split_dest(struct ir3_block *block, struct ir3_instruction **dst,
+		struct ir3_instruction *src, unsigned base, unsigned n)
+{
+	struct ir3_instruction *prev = NULL;
+
+	if ((n == 1) && (src->regs[0]->wrmask == 0x1)) {
+		dst[0] = src;
+		return;
+	}
+
+	for (int i = 0, j = 0; i < n; i++) {
+		struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO);
+		ir3_reg_create(split, 0, IR3_REG_SSA);
+		ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src;
+		split->fo.off = i + base;
+
+		if (prev) {
+			split->cp.left = prev;
+			split->cp.left_cnt++;
+			prev->cp.right = split;
+			prev->cp.right_cnt++;
+		}
+		prev = split;
+
+		if (src->regs[0]->wrmask & (1 << (i + base)))
+			dst[j++] = split;
+	}
+}
+
+/*
+ * Adreno uses uint rather than having dedicated bool type,
+ * which (potentially) requires some conversion, in particular
+ * when using output of an bool instr to int input, or visa
+ * versa.
+ *
+ *         | Adreno  |  NIR  |
+ *  -------+---------+-------+-
+ *   true  |    1    |  ~0   |
+ *   false |    0    |   0   |
+ *
+ * To convert from an adreno bool (uint) to nir, use:
+ *
+ *    absneg.s dst, (neg)src
+ *
+ * To convert back in the other direction:
+ *
+ *    absneg.s dst, (abs)arc
+ *
+ * The CP step can clean up the absneg.s that cancel each other
+ * out, and with a slight bit of extra cleverness (to recognize
+ * the instructions which produce either a 0 or 1) can eliminate
+ * the absneg.s's completely when an instruction that wants
+ * 0/1 consumes the result.  For example, when a nir 'bcsel'
+ * consumes the result of 'feq'.  So we should be able to get by
+ * without a boolean resolve step, and without incuring any
+ * extra penalty in instruction count.
+ */
+
+/* NIR bool -> native (adreno): */
+static struct ir3_instruction *
+ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr)
+{
+	return ir3_ABSNEG_S(block, instr, IR3_REG_SABS);
+}
+
+/* native (adreno) -> NIR bool: */
+static struct ir3_instruction *
+ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr)
+{
+	return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG);
+}
+
+/*
+ * alu/sfu instructions:
+ */
+
+static struct ir3_instruction *
+create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
+		unsigned src_bitsize, nir_op op)
+{
+	type_t src_type, dst_type;
+
+	switch (op) {
+	case nir_op_f2f32:
+	case nir_op_f2f16_rtne:
+	case nir_op_f2f16_rtz:
+	case nir_op_f2f16:
+	case nir_op_f2i32:
+	case nir_op_f2i16:
+	case nir_op_f2i8:
+	case nir_op_f2u32:
+	case nir_op_f2u16:
+	case nir_op_f2u8:
+		switch (src_bitsize) {
+		case 32:
+			src_type = TYPE_F32;
+			break;
+		case 16:
+			src_type = TYPE_F16;
+			break;
+		default:
+			compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+		}
+		break;
+
+	case nir_op_i2f32:
+	case nir_op_i2f16:
+	case nir_op_i2i32:
+	case nir_op_i2i16:
+	case nir_op_i2i8:
+		switch (src_bitsize) {
+		case 32:
+			src_type = TYPE_S32;
+			break;
+		case 16:
+			src_type = TYPE_S16;
+			break;
+		case 8:
+			src_type = TYPE_S8;
+			break;
+		default:
+			compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+		}
+		break;
+
+	case nir_op_u2f32:
+	case nir_op_u2f16:
+	case nir_op_u2u32:
+	case nir_op_u2u16:
+	case nir_op_u2u8:
+		switch (src_bitsize) {
+		case 32:
+			src_type = TYPE_U32;
+			break;
+		case 16:
+			src_type = TYPE_U16;
+			break;
+		case 8:
+			src_type = TYPE_U8;
+			break;
+		default:
+			compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+		}
+		break;
+
+	default:
+		compile_error(ctx, "invalid conversion op: %u", op);
+	}
+
+	switch (op) {
+	case nir_op_f2f32:
+	case nir_op_i2f32:
+	case nir_op_u2f32:
+		dst_type = TYPE_F32;
+		break;
+
+	case nir_op_f2f16_rtne:
+	case nir_op_f2f16_rtz:
+	case nir_op_f2f16:
+		/* TODO how to handle rounding mode? */
+	case nir_op_i2f16:
+	case nir_op_u2f16:
+		dst_type = TYPE_F16;
+		break;
+
+	case nir_op_f2i32:
+	case nir_op_i2i32:
+		dst_type = TYPE_S32;
+		break;
+
+	case nir_op_f2i16:
+	case nir_op_i2i16:
+		dst_type = TYPE_S16;
+		break;
+
+	case nir_op_f2i8:
+	case nir_op_i2i8:
+		dst_type = TYPE_S8;
+		break;
+
+	case nir_op_f2u32:
+	case nir_op_u2u32:
+		dst_type = TYPE_U32;
+		break;
+
+	case nir_op_f2u16:
+	case nir_op_u2u16:
+		dst_type = TYPE_U16;
+		break;
+
+	case nir_op_f2u8:
+	case nir_op_u2u8:
+		dst_type = TYPE_U8;
+		break;
+
+	default:
+		compile_error(ctx, "invalid conversion op: %u", op);
+	}
+
+	return ir3_COV(ctx->block, src, src_type, dst_type);
+}
+
+static void
+emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
+{
+	const nir_op_info *info = &nir_op_infos[alu->op];
+	struct ir3_instruction **dst, *src[info->num_inputs];
+	unsigned bs[info->num_inputs];     /* bit size */
+	struct ir3_block *b = ctx->block;
+	unsigned dst_sz, wrmask;
+
+	if (alu->dest.dest.is_ssa) {
+		dst_sz = alu->dest.dest.ssa.num_components;
+		wrmask = (1 << dst_sz) - 1;
+	} else {
+		dst_sz = alu->dest.dest.reg.reg->num_components;
+		wrmask = alu->dest.write_mask;
+	}
+
+	dst = get_dst(ctx, &alu->dest.dest, dst_sz);
+
+	/* Vectors are special in that they have non-scalarized writemasks,
+	 * and just take the first swizzle channel for each argument in
+	 * order into each writemask channel.
+	 */
+	if ((alu->op == nir_op_vec2) ||
+			(alu->op == nir_op_vec3) ||
+			(alu->op == nir_op_vec4)) {
+
+		for (int i = 0; i < info->num_inputs; i++) {
+			nir_alu_src *asrc = &alu->src[i];
+
+			compile_assert(ctx, !asrc->abs);
+			compile_assert(ctx, !asrc->negate);
+
+			src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]];
+			if (!src[i])
+				src[i] = create_immed(ctx->block, 0);
+			dst[i] = ir3_MOV(b, src[i], TYPE_U32);
+		}
+
+		put_dst(ctx, &alu->dest.dest);
+		return;
+	}
+
+	/* We also get mov's with more than one component for mov's so
+	 * handle those specially:
+	 */
+	if ((alu->op == nir_op_imov) || (alu->op == nir_op_fmov)) {
+		type_t type = (alu->op == nir_op_imov) ? TYPE_U32 : TYPE_F32;
+		nir_alu_src *asrc = &alu->src[0];
+		struct ir3_instruction *const *src0 = get_src(ctx, &asrc->src);
+
+		for (unsigned i = 0; i < dst_sz; i++) {
+			if (wrmask & (1 << i)) {
+				dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], type);
+			} else {
+				dst[i] = NULL;
+			}
+		}
+
+		put_dst(ctx, &alu->dest.dest);
+		return;
+	}
+
+	/* General case: We can just grab the one used channel per src. */
+	for (int i = 0; i < info->num_inputs; i++) {
+		unsigned chan = ffs(alu->dest.write_mask) - 1;
+		nir_alu_src *asrc = &alu->src[i];
+
+		compile_assert(ctx, !asrc->abs);
+		compile_assert(ctx, !asrc->negate);
+
+		src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
+		bs[i] = nir_src_bit_size(asrc->src);
+
+		compile_assert(ctx, src[i]);
+	}
+
+	switch (alu->op) {
+	case nir_op_f2f32:
+	case nir_op_f2f16_rtne:
+	case nir_op_f2f16_rtz:
+	case nir_op_f2f16:
+	case nir_op_f2i32:
+	case nir_op_f2i16:
+	case nir_op_f2i8:
+	case nir_op_f2u32:
+	case nir_op_f2u16:
+	case nir_op_f2u8:
+	case nir_op_i2f32:
+	case nir_op_i2f16:
+	case nir_op_i2i32:
+	case nir_op_i2i16:
+	case nir_op_i2i8:
+	case nir_op_u2f32:
+	case nir_op_u2f16:
+	case nir_op_u2u32:
+	case nir_op_u2u16:
+	case nir_op_u2u8:
+		dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
+		break;
+	case nir_op_f2b:
+		dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0);
+		dst[0]->cat2.condition = IR3_COND_NE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_b2f:
+		dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32);
+		break;
+	case nir_op_b2i:
+		dst[0] = ir3_b2n(b, src[0]);
+		break;
+	case nir_op_i2b:
+		dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
+		dst[0]->cat2.condition = IR3_COND_NE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+
+	case nir_op_fneg:
+		dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
+		break;
+	case nir_op_fabs:
+		dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
+		break;
+	case nir_op_fmax:
+		dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_fmin:
+		dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_fsat:
+		/* if there is just a single use of the src, and it supports
+		 * (sat) bit, we can just fold the (sat) flag back to the
+		 * src instruction and create a mov.  This is easier for cp
+		 * to eliminate.
+		 *
+		 * TODO probably opc_cat==4 is ok too
+		 */
+		if (alu->src[0].src.is_ssa &&
+				(list_length(&alu->src[0].src.ssa->uses) == 1) &&
+				((opc_cat(src[0]->opc) == 2) || (opc_cat(src[0]->opc) == 3))) {
+			src[0]->flags |= IR3_INSTR_SAT;
+			dst[0] = ir3_MOV(b, src[0], TYPE_U32);
+		} else {
+			/* otherwise generate a max.f that saturates.. blob does
+			 * similar (generating a cat2 mov using max.f)
+			 */
+			dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
+			dst[0]->flags |= IR3_INSTR_SAT;
+		}
+		break;
+	case nir_op_fmul:
+		dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_fadd:
+		dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_fsub:
+		dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
+		break;
+	case nir_op_ffma:
+		dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
+		break;
+	case nir_op_fddx:
+		dst[0] = ir3_DSX(b, src[0], 0);
+		dst[0]->cat5.type = TYPE_F32;
+		break;
+	case nir_op_fddy:
+		dst[0] = ir3_DSY(b, src[0], 0);
+		dst[0]->cat5.type = TYPE_F32;
+		break;
+		break;
+	case nir_op_flt:
+		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_LT;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_fge:
+		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_GE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_feq:
+		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_EQ;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_fne:
+		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_NE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_fceil:
+		dst[0] = ir3_CEIL_F(b, src[0], 0);
+		break;
+	case nir_op_ffloor:
+		dst[0] = ir3_FLOOR_F(b, src[0], 0);
+		break;
+	case nir_op_ftrunc:
+		dst[0] = ir3_TRUNC_F(b, src[0], 0);
+		break;
+	case nir_op_fround_even:
+		dst[0] = ir3_RNDNE_F(b, src[0], 0);
+		break;
+	case nir_op_fsign:
+		dst[0] = ir3_SIGN_F(b, src[0], 0);
+		break;
+
+	case nir_op_fsin:
+		dst[0] = ir3_SIN(b, src[0], 0);
+		break;
+	case nir_op_fcos:
+		dst[0] = ir3_COS(b, src[0], 0);
+		break;
+	case nir_op_frsq:
+		dst[0] = ir3_RSQ(b, src[0], 0);
+		break;
+	case nir_op_frcp:
+		dst[0] = ir3_RCP(b, src[0], 0);
+		break;
+	case nir_op_flog2:
+		dst[0] = ir3_LOG2(b, src[0], 0);
+		break;
+	case nir_op_fexp2:
+		dst[0] = ir3_EXP2(b, src[0], 0);
+		break;
+	case nir_op_fsqrt:
+		dst[0] = ir3_SQRT(b, src[0], 0);
+		break;
+
+	case nir_op_iabs:
+		dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
+		break;
+	case nir_op_iadd:
+		dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_iand:
+		dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_imax:
+		dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_umax:
+		dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_imin:
+		dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_umin:
+		dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_imul:
+		/*
+		 * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
+		 *   mull.u tmp0, a, b           ; mul low, i.e. al * bl
+		 *   madsh.m16 tmp1, a, b, tmp0  ; mul-add shift high mix, i.e. ah * bl << 16
+		 *   madsh.m16 dst, b, a, tmp1   ; i.e. al * bh << 16
+		 */
+		dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0,
+					ir3_MADSH_M16(b, src[0], 0, src[1], 0,
+						ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0);
+		break;
+	case nir_op_ineg:
+		dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
+		break;
+	case nir_op_inot:
+		dst[0] = ir3_NOT_B(b, src[0], 0);
+		break;
+	case nir_op_ior:
+		dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_ishl:
+		dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_ishr:
+		dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_isign: {
+		/* maybe this would be sane to lower in nir.. */
+		struct ir3_instruction *neg, *pos;
+
+		neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
+		neg->cat2.condition = IR3_COND_LT;
+
+		pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
+		pos->cat2.condition = IR3_COND_GT;
+
+		dst[0] = ir3_SUB_U(b, pos, 0, neg, 0);
+
+		break;
+	}
+	case nir_op_isub:
+		dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_ixor:
+		dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_ushr:
+		dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_ilt:
+		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_LT;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_ige:
+		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_GE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_ieq:
+		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_EQ;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_ine:
+		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_NE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_ult:
+		dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_LT;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_uge:
+		dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_GE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+
+	case nir_op_bcsel: {
+		struct ir3_instruction *cond = ir3_b2n(b, src[0]);
+		compile_assert(ctx, bs[1] == bs[2]);
+		/* the boolean condition is 32b even if src[1] and src[2] are
+		 * half-precision, but sel.b16 wants all three src's to be the
+		 * same type.
+		 */
+		if (bs[1] < 32)
+			cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
+		dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
+		break;
+	}
+	case nir_op_bit_count:
+		dst[0] = ir3_CBITS_B(b, src[0], 0);
+		break;
+	case nir_op_ifind_msb: {
+		struct ir3_instruction *cmp;
+		dst[0] = ir3_CLZ_S(b, src[0], 0);
+		cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
+		cmp->cat2.condition = IR3_COND_GE;
+		dst[0] = ir3_SEL_B32(b,
+				ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
+				cmp, 0, dst[0], 0);
+		break;
+	}
+	case nir_op_ufind_msb:
+		dst[0] = ir3_CLZ_B(b, src[0], 0);
+		dst[0] = ir3_SEL_B32(b,
+				ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
+				src[0], 0, dst[0], 0);
+		break;
+	case nir_op_find_lsb:
+		dst[0] = ir3_BFREV_B(b, src[0], 0);
+		dst[0] = ir3_CLZ_B(b, dst[0], 0);
+		break;
+	case nir_op_bitfield_reverse:
+		dst[0] = ir3_BFREV_B(b, src[0], 0);
+		break;
+
+	default:
+		compile_error(ctx, "Unhandled ALU op: %s\n",
+				nir_op_infos[alu->op].name);
+		break;
+	}
+
+	put_dst(ctx, &alu->dest.dest);
+}
+
+/* handles direct/indirect UBO reads: */
+static void
+emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
+	nir_const_value *const_offset;
+	/* UBO addresses are the first driver params: */
+	unsigned ubo = regid(ctx->so->constbase.ubo, 0);
+	const unsigned ptrsz = pointer_size(ctx);
+
+	int off = 0;
+
+	/* First src is ubo index, which could either be an immed or not: */
+	src0 = get_src(ctx, &intr->src[0])[0];
+	if (is_same_type_mov(src0) &&
+			(src0->regs[1]->flags & IR3_REG_IMMED)) {
+		base_lo = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz));
+		base_hi = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz) + 1);
+	} else {
+		base_lo = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0, 4));
+		base_hi = create_uniform_indirect(ctx, ubo + 1, get_addr(ctx, src0, 4));
+	}
+
+	/* note: on 32bit gpu's base_hi is ignored and DCE'd */
+	addr = base_lo;
+
+	const_offset = nir_src_as_const_value(intr->src[1]);
+	if (const_offset) {
+		off += const_offset->u32[0];
+	} else {
+		/* For load_ubo_indirect, second src is indirect offset: */
+		src1 = get_src(ctx, &intr->src[1])[0];
+
+		/* and add offset to addr: */
+		addr = ir3_ADD_S(b, addr, 0, src1, 0);
+	}
+
+	/* if offset is to large to encode in the ldg, split it out: */
+	if ((off + (intr->num_components * 4)) > 1024) {
+		/* split out the minimal amount to improve the odds that
+		 * cp can fit the immediate in the add.s instruction:
+		 */
+		unsigned off2 = off + (intr->num_components * 4) - 1024;
+		addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
+		off -= off2;
+	}
+
+	if (ptrsz == 2) {
+		struct ir3_instruction *carry;
+
+		/* handle 32b rollover, ie:
+		 *   if (addr < base_lo)
+		 *      base_hi++
+		 */
+		carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
+		carry->cat2.condition = IR3_COND_LT;
+		base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
+
+		addr = create_collect(ctx, (struct ir3_instruction*[]){ addr, base_hi }, 2);
+	}
+
+	for (int i = 0; i < intr->num_components; i++) {
+		struct ir3_instruction *load =
+				ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
+		load->cat6.type = TYPE_U32;
+		load->cat6.src_offset = off + i * 4;     /* byte offset */
+		dst[i] = load;
+	}
+}
+
+/* src[] = { buffer_index, offset }. No const_index */
+static void
+emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *ldgb, *src0, *src1, *offset;
+	nir_const_value *const_offset;
+
+	/* can this be non-const buffer_index?  how do we handle that? */
+	const_offset = nir_src_as_const_value(intr->src[0]);
+	compile_assert(ctx, const_offset);
+
+	offset = get_src(ctx, &intr->src[1])[0];
+
+	/* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
+	src0 = create_collect(ctx, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
+	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+
+	ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0,
+			src0, 0, src1, 0);
+	ldgb->regs[0]->wrmask = MASK(intr->num_components);
+	ldgb->cat6.iim_val = intr->num_components;
+	ldgb->cat6.d = 4;
+	ldgb->cat6.type = TYPE_U32;
+	ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
+	ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
+
+	split_dest(b, dst, ldgb, 0, intr->num_components);
+}
+
+/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
+static void
+emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *stgb, *src0, *src1, *src2, *offset;
+	nir_const_value *const_offset;
+	/* TODO handle wrmask properly, see _store_shared().. but I think
+	 * it is more a PITA than that, since blob ends up loading the
+	 * masked components and writing them back out.
+	 */
+	unsigned wrmask = intr->const_index[0];
+	unsigned ncomp = ffs(~wrmask) - 1;
+
+	/* can this be non-const buffer_index?  how do we handle that? */
+	const_offset = nir_src_as_const_value(intr->src[1]);
+	compile_assert(ctx, const_offset);
+
+	offset = get_src(ctx, &intr->src[2])[0];
+
+	/* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
+	 * nir already *= 4:
+	 */
+	src0 = create_collect(ctx, get_src(ctx, &intr->src[0]), ncomp);
+	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+	src2 = create_collect(ctx, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
+
+	stgb = ir3_STGB(b, create_immed(b, const_offset->u32[0]), 0,
+			src0, 0, src1, 0, src2, 0);
+	stgb->cat6.iim_val = ncomp;
+	stgb->cat6.d = 4;
+	stgb->cat6.type = TYPE_U32;
+	stgb->barrier_class = IR3_BARRIER_BUFFER_W;
+	stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+
+	array_insert(b, b->keeps, stgb);
+}
+
+/* src[] = { block_index } */
+static void
+emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	/* SSBO size stored as a const starting at ssbo_sizes: */
+	unsigned blk_idx = nir_src_as_const_value(intr->src[0])->u32[0];
+	unsigned idx = regid(ctx->so->constbase.ssbo_sizes, 0) +
+		ctx->so->const_layout.ssbo_size.off[blk_idx];
+
+	debug_assert(ctx->so->const_layout.ssbo_size.mask & (1 << blk_idx));
+
+	dst[0] = create_uniform(ctx, idx);
+}
+
+/*
+ * SSBO atomic intrinsics
+ *
+ * All of the SSBO atomic memory operations read a value from memory,
+ * compute a new value using one of the operations below, write the new
+ * value to memory, and return the original value read.
+ *
+ * All operations take 3 sources except CompSwap that takes 4. These
+ * sources represent:
+ *
+ * 0: The SSBO buffer index.
+ * 1: The offset into the SSBO buffer of the variable that the atomic
+ *    operation will operate on.
+ * 2: The data parameter to the atomic function (i.e. the value to add
+ *    in ssbo_atomic_add, etc).
+ * 3: For CompSwap only: the second data parameter.
+ */
+static struct ir3_instruction *
+emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *offset;
+	nir_const_value *const_offset;
+	type_t type = TYPE_U32;
+
+	/* can this be non-const buffer_index?  how do we handle that? */
+	const_offset = nir_src_as_const_value(intr->src[0]);
+	compile_assert(ctx, const_offset);
+	ssbo = create_immed(b, const_offset->u32[0]);
+
+	offset = get_src(ctx, &intr->src[1])[0];
+
+	/* src0 is data (or uvec2(data, compare))
+	 * src1 is offset
+	 * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
+	 *
+	 * Note that nir already multiplies the offset by four
+	 */
+	src0 = get_src(ctx, &intr->src[2])[0];
+	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+	src2 = create_collect(ctx, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
+
+	switch (intr->intrinsic) {
+	case nir_intrinsic_ssbo_atomic_add:
+		atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_imin:
+		atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		type = TYPE_S32;
+		break;
+	case nir_intrinsic_ssbo_atomic_umin:
+		atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_imax:
+		atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		type = TYPE_S32;
+		break;
+	case nir_intrinsic_ssbo_atomic_umax:
+		atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_and:
+		atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_or:
+		atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_xor:
+		atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_exchange:
+		atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_comp_swap:
+		/* for cmpxchg, src0 is [ui]vec2(data, compare): */
+		src0 = create_collect(ctx, (struct ir3_instruction*[]){
+			get_src(ctx, &intr->src[3])[0],
+			src0,
+		}, 2);
+		atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	default:
+		unreachable("boo");
+	}
+
+	atomic->cat6.iim_val = 1;
+	atomic->cat6.d = 4;
+	atomic->cat6.type = type;
+	atomic->barrier_class = IR3_BARRIER_BUFFER_W;
+	atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+
+	/* even if nothing consume the result, we can't DCE the instruction: */
+	array_insert(b, b->keeps, atomic);
+
+	return atomic;
+}
+
+/* src[] = { offset }. const_index[] = { base } */
+static void
+emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *ldl, *offset;
+	unsigned base;
+
+	offset = get_src(ctx, &intr->src[0])[0];
+	base   = nir_intrinsic_base(intr);
+
+	ldl = ir3_LDL(b, offset, 0, create_immed(b, intr->num_components), 0);
+	ldl->cat6.src_offset = base;
+	ldl->cat6.type = utype_dst(intr->dest);
+	ldl->regs[0]->wrmask = MASK(intr->num_components);
+
+	ldl->barrier_class = IR3_BARRIER_SHARED_R;
+	ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
+
+	split_dest(b, dst, ldl, 0, intr->num_components);
+}
+
+/* src[] = { value, offset }. const_index[] = { base, write_mask } */
+static void
+emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *stl, *offset;
+	struct ir3_instruction * const *value;
+	unsigned base, wrmask;
+
+	value  = get_src(ctx, &intr->src[0]);
+	offset = get_src(ctx, &intr->src[1])[0];
+
+	base   = nir_intrinsic_base(intr);
+	wrmask = nir_intrinsic_write_mask(intr);
+
+	/* Combine groups of consecutive enabled channels in one write
+	 * message. We use ffs to find the first enabled channel and then ffs on
+	 * the bit-inverse, down-shifted writemask to determine the length of
+	 * the block of enabled bits.
+	 *
+	 * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic())
+	 */
+	while (wrmask) {
+		unsigned first_component = ffs(wrmask) - 1;
+		unsigned length = ffs(~(wrmask >> first_component)) - 1;
+
+		stl = ir3_STL(b, offset, 0,
+			create_collect(ctx, &value[first_component], length), 0,
+			create_immed(b, length), 0);
+		stl->cat6.dst_offset = first_component + base;
+		stl->cat6.type = utype_src(intr->src[0]);
+		stl->barrier_class = IR3_BARRIER_SHARED_W;
+		stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+
+		array_insert(b, b->keeps, stl);
+
+		/* Clear the bits in the writemask that we just wrote, then try
+		 * again to see if more channels are left.
+		 */
+		wrmask &= (15 << (first_component + length));
+	}
+}
+
+/*
+ * CS shared variable atomic intrinsics
+ *
+ * All of the shared variable atomic memory operations read a value from
+ * memory, compute a new value using one of the operations below, write the
+ * new value to memory, and return the original value read.
+ *
+ * All operations take 2 sources except CompSwap that takes 3. These
+ * sources represent:
+ *
+ * 0: The offset into the shared variable storage region that the atomic
+ *    operation will operate on.
+ * 1: The data parameter to the atomic function (i.e. the value to add
+ *    in shared_atomic_add, etc).
+ * 2: For CompSwap only: the second data parameter.
+ */
+static struct ir3_instruction *
+emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *atomic, *src0, *src1;
+	type_t type = TYPE_U32;
+
+	src0 = get_src(ctx, &intr->src[0])[0];   /* offset */
+	src1 = get_src(ctx, &intr->src[1])[0];   /* value */
+
+	switch (intr->intrinsic) {
+	case nir_intrinsic_shared_atomic_add:
+		atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_imin:
+		atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+		type = TYPE_S32;
+		break;
+	case nir_intrinsic_shared_atomic_umin:
+		atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_imax:
+		atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+		type = TYPE_S32;
+		break;
+	case nir_intrinsic_shared_atomic_umax:
+		atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_and:
+		atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_or:
+		atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_xor:
+		atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_exchange:
+		atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_comp_swap:
+		/* for cmpxchg, src1 is [ui]vec2(data, compare): */
+		src1 = create_collect(ctx, (struct ir3_instruction*[]){
+			get_src(ctx, &intr->src[2])[0],
+			src1,
+		}, 2);
+		atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
+		break;
+	default:
+		unreachable("boo");
+	}
+
+	atomic->cat6.iim_val = 1;
+	atomic->cat6.d = 1;
+	atomic->cat6.type = type;
+	atomic->barrier_class = IR3_BARRIER_SHARED_W;
+	atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+
+	/* even if nothing consume the result, we can't DCE the instruction: */
+	array_insert(b, b->keeps, atomic);
+
+	return atomic;
+}
+
+/* Images get mapped into SSBO/image state (for store/atomic) and texture
+ * state block (for load).  To simplify things, invert the image id and
+ * map it from end of state block, ie. image 0 becomes num-1, image 1
+ * becomes num-2, etc.  This potentially avoids needing to re-emit texture
+ * state when switching shaders.
+ *
+ * TODO is max # of samplers and SSBOs the same.  This shouldn't be hard-
+ * coded.  Also, since all the gl shader stages (ie. everything but CS)
+ * share the same SSBO/image state block, this might require some more
+ * logic if we supported images in anything other than FS..
+ */
+static unsigned
+get_image_slot(struct ir3_context *ctx, nir_deref_instr *deref)
+{
+	unsigned int loc = 0;
+	unsigned inner_size = 1;
+
+	while (deref->deref_type != nir_deref_type_var) {
+		assert(deref->deref_type == nir_deref_type_array);
+		nir_const_value *const_index = nir_src_as_const_value(deref->arr.index);
+		assert(const_index);
+
+		/* Go to the next instruction */
+		deref = nir_deref_instr_parent(deref);
+
+		assert(glsl_type_is_array(deref->type));
+		const unsigned array_len = glsl_get_length(deref->type);
+		loc += MIN2(const_index->u32[0], array_len - 1) * inner_size;
+
+		/* Update the inner size */
+		inner_size *= array_len;
+	}
+
+	loc += deref->var->data.driver_location;
+
+	/* TODO figure out real limit per generation, and don't hardcode: */
+	const unsigned max_samplers = 16;
+	return max_samplers - loc - 1;
+}
+
+/* see tex_info() for equiv logic for texture instructions.. it would be
+ * nice if this could be better unified..
+ */
+static unsigned
+get_image_coords(const nir_variable *var, unsigned *flagsp)
+{
+	const struct glsl_type *type = glsl_without_array(var->type);
+	unsigned coords, flags = 0;
+
+	switch (glsl_get_sampler_dim(type)) {
+	case GLSL_SAMPLER_DIM_1D:
+	case GLSL_SAMPLER_DIM_BUF:
+		coords = 1;
+		break;
+	case GLSL_SAMPLER_DIM_2D:
+	case GLSL_SAMPLER_DIM_RECT:
+	case GLSL_SAMPLER_DIM_EXTERNAL:
+	case GLSL_SAMPLER_DIM_MS:
+		coords = 2;
+		break;
+	case GLSL_SAMPLER_DIM_3D:
+	case GLSL_SAMPLER_DIM_CUBE:
+		flags |= IR3_INSTR_3D;
+		coords = 3;
+		break;
+	default:
+		unreachable("bad sampler dim");
+		return 0;
+	}
+
+	if (glsl_sampler_type_is_array(type)) {
+		/* note: unlike tex_info(), adjust # of coords to include array idx: */
+		coords++;
+		flags |= IR3_INSTR_A;
+	}
+
+	if (flagsp)
+		*flagsp = flags;
+
+	return coords;
+}
+
+static type_t
+get_image_type(const nir_variable *var)
+{
+	switch (glsl_get_sampler_result_type(glsl_without_array(var->type))) {
+	case GLSL_TYPE_UINT:
+		return TYPE_U32;
+	case GLSL_TYPE_INT:
+		return TYPE_S32;
+	case GLSL_TYPE_FLOAT:
+		return TYPE_F32;
+	default:
+		unreachable("bad sampler type.");
+		return 0;
+	}
+}
+
+static struct ir3_instruction *
+get_image_offset(struct ir3_context *ctx, const nir_variable *var,
+		struct ir3_instruction * const *coords, bool byteoff)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *offset;
+	unsigned ncoords = get_image_coords(var, NULL);
+
+	/* to calculate the byte offset (yes, uggg) we need (up to) three
+	 * const values to know the bytes per pixel, and y and z stride:
+	 */
+	unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
+		ctx->so->const_layout.image_dims.off[var->data.driver_location];
+
+	debug_assert(ctx->so->const_layout.image_dims.mask &
+			(1 << var->data.driver_location));
+
+	/* offset = coords.x * bytes_per_pixel: */
+	offset = ir3_MUL_S(b, coords[0], 0, create_uniform(ctx, cb + 0), 0);
+	if (ncoords > 1) {
+		/* offset += coords.y * y_pitch: */
+		offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 1), 0,
+				coords[1], 0, offset, 0);
+	}
+	if (ncoords > 2) {
+		/* offset += coords.z * z_pitch: */
+		offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 2), 0,
+				coords[2], 0, offset, 0);
+	}
+
+	if (!byteoff) {
+		/* Some cases, like atomics, seem to use dword offset instead
+		 * of byte offsets.. blob just puts an extra shr.b in there
+		 * in those cases:
+		 */
+		offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+	}
+
+	return create_collect(ctx, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
+}
+
+/* src[] = { deref, coord, sample_index }. const_index[] = {} */
+static void
+emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	struct ir3_block *b = ctx->block;
+	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+	struct ir3_instruction *sam;
+	struct ir3_instruction * const *src0 = get_src(ctx, &intr->src[1]);
+	struct ir3_instruction *coords[4];
+	unsigned flags, ncoords = get_image_coords(var, &flags);
+	unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
+	type_t type = get_image_type(var);
+
+	/* hmm, this seems a bit odd, but it is what blob does and (at least
+	 * a5xx) just faults on bogus addresses otherwise:
+	 */
+	if (flags & IR3_INSTR_3D) {
+		flags &= ~IR3_INSTR_3D;
+		flags |= IR3_INSTR_A;
+	}
+
+	for (unsigned i = 0; i < ncoords; i++)
+		coords[i] = src0[i];
+
+	if (ncoords == 1)
+		coords[ncoords++] = create_immed(b, 0);
+
+	sam = ir3_SAM(b, OPC_ISAM, type, 0b1111, flags,
+			tex_idx, tex_idx, create_collect(ctx, coords, ncoords), NULL);
+
+	sam->barrier_class = IR3_BARRIER_IMAGE_R;
+	sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
+
+	split_dest(b, dst, sam, 0, 4);
+}
+
+/* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
+static void
+emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+	struct ir3_instruction *stib, *offset;
+	struct ir3_instruction * const *value = get_src(ctx, &intr->src[3]);
+	struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
+	unsigned ncoords = get_image_coords(var, NULL);
+	unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
+
+	/* src0 is value
+	 * src1 is coords
+	 * src2 is 64b byte offset
+	 */
+
+	offset = get_image_offset(ctx, var, coords, true);
+
+	/* NOTE: stib seems to take byte offset, but stgb.typed can be used
+	 * too and takes a dword offset.. not quite sure yet why blob uses
+	 * one over the other in various cases.
+	 */
+
+	stib = ir3_STIB(b, create_immed(b, tex_idx), 0,
+			create_collect(ctx, value, 4), 0,
+			create_collect(ctx, coords, ncoords), 0,
+			offset, 0);
+	stib->cat6.iim_val = 4;
+	stib->cat6.d = ncoords;
+	stib->cat6.type = get_image_type(var);
+	stib->cat6.typed = true;
+	stib->barrier_class = IR3_BARRIER_IMAGE_W;
+	stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
+
+	array_insert(b, b->keeps, stib);
+}
+
+static void
+emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	struct ir3_block *b = ctx->block;
+	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+	unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
+	struct ir3_instruction *sam, *lod;
+	unsigned flags, ncoords = get_image_coords(var, &flags);
+
+	lod = create_immed(b, 0);
+	sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags,
+			tex_idx, tex_idx, lod, NULL);
+
+	/* Array size actually ends up in .w rather than .z. This doesn't
+	 * matter for miplevel 0, but for higher mips the value in z is
+	 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
+	 * returned, which means that we have to add 1 to it for arrays for
+	 * a3xx.
+	 *
+	 * Note use a temporary dst and then copy, since the size of the dst
+	 * array that is passed in is based on nir's understanding of the
+	 * result size, not the hardware's
+	 */
+	struct ir3_instruction *tmp[4];
+
+	split_dest(b, tmp, sam, 0, 4);
+
+	/* get_size instruction returns size in bytes instead of texels
+	 * for imageBuffer, so we need to divide it by the pixel size
+	 * of the image format.
+	 *
+	 * TODO: This is at least true on a5xx. Check other gens.
+	 */
+	enum glsl_sampler_dim dim =
+		glsl_get_sampler_dim(glsl_without_array(var->type));
+	if (dim == GLSL_SAMPLER_DIM_BUF) {
+		/* Since all the possible values the divisor can take are
+		 * power-of-two (4, 8, or 16), the division is implemented
+		 * as a shift-right.
+		 * During shader setup, the log2 of the image format's
+		 * bytes-per-pixel should have been emitted in 2nd slot of
+		 * image_dims. See ir3_shader::emit_image_dims().
+		 */
+		unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
+			ctx->so->const_layout.image_dims.off[var->data.driver_location];
+		struct ir3_instruction *aux = create_uniform(ctx, cb + 1);
+
+		tmp[0] = ir3_SHR_B(b, tmp[0], 0, aux, 0);
+	}
+
+	for (unsigned i = 0; i < ncoords; i++)
+		dst[i] = tmp[i];
+
+	if (flags & IR3_INSTR_A) {
+		if (ctx->compiler->levels_add_one) {
+			dst[ncoords-1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
+		} else {
+			dst[ncoords-1] = ir3_MOV(b, tmp[3], TYPE_U32);
+		}
+	}
+}
+
+/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
+static struct ir3_instruction *
+emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+	struct ir3_instruction *atomic, *image, *src0, *src1, *src2;
+	struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
+	unsigned ncoords = get_image_coords(var, NULL);
+
+	image = create_immed(b, get_image_slot(ctx, nir_src_as_deref(intr->src[0])));
+
+	/* src0 is value (or uvec2(value, compare))
+	 * src1 is coords
+	 * src2 is 64b byte offset
+	 */
+	src0 = get_src(ctx, &intr->src[3])[0];
+	src1 = create_collect(ctx, coords, ncoords);
+	src2 = get_image_offset(ctx, var, coords, false);
+
+	switch (intr->intrinsic) {
+	case nir_intrinsic_image_deref_atomic_add:
+		atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_min:
+		atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_max:
+		atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_and:
+		atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_or:
+		atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_xor:
+		atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_exchange:
+		atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_comp_swap:
+		/* for cmpxchg, src0 is [ui]vec2(data, compare): */
+		src0 = create_collect(ctx, (struct ir3_instruction*[]){
+			get_src(ctx, &intr->src[4])[0],
+			src0,
+		}, 2);
+		atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	default:
+		unreachable("boo");
+	}
+
+	atomic->cat6.iim_val = 1;
+	atomic->cat6.d = ncoords;
+	atomic->cat6.type = get_image_type(var);
+	atomic->cat6.typed = true;
+	atomic->barrier_class = IR3_BARRIER_IMAGE_W;
+	atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
+
+	/* even if nothing consume the result, we can't DCE the instruction: */
+	array_insert(b, b->keeps, atomic);
+
+	return atomic;
+}
+
+static void
+emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *barrier;
+
+	switch (intr->intrinsic) {
+	case nir_intrinsic_barrier:
+		barrier = ir3_BAR(b);
+		barrier->cat7.g = true;
+		barrier->cat7.l = true;
+		barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
+		barrier->barrier_class = IR3_BARRIER_EVERYTHING;
+		break;
+	case nir_intrinsic_memory_barrier:
+		barrier = ir3_FENCE(b);
+		barrier->cat7.g = true;
+		barrier->cat7.r = true;
+		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_IMAGE_W |
+				IR3_BARRIER_BUFFER_W;
+		barrier->barrier_conflict =
+				IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
+				IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+		break;
+	case nir_intrinsic_memory_barrier_atomic_counter:
+	case nir_intrinsic_memory_barrier_buffer:
+		barrier = ir3_FENCE(b);
+		barrier->cat7.g = true;
+		barrier->cat7.r = true;
+		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_BUFFER_W;
+		barrier->barrier_conflict = IR3_BARRIER_BUFFER_R |
+				IR3_BARRIER_BUFFER_W;
+		break;
+	case nir_intrinsic_memory_barrier_image:
+		// TODO double check if this should have .g set
+		barrier = ir3_FENCE(b);
+		barrier->cat7.g = true;
+		barrier->cat7.r = true;
+		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_IMAGE_W;
+		barrier->barrier_conflict = IR3_BARRIER_IMAGE_R |
+				IR3_BARRIER_IMAGE_W;
+		break;
+	case nir_intrinsic_memory_barrier_shared:
+		barrier = ir3_FENCE(b);
+		barrier->cat7.g = true;
+		barrier->cat7.l = true;
+		barrier->cat7.r = true;
+		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_SHARED_W;
+		barrier->barrier_conflict = IR3_BARRIER_SHARED_R |
+				IR3_BARRIER_SHARED_W;
+		break;
+	case nir_intrinsic_group_memory_barrier:
+		barrier = ir3_FENCE(b);
+		barrier->cat7.g = true;
+		barrier->cat7.l = true;
+		barrier->cat7.r = true;
+		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_SHARED_W |
+				IR3_BARRIER_IMAGE_W |
+				IR3_BARRIER_BUFFER_W;
+		barrier->barrier_conflict =
+				IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
+				IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
+				IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+		break;
+	default:
+		unreachable("boo");
+	}
+
+	/* make sure barrier doesn't get DCE'd */
+	array_insert(b, b->keeps, barrier);
+}
+
+static void add_sysval_input_compmask(struct ir3_context *ctx,
+		gl_system_value slot, unsigned compmask,
+		struct ir3_instruction *instr)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	unsigned r = regid(so->inputs_count, 0);
+	unsigned n = so->inputs_count++;
+
+	so->inputs[n].sysval = true;
+	so->inputs[n].slot = slot;
+	so->inputs[n].compmask = compmask;
+	so->inputs[n].regid = r;
+	so->inputs[n].interpolate = INTERP_MODE_FLAT;
+	so->total_in++;
+
+	ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1);
+	ctx->ir->inputs[r] = instr;
+}
+
+static void add_sysval_input(struct ir3_context *ctx, gl_system_value slot,
+		struct ir3_instruction *instr)
+{
+	add_sysval_input_compmask(ctx, slot, 0x1, instr);
+}
+
+static void
+emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
+	struct ir3_instruction **dst;
+	struct ir3_instruction * const *src;
+	struct ir3_block *b = ctx->block;
+	nir_const_value *const_offset;
+	int idx, comp;
+
+	if (info->has_dest) {
+		unsigned n = nir_intrinsic_dest_components(intr);
+		dst = get_dst(ctx, &intr->dest, n);
+	} else {
+		dst = NULL;
+	}
+
+	switch (intr->intrinsic) {
+	case nir_intrinsic_load_uniform:
+		idx = nir_intrinsic_base(intr);
+		const_offset = nir_src_as_const_value(intr->src[0]);
+		if (const_offset) {
+			idx += const_offset->u32[0];
+			for (int i = 0; i < intr->num_components; i++) {
+				unsigned n = idx * 4 + i;
+				dst[i] = create_uniform(ctx, n);
+			}
+		} else {
+			src = get_src(ctx, &intr->src[0]);
+			for (int i = 0; i < intr->num_components; i++) {
+				int n = idx * 4 + i;
+				dst[i] = create_uniform_indirect(ctx, n,
+						get_addr(ctx, src[0], 4));
+			}
+			/* NOTE: if relative addressing is used, we set
+			 * constlen in the compiler (to worst-case value)
+			 * since we don't know in the assembler what the max
+			 * addr reg value can be:
+			 */
+			ctx->so->constlen = ctx->s->num_uniforms;
+		}
+		break;
+	case nir_intrinsic_load_ubo:
+		emit_intrinsic_load_ubo(ctx, intr, dst);
+		break;
+	case nir_intrinsic_load_input:
+		idx = nir_intrinsic_base(intr);
+		comp = nir_intrinsic_component(intr);
+		const_offset = nir_src_as_const_value(intr->src[0]);
+		if (const_offset) {
+			idx += const_offset->u32[0];
+			for (int i = 0; i < intr->num_components; i++) {
+				unsigned n = idx * 4 + i + comp;
+				dst[i] = ctx->ir->inputs[n];
+			}
+		} else {
+			src = get_src(ctx, &intr->src[0]);
+			struct ir3_instruction *collect =
+					create_collect(ctx, ctx->ir->inputs, ctx->ir->ninputs);
+			struct ir3_instruction *addr = get_addr(ctx, src[0], 4);
+			for (int i = 0; i < intr->num_components; i++) {
+				unsigned n = idx * 4 + i + comp;
+				dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
+						n, addr, collect);
+			}
+		}
+		break;
+	case nir_intrinsic_load_ssbo:
+		emit_intrinsic_load_ssbo(ctx, intr, dst);
+		break;
+	case nir_intrinsic_store_ssbo:
+		emit_intrinsic_store_ssbo(ctx, intr);
+		break;
+	case nir_intrinsic_get_buffer_size:
+		emit_intrinsic_ssbo_size(ctx, intr, dst);
+		break;
+	case nir_intrinsic_ssbo_atomic_add:
+	case nir_intrinsic_ssbo_atomic_imin:
+	case nir_intrinsic_ssbo_atomic_umin:
+	case nir_intrinsic_ssbo_atomic_imax:
+	case nir_intrinsic_ssbo_atomic_umax:
+	case nir_intrinsic_ssbo_atomic_and:
+	case nir_intrinsic_ssbo_atomic_or:
+	case nir_intrinsic_ssbo_atomic_xor:
+	case nir_intrinsic_ssbo_atomic_exchange:
+	case nir_intrinsic_ssbo_atomic_comp_swap:
+		dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr);
+		break;
+	case nir_intrinsic_load_shared:
+		emit_intrinsic_load_shared(ctx, intr, dst);
+		break;
+	case nir_intrinsic_store_shared:
+		emit_intrinsic_store_shared(ctx, intr);
+		break;
+	case nir_intrinsic_shared_atomic_add:
+	case nir_intrinsic_shared_atomic_imin:
+	case nir_intrinsic_shared_atomic_umin:
+	case nir_intrinsic_shared_atomic_imax:
+	case nir_intrinsic_shared_atomic_umax:
+	case nir_intrinsic_shared_atomic_and:
+	case nir_intrinsic_shared_atomic_or:
+	case nir_intrinsic_shared_atomic_xor:
+	case nir_intrinsic_shared_atomic_exchange:
+	case nir_intrinsic_shared_atomic_comp_swap:
+		dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
+		break;
+	case nir_intrinsic_image_deref_load:
+		emit_intrinsic_load_image(ctx, intr, dst);
+		break;
+	case nir_intrinsic_image_deref_store:
+		emit_intrinsic_store_image(ctx, intr);
+		break;
+	case nir_intrinsic_image_deref_size:
+		emit_intrinsic_image_size(ctx, intr, dst);
+		break;
+	case nir_intrinsic_image_deref_atomic_add:
+	case nir_intrinsic_image_deref_atomic_min:
+	case nir_intrinsic_image_deref_atomic_max:
+	case nir_intrinsic_image_deref_atomic_and:
+	case nir_intrinsic_image_deref_atomic_or:
+	case nir_intrinsic_image_deref_atomic_xor:
+	case nir_intrinsic_image_deref_atomic_exchange:
+	case nir_intrinsic_image_deref_atomic_comp_swap:
+		dst[0] = emit_intrinsic_atomic_image(ctx, intr);
+		break;
+	case nir_intrinsic_barrier:
+	case nir_intrinsic_memory_barrier:
+	case nir_intrinsic_group_memory_barrier:
+	case nir_intrinsic_memory_barrier_atomic_counter:
+	case nir_intrinsic_memory_barrier_buffer:
+	case nir_intrinsic_memory_barrier_image:
+	case nir_intrinsic_memory_barrier_shared:
+		emit_intrinsic_barrier(ctx, intr);
+		/* note that blk ptr no longer valid, make that obvious: */
+		b = NULL;
+		break;
+	case nir_intrinsic_store_output:
+		idx = nir_intrinsic_base(intr);
+		comp = nir_intrinsic_component(intr);
+		const_offset = nir_src_as_const_value(intr->src[1]);
+		compile_assert(ctx, const_offset != NULL);
+		idx += const_offset->u32[0];
+
+		src = get_src(ctx, &intr->src[0]);
+		for (int i = 0; i < intr->num_components; i++) {
+			unsigned n = idx * 4 + i + comp;
+			ctx->ir->outputs[n] = src[i];
+		}
+		break;
+	case nir_intrinsic_load_base_vertex:
+	case nir_intrinsic_load_first_vertex:
+		if (!ctx->basevertex) {
+			ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
+			add_sysval_input(ctx, SYSTEM_VALUE_FIRST_VERTEX, ctx->basevertex);
+		}
+		dst[0] = ctx->basevertex;
+		break;
+	case nir_intrinsic_load_vertex_id_zero_base:
+	case nir_intrinsic_load_vertex_id:
+		if (!ctx->vertex_id) {
+			gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) ?
+				SYSTEM_VALUE_VERTEX_ID : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
+			ctx->vertex_id = create_input(ctx, 0);
+			add_sysval_input(ctx, sv, ctx->vertex_id);
+		}
+		dst[0] = ctx->vertex_id;
+		break;
+	case nir_intrinsic_load_instance_id:
+		if (!ctx->instance_id) {
+			ctx->instance_id = create_input(ctx, 0);
+			add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID,
+					ctx->instance_id);
+		}
+		dst[0] = ctx->instance_id;
+		break;
+	case nir_intrinsic_load_sample_id:
+	case nir_intrinsic_load_sample_id_no_per_sample:
+		if (!ctx->samp_id) {
+			ctx->samp_id = create_input(ctx, 0);
+			ctx->samp_id->regs[0]->flags |= IR3_REG_HALF;
+			add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID,
+					ctx->samp_id);
+		}
+		dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
+		break;
+	case nir_intrinsic_load_sample_mask_in:
+		if (!ctx->samp_mask_in) {
+			ctx->samp_mask_in = create_input(ctx, 0);
+			add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN,
+					ctx->samp_mask_in);
+		}
+		dst[0] = ctx->samp_mask_in;
+		break;
+	case nir_intrinsic_load_user_clip_plane:
+		idx = nir_intrinsic_ucp_id(intr);
+		for (int i = 0; i < intr->num_components; i++) {
+			unsigned n = idx * 4 + i;
+			dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
+		}
+		break;
+	case nir_intrinsic_load_front_face:
+		if (!ctx->frag_face) {
+			ctx->so->frag_face = true;
+			ctx->frag_face = create_input(ctx, 0);
+			add_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, ctx->frag_face);
+			ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
+		}
+		/* for fragface, we get -1 for back and 0 for front. However this is
+		 * the inverse of what nir expects (where ~0 is true).
+		 */
+		dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32);
+		dst[0] = ir3_NOT_B(b, dst[0], 0);
+		break;
+	case nir_intrinsic_load_local_invocation_id:
+		if (!ctx->local_invocation_id) {
+			ctx->local_invocation_id = create_input_compmask(ctx, 0, 0x7);
+			add_sysval_input_compmask(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID,
+					0x7, ctx->local_invocation_id);
+		}
+		split_dest(b, dst, ctx->local_invocation_id, 0, 3);
+		break;
+	case nir_intrinsic_load_work_group_id:
+		if (!ctx->work_group_id) {
+			ctx->work_group_id = create_input_compmask(ctx, 0, 0x7);
+			add_sysval_input_compmask(ctx, SYSTEM_VALUE_WORK_GROUP_ID,
+					0x7, ctx->work_group_id);
+			ctx->work_group_id->regs[0]->flags |= IR3_REG_HIGH;
+		}
+		split_dest(b, dst, ctx->work_group_id, 0, 3);
+		break;
+	case nir_intrinsic_load_num_work_groups:
+		for (int i = 0; i < intr->num_components; i++) {
+			dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
+		}
+		break;
+	case nir_intrinsic_load_local_group_size:
+		for (int i = 0; i < intr->num_components; i++) {
+			dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
+		}
+		break;
+	case nir_intrinsic_discard_if:
+	case nir_intrinsic_discard: {
+		struct ir3_instruction *cond, *kill;
+
+		if (intr->intrinsic == nir_intrinsic_discard_if) {
+			/* conditional discard: */
+			src = get_src(ctx, &intr->src[0]);
+			cond = ir3_b2n(b, src[0]);
+		} else {
+			/* unconditional discard: */
+			cond = create_immed(b, 1);
+		}
+
+		/* NOTE: only cmps.*.* can write p0.x: */
+		cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
+		cond->cat2.condition = IR3_COND_NE;
+
+		/* condition always goes in predicate register: */
+		cond->regs[0]->num = regid(REG_P0, 0);
+
+		kill = ir3_KILL(b, cond, 0);
+		array_insert(ctx->ir, ctx->ir->predicates, kill);
+
+		array_insert(b, b->keeps, kill);
+		ctx->so->has_kill = true;
+
+		break;
+	}
+	default:
+		compile_error(ctx, "Unhandled intrinsic type: %s\n",
+				nir_intrinsic_infos[intr->intrinsic].name);
+		break;
+	}
+
+	if (info->has_dest)
+		put_dst(ctx, &intr->dest);
+}
+
+static void
+emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
+{
+	struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
+			instr->def.num_components);
+	type_t type = (instr->def.bit_size < 32) ? TYPE_U16 : TYPE_U32;
+
+	for (int i = 0; i < instr->def.num_components; i++)
+		dst[i] = create_immed_typed(ctx->block, instr->value.u32[i], type);
+}
+
+static void
+emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)
+{
+	struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def,
+			undef->def.num_components);
+	type_t type = (undef->def.bit_size < 32) ? TYPE_U16 : TYPE_U32;
+
+	/* backend doesn't want undefined instructions, so just plug
+	 * in 0.0..
+	 */
+	for (int i = 0; i < undef->def.num_components; i++)
+		dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
+}
+
+/*
+ * texture fetch/sample instructions:
+ */
+
+static void
+tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
+{
+	unsigned coords, flags = 0;
+
+	/* note: would use tex->coord_components.. except txs.. also,
+	 * since array index goes after shadow ref, we don't want to
+	 * count it:
+	 */
+	switch (tex->sampler_dim) {
+	case GLSL_SAMPLER_DIM_1D:
+	case GLSL_SAMPLER_DIM_BUF:
+		coords = 1;
+		break;
+	case GLSL_SAMPLER_DIM_2D:
+	case GLSL_SAMPLER_DIM_RECT:
+	case GLSL_SAMPLER_DIM_EXTERNAL:
+	case GLSL_SAMPLER_DIM_MS:
+		coords = 2;
+		break;
+	case GLSL_SAMPLER_DIM_3D:
+	case GLSL_SAMPLER_DIM_CUBE:
+		coords = 3;
+		flags |= IR3_INSTR_3D;
+		break;
+	default:
+		unreachable("bad sampler_dim");
+	}
+
+	if (tex->is_shadow && tex->op != nir_texop_lod)
+		flags |= IR3_INSTR_S;
+
+	if (tex->is_array && tex->op != nir_texop_lod)
+		flags |= IR3_INSTR_A;
+
+	*flagsp = flags;
+	*coordsp = coords;
+}
+
+static void
+emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
+	struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy;
+	struct ir3_instruction *lod, *compare, *proj, *sample_index;
+	bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
+	unsigned i, coords, flags;
+	unsigned nsrc0 = 0, nsrc1 = 0;
+	type_t type;
+	opc_t opc = 0;
+
+	coord = off = ddx = ddy = NULL;
+	lod = proj = compare = sample_index = NULL;
+
+	/* TODO: might just be one component for gathers? */
+	dst = get_dst(ctx, &tex->dest, 4);
+
+	for (unsigned i = 0; i < tex->num_srcs; i++) {
+		switch (tex->src[i].src_type) {
+		case nir_tex_src_coord:
+			coord = get_src(ctx, &tex->src[i].src);
+			break;
+		case nir_tex_src_bias:
+			lod = get_src(ctx, &tex->src[i].src)[0];
+			has_bias = true;
+			break;
+		case nir_tex_src_lod:
+			lod = get_src(ctx, &tex->src[i].src)[0];
+			has_lod = true;
+			break;
+		case nir_tex_src_comparator: /* shadow comparator */
+			compare = get_src(ctx, &tex->src[i].src)[0];
+			break;
+		case nir_tex_src_projector:
+			proj = get_src(ctx, &tex->src[i].src)[0];
+			has_proj = true;
+			break;
+		case nir_tex_src_offset:
+			off = get_src(ctx, &tex->src[i].src);
+			has_off = true;
+			break;
+		case nir_tex_src_ddx:
+			ddx = get_src(ctx, &tex->src[i].src);
+			break;
+		case nir_tex_src_ddy:
+			ddy = get_src(ctx, &tex->src[i].src);
+			break;
+		case nir_tex_src_ms_index:
+			sample_index = get_src(ctx, &tex->src[i].src)[0];
+			break;
+		default:
+			compile_error(ctx, "Unhandled NIR tex src type: %d\n",
+					tex->src[i].src_type);
+			return;
+		}
+	}
+
+	switch (tex->op) {
+	case nir_texop_tex:      opc = has_lod ? OPC_SAML : OPC_SAM; break;
+	case nir_texop_txb:      opc = OPC_SAMB;     break;
+	case nir_texop_txl:      opc = OPC_SAML;     break;
+	case nir_texop_txd:      opc = OPC_SAMGQ;    break;
+	case nir_texop_txf:      opc = OPC_ISAML;    break;
+	case nir_texop_lod:      opc = OPC_GETLOD;   break;
+	case nir_texop_tg4:
+		/* NOTE: a4xx might need to emulate gather w/ txf (this is
+		 * what blob does, seems gather  is broken?), and a3xx did
+		 * not support it (but probably could also emulate).
+		 */
+		switch (tex->component) {
+		case 0:              opc = OPC_GATHER4R; break;
+		case 1:              opc = OPC_GATHER4G; break;
+		case 2:              opc = OPC_GATHER4B; break;
+		case 3:              opc = OPC_GATHER4A; break;
+		}
+		break;
+	case nir_texop_txf_ms:   opc = OPC_ISAMM;    break;
+	case nir_texop_txs:
+	case nir_texop_query_levels:
+	case nir_texop_texture_samples:
+	case nir_texop_samples_identical:
+	case nir_texop_txf_ms_mcs:
+		compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
+		return;
+	}
+
+	tex_info(tex, &flags, &coords);
+
+	/*
+	 * lay out the first argument in the proper order:
+	 *  - actual coordinates first
+	 *  - shadow reference
+	 *  - array index
+	 *  - projection w
+	 *  - starting at offset 4, dpdx.xy, dpdy.xy
+	 *
+	 * bias/lod go into the second arg
+	 */
+
+	/* insert tex coords: */
+	for (i = 0; i < coords; i++)
+		src0[i] = coord[i];
+
+	nsrc0 = i;
+
+	/* NOTE a3xx (and possibly a4xx?) might be different, using isaml
+	 * with scaled x coord according to requested sample:
+	 */
+	if (tex->op == nir_texop_txf_ms) {
+		if (ctx->compiler->txf_ms_with_isaml) {
+			/* the samples are laid out in x dimension as
+			 *     0 1 2 3
+			 * x_ms = (x << ms) + sample_index;
+			 */
+			struct ir3_instruction *ms;
+			ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
+
+			src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
+			src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
+
+			opc = OPC_ISAML;
+		} else {
+			src0[nsrc0++] = sample_index;
+		}
+	}
+
+	/* scale up integer coords for TXF based on the LOD */
+	if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
+		assert(has_lod);
+		for (i = 0; i < coords; i++)
+			src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
+	}
+
+	if (coords == 1) {
+		/* hw doesn't do 1d, so we treat it as 2d with
+		 * height of 1, and patch up the y coord.
+		 * TODO: y coord should be (int)0 in some cases..
+		 */
+		src0[nsrc0++] = create_immed(b, fui(0.5));
+	}
+
+	if (tex->is_shadow && tex->op != nir_texop_lod)
+		src0[nsrc0++] = compare;
+
+	if (tex->is_array && tex->op != nir_texop_lod) {
+		struct ir3_instruction *idx = coord[coords];
+
+		/* the array coord for cube arrays needs 0.5 added to it */
+		if (ctx->compiler->array_index_add_half && (opc != OPC_ISAML))
+			idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);
+
+		src0[nsrc0++] = idx;
+	}
+
+	if (has_proj) {
+		src0[nsrc0++] = proj;
+		flags |= IR3_INSTR_P;
+	}
+
+	/* pad to 4, then ddx/ddy: */
+	if (tex->op == nir_texop_txd) {
+		while (nsrc0 < 4)
+			src0[nsrc0++] = create_immed(b, fui(0.0));
+		for (i = 0; i < coords; i++)
+			src0[nsrc0++] = ddx[i];
+		if (coords < 2)
+			src0[nsrc0++] = create_immed(b, fui(0.0));
+		for (i = 0; i < coords; i++)
+			src0[nsrc0++] = ddy[i];
+		if (coords < 2)
+			src0[nsrc0++] = create_immed(b, fui(0.0));
+	}
+
+	/*
+	 * second argument (if applicable):
+	 *  - offsets
+	 *  - lod
+	 *  - bias
+	 */
+	if (has_off | has_lod | has_bias) {
+		if (has_off) {
+			unsigned off_coords = coords;
+			if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+				off_coords--;
+			for (i = 0; i < off_coords; i++)
+				src1[nsrc1++] = off[i];
+			if (off_coords < 2)
+				src1[nsrc1++] = create_immed(b, fui(0.0));
+			flags |= IR3_INSTR_O;
+		}
+
+		if (has_lod | has_bias)
+			src1[nsrc1++] = lod;
+	}
+
+	switch (tex->dest_type) {
+	case nir_type_invalid:
+	case nir_type_float:
+		type = TYPE_F32;
+		break;
+	case nir_type_int:
+		type = TYPE_S32;
+		break;
+	case nir_type_uint:
+	case nir_type_bool:
+		type = TYPE_U32;
+		break;
+	default:
+		unreachable("bad dest_type");
+	}
+
+	if (opc == OPC_GETLOD)
+		type = TYPE_U32;
+
+	unsigned tex_idx = tex->texture_index;
+
+	ctx->max_texture_index = MAX2(ctx->max_texture_index, tex_idx);
+
+	struct ir3_instruction *col0 = create_collect(ctx, src0, nsrc0);
+	struct ir3_instruction *col1 = create_collect(ctx, src1, nsrc1);
+
+	sam = ir3_SAM(b, opc, type, 0b1111, flags,
+			tex_idx, tex_idx, col0, col1);
+
+	if ((ctx->astc_srgb & (1 << tex_idx)) && !nir_tex_instr_is_query(tex)) {
+		/* only need first 3 components: */
+		sam->regs[0]->wrmask = 0x7;
+		split_dest(b, dst, sam, 0, 3);
+
+		/* we need to sample the alpha separately with a non-ASTC
+		 * texture state:
+		 */
+		sam = ir3_SAM(b, opc, type, 0b1000, flags,
+				tex_idx, tex_idx, col0, col1);
+
+		array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
+
+		/* fixup .w component: */
+		split_dest(b, &dst[3], sam, 3, 1);
+	} else {
+		/* normal (non-workaround) case: */
+		split_dest(b, dst, sam, 0, 4);
+	}
+
+	/* GETLOD returns results in 4.8 fixed point */
+	if (opc == OPC_GETLOD) {
+		struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
+
+		compile_assert(ctx, tex->dest_type == nir_type_float);
+		for (i = 0; i < 2; i++) {
+			dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0,
+							   factor, 0);
+		}
+	}
+
+	put_dst(ctx, &tex->dest);
+}
+
+static void
+emit_tex_query_levels(struct ir3_context *ctx, nir_tex_instr *tex)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction **dst, *sam;
+
+	dst = get_dst(ctx, &tex->dest, 1);
+
+	sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, 0b0100, 0,
+			tex->texture_index, tex->texture_index, NULL, NULL);
+
+	/* even though there is only one component, since it ends
+	 * up in .z rather than .x, we need a split_dest()
+	 */
+	split_dest(b, dst, sam, 0, 3);
+
+	/* The # of levels comes from getinfo.z. We need to add 1 to it, since
+	 * the value in TEX_CONST_0 is zero-based.
+	 */
+	if (ctx->compiler->levels_add_one)
+		dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
+
+	put_dst(ctx, &tex->dest);
+}
+
+static void
+emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction **dst, *sam;
+	struct ir3_instruction *lod;
+	unsigned flags, coords;
+
+	tex_info(tex, &flags, &coords);
+
+	/* Actually we want the number of dimensions, not coordinates. This
+	 * distinction only matters for cubes.
+	 */
+	if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+		coords = 2;
+
+	dst = get_dst(ctx, &tex->dest, 4);
+
+	compile_assert(ctx, tex->num_srcs == 1);
+	compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod);
+
+	lod = get_src(ctx, &tex->src[0].src)[0];
+
+	sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags,
+			tex->texture_index, tex->texture_index, lod, NULL);
+
+	split_dest(b, dst, sam, 0, 4);
+
+	/* Array size actually ends up in .w rather than .z. This doesn't
+	 * matter for miplevel 0, but for higher mips the value in z is
+	 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
+	 * returned, which means that we have to add 1 to it for arrays.
+	 */
+	if (tex->is_array) {
+		if (ctx->compiler->levels_add_one) {
+			dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
+		} else {
+			dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
+		}
+	}
+
+	put_dst(ctx, &tex->dest);
+}
+
+static void
+emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
+{
+	switch (jump->type) {
+	case nir_jump_break:
+	case nir_jump_continue:
+	case nir_jump_return:
+		/* I *think* we can simply just ignore this, and use the
+		 * successor block link to figure out where we need to
+		 * jump to for break/continue
+		 */
+		break;
+	default:
+		compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
+		break;
+	}
+}
+
+static void
+emit_instr(struct ir3_context *ctx, nir_instr *instr)
+{
+	switch (instr->type) {
+	case nir_instr_type_alu:
+		emit_alu(ctx, nir_instr_as_alu(instr));
+		break;
+	case nir_instr_type_deref:
+		/* ignored, handled as part of the intrinsic they are src to */
+		break;
+	case nir_instr_type_intrinsic:
+		emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
+		break;
+	case nir_instr_type_load_const:
+		emit_load_const(ctx, nir_instr_as_load_const(instr));
+		break;
+	case nir_instr_type_ssa_undef:
+		emit_undef(ctx, nir_instr_as_ssa_undef(instr));
+		break;
+	case nir_instr_type_tex: {
+		nir_tex_instr *tex = nir_instr_as_tex(instr);
+		/* couple tex instructions get special-cased:
+		 */
+		switch (tex->op) {
+		case nir_texop_txs:
+			emit_tex_txs(ctx, tex);
+			break;
+		case nir_texop_query_levels:
+			emit_tex_query_levels(ctx, tex);
+			break;
+		default:
+			emit_tex(ctx, tex);
+			break;
+		}
+		break;
+	}
+	case nir_instr_type_jump:
+		emit_jump(ctx, nir_instr_as_jump(instr));
+		break;
+	case nir_instr_type_phi:
+		/* we have converted phi webs to regs in NIR by now */
+		compile_error(ctx, "Unexpected NIR instruction type: %d\n", instr->type);
+		break;
+	case nir_instr_type_call:
+	case nir_instr_type_parallel_copy:
+		compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
+		break;
+	}
+}
+
+static struct ir3_block *
+get_block(struct ir3_context *ctx, const nir_block *nblock)
+{
+	struct ir3_block *block;
+	struct hash_entry *hentry;
+	unsigned i;
+
+	hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
+	if (hentry)
+		return hentry->data;
+
+	block = ir3_block_create(ctx->ir);
+	block->nblock = nblock;
+	_mesa_hash_table_insert(ctx->block_ht, nblock, block);
+
+	block->predecessors_count = nblock->predecessors->entries;
+	block->predecessors = ralloc_array_size(block,
+		sizeof(block->predecessors[0]), block->predecessors_count);
+	i = 0;
+	set_foreach(nblock->predecessors, sentry) {
+		block->predecessors[i++] = get_block(ctx, sentry->key);
+	}
+
+	return block;
+}
+
+static void
+emit_block(struct ir3_context *ctx, nir_block *nblock)
+{
+	struct ir3_block *block = get_block(ctx, nblock);
+
+	for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
+		if (nblock->successors[i]) {
+			block->successors[i] =
+				get_block(ctx, nblock->successors[i]);
+		}
+	}
+
+	ctx->block = block;
+	list_addtail(&block->node, &ctx->ir->block_list);
+
+	/* re-emit addr register in each block if needed: */
+	for (int i = 0; i < ARRAY_SIZE(ctx->addr_ht); i++) {
+		_mesa_hash_table_destroy(ctx->addr_ht[i], NULL);
+		ctx->addr_ht[i] = NULL;
+	}
+
+	nir_foreach_instr(instr, nblock) {
+		ctx->cur_instr = instr;
+		emit_instr(ctx, instr);
+		ctx->cur_instr = NULL;
+		if (ctx->error)
+			return;
+	}
+}
+
+static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
+
+static void
+emit_if(struct ir3_context *ctx, nir_if *nif)
+{
+	struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0];
+
+	ctx->block->condition =
+		get_predicate(ctx, ir3_b2n(condition->block, condition));
+
+	emit_cf_list(ctx, &nif->then_list);
+	emit_cf_list(ctx, &nif->else_list);
+}
+
+static void
+emit_loop(struct ir3_context *ctx, nir_loop *nloop)
+{
+	emit_cf_list(ctx, &nloop->body);
+}
+
+static void
+emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
+{
+	foreach_list_typed(nir_cf_node, node, node, list) {
+		switch (node->type) {
+		case nir_cf_node_block:
+			emit_block(ctx, nir_cf_node_as_block(node));
+			break;
+		case nir_cf_node_if:
+			emit_if(ctx, nir_cf_node_as_if(node));
+			break;
+		case nir_cf_node_loop:
+			emit_loop(ctx, nir_cf_node_as_loop(node));
+			break;
+		case nir_cf_node_function:
+			compile_error(ctx, "TODO\n");
+			break;
+		}
+	}
+}
+
+/* emit stream-out code.  At this point, the current block is the original
+ * (nir) end block, and nir ensures that all flow control paths terminate
+ * into the end block.  We re-purpose the original end block to generate
+ * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
+ * block holding stream-out write instructions, followed by the new end
+ * block:
+ *
+ *   blockOrigEnd {
+ *      p0.x = (vtxcnt < maxvtxcnt)
+ *      // succs: blockStreamOut, blockNewEnd
+ *   }
+ *   blockStreamOut {
+ *      ... stream-out instructions ...
+ *      // succs: blockNewEnd
+ *   }
+ *   blockNewEnd {
+ *   }
+ */
+static void
+emit_stream_out(struct ir3_context *ctx)
+{
+	struct ir3_shader_variant *v = ctx->so;
+	struct ir3 *ir = ctx->ir;
+	struct ir3_stream_output_info *strmout =
+			&ctx->so->shader->stream_output;
+	struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
+	struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
+	struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
+
+	/* create vtxcnt input in input block at top of shader,
+	 * so that it is seen as live over the entire duration
+	 * of the shader:
+	 */
+	vtxcnt = create_input(ctx, 0);
+	add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt);
+
+	maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
+
+	/* at this point, we are at the original 'end' block,
+	 * re-purpose this block to stream-out condition, then
+	 * append stream-out block and new-end block
+	 */
+	orig_end_block = ctx->block;
+
+// TODO these blocks need to update predecessors..
+// maybe w/ store_global intrinsic, we could do this
+// stuff in nir->nir pass
+
+	stream_out_block = ir3_block_create(ir);
+	list_addtail(&stream_out_block->node, &ir->block_list);
+
+	new_end_block = ir3_block_create(ir);
+	list_addtail(&new_end_block->node, &ir->block_list);
+
+	orig_end_block->successors[0] = stream_out_block;
+	orig_end_block->successors[1] = new_end_block;
+	stream_out_block->successors[0] = new_end_block;
+
+	/* setup 'if (vtxcnt < maxvtxcnt)' condition: */
+	cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
+	cond->regs[0]->num = regid(REG_P0, 0);
+	cond->cat2.condition = IR3_COND_LT;
+
+	/* condition goes on previous block to the conditional,
+	 * since it is used to pick which of the two successor
+	 * paths to take:
+	 */
+	orig_end_block->condition = cond;
+
+	/* switch to stream_out_block to generate the stream-out
+	 * instructions:
+	 */
+	ctx->block = stream_out_block;
+
+	/* Calculate base addresses based on vtxcnt.  Instructions
+	 * generated for bases not used in following loop will be
+	 * stripped out in the backend.
+	 */
+	for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
+		unsigned stride = strmout->stride[i];
+		struct ir3_instruction *base, *off;
+
+		base = create_uniform(ctx, regid(v->constbase.tfbo, i));
+
+		/* 24-bit should be enough: */
+		off = ir3_MUL_U(ctx->block, vtxcnt, 0,
+				create_immed(ctx->block, stride * 4), 0);
+
+		bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
+	}
+
+	/* Generate the per-output store instructions: */
+	for (unsigned i = 0; i < strmout->num_outputs; i++) {
+		for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
+			unsigned c = j + strmout->output[i].start_component;
+			struct ir3_instruction *base, *out, *stg;
+
+			base = bases[strmout->output[i].output_buffer];
+			out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
+
+			stg = ir3_STG(ctx->block, base, 0, out, 0,
+					create_immed(ctx->block, 1), 0);
+			stg->cat6.type = TYPE_U32;
+			stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4;
+
+			array_insert(ctx->block, ctx->block->keeps, stg);
+		}
+	}
+
+	/* and finally switch to the new_end_block: */
+	ctx->block = new_end_block;
+}
+
+static void
+emit_function(struct ir3_context *ctx, nir_function_impl *impl)
+{
+	nir_metadata_require(impl, nir_metadata_block_index);
+
+	emit_cf_list(ctx, &impl->body);
+	emit_block(ctx, impl->end_block);
+
+	/* at this point, we should have a single empty block,
+	 * into which we emit the 'end' instruction.
+	 */
+	compile_assert(ctx, list_empty(&ctx->block->instr_list));
+
+	/* If stream-out (aka transform-feedback) enabled, emit the
+	 * stream-out instructions, followed by a new empty block (into
+	 * which the 'end' instruction lands).
+	 *
+	 * NOTE: it is done in this order, rather than inserting before
+	 * we emit end_block, because NIR guarantees that all blocks
+	 * flow into end_block, and that end_block has no successors.
+	 * So by re-purposing end_block as the first block of stream-
+	 * out, we guarantee that all exit paths flow into the stream-
+	 * out instructions.
+	 */
+	if ((ctx->compiler->gpu_id < 500) &&
+			(ctx->so->shader->stream_output.num_outputs > 0) &&
+			!ctx->so->binning_pass) {
+		debug_assert(ctx->so->type == MESA_SHADER_VERTEX);
+		emit_stream_out(ctx);
+	}
+
+	ir3_END(ctx->block);
+}
+
+static struct ir3_instruction *
+create_frag_coord(struct ir3_context *ctx, unsigned comp)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *instr;
+
+	if (!ctx->frag_coord) {
+		ctx->frag_coord = create_input_compmask(ctx, 0, 0xf);
+		/* defer add_sysval_input() until after all inputs created */
+	}
+
+	split_dest(block, &instr, ctx->frag_coord, comp, 1);
+
+	switch (comp) {
+	case 0: /* .x */
+	case 1: /* .y */
+		/* for frag_coord, we get unsigned values.. we need
+		 * to subtract (integer) 8 and divide by 16 (right-
+		 * shift by 4) then convert to float:
+		 *
+		 *    sub.s tmp, src, 8
+		 *    shr.b tmp, tmp, 4
+		 *    mov.u32f32 dst, tmp
+		 *
+		 */
+		instr = ir3_SUB_S(block, instr, 0,
+				create_immed(block, 8), 0);
+		instr = ir3_SHR_B(block, instr, 0,
+				create_immed(block, 4), 0);
+		instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32);
+
+		return instr;
+	case 2: /* .z */
+	case 3: /* .w */
+	default:
+		/* seems that we can use these as-is: */
+		return instr;
+	}
+}
+
+static void
+setup_input(struct ir3_context *ctx, nir_variable *in)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	unsigned ncomp = glsl_get_components(in->type);
+	unsigned n = in->data.driver_location;
+	unsigned slot = in->data.location;
+
+	/* let's pretend things other than vec4 don't exist: */
+	ncomp = MAX2(ncomp, 4);
+
+	/* skip unread inputs, we could end up with (for example), unsplit
+	 * matrix/etc inputs in the case they are not read, so just silently
+	 * skip these.
+	 */
+	if (ncomp > 4)
+		return;
+
+	compile_assert(ctx, ncomp == 4);
+
+	so->inputs[n].slot = slot;
+	so->inputs[n].compmask = (1 << ncomp) - 1;
+	so->inputs_count = MAX2(so->inputs_count, n + 1);
+	so->inputs[n].interpolate = in->data.interpolation;
+
+	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+		for (int i = 0; i < ncomp; i++) {
+			struct ir3_instruction *instr = NULL;
+			unsigned idx = (n * 4) + i;
+
+			if (slot == VARYING_SLOT_POS) {
+				so->inputs[n].bary = false;
+				so->frag_coord = true;
+				instr = create_frag_coord(ctx, i);
+			} else if (slot == VARYING_SLOT_PNTC) {
+				/* see for example st_nir_fixup_varying_slots().. this is
+				 * maybe a bit mesa/st specific.  But we need things to line
+				 * up for this in fdN_program:
+				 *    unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0);
+				 *    if (emit->sprite_coord_enable & texmask) {
+				 *       ...
+				 *    }
+				 */
+				so->inputs[n].slot = VARYING_SLOT_VAR8;
+				so->inputs[n].bary = true;
+				instr = create_frag_input(ctx, false);
+			} else {
+				bool use_ldlv = false;
+
+				/* detect the special case for front/back colors where
+				 * we need to do flat vs smooth shading depending on
+				 * rast state:
+				 */
+				if (in->data.interpolation == INTERP_MODE_NONE) {
+					switch (slot) {
+					case VARYING_SLOT_COL0:
+					case VARYING_SLOT_COL1:
+					case VARYING_SLOT_BFC0:
+					case VARYING_SLOT_BFC1:
+						so->inputs[n].rasterflat = true;
+						break;
+					default:
+						break;
+					}
+				}
+
+				if (ctx->compiler->flat_bypass) {
+					if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) ||
+							(so->inputs[n].rasterflat && ctx->so->key.rasterflat))
+						use_ldlv = true;
+				}
+
+				so->inputs[n].bary = true;
+
+				instr = create_frag_input(ctx, use_ldlv);
+			}
+
+			compile_assert(ctx, idx < ctx->ir->ninputs);
+
+			ctx->ir->inputs[idx] = instr;
+		}
+	} else if (ctx->so->type == MESA_SHADER_VERTEX) {
+		for (int i = 0; i < ncomp; i++) {
+			unsigned idx = (n * 4) + i;
+			compile_assert(ctx, idx < ctx->ir->ninputs);
+			ctx->ir->inputs[idx] = create_input(ctx, idx);
+		}
+	} else {
+		compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
+	}
+
+	if (so->inputs[n].bary || (ctx->so->type == MESA_SHADER_VERTEX)) {
+		so->total_in += ncomp;
+	}
+}
+
+static void
+setup_output(struct ir3_context *ctx, nir_variable *out)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	unsigned ncomp = glsl_get_components(out->type);
+	unsigned n = out->data.driver_location;
+	unsigned slot = out->data.location;
+	unsigned comp = 0;
+
+	/* let's pretend things other than vec4 don't exist: */
+	ncomp = MAX2(ncomp, 4);
+	compile_assert(ctx, ncomp == 4);
+
+	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+		switch (slot) {
+		case FRAG_RESULT_DEPTH:
+			comp = 2;  /* tgsi will write to .z component */
+			so->writes_pos = true;
+			break;
+		case FRAG_RESULT_COLOR:
+			so->color0_mrt = 1;
+			break;
+		default:
+			if (slot >= FRAG_RESULT_DATA0)
+				break;
+			compile_error(ctx, "unknown FS output name: %s\n",
+					gl_frag_result_name(slot));
+		}
+	} else if (ctx->so->type == MESA_SHADER_VERTEX) {
+		switch (slot) {
+		case VARYING_SLOT_POS:
+			so->writes_pos = true;
+			break;
+		case VARYING_SLOT_PSIZ:
+			so->writes_psize = true;
+			break;
+		case VARYING_SLOT_COL0:
+		case VARYING_SLOT_COL1:
+		case VARYING_SLOT_BFC0:
+		case VARYING_SLOT_BFC1:
+		case VARYING_SLOT_FOGC:
+		case VARYING_SLOT_CLIP_DIST0:
+		case VARYING_SLOT_CLIP_DIST1:
+		case VARYING_SLOT_CLIP_VERTEX:
+			break;
+		default:
+			if (slot >= VARYING_SLOT_VAR0)
+				break;
+			if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
+				break;
+			compile_error(ctx, "unknown VS output name: %s\n",
+					gl_varying_slot_name(slot));
+		}
+	} else {
+		compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
+	}
+
+	compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
+
+	so->outputs[n].slot = slot;
+	so->outputs[n].regid = regid(n, comp);
+	so->outputs_count = MAX2(so->outputs_count, n + 1);
+
+	for (int i = 0; i < ncomp; i++) {
+		unsigned idx = (n * 4) + i;
+		compile_assert(ctx, idx < ctx->ir->noutputs);
+		ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
+	}
+}
+
+static int
+max_drvloc(struct exec_list *vars)
+{
+	int drvloc = -1;
+	nir_foreach_variable(var, vars) {
+		drvloc = MAX2(drvloc, (int)var->data.driver_location);
+	}
+	return drvloc;
+}
+
+static const unsigned max_sysvals[] = {
+	[MESA_SHADER_FRAGMENT] = 24,  // TODO
+	[MESA_SHADER_VERTEX]  = 16,
+	[MESA_SHADER_COMPUTE] = 16, // TODO how many do we actually need?
+};
+
+static void
+emit_instructions(struct ir3_context *ctx)
+{
+	unsigned ninputs, noutputs;
+	nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
+
+	ninputs  = (max_drvloc(&ctx->s->inputs) + 1) * 4;
+	noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4;
+
+	/* we need to leave room for sysvals:
+	 */
+	ninputs += max_sysvals[ctx->so->type];
+
+	ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
+
+	/* Create inputs in first block: */
+	ctx->block = get_block(ctx, nir_start_block(fxn));
+	ctx->in_block = ctx->block;
+	list_addtail(&ctx->block->node, &ctx->ir->block_list);
+
+	ninputs -= max_sysvals[ctx->so->type];
+
+	/* for fragment shader, the vcoord input register is used as the
+	 * base for bary.f varying fetch instrs:
+	 */
+	struct ir3_instruction *vcoord = NULL;
+	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+		struct ir3_instruction *xy[2];
+
+		vcoord = create_input_compmask(ctx, 0, 0x3);
+		split_dest(ctx->block, xy, vcoord, 0, 2);
+
+		ctx->frag_vcoord = create_collect(ctx, xy, 2);
+	}
+
+	/* Setup inputs: */
+	nir_foreach_variable(var, &ctx->s->inputs) {
+		setup_input(ctx, var);
+	}
+
+	/* Defer add_sysval_input() stuff until after setup_inputs(),
+	 * because sysvals need to be appended after varyings:
+	 */
+	if (vcoord) {
+		add_sysval_input_compmask(ctx, SYSTEM_VALUE_VARYING_COORD,
+				0x3, vcoord);
+	}
+
+	if (ctx->frag_coord) {
+		add_sysval_input_compmask(ctx, SYSTEM_VALUE_FRAG_COORD,
+				0xf, ctx->frag_coord);
+	}
+
+	/* Setup outputs: */
+	nir_foreach_variable(var, &ctx->s->outputs) {
+		setup_output(ctx, var);
+	}
+
+	/* Setup registers (which should only be arrays): */
+	nir_foreach_register(reg, &ctx->s->registers) {
+		declare_array(ctx, reg);
+	}
+
+	/* NOTE: need to do something more clever when we support >1 fxn */
+	nir_foreach_register(reg, &fxn->registers) {
+		declare_array(ctx, reg);
+	}
+	/* And emit the body: */
+	ctx->impl = fxn;
+	emit_function(ctx, fxn);
+}
+
+/* from NIR perspective, we actually have varying inputs.  But the varying
+ * inputs, from an IR standpoint, are just bary.f/ldlv instructions.  The
+ * only actual inputs are the sysvals.
+ */
+static void
+fixup_frag_inputs(struct ir3_context *ctx)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	struct ir3 *ir = ctx->ir;
+	unsigned i = 0;
+
+	/* sysvals should appear at the end of the inputs, drop everything else: */
+	while ((i < so->inputs_count) && !so->inputs[i].sysval)
+		i++;
+
+	/* at IR level, inputs are always blocks of 4 scalars: */
+	i *= 4;
+
+	ir->inputs = &ir->inputs[i];
+	ir->ninputs -= i;
+}
+
+/* Fixup tex sampler state for astc/srgb workaround instructions.  We
+ * need to assign the tex state indexes for these after we know the
+ * max tex index.
+ */
+static void
+fixup_astc_srgb(struct ir3_context *ctx)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	/* indexed by original tex idx, value is newly assigned alpha sampler
+	 * state tex idx.  Zero is invalid since there is at least one sampler
+	 * if we get here.
+	 */
+	unsigned alt_tex_state[16] = {0};
+	unsigned tex_idx = ctx->max_texture_index + 1;
+	unsigned idx = 0;
+
+	so->astc_srgb.base = tex_idx;
+
+	for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
+		struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
+
+		compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
+
+		if (alt_tex_state[sam->cat5.tex] == 0) {
+			/* assign new alternate/alpha tex state slot: */
+			alt_tex_state[sam->cat5.tex] = tex_idx++;
+			so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
+			so->astc_srgb.count++;
+		}
+
+		sam->cat5.tex = alt_tex_state[sam->cat5.tex];
+	}
+}
+
+static void
+fixup_binning_pass(struct ir3_context *ctx)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	struct ir3 *ir = ctx->ir;
+	unsigned i, j;
+
+	for (i = 0, j = 0; i < so->outputs_count; i++) {
+		unsigned slot = so->outputs[i].slot;
+
+		/* throw away everything but first position/psize */
+		if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) {
+			if (i != j) {
+				so->outputs[j] = so->outputs[i];
+				ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0];
+				ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1];
+				ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2];
+				ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3];
+			}
+			j++;
+		}
+	}
+	so->outputs_count = j;
+	ir->noutputs = j * 4;
+}
+
+int
+ir3_compile_shader_nir(struct ir3_compiler *compiler,
+		struct ir3_shader_variant *so)
+{
+	struct ir3_context *ctx;
+	struct ir3 *ir;
+	struct ir3_instruction **inputs;
+	unsigned i, actual_in, inloc;
+	int ret = 0, max_bary;
+
+	assert(!so->ir);
+
+	ctx = compile_init(compiler, so);
+	if (!ctx) {
+		DBG("INIT failed!");
+		ret = -1;
+		goto out;
+	}
+
+	emit_instructions(ctx);
+
+	if (ctx->error) {
+		DBG("EMIT failed!");
+		ret = -1;
+		goto out;
+	}
+
+	ir = so->ir = ctx->ir;
+
+	/* keep track of the inputs from TGSI perspective.. */
+	inputs = ir->inputs;
+
+	/* but fixup actual inputs for frag shader: */
+	if (so->type == MESA_SHADER_FRAGMENT)
+		fixup_frag_inputs(ctx);
+
+	/* at this point, for binning pass, throw away unneeded outputs: */
+	if (so->binning_pass && (ctx->compiler->gpu_id < 600))
+		fixup_binning_pass(ctx);
+
+	/* if we want half-precision outputs, mark the output registers
+	 * as half:
+	 */
+	if (so->key.half_precision) {
+		for (i = 0; i < ir->noutputs; i++) {
+			struct ir3_instruction *out = ir->outputs[i];
+
+			if (!out)
+				continue;
+
+			/* if frag shader writes z, that needs to be full precision: */
+			if (so->outputs[i/4].slot == FRAG_RESULT_DEPTH)
+				continue;
+
+			out->regs[0]->flags |= IR3_REG_HALF;
+			/* output could be a fanout (ie. texture fetch output)
+			 * in which case we need to propagate the half-reg flag
+			 * up to the definer so that RA sees it:
+			 */
+			if (out->opc == OPC_META_FO) {
+				out = out->regs[1]->instr;
+				out->regs[0]->flags |= IR3_REG_HALF;
+			}
+
+			if (out->opc == OPC_MOV) {
+				out->cat1.dst_type = half_type(out->cat1.dst_type);
+			}
+		}
+	}
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("BEFORE CP:\n");
+		ir3_print(ir);
+	}
+
+	ir3_cp(ir, so);
+
+	/* at this point, for binning pass, throw away unneeded outputs:
+	 * Note that for a6xx and later, we do this after ir3_cp to ensure
+	 * that the uniform/constant layout for BS and VS matches, so that
+	 * we can re-use same VS_CONST state group.
+	 */
+	if (so->binning_pass && (ctx->compiler->gpu_id >= 600))
+		fixup_binning_pass(ctx);
+
+	/* Insert mov if there's same instruction for each output.
+	 * eg. dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_expression.vertex.sampler2dshadow
+	 */
+	for (int i = ir->noutputs - 1; i >= 0; i--) {
+		if (!ir->outputs[i])
+			continue;
+		for (unsigned j = 0; j < i; j++) {
+			if (ir->outputs[i] == ir->outputs[j]) {
+				ir->outputs[i] =
+					ir3_MOV(ir->outputs[i]->block, ir->outputs[i], TYPE_F32);
+			}
+		}
+	}
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("BEFORE GROUPING:\n");
+		ir3_print(ir);
+	}
+
+	ir3_sched_add_deps(ir);
+
+	/* Group left/right neighbors, inserting mov's where needed to
+	 * solve conflicts:
+	 */
+	ir3_group(ir);
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("AFTER GROUPING:\n");
+		ir3_print(ir);
+	}
+
+	ir3_depth(ir);
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("AFTER DEPTH:\n");
+		ir3_print(ir);
+	}
+
+	ret = ir3_sched(ir);
+	if (ret) {
+		DBG("SCHED failed!");
+		goto out;
+	}
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("AFTER SCHED:\n");
+		ir3_print(ir);
+	}
+
+	ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face);
+	if (ret) {
+		DBG("RA failed!");
+		goto out;
+	}
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("AFTER RA:\n");
+		ir3_print(ir);
+	}
+
+	/* fixup input/outputs: */
+	for (i = 0; i < so->outputs_count; i++) {
+		so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
+	}
+
+	/* Note that some or all channels of an input may be unused: */
+	actual_in = 0;
+	inloc = 0;
+	for (i = 0; i < so->inputs_count; i++) {
+		unsigned j, reg = regid(63,0), compmask = 0, maxcomp = 0;
+		so->inputs[i].ncomp = 0;
+		so->inputs[i].inloc = inloc;
+		for (j = 0; j < 4; j++) {
+			struct ir3_instruction *in = inputs[(i*4) + j];
+			if (in && !(in->flags & IR3_INSTR_UNUSED)) {
+				compmask |= (1 << j);
+				reg = in->regs[0]->num - j;
+				actual_in++;
+				so->inputs[i].ncomp++;
+				if ((so->type == MESA_SHADER_FRAGMENT) && so->inputs[i].bary) {
+					/* assign inloc: */
+					assert(in->regs[1]->flags & IR3_REG_IMMED);
+					in->regs[1]->iim_val = inloc + j;
+					maxcomp = j + 1;
+				}
+			}
+		}
+		if ((so->type == MESA_SHADER_FRAGMENT) && compmask && so->inputs[i].bary) {
+			so->varying_in++;
+			so->inputs[i].compmask = (1 << maxcomp) - 1;
+			inloc += maxcomp;
+		} else if (!so->inputs[i].sysval) {
+			so->inputs[i].compmask = compmask;
+		}
+		so->inputs[i].regid = reg;
+	}
+
+	if (ctx->astc_srgb)
+		fixup_astc_srgb(ctx);
+
+	/* We need to do legalize after (for frag shader's) the "bary.f"
+	 * offsets (inloc) have been assigned.
+	 */
+	ir3_legalize(ir, &so->num_samp, &so->has_ssbo, &max_bary);
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("AFTER LEGALIZE:\n");
+		ir3_print(ir);
+	}
+
+	/* Note that actual_in counts inputs that are not bary.f'd for FS: */
+	if (so->type == MESA_SHADER_VERTEX)
+		so->total_in = actual_in;
+	else
+		so->total_in = max_bary + 1;
+
+out:
+	if (ret) {
+		if (so->ir)
+			ir3_destroy(so->ir);
+		so->ir = NULL;
+	}
+	compile_free(ctx);
+
+	return ret;
+}
diff --git a/src/freedreno/ir3/ir3_cp.c b/src/freedreno/ir3/ir3_cp.c
new file mode 100644
index 00000000000..e8e8cc311e3
--- /dev/null
+++ b/src/freedreno/ir3/ir3_cp.c
@@ -0,0 +1,653 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include <math.h>
+
+#include "ir3.h"
+#include "ir3_shader.h"
+
+/*
+ * Copy Propagate:
+ */
+
+struct ir3_cp_ctx {
+	struct ir3 *shader;
+	struct ir3_shader_variant *so;
+	unsigned immediate_idx;
+};
+
+/* is it a type preserving mov, with ok flags? */
+static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
+{
+	if (is_same_type_mov(instr)) {
+		struct ir3_register *dst = instr->regs[0];
+		struct ir3_register *src = instr->regs[1];
+		struct ir3_instruction *src_instr = ssa(src);
+
+		/* only if mov src is SSA (not const/immed): */
+		if (!src_instr)
+			return false;
+
+		/* no indirect: */
+		if (dst->flags & IR3_REG_RELATIV)
+			return false;
+		if (src->flags & IR3_REG_RELATIV)
+			return false;
+
+		if (src->flags & IR3_REG_ARRAY)
+			return false;
+
+		if (!allow_flags)
+			if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG |
+					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
+				return false;
+
+		/* TODO: remove this hack: */
+		if (src_instr->opc == OPC_META_FO)
+			return false;
+
+		return true;
+	}
+	return false;
+}
+
+static unsigned cp_flags(unsigned flags)
+{
+	/* only considering these flags (at least for now): */
+	flags &= (IR3_REG_CONST | IR3_REG_IMMED |
+			IR3_REG_FNEG | IR3_REG_FABS |
+			IR3_REG_SNEG | IR3_REG_SABS |
+			IR3_REG_BNOT | IR3_REG_RELATIV);
+	return flags;
+}
+
+static bool valid_flags(struct ir3_instruction *instr, unsigned n,
+		unsigned flags)
+{
+	unsigned valid_flags;
+	flags = cp_flags(flags);
+
+	/* If destination is indirect, then source cannot be.. at least
+	 * I don't think so..
+	 */
+	if ((instr->regs[0]->flags & IR3_REG_RELATIV) &&
+			(flags & IR3_REG_RELATIV))
+		return false;
+
+	/* TODO it seems to *mostly* work to cp RELATIV, except we get some
+	 * intermittent piglit variable-indexing fails.  Newer blob driver
+	 * doesn't seem to cp these.  Possibly this is hw workaround?  Not
+	 * sure, but until that is understood better, lets just switch off
+	 * cp for indirect src's:
+	 */
+	if (flags & IR3_REG_RELATIV)
+		return false;
+
+	switch (opc_cat(instr->opc)) {
+	case 1:
+		valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
+		if (flags & ~valid_flags)
+			return false;
+		break;
+	case 2:
+		valid_flags = ir3_cat2_absneg(instr->opc) |
+				IR3_REG_CONST | IR3_REG_RELATIV;
+
+		if (ir3_cat2_int(instr->opc))
+			valid_flags |= IR3_REG_IMMED;
+
+		if (flags & ~valid_flags)
+			return false;
+
+		if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
+			unsigned m = (n ^ 1) + 1;
+			/* cannot deal w/ const in both srcs:
+			 * (note that some cat2 actually only have a single src)
+			 */
+			if (m < instr->regs_count) {
+				struct ir3_register *reg = instr->regs[m];
+				if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST))
+					return false;
+				if ((flags & IR3_REG_IMMED) && (reg->flags & IR3_REG_IMMED))
+					return false;
+			}
+			/* cannot be const + ABS|NEG: */
+			if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
+					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
+				return false;
+		}
+		break;
+	case 3:
+		valid_flags = ir3_cat3_absneg(instr->opc) |
+				IR3_REG_CONST | IR3_REG_RELATIV;
+
+		if (flags & ~valid_flags)
+			return false;
+
+		if (flags & (IR3_REG_CONST | IR3_REG_RELATIV)) {
+			/* cannot deal w/ const/relativ in 2nd src: */
+			if (n == 1)
+				return false;
+		}
+
+		if (flags & IR3_REG_CONST) {
+			/* cannot be const + ABS|NEG: */
+			if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
+					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
+				return false;
+		}
+		break;
+	case 4:
+		/* seems like blob compiler avoids const as src.. */
+		/* TODO double check if this is still the case on a4xx */
+		if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
+			return false;
+		if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
+			return false;
+		break;
+	case 5:
+		/* no flags allowed */
+		if (flags)
+			return false;
+		break;
+	case 6:
+		valid_flags = IR3_REG_IMMED;
+		if (flags & ~valid_flags)
+			return false;
+
+		if (flags & IR3_REG_IMMED) {
+			/* doesn't seem like we can have immediate src for store
+			 * instructions:
+			 *
+			 * TODO this restriction could also apply to load instructions,
+			 * but for load instructions this arg is the address (and not
+			 * really sure any good way to test a hard-coded immed addr src)
+			 */
+			if (is_store(instr) && (n == 1))
+				return false;
+
+			if ((instr->opc == OPC_LDL) && (n != 1))
+				return false;
+
+			if ((instr->opc == OPC_STL) && (n != 2))
+				return false;
+
+			/* disallow CP into anything but the SSBO slot argument for
+			 * atomics:
+			 */
+			if (is_atomic(instr->opc) && (n != 0))
+				return false;
+
+			if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
+				return false;
+		}
+
+		break;
+	}
+
+	return true;
+}
+
+/* propagate register flags from src to dst.. negates need special
+ * handling to cancel each other out.
+ */
+static void combine_flags(unsigned *dstflags, struct ir3_instruction *src)
+{
+	unsigned srcflags = src->regs[1]->flags;
+
+	/* if what we are combining into already has (abs) flags,
+	 * we can drop (neg) from src:
+	 */
+	if (*dstflags & IR3_REG_FABS)
+		srcflags &= ~IR3_REG_FNEG;
+	if (*dstflags & IR3_REG_SABS)
+		srcflags &= ~IR3_REG_SNEG;
+
+	if (srcflags & IR3_REG_FABS)
+		*dstflags |= IR3_REG_FABS;
+	if (srcflags & IR3_REG_SABS)
+		*dstflags |= IR3_REG_SABS;
+	if (srcflags & IR3_REG_FNEG)
+		*dstflags ^= IR3_REG_FNEG;
+	if (srcflags & IR3_REG_SNEG)
+		*dstflags ^= IR3_REG_SNEG;
+	if (srcflags & IR3_REG_BNOT)
+		*dstflags ^= IR3_REG_BNOT;
+
+	*dstflags &= ~IR3_REG_SSA;
+	*dstflags |= srcflags & IR3_REG_SSA;
+	*dstflags |= srcflags & IR3_REG_CONST;
+	*dstflags |= srcflags & IR3_REG_IMMED;
+	*dstflags |= srcflags & IR3_REG_RELATIV;
+	*dstflags |= srcflags & IR3_REG_ARRAY;
+
+	/* if src of the src is boolean we can drop the (abs) since we know
+	 * the source value is already a postitive integer.  This cleans
+	 * up the absnegs that get inserted when converting between nir and
+	 * native boolean (see ir3_b2n/n2b)
+	 */
+	struct ir3_instruction *srcsrc = ssa(src->regs[1]);
+	if (srcsrc && is_bool(srcsrc))
+		*dstflags &= ~IR3_REG_SABS;
+}
+
+static struct ir3_register *
+lower_immed(struct ir3_cp_ctx *ctx, struct ir3_register *reg, unsigned new_flags)
+{
+	unsigned swiz, idx, i;
+
+	reg = ir3_reg_clone(ctx->shader, reg);
+
+	/* in some cases, there are restrictions on (abs)/(neg) plus const..
+	 * so just evaluate those and clear the flags:
+	 */
+	if (new_flags & IR3_REG_SABS) {
+		reg->iim_val = abs(reg->iim_val);
+		new_flags &= ~IR3_REG_SABS;
+	}
+
+	if (new_flags & IR3_REG_FABS) {
+		reg->fim_val = fabs(reg->fim_val);
+		new_flags &= ~IR3_REG_FABS;
+	}
+
+	if (new_flags & IR3_REG_SNEG) {
+		reg->iim_val = -reg->iim_val;
+		new_flags &= ~IR3_REG_SNEG;
+	}
+
+	if (new_flags & IR3_REG_FNEG) {
+		reg->fim_val = -reg->fim_val;
+		new_flags &= ~IR3_REG_FNEG;
+	}
+
+	/* Reallocate for 4 more elements whenever it's necessary */
+	if (ctx->immediate_idx == ctx->so->immediates_size * 4) {
+		ctx->so->immediates_size += 4;
+		ctx->so->immediates = realloc (ctx->so->immediates,
+			ctx->so->immediates_size * sizeof (ctx->so->immediates[0]));
+	}
+
+	for (i = 0; i < ctx->immediate_idx; i++) {
+		swiz = i % 4;
+		idx  = i / 4;
+
+		if (ctx->so->immediates[idx].val[swiz] == reg->uim_val) {
+			break;
+		}
+	}
+
+	if (i == ctx->immediate_idx) {
+		/* need to generate a new immediate: */
+		swiz = i % 4;
+		idx  = i / 4;
+		ctx->so->immediates[idx].val[swiz] = reg->uim_val;
+		ctx->so->immediates_count = idx + 1;
+		ctx->immediate_idx++;
+	}
+
+	new_flags &= ~IR3_REG_IMMED;
+	new_flags |= IR3_REG_CONST;
+	reg->flags = new_flags;
+	reg->num = i + (4 * ctx->so->constbase.immediate);
+
+	return reg;
+}
+
+static void
+unuse(struct ir3_instruction *instr)
+{
+	debug_assert(instr->use_count > 0);
+
+	if (--instr->use_count == 0) {
+		struct ir3_block *block = instr->block;
+
+		instr->barrier_class = 0;
+		instr->barrier_conflict = 0;
+
+		/* we don't want to remove anything in keeps (which could
+		 * be things like array store's)
+		 */
+		for (unsigned i = 0; i < block->keeps_count; i++) {
+			debug_assert(block->keeps[i] != instr);
+		}
+	}
+}
+
+/**
+ * Handle cp for a given src register.  This additionally handles
+ * the cases of collapsing immedate/const (which replace the src
+ * register with a non-ssa src) or collapsing mov's from relative
+ * src (which needs to also fixup the address src reference by the
+ * instruction).
+ */
+static void
+reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
+		struct ir3_register *reg, unsigned n)
+{
+	struct ir3_instruction *src = ssa(reg);
+
+	if (is_eligible_mov(src, true)) {
+		/* simple case, no immed/const/relativ, only mov's w/ ssa src: */
+		struct ir3_register *src_reg = src->regs[1];
+		unsigned new_flags = reg->flags;
+
+		combine_flags(&new_flags, src);
+
+		if (valid_flags(instr, n, new_flags)) {
+			if (new_flags & IR3_REG_ARRAY) {
+				debug_assert(!(reg->flags & IR3_REG_ARRAY));
+				reg->array = src_reg->array;
+			}
+			reg->flags = new_flags;
+			reg->instr = ssa(src_reg);
+
+			instr->barrier_class |= src->barrier_class;
+			instr->barrier_conflict |= src->barrier_conflict;
+
+			unuse(src);
+			reg->instr->use_count++;
+		}
+
+	} else if (is_same_type_mov(src) &&
+			/* cannot collapse const/immed/etc into meta instrs: */
+			!is_meta(instr)) {
+		/* immed/const/etc cases, which require some special handling: */
+		struct ir3_register *src_reg = src->regs[1];
+		unsigned new_flags = reg->flags;
+
+		combine_flags(&new_flags, src);
+
+		if (!valid_flags(instr, n, new_flags)) {
+			/* See if lowering an immediate to const would help. */
+			if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
+				debug_assert(new_flags & IR3_REG_IMMED);
+				instr->regs[n + 1] = lower_immed(ctx, src_reg, new_flags);
+				return;
+			}
+
+			/* special case for "normal" mad instructions, we can
+			 * try swapping the first two args if that fits better.
+			 *
+			 * the "plain" MAD's (ie. the ones that don't shift first
+			 * src prior to multiply) can swap their first two srcs if
+			 * src[0] is !CONST and src[1] is CONST:
+			 */
+			if ((n == 1) && is_mad(instr->opc) &&
+					!(instr->regs[0 + 1]->flags & (IR3_REG_CONST | IR3_REG_RELATIV)) &&
+					valid_flags(instr, 0, new_flags & ~IR3_REG_IMMED)) {
+				/* swap src[0] and src[1]: */
+				struct ir3_register *tmp;
+				tmp = instr->regs[0 + 1];
+				instr->regs[0 + 1] = instr->regs[1 + 1];
+				instr->regs[1 + 1] = tmp;
+
+				n = 0;
+			} else {
+				return;
+			}
+		}
+
+		/* Here we handle the special case of mov from
+		 * CONST and/or RELATIV.  These need to be handled
+		 * specially, because in the case of move from CONST
+		 * there is no src ir3_instruction so we need to
+		 * replace the ir3_register.  And in the case of
+		 * RELATIV we need to handle the address register
+		 * dependency.
+		 */
+		if (src_reg->flags & IR3_REG_CONST) {
+			/* an instruction cannot reference two different
+			 * address registers:
+			 */
+			if ((src_reg->flags & IR3_REG_RELATIV) &&
+					conflicts(instr->address, reg->instr->address))
+				return;
+
+			/* This seems to be a hw bug, or something where the timings
+			 * just somehow don't work out.  This restriction may only
+			 * apply if the first src is also CONST.
+			 */
+			if ((opc_cat(instr->opc) == 3) && (n == 2) &&
+					(src_reg->flags & IR3_REG_RELATIV) &&
+					(src_reg->array.offset == 0))
+				return;
+
+			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+			src_reg->flags = new_flags;
+			instr->regs[n+1] = src_reg;
+
+			if (src_reg->flags & IR3_REG_RELATIV)
+				ir3_instr_set_address(instr, reg->instr->address);
+
+			return;
+		}
+
+		if ((src_reg->flags & IR3_REG_RELATIV) &&
+				!conflicts(instr->address, reg->instr->address)) {
+			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+			src_reg->flags = new_flags;
+			instr->regs[n+1] = src_reg;
+			ir3_instr_set_address(instr, reg->instr->address);
+
+			return;
+		}
+
+		/* NOTE: seems we can only do immed integers, so don't
+		 * need to care about float.  But we do need to handle
+		 * abs/neg *before* checking that the immediate requires
+		 * few enough bits to encode:
+		 *
+		 * TODO: do we need to do something to avoid accidentally
+		 * catching a float immed?
+		 */
+		if (src_reg->flags & IR3_REG_IMMED) {
+			int32_t iim_val = src_reg->iim_val;
+
+			debug_assert((opc_cat(instr->opc) == 1) ||
+					(opc_cat(instr->opc) == 6) ||
+					ir3_cat2_int(instr->opc) ||
+					(is_mad(instr->opc) && (n == 0)));
+
+			if (new_flags & IR3_REG_SABS)
+				iim_val = abs(iim_val);
+
+			if (new_flags & IR3_REG_SNEG)
+				iim_val = -iim_val;
+
+			if (new_flags & IR3_REG_BNOT)
+				iim_val = ~iim_val;
+
+			/* other than category 1 (mov) we can only encode up to 10 bits: */
+			if ((instr->opc == OPC_MOV) ||
+					!((iim_val & ~0x3ff) && (-iim_val & ~0x3ff))) {
+				new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
+				src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+				src_reg->flags = new_flags;
+				src_reg->iim_val = iim_val;
+				instr->regs[n+1] = src_reg;
+			} else if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
+				/* See if lowering an immediate to const would help. */
+				instr->regs[n+1] = lower_immed(ctx, src_reg, new_flags);
+			}
+
+			return;
+		}
+	}
+}
+
+/* Handle special case of eliminating output mov, and similar cases where
+ * there isn't a normal "consuming" instruction.  In this case we cannot
+ * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot
+ * be eliminated)
+ */
+static struct ir3_instruction *
+eliminate_output_mov(struct ir3_instruction *instr)
+{
+	if (is_eligible_mov(instr, false)) {
+		struct ir3_register *reg = instr->regs[1];
+		if (!(reg->flags & IR3_REG_ARRAY)) {
+			struct ir3_instruction *src_instr = ssa(reg);
+			debug_assert(src_instr);
+			return src_instr;
+		}
+	}
+	return instr;
+}
+
+/**
+ * Find instruction src's which are mov's that can be collapsed, replacing
+ * the mov dst with the mov src
+ */
+static void
+instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr)
+{
+	struct ir3_register *reg;
+
+	if (instr->regs_count == 0)
+		return;
+
+	if (ir3_instr_check_mark(instr))
+		return;
+
+	/* walk down the graph from each src: */
+	foreach_src_n(reg, n, instr) {
+		struct ir3_instruction *src = ssa(reg);
+
+		if (!src)
+			continue;
+
+		instr_cp(ctx, src);
+
+		/* TODO non-indirect access we could figure out which register
+		 * we actually want and allow cp..
+		 */
+		if (reg->flags & IR3_REG_ARRAY)
+			continue;
+
+		/* Don't CP absneg into meta instructions, that won't end well: */
+		if (is_meta(instr) && (src->opc != OPC_MOV))
+			continue;
+
+		reg_cp(ctx, instr, reg, n);
+	}
+
+	if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+		struct ir3_instruction *src = ssa(instr->regs[0]);
+		if (src)
+			instr_cp(ctx, src);
+	}
+
+	if (instr->address) {
+		instr_cp(ctx, instr->address);
+		ir3_instr_set_address(instr, eliminate_output_mov(instr->address));
+	}
+
+	/* we can end up with extra cmps.s from frontend, which uses a
+	 *
+	 *    cmps.s p0.x, cond, 0
+	 *
+	 * as a way to mov into the predicate register.  But frequently 'cond'
+	 * is itself a cmps.s/cmps.f/cmps.u.  So detect this special case and
+	 * just re-write the instruction writing predicate register to get rid
+	 * of the double cmps.
+	 */
+	if ((instr->opc == OPC_CMPS_S) &&
+			(instr->regs[0]->num == regid(REG_P0, 0)) &&
+			ssa(instr->regs[1]) &&
+			(instr->regs[2]->flags & IR3_REG_IMMED) &&
+			(instr->regs[2]->iim_val == 0)) {
+		struct ir3_instruction *cond = ssa(instr->regs[1]);
+		switch (cond->opc) {
+		case OPC_CMPS_S:
+		case OPC_CMPS_F:
+		case OPC_CMPS_U:
+			instr->opc   = cond->opc;
+			instr->flags = cond->flags;
+			instr->cat2  = cond->cat2;
+			instr->address = cond->address;
+			instr->regs[1] = cond->regs[1];
+			instr->regs[2] = cond->regs[2];
+			instr->barrier_class |= cond->barrier_class;
+			instr->barrier_conflict |= cond->barrier_conflict;
+			unuse(cond);
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+void
+ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so)
+{
+	struct ir3_cp_ctx ctx = {
+			.shader = ir,
+			.so = so,
+	};
+
+	/* This is a bit annoying, and probably wouldn't be necessary if we
+	 * tracked a reverse link from producing instruction to consumer.
+	 * But we need to know when we've eliminated the last consumer of
+	 * a mov, so we need to do a pass to first count consumers of a
+	 * mov.
+	 */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			struct ir3_instruction *src;
+
+			/* by the way, we don't account for false-dep's, so the CP
+			 * pass should always happen before false-dep's are inserted
+			 */
+			debug_assert(instr->deps_count == 0);
+
+			foreach_ssa_src(src, instr) {
+				src->use_count++;
+			}
+		}
+	}
+
+	ir3_clear_mark(ir);
+
+	for (unsigned i = 0; i < ir->noutputs; i++) {
+		if (ir->outputs[i]) {
+			instr_cp(&ctx, ir->outputs[i]);
+			ir->outputs[i] = eliminate_output_mov(ir->outputs[i]);
+		}
+	}
+
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		if (block->condition) {
+			instr_cp(&ctx, block->condition);
+			block->condition = eliminate_output_mov(block->condition);
+		}
+
+		for (unsigned i = 0; i < block->keeps_count; i++) {
+			instr_cp(&ctx, block->keeps[i]);
+			block->keeps[i] = eliminate_output_mov(block->keeps[i]);
+		}
+	}
+}
diff --git a/src/freedreno/ir3/ir3_depth.c b/src/freedreno/ir3/ir3_depth.c
new file mode 100644
index 00000000000..73bf5e19926
--- /dev/null
+++ b/src/freedreno/ir3/ir3_depth.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Instruction Depth:
+ *
+ * Calculates weighted instruction depth, ie. the sum of # of needed
+ * instructions plus delay slots back to original input (ie INPUT or
+ * CONST).  That is to say, an instructions depth is:
+ *
+ *   depth(instr) {
+ *     d = 0;
+ *     // for each src register:
+ *     foreach (src in instr->regs[1..n])
+ *       d = max(d, delayslots(src->instr, n) + depth(src->instr));
+ *     return d + 1;
+ *   }
+ *
+ * After an instruction's depth is calculated, it is inserted into the
+ * blocks depth sorted list, which is used by the scheduling pass.
+ */
+
+/* generally don't count false dependencies, since this can just be
+ * something like a barrier, or SSBO store.  The exception is array
+ * dependencies if the assigner is an array write and the consumer
+ * reads the same array.
+ */
+static bool
+ignore_dep(struct ir3_instruction *assigner,
+		struct ir3_instruction *consumer, unsigned n)
+{
+	if (!__is_false_dep(consumer, n))
+		return false;
+
+	if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
+		struct ir3_register *dst = assigner->regs[0];
+		struct ir3_register *src;
+
+		debug_assert(dst->flags & IR3_REG_ARRAY);
+
+		foreach_src(src, consumer) {
+			if ((src->flags & IR3_REG_ARRAY) &&
+					(dst->array.id == src->array.id)) {
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+/* calculate required # of delay slots between the instruction that
+ * assigns a value and the one that consumes
+ */
+int ir3_delayslots(struct ir3_instruction *assigner,
+		struct ir3_instruction *consumer, unsigned n)
+{
+	if (ignore_dep(assigner, consumer, n))
+		return 0;
+
+	/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
+	 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
+	 * handled with sync bits
+	 */
+
+	if (is_meta(assigner))
+		return 0;
+
+	if (writes_addr(assigner))
+		return 6;
+
+	/* handled via sync flags: */
+	if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
+		return 0;
+
+	/* assigner must be alu: */
+	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
+			is_mem(consumer)) {
+		return 6;
+	} else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
+			(n == 3)) {
+		/* special case, 3rd src to cat3 not required on first cycle */
+		return 1;
+	} else {
+		return 3;
+	}
+}
+
+void
+ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list)
+{
+	/* remove from existing spot in list: */
+	list_delinit(&instr->node);
+
+	/* find where to re-insert instruction: */
+	list_for_each_entry (struct ir3_instruction, pos, list, node) {
+		if (pos->depth > instr->depth) {
+			list_add(&instr->node, &pos->node);
+			return;
+		}
+	}
+	/* if we get here, we didn't find an insertion spot: */
+	list_addtail(&instr->node, list);
+}
+
+static void
+ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep)
+{
+	struct ir3_instruction *src;
+
+	/* don't mark falsedep's as used, but otherwise process them normally: */
+	if (!falsedep)
+		instr->flags &= ~IR3_INSTR_UNUSED;
+
+	if (ir3_instr_check_mark(instr))
+		return;
+
+	instr->depth = 0;
+
+	foreach_ssa_src_n(src, i, instr) {
+		unsigned sd;
+
+		/* visit child to compute it's depth: */
+		ir3_instr_depth(src, boost, __is_false_dep(instr, i));
+
+		/* for array writes, no need to delay on previous write: */
+		if (i == 0)
+			continue;
+
+		sd = ir3_delayslots(src, instr, i) + src->depth;
+		sd += boost;
+
+		instr->depth = MAX2(instr->depth, sd);
+	}
+
+	if (!is_meta(instr))
+		instr->depth++;
+
+	ir3_insert_by_depth(instr, &instr->block->instr_list);
+}
+
+static bool
+remove_unused_by_block(struct ir3_block *block)
+{
+	bool progress = false;
+	list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+		if (instr->opc == OPC_END)
+			continue;
+		if (instr->flags & IR3_INSTR_UNUSED) {
+			list_delinit(&instr->node);
+			progress = true;
+		}
+	}
+	return progress;
+}
+
+static bool
+compute_depth_and_remove_unused(struct ir3 *ir)
+{
+	unsigned i;
+	bool progress = false;
+
+	ir3_clear_mark(ir);
+
+	/* initially mark everything as unused, we'll clear the flag as we
+	 * visit the instructions:
+	 */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			instr->flags |= IR3_INSTR_UNUSED;
+		}
+	}
+
+	for (i = 0; i < ir->noutputs; i++)
+		if (ir->outputs[i])
+			ir3_instr_depth(ir->outputs[i], 0, false);
+
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		for (i = 0; i < block->keeps_count; i++)
+			ir3_instr_depth(block->keeps[i], 0, false);
+
+		/* We also need to account for if-condition: */
+		if (block->condition)
+			ir3_instr_depth(block->condition, 6, false);
+	}
+
+	/* mark un-used instructions: */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		progress |= remove_unused_by_block(block);
+	}
+
+	/* note that we can end up with unused indirects, but we should
+	 * not end up with unused predicates.
+	 */
+	for (i = 0; i < ir->indirects_count; i++) {
+		struct ir3_instruction *instr = ir->indirects[i];
+		if (instr && (instr->flags & IR3_INSTR_UNUSED))
+			ir->indirects[i] = NULL;
+	}
+
+	/* cleanup unused inputs: */
+	for (i = 0; i < ir->ninputs; i++) {
+		struct ir3_instruction *in = ir->inputs[i];
+		if (in && (in->flags & IR3_INSTR_UNUSED))
+			ir->inputs[i] = NULL;
+	}
+
+	return progress;
+}
+
+void
+ir3_depth(struct ir3 *ir)
+{
+	bool progress;
+	do {
+		progress = compute_depth_and_remove_unused(ir);
+	} while (progress);
+}
diff --git a/src/freedreno/ir3/ir3_group.c b/src/freedreno/ir3/ir3_group.c
new file mode 100644
index 00000000000..570055973e8
--- /dev/null
+++ b/src/freedreno/ir3/ir3_group.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "ir3.h"
+
+/*
+ * Find/group instruction neighbors:
+ */
+
+/* bleh.. we need to do the same group_n() thing for both inputs/outputs
+ * (where we have a simple instr[] array), and fanin nodes (where we have
+ * an extra indirection via reg->instr).
+ */
+struct group_ops {
+	struct ir3_instruction *(*get)(void *arr, int idx);
+	void (*insert_mov)(void *arr, int idx, struct ir3_instruction *instr);
+};
+
+static struct ir3_instruction *arr_get(void *arr, int idx)
+{
+	return ((struct ir3_instruction **)arr)[idx];
+}
+static void arr_insert_mov_out(void *arr, int idx, struct ir3_instruction *instr)
+{
+	((struct ir3_instruction **)arr)[idx] =
+			ir3_MOV(instr->block, instr, TYPE_F32);
+}
+static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr)
+{
+	/* so, we can't insert a mov in front of a meta:in.. and the downstream
+	 * instruction already has a pointer to 'instr'.  So we cheat a bit and
+	 * morph the meta:in instruction into a mov and insert a new meta:in
+	 * in front.
+	 */
+	struct ir3_instruction *in;
+
+	debug_assert(instr->regs_count == 1);
+
+	in = ir3_instr_create(instr->block, OPC_META_INPUT);
+	in->inout.block = instr->block;
+	ir3_reg_create(in, instr->regs[0]->num, 0);
+
+	/* create src reg for meta:in and fixup to now be a mov: */
+	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = in;
+	instr->opc = OPC_MOV;
+	instr->cat1.src_type = TYPE_F32;
+	instr->cat1.dst_type = TYPE_F32;
+
+	((struct ir3_instruction **)arr)[idx] = in;
+}
+static struct group_ops arr_ops_out = { arr_get, arr_insert_mov_out };
+static struct group_ops arr_ops_in = { arr_get, arr_insert_mov_in };
+
+static struct ir3_instruction *instr_get(void *arr, int idx)
+{
+	return ssa(((struct ir3_instruction *)arr)->regs[idx+1]);
+}
+static void
+instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr)
+{
+	((struct ir3_instruction *)arr)->regs[idx+1]->instr =
+			ir3_MOV(instr->block, instr, TYPE_F32);
+}
+static struct group_ops instr_ops = { instr_get, instr_insert_mov };
+
+/* verify that cur != instr, but cur is also not in instr's neighbor-list: */
+static bool
+in_neighbor_list(struct ir3_instruction *instr, struct ir3_instruction *cur, int pos)
+{
+	int idx = 0;
+
+	if (!instr)
+		return false;
+
+	if (instr == cur)
+		return true;
+
+	for (instr = ir3_neighbor_first(instr); instr; instr = instr->cp.right)
+		if ((idx++ != pos) && (instr == cur))
+			return true;
+
+	return false;
+}
+
+static void
+group_n(struct group_ops *ops, void *arr, unsigned n)
+{
+	unsigned i, j;
+
+	/* first pass, figure out what has conflicts and needs a mov
+	 * inserted.  Do this up front, before starting to setup
+	 * left/right neighbor pointers.  Trying to do it in a single
+	 * pass could result in a situation where we can't even setup
+	 * the mov's right neighbor ptr if the next instr also needs
+	 * a mov.
+	 */
+restart:
+	for (i = 0; i < n; i++) {
+		struct ir3_instruction *instr = ops->get(arr, i);
+		if (instr) {
+			struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
+			struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
+			bool conflict;
+
+			/* check for left/right neighbor conflicts: */
+			conflict = conflicts(instr->cp.left, left) ||
+				conflicts(instr->cp.right, right);
+
+			/* Mixing array elements and higher register classes
+			 * (ie. groups) doesn't really work out in RA.  See:
+			 *
+			 * https://trello.com/c/DqeDkeVf/156-bug-with-stk-70frag
+			 */
+			if (instr->regs[0]->flags & IR3_REG_ARRAY)
+				conflict = true;
+
+			/* we also can't have an instr twice in the group: */
+			for (j = i + 1; (j < n) && !conflict; j++)
+				if (in_neighbor_list(ops->get(arr, j), instr, i))
+					conflict = true;
+
+			if (conflict) {
+				ops->insert_mov(arr, i, instr);
+				/* inserting the mov may have caused a conflict
+				 * against the previous:
+				 */
+				goto restart;
+			}
+		}
+	}
+
+	/* second pass, now that we've inserted mov's, fixup left/right
+	 * neighbors.  This is guaranteed to succeed, since by definition
+	 * the newly inserted mov's cannot conflict with anything.
+	 */
+	for (i = 0; i < n; i++) {
+		struct ir3_instruction *instr = ops->get(arr, i);
+		if (instr) {
+			struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
+			struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
+
+			debug_assert(!conflicts(instr->cp.left, left));
+			if (left) {
+				instr->cp.left_cnt++;
+				instr->cp.left = left;
+			}
+
+			debug_assert(!conflicts(instr->cp.right, right));
+			if (right) {
+				instr->cp.right_cnt++;
+				instr->cp.right = right;
+			}
+		}
+	}
+}
+
+static void
+instr_find_neighbors(struct ir3_instruction *instr)
+{
+	struct ir3_instruction *src;
+
+	if (ir3_instr_check_mark(instr))
+		return;
+
+	if (instr->opc == OPC_META_FI)
+		group_n(&instr_ops, instr, instr->regs_count - 1);
+
+	foreach_ssa_src(src, instr)
+		instr_find_neighbors(src);
+}
+
+/* a bit of sadness.. we can't have "holes" in inputs from PoV of
+ * register assignment, they still need to be grouped together.  So
+ * we need to insert dummy/padding instruction for grouping, and
+ * then take it back out again before anyone notices.
+ */
+static void
+pad_and_group_input(struct ir3_instruction **input, unsigned n)
+{
+	int i, mask = 0;
+	struct ir3_block *block = NULL;
+
+	for (i = n - 1; i >= 0; i--) {
+		struct ir3_instruction *instr = input[i];
+		if (instr) {
+			block = instr->block;
+		} else if (block) {
+			instr = ir3_NOP(block);
+			ir3_reg_create(instr, 0, IR3_REG_SSA);    /* dummy dst */
+			input[i] = instr;
+			mask |= (1 << i);
+		}
+	}
+
+	group_n(&arr_ops_in, input, n);
+
+	for (i = 0; i < n; i++) {
+		if (mask & (1 << i))
+			input[i] = NULL;
+	}
+}
+
+static void
+find_neighbors(struct ir3 *ir)
+{
+	unsigned i;
+
+	/* shader inputs/outputs themselves must be contiguous as well:
+	 *
+	 * NOTE: group inputs first, since we only insert mov's
+	 * *before* the conflicted instr (and that would go badly
+	 * for inputs).  By doing inputs first, we should never
+	 * have a conflict on inputs.. pushing any conflict to
+	 * resolve to the outputs, for stuff like:
+	 *
+	 *     MOV OUT[n], IN[m].wzyx
+	 *
+	 * NOTE: we assume here inputs/outputs are grouped in vec4.
+	 * This logic won't quite cut it if we don't align smaller
+	 * on vec4 boundaries
+	 */
+	for (i = 0; i < ir->ninputs; i += 4)
+		pad_and_group_input(&ir->inputs[i], 4);
+	for (i = 0; i < ir->noutputs; i += 4)
+		group_n(&arr_ops_out, &ir->outputs[i], 4);
+
+	for (i = 0; i < ir->noutputs; i++) {
+		if (ir->outputs[i]) {
+			struct ir3_instruction *instr = ir->outputs[i];
+			instr_find_neighbors(instr);
+		}
+	}
+
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		for (i = 0; i < block->keeps_count; i++) {
+			struct ir3_instruction *instr = block->keeps[i];
+			instr_find_neighbors(instr);
+		}
+
+		/* We also need to account for if-condition: */
+		if (block->condition)
+			instr_find_neighbors(block->condition);
+	}
+}
+
+void
+ir3_group(struct ir3 *ir)
+{
+	ir3_clear_mark(ir);
+	find_neighbors(ir);
+}
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
new file mode 100644
index 00000000000..ff4c644eab5
--- /dev/null
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -0,0 +1,496 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "util/ralloc.h"
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Legalize:
+ *
+ * We currently require that scheduling ensures that we have enough nop's
+ * in all the right places.  The legalize step mostly handles fixing up
+ * instruction flags ((ss)/(sy)/(ei)), and collapses sequences of nop's
+ * into fewer nop's w/ rpt flag.
+ */
+
+struct ir3_legalize_ctx {
+	int num_samp;
+	bool has_ssbo;
+	int max_bary;
+};
+
+struct ir3_legalize_state {
+	regmask_t needs_ss;
+	regmask_t needs_ss_war;       /* write after read */
+	regmask_t needs_sy;
+};
+
+struct ir3_legalize_block_data {
+	bool valid;
+	struct ir3_legalize_state state;
+};
+
+/* We want to evaluate each block from the position of any other
+ * predecessor block, in order that the flags set are the union of
+ * all possible program paths.
+ *
+ * To do this, we need to know the output state (needs_ss/ss_war/sy)
+ * of all predecessor blocks.  The tricky thing is loops, which mean
+ * that we can't simply recursively process each predecessor block
+ * before legalizing the current block.
+ *
+ * How we handle that is by looping over all the blocks until the
+ * results converge.  If the output state of a given block changes
+ * in a given pass, this means that all successor blocks are not
+ * yet fully legalized.
+ */
+
+static bool
+legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
+{
+	struct ir3_legalize_block_data *bd = block->data;
+
+	if (bd->valid)
+		return false;
+
+	struct ir3_instruction *last_input = NULL;
+	struct ir3_instruction *last_rel = NULL;
+	struct ir3_instruction *last_n = NULL;
+	struct list_head instr_list;
+	struct ir3_legalize_state prev_state = bd->state;
+	struct ir3_legalize_state *state = &bd->state;
+
+	/* our input state is the OR of all predecessor blocks' state: */
+	for (unsigned i = 0; i < block->predecessors_count; i++) {
+		struct ir3_legalize_block_data *pbd = block->predecessors[i]->data;
+		struct ir3_legalize_state *pstate = &pbd->state;
+
+		/* Our input (ss)/(sy) state is based on OR'ing the output
+		 * state of all our predecessor blocks
+		 */
+		regmask_or(&state->needs_ss,
+				&state->needs_ss, &pstate->needs_ss);
+		regmask_or(&state->needs_ss_war,
+				&state->needs_ss_war, &pstate->needs_ss_war);
+		regmask_or(&state->needs_sy,
+				&state->needs_sy, &pstate->needs_sy);
+	}
+
+	/* remove all the instructions from the list, we'll be adding
+	 * them back in as we go
+	 */
+	list_replace(&block->instr_list, &instr_list);
+	list_inithead(&block->instr_list);
+
+	list_for_each_entry_safe (struct ir3_instruction, n, &instr_list, node) {
+		struct ir3_register *reg;
+		unsigned i;
+
+		n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
+
+		if (is_meta(n))
+			continue;
+
+		if (is_input(n)) {
+			struct ir3_register *inloc = n->regs[1];
+			assert(inloc->flags & IR3_REG_IMMED);
+			ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
+		}
+
+		if (last_n && is_barrier(last_n))
+			n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+
+		/* NOTE: consider dst register too.. it could happen that
+		 * texture sample instruction (for example) writes some
+		 * components which are unused.  A subsequent instruction
+		 * that writes the same register can race w/ the sam instr
+		 * resulting in undefined results:
+		 */
+		for (i = 0; i < n->regs_count; i++) {
+			reg = n->regs[i];
+
+			if (reg_gpr(reg)) {
+
+				/* TODO: we probably only need (ss) for alu
+				 * instr consuming sfu result.. need to make
+				 * some tests for both this and (sy)..
+				 */
+				if (regmask_get(&state->needs_ss, reg)) {
+					n->flags |= IR3_INSTR_SS;
+					regmask_init(&state->needs_ss_war);
+					regmask_init(&state->needs_ss);
+				}
+
+				if (regmask_get(&state->needs_sy, reg)) {
+					n->flags |= IR3_INSTR_SY;
+					regmask_init(&state->needs_sy);
+				}
+			}
+
+			/* TODO: is it valid to have address reg loaded from a
+			 * relative src (ie. mova a0, c<a0.x+4>)?  If so, the
+			 * last_rel check below should be moved ahead of this:
+			 */
+			if (reg->flags & IR3_REG_RELATIV)
+				last_rel = n;
+		}
+
+		if (n->regs_count > 0) {
+			reg = n->regs[0];
+			if (regmask_get(&state->needs_ss_war, reg)) {
+				n->flags |= IR3_INSTR_SS;
+				regmask_init(&state->needs_ss_war);
+				regmask_init(&state->needs_ss);
+			}
+
+			if (last_rel && (reg->num == regid(REG_A0, 0))) {
+				last_rel->flags |= IR3_INSTR_UL;
+				last_rel = NULL;
+			}
+		}
+
+		/* cat5+ does not have an (ss) bit, if needed we need to
+		 * insert a nop to carry the sync flag.  Would be kinda
+		 * clever if we were aware of this during scheduling, but
+		 * this should be a pretty rare case:
+		 */
+		if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
+			struct ir3_instruction *nop;
+			nop = ir3_NOP(block);
+			nop->flags |= IR3_INSTR_SS;
+			n->flags &= ~IR3_INSTR_SS;
+		}
+
+		/* need to be able to set (ss) on first instruction: */
+		if (list_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
+			ir3_NOP(block);
+
+		if (is_nop(n) && !list_empty(&block->instr_list)) {
+			struct ir3_instruction *last = list_last_entry(&block->instr_list,
+					struct ir3_instruction, node);
+			if (is_nop(last) && (last->repeat < 5)) {
+				last->repeat++;
+				last->flags |= n->flags;
+				continue;
+			}
+		}
+
+		list_addtail(&n->node, &block->instr_list);
+
+		if (is_sfu(n))
+			regmask_set(&state->needs_ss, n->regs[0]);
+
+		if (is_tex(n)) {
+			/* this ends up being the # of samp instructions.. but that
+			 * is ok, everything else only cares whether it is zero or
+			 * not.  We do this here, rather than when we encounter a
+			 * SAMP decl, because (especially in binning pass shader)
+			 * the samp instruction(s) could get eliminated if the
+			 * result is not used.
+			 */
+			ctx->num_samp = MAX2(ctx->num_samp, n->cat5.samp + 1);
+			regmask_set(&state->needs_sy, n->regs[0]);
+		} else if (n->opc == OPC_RESINFO) {
+			regmask_set(&state->needs_ss, n->regs[0]);
+			ir3_NOP(block)->flags |= IR3_INSTR_SS;
+		} else if (is_load(n)) {
+			/* seems like ldlv needs (ss) bit instead??  which is odd but
+			 * makes a bunch of flat-varying tests start working on a4xx.
+			 */
+			if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL))
+				regmask_set(&state->needs_ss, n->regs[0]);
+			else
+				regmask_set(&state->needs_sy, n->regs[0]);
+		} else if (is_atomic(n->opc)) {
+			if (n->flags & IR3_INSTR_G)
+				regmask_set(&state->needs_sy, n->regs[0]);
+			else
+				regmask_set(&state->needs_ss, n->regs[0]);
+		}
+
+		if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
+			ctx->has_ssbo = true;
+
+		/* both tex/sfu appear to not always immediately consume
+		 * their src register(s):
+		 */
+		if (is_tex(n) || is_sfu(n) || is_mem(n)) {
+			foreach_src(reg, n) {
+				if (reg_gpr(reg))
+					regmask_set(&state->needs_ss_war, reg);
+			}
+		}
+
+		if (is_input(n))
+			last_input = n;
+
+		last_n = n;
+	}
+
+	if (last_input) {
+		/* special hack.. if using ldlv to bypass interpolation,
+		 * we need to insert a dummy bary.f on which we can set
+		 * the (ei) flag:
+		 */
+		if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) {
+			struct ir3_instruction *baryf;
+
+			/* (ss)bary.f (ei)r63.x, 0, r0.x */
+			baryf = ir3_instr_create(block, OPC_BARY_F);
+			baryf->flags |= IR3_INSTR_SS;
+			ir3_reg_create(baryf, regid(63, 0), 0);
+			ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
+			ir3_reg_create(baryf, regid(0, 0), 0);
+
+			/* insert the dummy bary.f after last_input: */
+			list_delinit(&baryf->node);
+			list_add(&baryf->node, &last_input->node);
+
+			last_input = baryf;
+		}
+		last_input->regs[0]->flags |= IR3_REG_EI;
+	}
+
+	if (last_rel)
+		last_rel->flags |= IR3_INSTR_UL;
+
+	bd->valid = true;
+
+	if (memcmp(&prev_state, state, sizeof(*state))) {
+		/* our output state changed, this invalidates all of our
+		 * successors:
+		 */
+		for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
+			if (!block->successors[i])
+				break;
+			struct ir3_legalize_block_data *pbd = block->successors[i]->data;
+			pbd->valid = false;
+		}
+	}
+
+	return true;
+}
+
+/* NOTE: branch instructions are always the last instruction(s)
+ * in the block.  We take advantage of this as we resolve the
+ * branches, since "if (foo) break;" constructs turn into
+ * something like:
+ *
+ *   block3 {
+ *   	...
+ *   	0029:021: mov.s32s32 r62.x, r1.y
+ *   	0082:022: br !p0.x, target=block5
+ *   	0083:023: br p0.x, target=block4
+ *   	// succs: if _[0029:021: mov.s32s32] block4; else block5;
+ *   }
+ *   block4 {
+ *   	0084:024: jump, target=block6
+ *   	// succs: block6;
+ *   }
+ *   block5 {
+ *   	0085:025: jump, target=block7
+ *   	// succs: block7;
+ *   }
+ *
+ * ie. only instruction in block4/block5 is a jump, so when
+ * resolving branches we can easily detect this by checking
+ * that the first instruction in the target block is itself
+ * a jump, and setup the br directly to the jump's target
+ * (and strip back out the now unreached jump)
+ *
+ * TODO sometimes we end up with things like:
+ *
+ *    br !p0.x, #2
+ *    br p0.x, #12
+ *    add.u r0.y, r0.y, 1
+ *
+ * If we swapped the order of the branches, we could drop one.
+ */
+static struct ir3_block *
+resolve_dest_block(struct ir3_block *block)
+{
+	/* special case for last block: */
+	if (!block->successors[0])
+		return block;
+
+	/* NOTE that we may or may not have inserted the jump
+	 * in the target block yet, so conditions to resolve
+	 * the dest to the dest block's successor are:
+	 *
+	 *   (1) successor[1] == NULL &&
+	 *   (2) (block-is-empty || only-instr-is-jump)
+	 */
+	if (block->successors[1] == NULL) {
+		if (list_empty(&block->instr_list)) {
+			return block->successors[0];
+		} else if (list_length(&block->instr_list) == 1) {
+			struct ir3_instruction *instr = list_first_entry(
+					&block->instr_list, struct ir3_instruction, node);
+			if (instr->opc == OPC_JUMP)
+				return block->successors[0];
+		}
+	}
+	return block;
+}
+
+static bool
+resolve_jump(struct ir3_instruction *instr)
+{
+	struct ir3_block *tblock =
+		resolve_dest_block(instr->cat0.target);
+	struct ir3_instruction *target;
+
+	if (tblock != instr->cat0.target) {
+		list_delinit(&instr->cat0.target->node);
+		instr->cat0.target = tblock;
+		return true;
+	}
+
+	target = list_first_entry(&tblock->instr_list,
+				struct ir3_instruction, node);
+
+	/* TODO maybe a less fragile way to do this.  But we are expecting
+	 * a pattern from sched_block() that looks like:
+	 *
+	 *   br !p0.x, #else-block
+	 *   br p0.x, #if-block
+	 *
+	 * if the first branch target is +2, or if 2nd branch target is +1
+	 * then we can just drop the jump.
+	 */
+	unsigned next_block;
+	if (instr->cat0.inv == true)
+		next_block = 2;
+	else
+		next_block = 1;
+
+	if ((!target) || (target->ip == (instr->ip + next_block))) {
+		list_delinit(&instr->node);
+		return true;
+	} else {
+		instr->cat0.immed =
+			(int)target->ip - (int)instr->ip;
+	}
+	return false;
+}
+
+/* resolve jumps, removing jumps/branches to immediately following
+ * instruction which we end up with from earlier stages.  Since
+ * removing an instruction can invalidate earlier instruction's
+ * branch offsets, we need to do this iteratively until no more
+ * branches are removed.
+ */
+static bool
+resolve_jumps(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+			if (is_flow(instr) && instr->cat0.target)
+				if (resolve_jump(instr))
+					return true;
+
+	return false;
+}
+
+/* we want to mark points where divergent flow control re-converges
+ * with (jp) flags.  For now, since we don't do any optimization for
+ * things that start out as a 'do {} while()', re-convergence points
+ * will always be a branch or jump target.  Note that this is overly
+ * conservative, since unconditional jump targets are not convergence
+ * points, we are just assuming that the other path to reach the jump
+ * target was divergent.  If we were clever enough to optimize the
+ * jump at end of a loop back to a conditional branch into a single
+ * conditional branch, ie. like:
+ *
+ *    add.f r1.w, r0.x, (neg)(r)c2.x   <= loop start
+ *    mul.f r1.z, r1.z, r0.x
+ *    mul.f r1.y, r1.y, r0.x
+ *    mul.f r0.z, r1.x, r0.x
+ *    mul.f r0.w, r0.y, r0.x
+ *    cmps.f.ge r0.x, (r)c2.y, (r)r1.w
+ *    add.s r0.x, (r)r0.x, (r)-1
+ *    sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x
+ *    cmps.f.eq p0.x, r0.x, c3.y
+ *    mov.f32f32 r0.x, r1.w
+ *    mov.f32f32 r0.y, r0.w
+ *    mov.f32f32 r1.x, r0.z
+ *    (rpt2)nop
+ *    br !p0.x, #-13
+ *    (jp)mul.f r0.x, c263.y, r1.y
+ *
+ * Then we'd have to be more clever, as the convergence point is no
+ * longer a branch or jump target.
+ */
+static void
+mark_convergence_points(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			if (is_flow(instr) && instr->cat0.target) {
+				struct ir3_instruction *target =
+					list_first_entry(&instr->cat0.target->instr_list,
+							struct ir3_instruction, node);
+				target->flags |= IR3_INSTR_JP;
+			}
+		}
+	}
+}
+
+void
+ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary)
+{
+	struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
+	bool progress;
+
+	ctx->max_bary = -1;
+
+	/* allocate per-block data: */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		block->data = rzalloc(ctx, struct ir3_legalize_block_data);
+	}
+
+	/* process each block: */
+	do {
+		progress = false;
+		list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+			progress |= legalize_block(ctx, block);
+		}
+	} while (progress);
+
+	*num_samp = ctx->num_samp;
+	*has_ssbo = ctx->has_ssbo;
+	*max_bary = ctx->max_bary;
+
+	do {
+		ir3_count_instructions(ir);
+	} while(resolve_jumps(ir));
+
+	mark_convergence_points(ir);
+
+	ralloc_free(ctx);
+}
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
new file mode 100644
index 00000000000..70c01ee0593
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+
+#include "util/debug.h"
+
+#include "ir3_nir.h"
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+static const nir_shader_compiler_options options = {
+		.lower_fpow = true,
+		.lower_scmp = true,
+		.lower_flrp32 = true,
+		.lower_flrp64 = true,
+		.lower_ffract = true,
+		.lower_fmod32 = true,
+		.lower_fmod64 = true,
+		.lower_fdiv = true,
+		.lower_ldexp = true,
+		.fuse_ffma = true,
+		.native_integers = true,
+		.vertex_id_zero_based = true,
+		.lower_extract_byte = true,
+		.lower_extract_word = true,
+		.lower_all_io_to_temps = true,
+		.lower_helper_invocation = true,
+};
+
+const nir_shader_compiler_options *
+ir3_get_compiler_options(struct ir3_compiler *compiler)
+{
+	return &options;
+}
+
+/* for given shader key, are any steps handled in nir? */
+bool
+ir3_key_lowers_nir(const struct ir3_shader_key *key)
+{
+	return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r |
+			key->vsaturate_s | key->vsaturate_t | key->vsaturate_r |
+			key->ucp_enables | key->color_two_side |
+			key->fclamp_color | key->vclamp_color;
+}
+
+#define OPT(nir, pass, ...) ({                             \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   this_progress;                                          \
+})
+
+#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
+
+static void
+ir3_optimize_loop(nir_shader *s)
+{
+	bool progress;
+	do {
+		progress = false;
+
+		OPT_V(s, nir_lower_vars_to_ssa);
+		progress |= OPT(s, nir_opt_copy_prop_vars);
+		progress |= OPT(s, nir_opt_dead_write_vars);
+		progress |= OPT(s, nir_lower_alu_to_scalar);
+		progress |= OPT(s, nir_lower_phis_to_scalar);
+
+		progress |= OPT(s, nir_copy_prop);
+		progress |= OPT(s, nir_opt_dce);
+		progress |= OPT(s, nir_opt_cse);
+		static int gcm = -1;
+		if (gcm == -1)
+			gcm = env_var_as_unsigned("GCM", 0);
+		if (gcm == 1)
+			progress |= OPT(s, nir_opt_gcm, true);
+		else if (gcm == 2)
+			progress |= OPT(s, nir_opt_gcm, false);
+		progress |= OPT(s, nir_opt_peephole_select, 16);
+		progress |= OPT(s, nir_opt_intrinsics);
+		progress |= OPT(s, nir_opt_algebraic);
+		progress |= OPT(s, nir_opt_constant_folding);
+		progress |= OPT(s, nir_opt_dead_cf);
+		if (OPT(s, nir_opt_trivial_continues)) {
+			progress |= true;
+			/* If nir_opt_trivial_continues makes progress, then we need to clean
+			 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
+			 * to make progress.
+			 */
+			OPT(s, nir_copy_prop);
+			OPT(s, nir_opt_dce);
+		}
+		progress |= OPT(s, nir_opt_if);
+		progress |= OPT(s, nir_opt_remove_phis);
+		progress |= OPT(s, nir_opt_undef);
+
+	} while (progress);
+}
+
+struct nir_shader *
+ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+		const struct ir3_shader_key *key)
+{
+	struct nir_lower_tex_options tex_options = {
+			.lower_rect = 0,
+	};
+
+	if (key) {
+		switch (shader->type) {
+		case MESA_SHADER_FRAGMENT:
+			tex_options.saturate_s = key->fsaturate_s;
+			tex_options.saturate_t = key->fsaturate_t;
+			tex_options.saturate_r = key->fsaturate_r;
+			break;
+		case MESA_SHADER_VERTEX:
+			tex_options.saturate_s = key->vsaturate_s;
+			tex_options.saturate_t = key->vsaturate_t;
+			tex_options.saturate_r = key->vsaturate_r;
+			break;
+		default:
+			/* TODO */
+			break;
+		}
+	}
+
+	if (shader->compiler->gpu_id >= 400) {
+		/* a4xx seems to have *no* sam.p */
+		tex_options.lower_txp = ~0;  /* lower all txp */
+	} else {
+		/* a3xx just needs to avoid sam.p for 3d tex */
+		tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
+	}
+
+	if (ir3_shader_debug & IR3_DBG_DISASM) {
+		debug_printf("----------------------\n");
+		nir_print_shader(s, stdout);
+		debug_printf("----------------------\n");
+	}
+
+	OPT_V(s, nir_opt_global_to_local);
+	OPT_V(s, nir_lower_regs_to_ssa);
+
+	if (key) {
+		if (s->info.stage == MESA_SHADER_VERTEX) {
+			OPT_V(s, nir_lower_clip_vs, key->ucp_enables, false);
+			if (key->vclamp_color)
+				OPT_V(s, nir_lower_clamp_color_outputs);
+		} else if (s->info.stage == MESA_SHADER_FRAGMENT) {
+			OPT_V(s, nir_lower_clip_fs, key->ucp_enables);
+			if (key->fclamp_color)
+				OPT_V(s, nir_lower_clamp_color_outputs);
+		}
+		if (key->color_two_side) {
+			OPT_V(s, nir_lower_two_sided_color);
+		}
+	} else {
+		/* only want to do this the first time (when key is null)
+		 * and not again on any potential 2nd variant lowering pass:
+		 */
+		OPT_V(s, ir3_nir_apply_trig_workarounds);
+	}
+
+	OPT_V(s, nir_lower_tex, &tex_options);
+	OPT_V(s, nir_lower_load_const_to_scalar);
+	if (shader->compiler->gpu_id < 500)
+		OPT_V(s, ir3_nir_lower_tg4_to_tex);
+
+	ir3_optimize_loop(s);
+
+	/* do idiv lowering after first opt loop to give a chance for
+	 * divide by immed power-of-two to be caught first:
+	 */
+	if (OPT(s, nir_lower_idiv))
+		ir3_optimize_loop(s);
+
+	OPT_V(s, nir_remove_dead_variables, nir_var_local);
+
+	OPT_V(s, nir_move_load_const);
+
+	if (ir3_shader_debug & IR3_DBG_DISASM) {
+		debug_printf("----------------------\n");
+		nir_print_shader(s, stdout);
+		debug_printf("----------------------\n");
+	}
+
+	nir_sweep(s);
+
+	return s;
+}
+
+void
+ir3_nir_scan_driver_consts(nir_shader *shader,
+		struct ir3_driver_const_layout *layout)
+{
+	nir_foreach_function(function, shader) {
+		if (!function->impl)
+			continue;
+
+		nir_foreach_block(block, function->impl) {
+			nir_foreach_instr(instr, block) {
+				if (instr->type != nir_instr_type_intrinsic)
+					continue;
+
+				nir_intrinsic_instr *intr =
+					nir_instr_as_intrinsic(instr);
+				unsigned idx;
+
+				switch (intr->intrinsic) {
+				case nir_intrinsic_get_buffer_size:
+					idx = nir_src_as_const_value(intr->src[0])->u32[0];
+					if (layout->ssbo_size.mask & (1 << idx))
+						break;
+					layout->ssbo_size.mask |= (1 << idx);
+					layout->ssbo_size.off[idx] =
+						layout->ssbo_size.count;
+					layout->ssbo_size.count += 1; /* one const per */
+					break;
+				case nir_intrinsic_image_deref_atomic_add:
+				case nir_intrinsic_image_deref_atomic_min:
+				case nir_intrinsic_image_deref_atomic_max:
+				case nir_intrinsic_image_deref_atomic_and:
+				case nir_intrinsic_image_deref_atomic_or:
+				case nir_intrinsic_image_deref_atomic_xor:
+				case nir_intrinsic_image_deref_atomic_exchange:
+				case nir_intrinsic_image_deref_atomic_comp_swap:
+				case nir_intrinsic_image_deref_store:
+				case nir_intrinsic_image_deref_size:
+					idx = nir_intrinsic_get_var(intr, 0)->data.driver_location;
+					if (layout->image_dims.mask & (1 << idx))
+						break;
+					layout->image_dims.mask |= (1 << idx);
+					layout->image_dims.off[idx] =
+						layout->image_dims.count;
+					layout->image_dims.count += 3; /* three const per */
+					break;
+				default:
+					break;
+				}
+			}
+		}
+	}
+}
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
new file mode 100644
index 00000000000..74201d34160
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#ifndef IR3_NIR_H_
+#define IR3_NIR_H_
+
+#include "compiler/nir/nir.h"
+#include "compiler/shader_enums.h"
+
+#include "ir3_shader.h"
+
+void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layout *layout);
+
+bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
+bool ir3_nir_lower_tg4_to_tex(nir_shader *shader);
+
+const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
+bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
+struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+		const struct ir3_shader_key *key);
+
+#endif /* IR3_NIR_H_ */
diff --git a/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
new file mode 100644
index 00000000000..37a3dcb26f8
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright © 2017 Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "ir3_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the
+ * gather results, rather than before. As a result, it must be emulated with
+ * direct texture calls.
+ */
+
+static bool
+lower_tg4(nir_block *block, nir_builder *b, void *mem_ctx)
+{
+	bool progress = false;
+
+	static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} };
+
+	nir_foreach_instr_safe(instr, block) {
+		if (instr->type != nir_instr_type_tex)
+			continue;
+
+		nir_tex_instr *tg4 = (nir_tex_instr *)instr;
+
+		if (tg4->op != nir_texop_tg4)
+			continue;
+
+		b->cursor = nir_before_instr(&tg4->instr);
+
+		nir_ssa_def *results[4];
+		int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
+		for (int i = 0; i < 4; i++) {
+			int num_srcs = tg4->num_srcs + 1 /* lod */;
+			if (offset_index < 0 && i < 3)
+				num_srcs++;
+
+			nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
+			tex->op = nir_texop_txl;
+			tex->sampler_dim = tg4->sampler_dim;
+			tex->coord_components = tg4->coord_components;
+			tex->is_array = tg4->is_array;
+			tex->is_shadow = tg4->is_shadow;
+			tex->is_new_style_shadow = tg4->is_new_style_shadow;
+			tex->texture_index = tg4->texture_index;
+			tex->sampler_index = tg4->sampler_index;
+			tex->dest_type = tg4->dest_type;
+
+			for (int j = 0; j < tg4->num_srcs; j++) {
+				nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
+				tex->src[j].src_type = tg4->src[j].src_type;
+			}
+			if (i != 3) {
+				nir_ssa_def *offset =
+					nir_vec2(b, nir_imm_int(b, offsets[i][0]),
+							 nir_imm_int(b, offsets[i][1]));
+				if (offset_index < 0) {
+					tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
+					tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
+				} else {
+					assert(nir_tex_instr_src_size(tex, offset_index) == 2);
+					nir_ssa_def *orig = nir_ssa_for_src(
+							b, tex->src[offset_index].src, 2);
+					tex->src[offset_index].src =
+						nir_src_for_ssa(nir_iadd(b, orig, offset));
+				}
+			}
+			tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
+			tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
+
+			nir_ssa_dest_init(&tex->instr, &tex->dest,
+							  nir_tex_instr_dest_size(tex), 32, NULL);
+			nir_builder_instr_insert(b, &tex->instr);
+
+			results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
+		}
+
+		nir_ssa_def *result = nir_vec4(b, results[0], results[1], results[2], results[3]);
+		nir_ssa_def_rewrite_uses(&tg4->dest.ssa, nir_src_for_ssa(result));
+
+		nir_instr_remove(&tg4->instr);
+
+		progress = true;
+	}
+
+	return progress;
+}
+
+static bool
+lower_tg4_func(nir_function_impl *impl)
+{
+	void *mem_ctx = ralloc_parent(impl);
+	nir_builder b;
+	nir_builder_init(&b, impl);
+
+	bool progress = false;
+	nir_foreach_block_safe(block, impl) {
+		progress |= lower_tg4(block, &b, mem_ctx);
+	}
+
+	if (progress)
+		nir_metadata_preserve(impl, nir_metadata_block_index |
+									nir_metadata_dominance);
+
+	return progress;
+}
+
+bool
+ir3_nir_lower_tg4_to_tex(nir_shader *shader)
+{
+	bool progress = false;
+
+	nir_foreach_function(function, shader) {
+		if (function->impl)
+			progress |= lower_tg4_func(function->impl);
+	}
+
+	return progress;
+}
diff --git a/src/freedreno/ir3/ir3_nir_trig.py b/src/freedreno/ir3/ir3_nir_trig.py
new file mode 100644
index 00000000000..3968aea543c
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_trig.py
@@ -0,0 +1,51 @@
+#
+# Copyright (C) 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+from __future__ import print_function
+
+import argparse
+import sys
+
+trig_workarounds = [
+   (('fsin', 'x'), ('fsin', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))),
+   (('fcos', 'x'), ('fcos', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))),
+]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--import-path', required=True)
+    args = parser.parse_args()
+    sys.path.insert(0, args.import_path)
+    run()
+
+
+def run():
+    import nir_algebraic  # pylint: disable=import-error
+
+    print('#include "ir3_nir.h"')
+    print(nir_algebraic.AlgebraicPass("ir3_nir_apply_trig_workarounds",
+                                      trig_workarounds).render())
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c
new file mode 100644
index 00000000000..b6ef6e4b5a7
--- /dev/null
+++ b/src/freedreno/ir3/ir3_print.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "ir3.h"
+
+#define PTRID(x) ((unsigned long)(x))
+
+static void print_instr_name(struct ir3_instruction *instr)
+{
+	if (!instr)
+		return;
+#ifdef DEBUG
+	printf("%04u:", instr->serialno);
+#endif
+	printf("%04u:", instr->name);
+	printf("%04u:", instr->ip);
+	printf("%03u: ", instr->depth);
+
+	if (instr->flags & IR3_INSTR_SY)
+		printf("(sy)");
+	if (instr->flags & IR3_INSTR_SS)
+		printf("(ss)");
+
+	if (is_meta(instr)) {
+		switch (instr->opc) {
+		case OPC_META_INPUT:  printf("_meta:in");   break;
+		case OPC_META_FO:     printf("_meta:fo");   break;
+		case OPC_META_FI:     printf("_meta:fi");   break;
+
+		/* shouldn't hit here.. just for debugging: */
+		default: printf("_meta:%d", instr->opc);    break;
+		}
+	} else if (instr->opc == OPC_MOV) {
+		static const char *type[] = {
+				[TYPE_F16] = "f16",
+				[TYPE_F32] = "f32",
+				[TYPE_U16] = "u16",
+				[TYPE_U32] = "u32",
+				[TYPE_S16] = "s16",
+				[TYPE_S32] = "s32",
+				[TYPE_U8]  = "u8",
+				[TYPE_S8]  = "s8",
+		};
+		if (instr->cat1.src_type == instr->cat1.dst_type)
+			printf("mov");
+		else
+			printf("cov");
+		printf(".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
+	} else {
+		printf("%s", ir3_instr_name(instr));
+		if (instr->flags & IR3_INSTR_3D)
+			printf(".3d");
+		if (instr->flags & IR3_INSTR_A)
+			printf(".a");
+		if (instr->flags & IR3_INSTR_O)
+			printf(".o");
+		if (instr->flags & IR3_INSTR_P)
+			printf(".p");
+		if (instr->flags & IR3_INSTR_S)
+			printf(".s");
+		if (instr->flags & IR3_INSTR_S2EN)
+			printf(".s2en");
+	}
+}
+
+static void print_reg_name(struct ir3_register *reg)
+{
+	if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
+			(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
+		printf("(absneg)");
+	else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
+		printf("(neg)");
+	else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
+		printf("(abs)");
+
+	if (reg->flags & IR3_REG_IMMED) {
+		printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
+	} else if (reg->flags & IR3_REG_ARRAY) {
+		printf("arr[id=%u, offset=%d, size=%u", reg->array.id,
+				reg->array.offset, reg->size);
+		/* for ARRAY we could have null src, for example first write
+		 * instruction..
+		 */
+		if (reg->instr) {
+			printf(", _[");
+			print_instr_name(reg->instr);
+			printf("]");
+		}
+		printf("]");
+	} else if (reg->flags & IR3_REG_SSA) {
+		printf("_[");
+		print_instr_name(reg->instr);
+		printf("]");
+	} else if (reg->flags & IR3_REG_RELATIV) {
+		if (reg->flags & IR3_REG_HALF)
+			printf("h");
+		if (reg->flags & IR3_REG_CONST)
+			printf("c<a0.x + %d>", reg->array.offset);
+		else
+			printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size);
+	} else {
+		if (reg->flags & IR3_REG_HALF)
+			printf("h");
+		if (reg->flags & IR3_REG_CONST)
+			printf("c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
+		else
+			printf("\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]);
+	}
+}
+
+static void
+tab(int lvl)
+{
+	for (int i = 0; i < lvl; i++)
+		printf("\t");
+}
+
+static void
+print_instr(struct ir3_instruction *instr, int lvl)
+{
+	unsigned i;
+
+	tab(lvl);
+
+	print_instr_name(instr);
+	for (i = 0; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		printf(i ? ", " : " ");
+		print_reg_name(reg);
+	}
+
+	if (instr->address) {
+		printf(", address=_");
+		printf("[");
+		print_instr_name(instr->address);
+		printf("]");
+	}
+
+	if (instr->cp.left) {
+		printf(", left=_");
+		printf("[");
+		print_instr_name(instr->cp.left);
+		printf("]");
+	}
+
+	if (instr->cp.right) {
+		printf(", right=_");
+		printf("[");
+		print_instr_name(instr->cp.right);
+		printf("]");
+	}
+
+	if (instr->opc == OPC_META_FO) {
+		printf(", off=%d", instr->fo.off);
+	}
+
+	if (is_flow(instr) && instr->cat0.target) {
+		/* the predicate register src is implied: */
+		if (instr->opc == OPC_BR) {
+			printf(" %sp0.x", instr->cat0.inv ? "!" : "");
+		}
+		printf(", target=block%u", block_id(instr->cat0.target));
+	}
+
+	if (instr->deps_count) {
+		printf(", false-deps:");
+		for (unsigned i = 0; i < instr->deps_count; i++) {
+			if (i > 0)
+				printf(", ");
+			printf("_[");
+			print_instr_name(instr->deps[i]);
+			printf("]");
+		}
+	}
+
+	printf("\n");
+}
+
+void ir3_print_instr(struct ir3_instruction *instr)
+{
+	print_instr(instr, 0);
+}
+
+static void
+print_block(struct ir3_block *block, int lvl)
+{
+	tab(lvl); printf("block%u {\n", block_id(block));
+
+	if (block->predecessors_count > 0) {
+		tab(lvl+1);
+		printf("pred: ");
+		for (unsigned i = 0; i < block->predecessors_count; i++) {
+			if (i)
+				printf(", ");
+			printf("block%u", block_id(block->predecessors[i]));
+		}
+		printf("\n");
+	}
+
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		print_instr(instr, lvl+1);
+	}
+
+	tab(lvl+1); printf("/* keeps:\n");
+	for (unsigned i = 0; i < block->keeps_count; i++) {
+		print_instr(block->keeps[i], lvl+2);
+	}
+	tab(lvl+1); printf(" */\n");
+
+	if (block->successors[1]) {
+		/* leading into if/else: */
+		tab(lvl+1);
+		printf("/* succs: if _[");
+		print_instr_name(block->condition);
+		printf("] block%u; else block%u; */\n",
+				block_id(block->successors[0]),
+				block_id(block->successors[1]));
+	} else if (block->successors[0]) {
+		tab(lvl+1);
+		printf("/* succs: block%u; */\n",
+				block_id(block->successors[0]));
+	}
+	tab(lvl); printf("}\n");
+}
+
+void
+ir3_print(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+		print_block(block, 0);
+
+	for (unsigned i = 0; i < ir->noutputs; i++) {
+		if (!ir->outputs[i])
+			continue;
+		printf("out%d: ", i);
+		print_instr(ir->outputs[i], 0);
+	}
+}
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
new file mode 100644
index 00000000000..ad09c4018d3
--- /dev/null
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -0,0 +1,1124 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "util/u_math.h"
+#include "util/register_allocate.h"
+#include "util/ralloc.h"
+#include "util/bitset.h"
+
+#include "ir3.h"
+#include "ir3_compiler.h"
+
+/*
+ * Register Assignment:
+ *
+ * Uses the register_allocate util, which implements graph coloring
+ * algo with interference classes.  To handle the cases where we need
+ * consecutive registers (for example, texture sample instructions),
+ * we model these as larger (double/quad/etc) registers which conflict
+ * with the corresponding registers in other classes.
+ *
+ * Additionally we create additional classes for half-regs, which
+ * do not conflict with the full-reg classes.  We do need at least
+ * sizes 1-4 (to deal w/ texture sample instructions output to half-
+ * reg).  At the moment we don't create the higher order half-reg
+ * classes as half-reg frequently does not have enough precision
+ * for texture coords at higher resolutions.
+ *
+ * There are some additional cases that we need to handle specially,
+ * as the graph coloring algo doesn't understand "partial writes".
+ * For example, a sequence like:
+ *
+ *   add r0.z, ...
+ *   sam (f32)(xy)r0.x, ...
+ *   ...
+ *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
+ *
+ * In this scenario, we treat r0.xyz as class size 3, which is written
+ * (from a use/def perspective) at the 'add' instruction and ignore the
+ * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
+ * defining instruction, as it is the first to partially write r0.xyz.
+ *
+ * Note i965 has a similar scenario, which they solve with a virtual
+ * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
+ * register assignment.  But for us that is horrible from a scheduling
+ * standpoint.  Instead what we do is use idea of 'definer' instruction.
+ * Ie. the first instruction (lowest ip) to write to the variable is the
+ * one we consider from use/def perspective when building interference
+ * graph.  (Other instructions which write other variable components
+ * just define the variable some more.)
+ *
+ * Arrays of arbitrary size are handled via pre-coloring a consecutive
+ * sequence of registers.  Additional scalar (single component) reg
+ * names are allocated starting at ctx->class_base[total_class_count]
+ * (see arr->base), which are pre-colored.  In the use/def graph direct
+ * access is treated as a single element use/def, and indirect access
+ * is treated as use or def of all array elements.  (Only the first
+ * def is tracked, in case of multiple indirect writes, etc.)
+ *
+ * TODO arrays that fit in one of the pre-defined class sizes should
+ * not need to be pre-colored, but instead could be given a normal
+ * vreg name.  (Ignoring this for now since it is a good way to work
+ * out the kinks with arbitrary sized arrays.)
+ *
+ * TODO might be easier for debugging to split this into two passes,
+ * the first assigning vreg names in a way that we could ir3_print()
+ * the result.
+ */
+
+static const unsigned class_sizes[] = {
+	1, 2, 3, 4,
+	4 + 4, /* txd + 1d/2d */
+	4 + 6, /* txd + 3d */
+};
+#define class_count ARRAY_SIZE(class_sizes)
+
+static const unsigned half_class_sizes[] = {
+	1, 2, 3, 4,
+};
+#define half_class_count  ARRAY_SIZE(half_class_sizes)
+
+/* seems to just be used for compute shaders?  Seems like vec1 and vec3
+ * are sufficient (for now?)
+ */
+static const unsigned high_class_sizes[] = {
+	1, 3,
+};
+#define high_class_count ARRAY_SIZE(high_class_sizes)
+
+#define total_class_count (class_count + half_class_count + high_class_count)
+
+/* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
+#define NUM_REGS             (4 * 48)  /* r0 to r47 */
+#define NUM_HIGH_REGS        (4 * 8)   /* r48 to r55 */
+#define FIRST_HIGH_REG       (4 * 48)
+/* Number of virtual regs in a given class: */
+#define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
+#define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
+#define HIGH_CLASS_REGS(i)   (NUM_HIGH_REGS - (high_class_sizes[i] - 1))
+
+#define HALF_OFFSET          (class_count)
+#define HIGH_OFFSET          (class_count + half_class_count)
+
+/* register-set, created one time, used for all shaders: */
+struct ir3_ra_reg_set {
+	struct ra_regs *regs;
+	unsigned int classes[class_count];
+	unsigned int half_classes[half_class_count];
+	unsigned int high_classes[high_class_count];
+	/* maps flat virtual register space to base gpr: */
+	uint16_t *ra_reg_to_gpr;
+	/* maps cls,gpr to flat virtual register space: */
+	uint16_t **gpr_to_ra_reg;
+};
+
+static void
+build_q_values(unsigned int **q_values, unsigned off,
+		const unsigned *sizes, unsigned count)
+{
+	for (unsigned i = 0; i < count; i++) {
+		q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count);
+
+		/* From register_allocate.c:
+		 *
+		 * q(B,C) (indexed by C, B is this register class) in
+		 * Runeson/Nyström paper.  This is "how many registers of B could
+		 * the worst choice register from C conflict with".
+		 *
+		 * If we just let the register allocation algorithm compute these
+		 * values, is extremely expensive.  However, since all of our
+		 * registers are laid out, we can very easily compute them
+		 * ourselves.  View the register from C as fixed starting at GRF n
+		 * somewhere in the middle, and the register from B as sliding back
+		 * and forth.  Then the first register to conflict from B is the
+		 * one starting at n - class_size[B] + 1 and the last register to
+		 * conflict will start at n + class_size[B] - 1.  Therefore, the
+		 * number of conflicts from B is class_size[B] + class_size[C] - 1.
+		 *
+		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+		 * B | | | | | |n| --> | | | | | | |
+		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+		 *             +-+-+-+-+-+
+		 * C           |n| | | | |
+		 *             +-+-+-+-+-+
+		 *
+		 * (Idea copied from brw_fs_reg_allocate.cpp)
+		 */
+		for (unsigned j = 0; j < count; j++)
+			q_values[i + off][j + off] = sizes[i] + sizes[j] - 1;
+	}
+}
+
+/* One-time setup of RA register-set, which describes all the possible
+ * "virtual" registers and their interferences.  Ie. double register
+ * occupies (and conflicts with) two single registers, and so forth.
+ * Since registers do not need to be aligned to their class size, they
+ * can conflict with other registers in the same class too.  Ie:
+ *
+ *    Single (base) |  Double
+ *    --------------+---------------
+ *       R0         |  D0
+ *       R1         |  D0 D1
+ *       R2         |     D1 D2
+ *       R3         |        D2
+ *           .. and so on..
+ *
+ * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
+ * really just four scalar registers.  Don't let that confuse you.)
+ */
+struct ir3_ra_reg_set *
+ir3_ra_alloc_reg_set(struct ir3_compiler *compiler)
+{
+	struct ir3_ra_reg_set *set = rzalloc(compiler, struct ir3_ra_reg_set);
+	unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base;
+	unsigned int **q_values;
+
+	/* calculate # of regs across all classes: */
+	ra_reg_count = 0;
+	for (unsigned i = 0; i < class_count; i++)
+		ra_reg_count += CLASS_REGS(i);
+	for (unsigned i = 0; i < half_class_count; i++)
+		ra_reg_count += HALF_CLASS_REGS(i);
+	for (unsigned i = 0; i < high_class_count; i++)
+		ra_reg_count += HIGH_CLASS_REGS(i);
+
+	/* allocate and populate q_values: */
+	q_values = ralloc_array(set, unsigned *, total_class_count);
+
+	build_q_values(q_values, 0, class_sizes, class_count);
+	build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count);
+	build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count);
+
+	/* allocate the reg-set.. */
+	set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
+	set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
+	set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
+
+	/* .. and classes */
+	reg = 0;
+	for (unsigned i = 0; i < class_count; i++) {
+		set->classes[i] = ra_alloc_reg_class(set->regs);
+
+		set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
+
+		for (unsigned j = 0; j < CLASS_REGS(i); j++) {
+			ra_class_add_reg(set->regs, set->classes[i], reg);
+
+			set->ra_reg_to_gpr[reg] = j;
+			set->gpr_to_ra_reg[i][j] = reg;
+
+			for (unsigned br = j; br < j + class_sizes[i]; br++)
+				ra_add_transitive_reg_conflict(set->regs, br, reg);
+
+			reg++;
+		}
+	}
+
+	first_half_reg = reg;
+	base = HALF_OFFSET;
+
+	for (unsigned i = 0; i < half_class_count; i++) {
+		set->half_classes[i] = ra_alloc_reg_class(set->regs);
+
+		set->gpr_to_ra_reg[base + i] =
+				ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
+
+		for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
+			ra_class_add_reg(set->regs, set->half_classes[i], reg);
+
+			set->ra_reg_to_gpr[reg] = j;
+			set->gpr_to_ra_reg[base + i][j] = reg;
+
+			for (unsigned br = j; br < j + half_class_sizes[i]; br++)
+				ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
+
+			reg++;
+		}
+	}
+
+	first_high_reg = reg;
+	base = HIGH_OFFSET;
+
+	for (unsigned i = 0; i < high_class_count; i++) {
+		set->high_classes[i] = ra_alloc_reg_class(set->regs);
+
+		set->gpr_to_ra_reg[base + i] =
+				ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i));
+
+		for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
+			ra_class_add_reg(set->regs, set->high_classes[i], reg);
+
+			set->ra_reg_to_gpr[reg] = j;
+			set->gpr_to_ra_reg[base + i][j] = reg;
+
+			for (unsigned br = j; br < j + high_class_sizes[i]; br++)
+				ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg);
+
+			reg++;
+		}
+	}
+
+	/* starting a6xx, half precision regs conflict w/ full precision regs: */
+	if (compiler->gpu_id >= 600) {
+		/* because of transitivity, we can get away with just setting up
+		 * conflicts between the first class of full and half regs:
+		 */
+		for (unsigned j = 0; j < CLASS_REGS(0) / 2; j++) {
+			unsigned freg  = set->gpr_to_ra_reg[0][j];
+			unsigned hreg0 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 0];
+			unsigned hreg1 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 1];
+
+			ra_add_transitive_reg_conflict(set->regs, freg, hreg0);
+			ra_add_transitive_reg_conflict(set->regs, freg, hreg1);
+		}
+
+		// TODO also need to update q_values, but for now:
+		ra_set_finalize(set->regs, NULL);
+	} else {
+		ra_set_finalize(set->regs, q_values);
+	}
+
+	ralloc_free(q_values);
+
+	return set;
+}
+
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+	BITSET_WORD *def;        /* variables defined before used in block */
+	BITSET_WORD *use;        /* variables used before defined in block */
+	BITSET_WORD *livein;     /* which defs reach entry point of block */
+	BITSET_WORD *liveout;    /* which defs reach exit point of block */
+};
+
+/* additional instruction-data (per-instruction) */
+struct ir3_ra_instr_data {
+	/* cached instruction 'definer' info: */
+	struct ir3_instruction *defn;
+	int off, sz, cls;
+};
+
+/* register-assign context, per-shader */
+struct ir3_ra_ctx {
+	struct ir3 *ir;
+	gl_shader_stage type;
+	bool frag_face;
+
+	struct ir3_ra_reg_set *set;
+	struct ra_graph *g;
+	unsigned alloc_count;
+	/* one per class, plus one slot for arrays: */
+	unsigned class_alloc_count[total_class_count + 1];
+	unsigned class_base[total_class_count + 1];
+	unsigned instr_cnt;
+	unsigned *def, *use;     /* def/use table */
+	struct ir3_ra_instr_data *instrd;
+};
+
+/* does it conflict? */
+static inline bool
+intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
+{
+	return !((a_start >= b_end) || (b_start >= a_end));
+}
+
+static bool
+is_half(struct ir3_instruction *instr)
+{
+	return !!(instr->regs[0]->flags & IR3_REG_HALF);
+}
+
+static bool
+is_high(struct ir3_instruction *instr)
+{
+	return !!(instr->regs[0]->flags & IR3_REG_HIGH);
+}
+
+static int
+size_to_class(unsigned sz, bool half, bool high)
+{
+	if (high) {
+		for (unsigned i = 0; i < high_class_count; i++)
+			if (high_class_sizes[i] >= sz)
+				return i + HIGH_OFFSET;
+	} else if (half) {
+		for (unsigned i = 0; i < half_class_count; i++)
+			if (half_class_sizes[i] >= sz)
+				return i + HALF_OFFSET;
+	} else {
+		for (unsigned i = 0; i < class_count; i++)
+			if (class_sizes[i] >= sz)
+				return i;
+	}
+	debug_assert(0);
+	return -1;
+}
+
+static bool
+writes_gpr(struct ir3_instruction *instr)
+{
+	if (is_store(instr))
+		return false;
+	/* is dest a normal temp register: */
+	struct ir3_register *reg = instr->regs[0];
+	if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+		return false;
+	if ((reg->num == regid(REG_A0, 0)) ||
+			(reg->num == regid(REG_P0, 0)))
+		return false;
+	return true;
+}
+
+static bool
+instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
+{
+	if (a->flags & IR3_INSTR_UNUSED)
+		return false;
+	return (a->ip < b->ip);
+}
+
+static struct ir3_instruction *
+get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
+		int *sz, int *off)
+{
+	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+	struct ir3_instruction *d = NULL;
+
+	if (id->defn) {
+		*sz = id->sz;
+		*off = id->off;
+		return id->defn;
+	}
+
+	if (instr->opc == OPC_META_FI) {
+		/* What about the case where collect is subset of array, we
+		 * need to find the distance between where actual array starts
+		 * and fanin..  that probably doesn't happen currently.
+		 */
+		struct ir3_register *src;
+		int dsz, doff;
+
+		/* note: don't use foreach_ssa_src as this gets called once
+		 * while assigning regs (which clears SSA flag)
+		 */
+		foreach_src_n(src, n, instr) {
+			struct ir3_instruction *dd;
+			if (!src->instr)
+				continue;
+
+			dd = get_definer(ctx, src->instr, &dsz, &doff);
+
+			if ((!d) || instr_before(dd, d)) {
+				d = dd;
+				*sz = dsz;
+				*off = doff - n;
+			}
+		}
+
+	} else if (instr->cp.right || instr->cp.left) {
+		/* covers also the meta:fo case, which ends up w/ single
+		 * scalar instructions for each component:
+		 */
+		struct ir3_instruction *f = ir3_neighbor_first(instr);
+
+		/* by definition, the entire sequence forms one linked list
+		 * of single scalar register nodes (even if some of them may
+		 * be fanouts from a texture sample (for example) instr.  We
+		 * just need to walk the list finding the first element of
+		 * the group defined (lowest ip)
+		 */
+		int cnt = 0;
+
+		/* need to skip over unused in the group: */
+		while (f && (f->flags & IR3_INSTR_UNUSED)) {
+			f = f->cp.right;
+			cnt++;
+		}
+
+		while (f) {
+			if ((!d) || instr_before(f, d))
+				d = f;
+			if (f == instr)
+				*off = cnt;
+			f = f->cp.right;
+			cnt++;
+		}
+
+		*sz = cnt;
+
+	} else {
+		/* second case is looking directly at the instruction which
+		 * produces multiple values (eg, texture sample), rather
+		 * than the fanout nodes that point back to that instruction.
+		 * This isn't quite right, because it may be part of a larger
+		 * group, such as:
+		 *
+		 *     sam (f32)(xyzw)r0.x, ...
+		 *     add r1.x, ...
+		 *     add r1.y, ...
+		 *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
+		 *
+		 * need to come up with a better way to handle that case.
+		 */
+		if (instr->address) {
+			*sz = instr->regs[0]->size;
+		} else {
+			*sz = util_last_bit(instr->regs[0]->wrmask);
+		}
+		*off = 0;
+		d = instr;
+	}
+
+	if (d->opc == OPC_META_FO) {
+		struct ir3_instruction *dd;
+		int dsz, doff;
+
+		dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
+
+		/* by definition, should come before: */
+		debug_assert(instr_before(dd, d));
+
+		*sz = MAX2(*sz, dsz);
+
+		debug_assert(instr->opc == OPC_META_FO);
+		*off = MAX2(*off, instr->fo.off);
+
+		d = dd;
+	}
+
+	id->defn = d;
+	id->sz = *sz;
+	id->off = *off;
+
+	return d;
+}
+
+static void
+ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+		if (instr->regs_count == 0)
+			continue;
+		/* couple special cases: */
+		if (writes_addr(instr) || writes_pred(instr)) {
+			id->cls = -1;
+		} else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+			id->cls = total_class_count;
+		} else {
+			id->defn = get_definer(ctx, instr, &id->sz, &id->off);
+			id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn));
+		}
+	}
+}
+
+/* give each instruction a name (and ip), and count up the # of names
+ * of each class
+ */
+static void
+ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+
+#ifdef DEBUG
+		instr->name = ~0;
+#endif
+
+		ctx->instr_cnt++;
+
+		if (instr->regs_count == 0)
+			continue;
+
+		if (!writes_gpr(instr))
+			continue;
+
+		if (id->defn != instr)
+			continue;
+
+		/* arrays which don't fit in one of the pre-defined class
+		 * sizes are pre-colored:
+		 */
+		if ((id->cls >= 0) && (id->cls < total_class_count)) {
+			instr->name = ctx->class_alloc_count[id->cls]++;
+			ctx->alloc_count++;
+		}
+	}
+}
+
+static void
+ra_init(struct ir3_ra_ctx *ctx)
+{
+	unsigned n, base;
+
+	ir3_clear_mark(ctx->ir);
+	n = ir3_count_instructions(ctx->ir);
+
+	ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_find_definers(ctx, block);
+	}
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_name_instructions(ctx, block);
+	}
+
+	/* figure out the base register name for each class.  The
+	 * actual ra name is class_base[cls] + instr->name;
+	 */
+	ctx->class_base[0] = 0;
+	for (unsigned i = 1; i <= total_class_count; i++) {
+		ctx->class_base[i] = ctx->class_base[i-1] +
+				ctx->class_alloc_count[i-1];
+	}
+
+	/* and vreg names for array elements: */
+	base = ctx->class_base[total_class_count];
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		arr->base = base;
+		ctx->class_alloc_count[total_class_count] += arr->length;
+		base += arr->length;
+	}
+	ctx->alloc_count += ctx->class_alloc_count[total_class_count];
+
+	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
+	ralloc_steal(ctx->g, ctx->instrd);
+	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+	ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+}
+
+static unsigned
+__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+{
+	unsigned name;
+	debug_assert(cls >= 0);
+	debug_assert(cls < total_class_count);  /* we shouldn't get arrays here.. */
+	name = ctx->class_base[cls] + defn->name;
+	debug_assert(name < ctx->alloc_count);
+	return name;
+}
+
+static int
+ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
+{
+	/* TODO handle name mapping for arrays */
+	return __ra_name(ctx, id->cls, id->defn);
+}
+
+static void
+ra_destroy(struct ir3_ra_ctx *ctx)
+{
+	ralloc_free(ctx->g);
+}
+
+static void
+ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	struct ir3_ra_block_data *bd;
+	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+
+#define def(name, instr) \
+		do { \
+			/* defined on first write: */ \
+			if (!ctx->def[name]) \
+				ctx->def[name] = instr->ip; \
+			ctx->use[name] = instr->ip; \
+			BITSET_SET(bd->def, name); \
+		} while(0);
+
+#define use(name, instr) \
+		do { \
+			ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
+			if (!BITSET_TEST(bd->def, name)) \
+				BITSET_SET(bd->use, name); \
+		} while(0);
+
+	bd = rzalloc(ctx->g, struct ir3_ra_block_data);
+
+	bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+
+	block->data = bd;
+
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_instruction *src;
+		struct ir3_register *reg;
+
+		if (instr->regs_count == 0)
+			continue;
+
+		/* There are a couple special cases to deal with here:
+		 *
+		 * fanout: used to split values from a higher class to a lower
+		 *     class, for example split the results of a texture fetch
+		 *     into individual scalar values;  We skip over these from
+		 *     a 'def' perspective, and for a 'use' we walk the chain
+		 *     up to the defining instruction.
+		 *
+		 * fanin: used to collect values from lower class and assemble
+		 *     them together into a higher class, for example arguments
+		 *     to texture sample instructions;  We consider these to be
+		 *     defined at the earliest fanin source.
+		 *
+		 * Most of this is handled in the get_definer() helper.
+		 *
+		 * In either case, we trace the instruction back to the original
+		 * definer and consider that as the def/use ip.
+		 */
+
+		if (writes_gpr(instr)) {
+			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+			struct ir3_register *dst = instr->regs[0];
+
+			if (dst->flags & IR3_REG_ARRAY) {
+				struct ir3_array *arr =
+					ir3_lookup_array(ctx->ir, dst->array.id);
+				unsigned i;
+
+				arr->start_ip = MIN2(arr->start_ip, instr->ip);
+				arr->end_ip = MAX2(arr->end_ip, instr->ip);
+
+				/* set the node class now.. in case we don't encounter
+				 * this array dst again.  From register_alloc algo's
+				 * perspective, these are all single/scalar regs:
+				 */
+				for (i = 0; i < arr->length; i++) {
+					unsigned name = arr->base + i;
+					ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
+				}
+
+				/* indirect write is treated like a write to all array
+				 * elements, since we don't know which one is actually
+				 * written:
+				 */
+				if (dst->flags & IR3_REG_RELATIV) {
+					for (i = 0; i < arr->length; i++) {
+						unsigned name = arr->base + i;
+						def(name, instr);
+					}
+				} else {
+					unsigned name = arr->base + dst->array.offset;
+					def(name, instr);
+				}
+
+			} else if (id->defn == instr) {
+				unsigned name = ra_name(ctx, id);
+
+				/* since we are in SSA at this point: */
+				debug_assert(!BITSET_TEST(bd->use, name));
+
+				def(name, id->defn);
+
+				if (is_high(id->defn)) {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->high_classes[id->cls - HIGH_OFFSET]);
+				} else if (is_half(id->defn)) {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->half_classes[id->cls - HALF_OFFSET]);
+				} else {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->classes[id->cls]);
+				}
+			}
+		}
+
+		foreach_src(reg, instr) {
+			if (reg->flags & IR3_REG_ARRAY) {
+				struct ir3_array *arr =
+					ir3_lookup_array(ctx->ir, reg->array.id);
+				arr->start_ip = MIN2(arr->start_ip, instr->ip);
+				arr->end_ip = MAX2(arr->end_ip, instr->ip);
+
+				/* indirect read is treated like a read fromall array
+				 * elements, since we don't know which one is actually
+				 * read:
+				 */
+				if (reg->flags & IR3_REG_RELATIV) {
+					unsigned i;
+					for (i = 0; i < arr->length; i++) {
+						unsigned name = arr->base + i;
+						use(name, instr);
+					}
+				} else {
+					unsigned name = arr->base + reg->array.offset;
+					use(name, instr);
+					/* NOTE: arrays are not SSA so unconditionally
+					 * set use bit:
+					 */
+					BITSET_SET(bd->use, name);
+					debug_assert(reg->array.offset < arr->length);
+				}
+			} else if ((src = ssa(reg)) && writes_gpr(src)) {
+				unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
+				use(name, instr);
+			}
+		}
+	}
+}
+
+static bool
+ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
+{
+	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+	bool progress = false;
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		struct ir3_ra_block_data *bd = block->data;
+
+		/* update livein: */
+		for (unsigned i = 0; i < bitset_words; i++) {
+			BITSET_WORD new_livein =
+				(bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
+
+			if (new_livein & ~bd->livein[i]) {
+				bd->livein[i] |= new_livein;
+				progress = true;
+			}
+		}
+
+		/* update liveout: */
+		for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
+			struct ir3_block *succ = block->successors[j];
+			struct ir3_ra_block_data *succ_bd;
+
+			if (!succ)
+				continue;
+
+			succ_bd = succ->data;
+
+			for (unsigned i = 0; i < bitset_words; i++) {
+				BITSET_WORD new_liveout =
+					(succ_bd->livein[i] & ~bd->liveout[i]);
+
+				if (new_liveout) {
+					bd->liveout[i] |= new_liveout;
+					progress = true;
+				}
+			}
+		}
+	}
+
+	return progress;
+}
+
+static void
+print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
+{
+	bool first = true;
+	debug_printf("  %s:", name);
+	for (unsigned i = 0; i < cnt; i++) {
+		if (BITSET_TEST(bs, i)) {
+			if (!first)
+				debug_printf(",");
+			debug_printf(" %04u", i);
+			first = false;
+		}
+	}
+	debug_printf("\n");
+}
+
+static void
+ra_add_interference(struct ir3_ra_ctx *ctx)
+{
+	struct ir3 *ir = ctx->ir;
+
+	/* initialize array live ranges: */
+	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+		arr->start_ip = ~0;
+		arr->end_ip = 0;
+	}
+
+	/* compute live ranges (use/def) on a block level, also updating
+	 * block's def/use bitmasks (used below to calculate per-block
+	 * livein/liveout):
+	 */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		ra_block_compute_live_ranges(ctx, block);
+	}
+
+	/* update per-block livein/liveout: */
+	while (ra_compute_livein_liveout(ctx)) {}
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		debug_printf("AFTER LIVEIN/OUT:\n");
+		ir3_print(ir);
+		list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+			struct ir3_ra_block_data *bd = block->data;
+			debug_printf("block%u:\n", block_id(block));
+			print_bitset("  def", bd->def, ctx->alloc_count);
+			print_bitset("  use", bd->use, ctx->alloc_count);
+			print_bitset("  l/i", bd->livein, ctx->alloc_count);
+			print_bitset("  l/o", bd->liveout, ctx->alloc_count);
+		}
+		list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+			debug_printf("array%u:\n", arr->id);
+			debug_printf("  length:   %u\n", arr->length);
+			debug_printf("  start_ip: %u\n", arr->start_ip);
+			debug_printf("  end_ip:   %u\n", arr->end_ip);
+		}
+	}
+
+	/* extend start/end ranges based on livein/liveout info from cfg: */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		struct ir3_ra_block_data *bd = block->data;
+
+		for (unsigned i = 0; i < ctx->alloc_count; i++) {
+			if (BITSET_TEST(bd->livein, i)) {
+				ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
+				ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
+			}
+
+			if (BITSET_TEST(bd->liveout, i)) {
+				ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
+				ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
+			}
+		}
+
+		list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+			for (unsigned i = 0; i < arr->length; i++) {
+				if (BITSET_TEST(bd->livein, i + arr->base)) {
+					arr->start_ip = MIN2(arr->start_ip, block->start_ip);
+				}
+				if (BITSET_TEST(bd->livein, i + arr->base)) {
+					arr->end_ip = MAX2(arr->end_ip, block->end_ip);
+				}
+			}
+		}
+	}
+
+	/* need to fix things up to keep outputs live: */
+	for (unsigned i = 0; i < ir->noutputs; i++) {
+		struct ir3_instruction *instr = ir->outputs[i];
+		unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
+		ctx->use[name] = ctx->instr_cnt;
+	}
+
+	for (unsigned i = 0; i < ctx->alloc_count; i++) {
+		for (unsigned j = 0; j < ctx->alloc_count; j++) {
+			if (intersects(ctx->def[i], ctx->use[i],
+					ctx->def[j], ctx->use[j])) {
+				ra_add_node_interference(ctx->g, i, j);
+			}
+		}
+	}
+}
+
+/* some instructions need fix-up if dst register is half precision: */
+static void fixup_half_instr_dst(struct ir3_instruction *instr)
+{
+	switch (opc_cat(instr->opc)) {
+	case 1: /* move instructions */
+		instr->cat1.dst_type = half_type(instr->cat1.dst_type);
+		break;
+	case 3:
+		switch (instr->opc) {
+		case OPC_MAD_F32:
+			instr->opc = OPC_MAD_F16;
+			break;
+		case OPC_SEL_B32:
+			instr->opc = OPC_SEL_B16;
+			break;
+		case OPC_SEL_S32:
+			instr->opc = OPC_SEL_S16;
+			break;
+		case OPC_SEL_F32:
+			instr->opc = OPC_SEL_F16;
+			break;
+		case OPC_SAD_S32:
+			instr->opc = OPC_SAD_S16;
+			break;
+		/* instructions may already be fixed up: */
+		case OPC_MAD_F16:
+		case OPC_SEL_B16:
+		case OPC_SEL_S16:
+		case OPC_SEL_F16:
+		case OPC_SAD_S16:
+			break;
+		default:
+			assert(0);
+			break;
+		}
+		break;
+	case 5:
+		instr->cat5.type = half_type(instr->cat5.type);
+		break;
+	}
+}
+/* some instructions need fix-up if src register is half precision: */
+static void fixup_half_instr_src(struct ir3_instruction *instr)
+{
+	switch (instr->opc) {
+	case OPC_MOV:
+		instr->cat1.src_type = half_type(instr->cat1.src_type);
+		break;
+	default:
+		break;
+	}
+}
+
+/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
+ * array access(es) which do not have any previous access to depend
+ * on from scheduling point of view
+ */
+static void
+reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
+		struct ir3_instruction *instr)
+{
+	struct ir3_ra_instr_data *id;
+
+	if (reg->flags & IR3_REG_ARRAY) {
+		struct ir3_array *arr =
+			ir3_lookup_array(ctx->ir, reg->array.id);
+		unsigned name = arr->base + reg->array.offset;
+		unsigned r = ra_get_node_reg(ctx->g, name);
+		unsigned num = ctx->set->ra_reg_to_gpr[r];
+
+		if (reg->flags & IR3_REG_RELATIV) {
+			reg->array.offset = num;
+		} else {
+			reg->num = num;
+			reg->flags &= ~IR3_REG_SSA;
+		}
+
+		reg->flags &= ~IR3_REG_ARRAY;
+	} else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
+		unsigned name = ra_name(ctx, id);
+		unsigned r = ra_get_node_reg(ctx->g, name);
+		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
+
+		debug_assert(!(reg->flags & IR3_REG_RELATIV));
+
+		if (is_high(id->defn))
+			num += FIRST_HIGH_REG;
+
+		reg->num = num;
+		reg->flags &= ~IR3_REG_SSA;
+
+		if (is_half(id->defn))
+			reg->flags |= IR3_REG_HALF;
+	}
+}
+
+static void
+ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_register *reg;
+
+		if (instr->regs_count == 0)
+			continue;
+
+		if (writes_gpr(instr)) {
+			reg_assign(ctx, instr->regs[0], instr);
+			if (instr->regs[0]->flags & IR3_REG_HALF)
+				fixup_half_instr_dst(instr);
+		}
+
+		foreach_src_n(reg, n, instr) {
+			struct ir3_instruction *src = reg->instr;
+			/* Note: reg->instr could be null for IR3_REG_ARRAY */
+			if (!(src || (reg->flags & IR3_REG_ARRAY)))
+				continue;
+			reg_assign(ctx, instr->regs[n+1], src);
+			if (instr->regs[n+1]->flags & IR3_REG_HALF)
+				fixup_half_instr_src(instr);
+		}
+	}
+}
+
+static int
+ra_alloc(struct ir3_ra_ctx *ctx)
+{
+	/* pre-assign array elements:
+	 */
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		unsigned base = 0;
+
+		if (arr->end_ip == 0)
+			continue;
+
+		/* figure out what else we conflict with which has already
+		 * been assigned:
+		 */
+retry:
+		list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
+			if (arr2 == arr)
+				break;
+			if (arr2->end_ip == 0)
+				continue;
+			/* if it intersects with liverange AND register range.. */
+			if (intersects(arr->start_ip, arr->end_ip,
+					arr2->start_ip, arr2->end_ip) &&
+				intersects(base, base + arr->length,
+					arr2->reg, arr2->reg + arr2->length)) {
+				base = MAX2(base, arr2->reg + arr2->length);
+				goto retry;
+			}
+		}
+
+		arr->reg = base;
+
+		for (unsigned i = 0; i < arr->length; i++) {
+			unsigned name, reg;
+
+			name = arr->base + i;
+			reg = ctx->set->gpr_to_ra_reg[0][base++];
+
+			ra_set_node_reg(ctx->g, name, reg);
+		}
+	}
+
+	if (!ra_allocate(ctx->g))
+		return -1;
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_alloc(ctx, block);
+	}
+
+	return 0;
+}
+
+int ir3_ra(struct ir3 *ir, gl_shader_stage type,
+		bool frag_coord, bool frag_face)
+{
+	struct ir3_ra_ctx ctx = {
+			.ir = ir,
+			.type = type,
+			.frag_face = frag_face,
+			.set = ir->compiler->set,
+	};
+	int ret;
+
+	ra_init(&ctx);
+	ra_add_interference(&ctx);
+	ret = ra_alloc(&ctx);
+	ra_destroy(&ctx);
+
+	return ret;
+}
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
new file mode 100644
index 00000000000..6552980d90c
--- /dev/null
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Instruction Scheduling:
+ *
+ * A recursive depth based scheduling algo.  Recursively find an eligible
+ * instruction to schedule from the deepest instruction (recursing through
+ * it's unscheduled src instructions).  Normally this would result in a
+ * lot of re-traversal of the same instructions, so we cache results in
+ * instr->data (and clear cached results that would be no longer valid
+ * after scheduling an instruction).
+ *
+ * There are a few special cases that need to be handled, since sched
+ * is currently independent of register allocation.  Usages of address
+ * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
+ * if you have two pairs of instructions that write the same special
+ * register and then read it, then those pairs cannot be interleaved.
+ * To solve this, when we are in such a scheduling "critical section",
+ * and we encounter a conflicting write to a special register, we try
+ * to schedule any remaining instructions that use that value first.
+ */
+
+struct ir3_sched_ctx {
+	struct ir3_block *block;           /* the current block */
+	struct list_head depth_list;       /* depth sorted unscheduled instrs */
+	struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
+	struct ir3_instruction *addr;      /* current a0.x user, if any */
+	struct ir3_instruction *pred;      /* current p0.x user, if any */
+	bool error;
+};
+
+static bool is_sfu_or_mem(struct ir3_instruction *instr)
+{
+	return is_sfu(instr) || is_mem(instr);
+}
+
+#define NULL_INSTR ((void *)~0)
+
+static void
+clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+	list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
+		if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
+			instr2->data = NULL;
+	}
+}
+
+static void
+schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+	debug_assert(ctx->block == instr->block);
+
+	/* maybe there is a better way to handle this than just stuffing
+	 * a nop.. ideally we'd know about this constraint in the
+	 * scheduling and depth calculation..
+	 */
+	if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
+		ir3_NOP(ctx->block);
+
+	/* remove from depth list:
+	 */
+	list_delinit(&instr->node);
+
+	if (writes_addr(instr)) {
+		debug_assert(ctx->addr == NULL);
+		ctx->addr = instr;
+	}
+
+	if (writes_pred(instr)) {
+		debug_assert(ctx->pred == NULL);
+		ctx->pred = instr;
+	}
+
+	instr->flags |= IR3_INSTR_MARK;
+
+	list_addtail(&instr->node, &instr->block->instr_list);
+	ctx->scheduled = instr;
+
+	if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
+		clear_cache(ctx, NULL);
+	} else {
+		/* invalidate only the necessary entries.. */
+		clear_cache(ctx, instr);
+	}
+}
+
+static struct ir3_instruction *
+deepest(struct ir3_instruction **srcs, unsigned nsrcs)
+{
+	struct ir3_instruction *d = NULL;
+	unsigned i = 0, id = 0;
+
+	while ((i < nsrcs) && !(d = srcs[id = i]))
+		i++;
+
+	if (!d)
+		return NULL;
+
+	for (; i < nsrcs; i++)
+		if (srcs[i] && (srcs[i]->depth > d->depth))
+			d = srcs[id = i];
+
+	srcs[id] = NULL;
+
+	return d;
+}
+
+/**
+ * @block: the block to search in, starting from end; in first pass,
+ *    this will be the block the instruction would be inserted into
+ *    (but has not yet, ie. it only contains already scheduled
+ *    instructions).  For intra-block scheduling (second pass), this
+ *    would be one of the predecessor blocks.
+ * @instr: the instruction to search for
+ * @maxd:  max distance, bail after searching this # of instruction
+ *    slots, since it means the instruction we are looking for is
+ *    far enough away
+ * @pred:  if true, recursively search into predecessor blocks to
+ *    find the worst case (shortest) distance (only possible after
+ *    individual blocks are all scheduled
+ */
+static unsigned
+distance(struct ir3_block *block, struct ir3_instruction *instr,
+		unsigned maxd, bool pred)
+{
+	unsigned d = 0;
+
+	list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
+		if ((n == instr) || (d >= maxd))
+			return d;
+		/* NOTE: don't count branch/jump since we don't know yet if they will
+		 * be eliminated later in resolve_jumps().. really should do that
+		 * earlier so we don't have this constraint.
+		 */
+		if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
+			d++;
+	}
+
+	/* if coming from a predecessor block, assume it is assigned far
+	 * enough away.. we'll fix up later.
+	 */
+	if (!pred)
+		return maxd;
+
+	if (pred && (block->data != block)) {
+		/* Search into predecessor blocks, finding the one with the
+		 * shortest distance, since that will be the worst case
+		 */
+		unsigned min = maxd - d;
+
+		/* (ab)use block->data to prevent recursion: */
+		block->data = block;
+
+		for (unsigned i = 0; i < block->predecessors_count; i++) {
+			unsigned n;
+
+			n = distance(block->predecessors[i], instr, min, pred);
+
+			min = MIN2(min, n);
+		}
+
+		block->data = NULL;
+		d += min;
+	}
+
+	return d;
+}
+
+/* calculate delay for specified src: */
+static unsigned
+delay_calc_srcn(struct ir3_block *block,
+		struct ir3_instruction *assigner,
+		struct ir3_instruction *consumer,
+		unsigned srcn, bool soft, bool pred)
+{
+	unsigned delay = 0;
+
+	if (is_meta(assigner)) {
+		struct ir3_instruction *src;
+		foreach_ssa_src(src, assigner) {
+			unsigned d;
+			d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
+			delay = MAX2(delay, d);
+		}
+	} else {
+		if (soft) {
+			if (is_sfu(assigner)) {
+				delay = 4;
+			} else {
+				delay = ir3_delayslots(assigner, consumer, srcn);
+			}
+		} else {
+			delay = ir3_delayslots(assigner, consumer, srcn);
+		}
+		delay -= distance(block, assigner, delay, pred);
+	}
+
+	return delay;
+}
+
+/* calculate delay for instruction (maximum of delay for all srcs): */
+static unsigned
+delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+		bool soft, bool pred)
+{
+	unsigned delay = 0;
+	struct ir3_instruction *src;
+
+	foreach_ssa_src_n(src, i, instr) {
+		unsigned d;
+		d = delay_calc_srcn(block, src, instr, i, soft, pred);
+		delay = MAX2(delay, d);
+	}
+
+	return delay;
+}
+
+struct ir3_sched_notes {
+	/* there is at least one kill which could be scheduled, except
+	 * for unscheduled bary.f's:
+	 */
+	bool blocked_kill;
+	/* there is at least one instruction that could be scheduled,
+	 * except for conflicting address/predicate register usage:
+	 */
+	bool addr_conflict, pred_conflict;
+};
+
+static bool is_scheduled(struct ir3_instruction *instr)
+{
+	return !!(instr->flags & IR3_INSTR_MARK);
+}
+
+/* could an instruction be scheduled if specified ssa src was scheduled? */
+static bool
+could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+{
+	struct ir3_instruction *other_src;
+	foreach_ssa_src(other_src, instr) {
+		/* if dependency not scheduled, we aren't ready yet: */
+		if ((src != other_src) && !is_scheduled(other_src)) {
+			return false;
+		}
+	}
+	return true;
+}
+
+/* Check if instruction is ok to schedule.  Make sure it is not blocked
+ * by use of addr/predicate register, etc.
+ */
+static bool
+check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct ir3_instruction *instr)
+{
+	/* For instructions that write address register we need to
+	 * make sure there is at least one instruction that uses the
+	 * addr value which is otherwise ready.
+	 *
+	 * TODO if any instructions use pred register and have other
+	 * src args, we would need to do the same for writes_pred()..
+	 */
+	if (writes_addr(instr)) {
+		struct ir3 *ir = instr->block->shader;
+		bool ready = false;
+		for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
+			struct ir3_instruction *indirect = ir->indirects[i];
+			if (!indirect)
+				continue;
+			if (indirect->address != instr)
+				continue;
+			ready = could_sched(indirect, instr);
+		}
+
+		/* nothing could be scheduled, so keep looking: */
+		if (!ready)
+			return false;
+	}
+
+	/* if this is a write to address/predicate register, and that
+	 * register is currently in use, we need to defer until it is
+	 * free:
+	 */
+	if (writes_addr(instr) && ctx->addr) {
+		debug_assert(ctx->addr != instr);
+		notes->addr_conflict = true;
+		return false;
+	}
+
+	if (writes_pred(instr) && ctx->pred) {
+		debug_assert(ctx->pred != instr);
+		notes->pred_conflict = true;
+		return false;
+	}
+
+	/* if the instruction is a kill, we need to ensure *every*
+	 * bary.f is scheduled.  The hw seems unhappy if the thread
+	 * gets killed before the end-input (ei) flag is hit.
+	 *
+	 * We could do this by adding each bary.f instruction as
+	 * virtual ssa src for the kill instruction.  But we have
+	 * fixed length instr->regs[].
+	 *
+	 * TODO this wouldn't be quite right if we had multiple
+	 * basic blocks, if any block was conditional.  We'd need
+	 * to schedule the bary.f's outside of any block which
+	 * was conditional that contained a kill.. I think..
+	 */
+	if (is_kill(instr)) {
+		struct ir3 *ir = instr->block->shader;
+
+		for (unsigned i = 0; i < ir->baryfs_count; i++) {
+			struct ir3_instruction *baryf = ir->baryfs[i];
+			if (baryf->flags & IR3_INSTR_UNUSED)
+				continue;
+			if (!is_scheduled(baryf)) {
+				notes->blocked_kill = true;
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+/* Find the best instruction to schedule from specified instruction or
+ * recursively it's ssa sources.
+ */
+static struct ir3_instruction *
+find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct ir3_instruction *instr)
+{
+	struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
+	struct ir3_instruction *src;
+	unsigned nsrcs = 0;
+
+	if (is_scheduled(instr))
+		return NULL;
+
+	/* use instr->data to cache the results of recursing up the
+	 * instr src's.  Otherwise the recursive algo can scale quite
+	 * badly w/ shader size.  But this takes some care to clear
+	 * the cache appropriately when instructions are scheduled.
+	 */
+	if (instr->data) {
+		if (instr->data == NULL_INSTR)
+			return NULL;
+		return instr->data;
+	}
+
+	/* find unscheduled srcs: */
+	foreach_ssa_src(src, instr) {
+		if (!is_scheduled(src)) {
+			debug_assert(nsrcs < ARRAY_SIZE(srcs));
+			srcs[nsrcs++] = src;
+		}
+	}
+
+	/* if all our src's are already scheduled: */
+	if (nsrcs == 0) {
+		if (check_instr(ctx, notes, instr)) {
+			instr->data = instr;
+			return instr;
+		}
+		return NULL;
+	}
+
+	while ((src = deepest(srcs, nsrcs))) {
+		struct ir3_instruction *candidate;
+
+		candidate = find_instr_recursive(ctx, notes, src);
+		if (!candidate)
+			continue;
+
+		if (check_instr(ctx, notes, candidate)) {
+			instr->data = candidate;
+			return candidate;
+		}
+	}
+
+	instr->data = NULL_INSTR;
+	return NULL;
+}
+
+/* find instruction to schedule: */
+static struct ir3_instruction *
+find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		bool soft)
+{
+	struct ir3_instruction *best_instr = NULL;
+	unsigned min_delay = ~0;
+
+	/* TODO we'd really rather use the list/array of block outputs.  But we
+	 * don't have such a thing.  Recursing *every* instruction in the list
+	 * will result in a lot of repeated traversal, since instructions will
+	 * get traversed both when they appear as ssa src to a later instruction
+	 * as well as where they appear in the depth_list.
+	 */
+	list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+		struct ir3_instruction *candidate;
+		unsigned delay;
+
+		candidate = find_instr_recursive(ctx, notes, instr);
+		if (!candidate)
+			continue;
+
+		delay = delay_calc(ctx->block, candidate, soft, false);
+		if (delay < min_delay) {
+			best_instr = candidate;
+			min_delay = delay;
+		}
+
+		if (min_delay == 0)
+			break;
+	}
+
+	return best_instr;
+}
+
+/* "spill" the address register by remapping any unscheduled
+ * instructions which depend on the current address register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static struct ir3_instruction *
+split_addr(struct ir3_sched_ctx *ctx)
+{
+	struct ir3 *ir;
+	struct ir3_instruction *new_addr = NULL;
+	unsigned i;
+
+	debug_assert(ctx->addr);
+
+	ir = ctx->addr->block->shader;
+
+	for (i = 0; i < ir->indirects_count; i++) {
+		struct ir3_instruction *indirect = ir->indirects[i];
+
+		if (!indirect)
+			continue;
+
+		/* skip instructions already scheduled: */
+		if (is_scheduled(indirect))
+			continue;
+
+		/* remap remaining instructions using current addr
+		 * to new addr:
+		 */
+		if (indirect->address == ctx->addr) {
+			if (!new_addr) {
+				new_addr = ir3_instr_clone(ctx->addr);
+				/* original addr is scheduled, but new one isn't: */
+				new_addr->flags &= ~IR3_INSTR_MARK;
+			}
+			ir3_instr_set_address(indirect, new_addr);
+		}
+	}
+
+	/* all remaining indirects remapped to new addr: */
+	ctx->addr = NULL;
+
+	return new_addr;
+}
+
+/* "spill" the predicate register by remapping any unscheduled
+ * instructions which depend on the current predicate register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static struct ir3_instruction *
+split_pred(struct ir3_sched_ctx *ctx)
+{
+	struct ir3 *ir;
+	struct ir3_instruction *new_pred = NULL;
+	unsigned i;
+
+	debug_assert(ctx->pred);
+
+	ir = ctx->pred->block->shader;
+
+	for (i = 0; i < ir->predicates_count; i++) {
+		struct ir3_instruction *predicated = ir->predicates[i];
+
+		/* skip instructions already scheduled: */
+		if (is_scheduled(predicated))
+			continue;
+
+		/* remap remaining instructions using current pred
+		 * to new pred:
+		 *
+		 * TODO is there ever a case when pred isn't first
+		 * (and only) src?
+		 */
+		if (ssa(predicated->regs[1]) == ctx->pred) {
+			if (!new_pred) {
+				new_pred = ir3_instr_clone(ctx->pred);
+				/* original pred is scheduled, but new one isn't: */
+				new_pred->flags &= ~IR3_INSTR_MARK;
+			}
+			predicated->regs[1]->instr = new_pred;
+		}
+	}
+
+	/* all remaining predicated remapped to new pred: */
+	ctx->pred = NULL;
+
+	return new_pred;
+}
+
+static void
+sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+	struct list_head unscheduled_list;
+
+	ctx->block = block;
+
+	/* addr/pred writes are per-block: */
+	ctx->addr = NULL;
+	ctx->pred = NULL;
+
+	/* move all instructions to the unscheduled list, and
+	 * empty the block's instruction list (to which we will
+	 * be inserting).
+	 */
+	list_replace(&block->instr_list, &unscheduled_list);
+	list_inithead(&block->instr_list);
+	list_inithead(&ctx->depth_list);
+
+	/* first a pre-pass to schedule all meta:input instructions
+	 * (which need to appear first so that RA knows the register is
+	 * occupied), and move remaining to depth sorted list:
+	 */
+	list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
+		if (instr->opc == OPC_META_INPUT) {
+			schedule(ctx, instr);
+		} else {
+			ir3_insert_by_depth(instr, &ctx->depth_list);
+		}
+	}
+
+	while (!list_empty(&ctx->depth_list)) {
+		struct ir3_sched_notes notes = {0};
+		struct ir3_instruction *instr;
+
+		instr = find_eligible_instr(ctx, &notes, true);
+		if (!instr)
+			instr = find_eligible_instr(ctx, &notes, false);
+
+		if (instr) {
+			unsigned delay = delay_calc(ctx->block, instr, false, false);
+
+			/* and if we run out of instructions that can be scheduled,
+			 * then it is time for nop's:
+			 */
+			debug_assert(delay <= 6);
+			while (delay > 0) {
+				ir3_NOP(block);
+				delay--;
+			}
+
+			schedule(ctx, instr);
+		} else {
+			struct ir3_instruction *new_instr = NULL;
+
+			/* nothing available to schedule.. if we are blocked on
+			 * address/predicate register conflict, then break the
+			 * deadlock by cloning the instruction that wrote that
+			 * reg:
+			 */
+			if (notes.addr_conflict) {
+				new_instr = split_addr(ctx);
+			} else if (notes.pred_conflict) {
+				new_instr = split_pred(ctx);
+			} else {
+				debug_assert(0);
+				ctx->error = true;
+				return;
+			}
+
+			if (new_instr) {
+				/* clearing current addr/pred can change what is
+				 * available to schedule, so clear cache..
+				 */
+				clear_cache(ctx, NULL);
+
+				ir3_insert_by_depth(new_instr, &ctx->depth_list);
+				/* the original instr that wrote addr/pred may have
+				 * originated from a different block:
+				 */
+				new_instr->block = block;
+			}
+		}
+	}
+
+	/* And lastly, insert branch/jump instructions to take us to
+	 * the next block.  Later we'll strip back out the branches
+	 * that simply jump to next instruction.
+	 */
+	if (block->successors[1]) {
+		/* if/else, conditional branches to "then" or "else": */
+		struct ir3_instruction *br;
+		unsigned delay = 6;
+
+		debug_assert(ctx->pred);
+		debug_assert(block->condition);
+
+		delay -= distance(ctx->block, ctx->pred, delay, false);
+
+		while (delay > 0) {
+			ir3_NOP(block);
+			delay--;
+		}
+
+		/* create "else" branch first (since "then" block should
+		 * frequently/always end up being a fall-thru):
+		 */
+		br = ir3_BR(block);
+		br->cat0.inv = true;
+		br->cat0.target = block->successors[1];
+
+		/* NOTE: we have to hard code delay of 6 above, since
+		 * we want to insert the nop's before constructing the
+		 * branch.  Throw in an assert so we notice if this
+		 * ever breaks on future generation:
+		 */
+		debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
+
+		br = ir3_BR(block);
+		br->cat0.target = block->successors[0];
+
+	} else if (block->successors[0]) {
+		/* otherwise unconditional jump to next block: */
+		struct ir3_instruction *jmp;
+
+		jmp = ir3_JUMP(block);
+		jmp->cat0.target = block->successors[0];
+	}
+
+	/* NOTE: if we kept track of the predecessors, we could do a better
+	 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
+	 * Note that as we eliminate blocks which contain only an unconditional
+	 * jump we probably need to propagate (jp) flag..
+	 */
+}
+
+/* After scheduling individual blocks, we still could have cases where
+ * one (or more) paths into a block, a value produced by a previous
+ * has too few delay slots to be legal.  We can't deal with this in the
+ * first pass, because loops (ie. we can't ensure all predecessor blocks
+ * are already scheduled in the first pass).  All we can really do at
+ * this point is stuff in extra nop's until things are legal.
+ */
+static void
+sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+	unsigned n = 0;
+
+	ctx->block = block;
+
+	list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+		unsigned delay = 0;
+
+		for (unsigned i = 0; i < block->predecessors_count; i++) {
+			unsigned d = delay_calc(block->predecessors[i], instr, false, true);
+			delay = MAX2(d, delay);
+		}
+
+		while (delay > n) {
+			struct ir3_instruction *nop = ir3_NOP(block);
+
+			/* move to before instr: */
+			list_delinit(&nop->node);
+			list_addtail(&nop->node, &instr->node);
+
+			n++;
+		}
+
+		/* we can bail once we hit worst case delay: */
+		if (++n > 6)
+			break;
+	}
+}
+
+int ir3_sched(struct ir3 *ir)
+{
+	struct ir3_sched_ctx ctx = {0};
+
+	ir3_clear_mark(ir);
+
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		sched_block(&ctx, block);
+	}
+
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		sched_intra_block(&ctx, block);
+	}
+
+	if (ctx.error)
+		return -1;
+	return 0;
+}
+
+/* does instruction 'prior' need to be scheduled before 'instr'? */
+static bool
+depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
+{
+	/* TODO for dependencies that are related to a specific object, ie
+	 * a specific SSBO/image/array, we could relax this constraint to
+	 * make accesses to unrelated objects not depend on each other (at
+	 * least as long as not declared coherent)
+	 */
+	if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
+			((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
+		return true;
+	return !!(instr->barrier_class & prior->barrier_conflict);
+}
+
+static void
+add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
+{
+	struct list_head *prev = instr->node.prev;
+	struct list_head *next = instr->node.next;
+
+	/* add dependencies on previous instructions that must be scheduled
+	 * prior to the current instruction
+	 */
+	while (prev != &block->instr_list) {
+		struct ir3_instruction *pi =
+			LIST_ENTRY(struct ir3_instruction, prev, node);
+
+		prev = prev->prev;
+
+		if (is_meta(pi))
+			continue;
+
+		if (instr->barrier_class == pi->barrier_class) {
+			ir3_instr_add_dep(instr, pi);
+			break;
+		}
+
+		if (depends_on(instr, pi))
+			ir3_instr_add_dep(instr, pi);
+	}
+
+	/* add dependencies on this instruction to following instructions
+	 * that must be scheduled after the current instruction:
+	 */
+	while (next != &block->instr_list) {
+		struct ir3_instruction *ni =
+			LIST_ENTRY(struct ir3_instruction, next, node);
+
+		next = next->next;
+
+		if (is_meta(ni))
+			continue;
+
+		if (instr->barrier_class == ni->barrier_class) {
+			ir3_instr_add_dep(ni, instr);
+			break;
+		}
+
+		if (depends_on(ni, instr))
+			ir3_instr_add_dep(ni, instr);
+	}
+}
+
+/* before scheduling a block, we need to add any necessary false-dependencies
+ * to ensure that:
+ *
+ *  (1) barriers are scheduled in the right order wrt instructions related
+ *      to the barrier
+ *
+ *  (2) reads that come before a write actually get scheduled before the
+ *      write
+ */
+static void
+calculate_deps(struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		if (instr->barrier_class) {
+			add_barrier_deps(block, instr);
+		}
+	}
+}
+
+void
+ir3_sched_add_deps(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		calculate_deps(block);
+	}
+}
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c
new file mode 100644
index 00000000000..8b18e950cca
--- /dev/null
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_format.h"
+
+#include "drm/freedreno_drmif.h"
+
+#include "ir3_shader.h"
+#include "ir3_compiler.h"
+#include "ir3_nir.h"
+
+int
+ir3_glsl_type_size(const struct glsl_type *type)
+{
+	return glsl_count_attribute_slots(type, false);
+}
+
+static void
+delete_variant(struct ir3_shader_variant *v)
+{
+	if (v->ir)
+		ir3_destroy(v->ir);
+	if (v->bo)
+		fd_bo_del(v->bo);
+	if (v->immediates)
+		free(v->immediates);
+	free(v);
+}
+
+/* for vertex shader, the inputs are loaded into registers before the shader
+ * is executed, so max_regs from the shader instructions might not properly
+ * reflect the # of registers actually used, especially in case passthrough
+ * varyings.
+ *
+ * Likewise, for fragment shader, we can have some regs which are passed
+ * input values but never touched by the resulting shader (ie. as result
+ * of dead code elimination or simply because we don't know how to turn
+ * the reg off.
+ */
+static void
+fixup_regfootprint(struct ir3_shader_variant *v)
+{
+	unsigned i;
+
+	for (i = 0; i < v->inputs_count; i++) {
+		/* skip frag inputs fetch via bary.f since their reg's are
+		 * not written by gpu before shader starts (and in fact the
+		 * regid's might not even be valid)
+		 */
+		if (v->inputs[i].bary)
+			continue;
+
+		/* ignore high regs that are global to all threads in a warp
+		 * (they exist by default) (a5xx+)
+		 */
+		if (v->inputs[i].regid >= regid(48,0))
+			continue;
+
+		if (v->inputs[i].compmask) {
+			unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
+			int32_t regid = (v->inputs[i].regid + n) >> 2;
+			v->info.max_reg = MAX2(v->info.max_reg, regid);
+		}
+	}
+
+	for (i = 0; i < v->outputs_count; i++) {
+		int32_t regid = (v->outputs[i].regid + 3) >> 2;
+		v->info.max_reg = MAX2(v->info.max_reg, regid);
+	}
+}
+
+/* wrapper for ir3_assemble() which does some info fixup based on
+ * shader state.  Non-static since used by ir3_cmdline too.
+ */
+void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id)
+{
+	void *bin;
+
+	bin = ir3_assemble(v->ir, &v->info, gpu_id);
+	if (!bin)
+		return NULL;
+
+	if (gpu_id >= 400) {
+		v->instrlen = v->info.sizedwords / (2 * 16);
+	} else {
+		v->instrlen = v->info.sizedwords / (2 * 4);
+	}
+
+	/* NOTE: if relative addressing is used, we set constlen in
+	 * the compiler (to worst-case value) since we don't know in
+	 * the assembler what the max addr reg value can be:
+	 */
+	v->constlen = MIN2(255, MAX2(v->constlen, v->info.max_const + 1));
+
+	fixup_regfootprint(v);
+
+	return bin;
+}
+
+static void
+assemble_variant(struct ir3_shader_variant *v)
+{
+	struct ir3_compiler *compiler = v->shader->compiler;
+	uint32_t gpu_id = compiler->gpu_id;
+	uint32_t sz, *bin;
+
+	bin = ir3_shader_assemble(v, gpu_id);
+	sz = v->info.sizedwords * 4;
+
+	v->bo = fd_bo_new(compiler->dev, sz,
+			DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
+			DRM_FREEDRENO_GEM_TYPE_KMEM);
+
+	memcpy(fd_bo_map(v->bo), bin, sz);
+
+	if (ir3_shader_debug & IR3_DBG_DISASM) {
+		struct ir3_shader_key key = v->key;
+		printf("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+			v->binning_pass, key.color_two_side, key.half_precision);
+		ir3_shader_disasm(v, bin, stdout);
+	}
+
+	if (shader_debug_enabled(v->shader->type)) {
+		fprintf(stderr, "Native code for unnamed %s shader %s:\n",
+			_mesa_shader_stage_to_string(v->shader->type),
+			v->shader->nir->info.name);
+		if (v->shader->type == MESA_SHADER_FRAGMENT)
+			fprintf(stderr, "SIMD0\n");
+		ir3_shader_disasm(v, bin, stderr);
+	}
+
+	free(bin);
+
+	/* no need to keep the ir around beyond this point: */
+	ir3_destroy(v->ir);
+	v->ir = NULL;
+}
+
+static struct ir3_shader_variant *
+create_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
+		bool binning_pass)
+{
+	struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
+	int ret;
+
+	if (!v)
+		return NULL;
+
+	v->id = ++shader->variant_count;
+	v->shader = shader;
+	v->binning_pass = binning_pass;
+	v->key = *key;
+	v->type = shader->type;
+
+	ret = ir3_compile_shader_nir(shader->compiler, v);
+	if (ret) {
+		debug_error("compile failed!");
+		goto fail;
+	}
+
+	assemble_variant(v);
+	if (!v->bo) {
+		debug_error("assemble failed!");
+		goto fail;
+	}
+
+	return v;
+
+fail:
+	delete_variant(v);
+	return NULL;
+}
+
+static inline struct ir3_shader_variant *
+shader_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
+		bool *created)
+{
+	struct ir3_shader_variant *v;
+
+	*created = false;
+
+	for (v = shader->variants; v; v = v->next)
+		if (ir3_shader_key_equal(key, &v->key))
+			return v;
+
+	/* compile new variant if it doesn't exist already: */
+	v = create_variant(shader, key, false);
+	if (v) {
+		v->next = shader->variants;
+		shader->variants = v;
+		*created = true;
+	}
+
+	return v;
+}
+
+struct ir3_shader_variant *
+ir3_shader_get_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
+		bool binning_pass, bool *created)
+{
+	struct ir3_shader_variant *v =
+			shader_variant(shader, key, created);
+
+	if (binning_pass) {
+		if (!v->binning)
+			v->binning = create_variant(shader, key, true);
+		return v->binning;
+	}
+
+	return v;
+}
+
+void
+ir3_shader_destroy(struct ir3_shader *shader)
+{
+	struct ir3_shader_variant *v, *t;
+	for (v = shader->variants; v; ) {
+		t = v;
+		v = v->next;
+		delete_variant(t);
+	}
+	ralloc_free(shader->nir);
+	free(shader);
+}
+
+struct ir3_shader *
+ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir)
+{
+	struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
+
+	shader->compiler = compiler;
+	shader->id = ++shader->compiler->shader_count;
+	shader->type = nir->info.stage;
+
+	NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size,
+			   (nir_lower_io_options)0);
+
+	/* do first pass optimization, ignoring the key: */
+	shader->nir = ir3_optimize_nir(shader, nir, NULL);
+	if (ir3_shader_debug & IR3_DBG_DISASM) {
+		printf("dump nir%d: type=%d", shader->id, shader->type);
+		nir_print_shader(shader->nir, stdout);
+	}
+
+	return shader;
+}
+
+static void dump_reg(FILE *out, const char *name, uint32_t r)
+{
+	if (r != regid(63,0))
+		fprintf(out, "; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
+}
+
+static void dump_output(FILE *out, struct ir3_shader_variant *so,
+		unsigned slot, const char *name)
+{
+	uint32_t regid;
+	regid = ir3_find_output_regid(so, slot);
+	dump_reg(out, name, regid);
+}
+
+void
+ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
+{
+	struct ir3 *ir = so->ir;
+	struct ir3_register *reg;
+	const char *type = ir3_shader_stage(so->shader);
+	uint8_t regid;
+	unsigned i;
+
+	for (i = 0; i < ir->ninputs; i++) {
+		if (!ir->inputs[i]) {
+			fprintf(out, "; in%d unused\n", i);
+			continue;
+		}
+		reg = ir->inputs[i]->regs[0];
+		regid = reg->num;
+		fprintf(out, "@in(%sr%d.%c)\tin%d\n",
+				(reg->flags & IR3_REG_HALF) ? "h" : "",
+				(regid >> 2), "xyzw"[regid & 0x3], i);
+	}
+
+	for (i = 0; i < ir->noutputs; i++) {
+		if (!ir->outputs[i]) {
+			fprintf(out, "; out%d unused\n", i);
+			continue;
+		}
+		/* kill shows up as a virtual output.. skip it! */
+		if (is_kill(ir->outputs[i]))
+			continue;
+		reg = ir->outputs[i]->regs[0];
+		regid = reg->num;
+		fprintf(out, "@out(%sr%d.%c)\tout%d\n",
+				(reg->flags & IR3_REG_HALF) ? "h" : "",
+				(regid >> 2), "xyzw"[regid & 0x3], i);
+	}
+
+	for (i = 0; i < so->immediates_count; i++) {
+		fprintf(out, "@const(c%d.x)\t", so->constbase.immediate + i);
+		fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
+				so->immediates[i].val[0],
+				so->immediates[i].val[1],
+				so->immediates[i].val[2],
+				so->immediates[i].val[3]);
+	}
+
+	disasm_a3xx(bin, so->info.sizedwords, 0, out);
+
+	switch (so->type) {
+	case MESA_SHADER_VERTEX:
+		fprintf(out, "; %s: outputs:", type);
+		for (i = 0; i < so->outputs_count; i++) {
+			uint8_t regid = so->outputs[i].regid;
+			fprintf(out, " r%d.%c (%s)",
+					(regid >> 2), "xyzw"[regid & 0x3],
+					gl_varying_slot_name(so->outputs[i].slot));
+		}
+		fprintf(out, "\n");
+		fprintf(out, "; %s: inputs:", type);
+		for (i = 0; i < so->inputs_count; i++) {
+			uint8_t regid = so->inputs[i].regid;
+			fprintf(out, " r%d.%c (cm=%x,il=%u,b=%u)",
+					(regid >> 2), "xyzw"[regid & 0x3],
+					so->inputs[i].compmask,
+					so->inputs[i].inloc,
+					so->inputs[i].bary);
+		}
+		fprintf(out, "\n");
+		break;
+	case MESA_SHADER_FRAGMENT:
+		fprintf(out, "; %s: outputs:", type);
+		for (i = 0; i < so->outputs_count; i++) {
+			uint8_t regid = so->outputs[i].regid;
+			fprintf(out, " r%d.%c (%s)",
+					(regid >> 2), "xyzw"[regid & 0x3],
+					gl_frag_result_name(so->outputs[i].slot));
+		}
+		fprintf(out, "\n");
+		fprintf(out, "; %s: inputs:", type);
+		for (i = 0; i < so->inputs_count; i++) {
+			uint8_t regid = so->inputs[i].regid;
+			fprintf(out, " r%d.%c (%s,cm=%x,il=%u,b=%u)",
+					(regid >> 2), "xyzw"[regid & 0x3],
+					gl_varying_slot_name(so->inputs[i].slot),
+					so->inputs[i].compmask,
+					so->inputs[i].inloc,
+					so->inputs[i].bary);
+		}
+		fprintf(out, "\n");
+		break;
+	default:
+		/* TODO */
+		break;
+	}
+
+	/* print generic shader info: */
+	fprintf(out, "; %s prog %d/%d: %u instructions, %d half, %d full\n",
+			type, so->shader->id, so->id,
+			so->info.instrs_count,
+			so->info.max_half_reg + 1,
+			so->info.max_reg + 1);
+
+	fprintf(out, "; %d const, %u constlen\n",
+			so->info.max_const + 1,
+			so->constlen);
+
+	fprintf(out, "; %u (ss), %u (sy)\n", so->info.ss, so->info.sy);
+
+	/* print shader type specific info: */
+	switch (so->type) {
+	case MESA_SHADER_VERTEX:
+		dump_output(out, so, VARYING_SLOT_POS, "pos");
+		dump_output(out, so, VARYING_SLOT_PSIZ, "psize");
+		break;
+	case MESA_SHADER_FRAGMENT:
+		dump_reg(out, "pos (bary)",
+			ir3_find_sysval_regid(so, SYSTEM_VALUE_VARYING_COORD));
+		dump_output(out, so, FRAG_RESULT_DEPTH, "posz");
+		if (so->color0_mrt) {
+			dump_output(out, so, FRAG_RESULT_COLOR, "color");
+		} else {
+			dump_output(out, so, FRAG_RESULT_DATA0, "data0");
+			dump_output(out, so, FRAG_RESULT_DATA1, "data1");
+			dump_output(out, so, FRAG_RESULT_DATA2, "data2");
+			dump_output(out, so, FRAG_RESULT_DATA3, "data3");
+			dump_output(out, so, FRAG_RESULT_DATA4, "data4");
+			dump_output(out, so, FRAG_RESULT_DATA5, "data5");
+			dump_output(out, so, FRAG_RESULT_DATA6, "data6");
+			dump_output(out, so, FRAG_RESULT_DATA7, "data7");
+		}
+		/* these two are hard-coded since we don't know how to
+		 * program them to anything but all 0's...
+		 */
+		if (so->frag_coord)
+			fprintf(out, "; fragcoord: r0.x\n");
+		if (so->frag_face)
+			fprintf(out, "; fragface: hr0.x\n");
+		break;
+	default:
+		/* TODO */
+		break;
+	}
+
+	fprintf(out, "\n");
+}
+
+uint64_t
+ir3_shader_outputs(const struct ir3_shader *so)
+{
+	return so->nir->info.outputs_written;
+}
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
new file mode 100644
index 00000000000..bc47160d6ea
--- /dev/null
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -0,0 +1,587 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#ifndef IR3_SHADER_H_
+#define IR3_SHADER_H_
+
+#include <stdio.h>
+
+#include "compiler/shader_enums.h"
+#include "compiler/nir/nir.h"
+#include "util/bitscan.h"
+
+#include "ir3.h"
+
+struct glsl_type;
+
+/* driver param indices: */
+enum ir3_driver_param {
+	/* compute shader driver params: */
+	IR3_DP_NUM_WORK_GROUPS_X = 0,
+	IR3_DP_NUM_WORK_GROUPS_Y = 1,
+	IR3_DP_NUM_WORK_GROUPS_Z = 2,
+	IR3_DP_LOCAL_GROUP_SIZE_X = 4,
+	IR3_DP_LOCAL_GROUP_SIZE_Y = 5,
+	IR3_DP_LOCAL_GROUP_SIZE_Z = 6,
+	/* NOTE: gl_NumWorkGroups should be vec4 aligned because
+	 * glDispatchComputeIndirect() needs to load these from
+	 * the info->indirect buffer.  Keep that in mind when/if
+	 * adding any addition CS driver params.
+	 */
+	IR3_DP_CS_COUNT   = 8,   /* must be aligned to vec4 */
+
+	/* vertex shader driver params: */
+	IR3_DP_VTXID_BASE = 0,
+	IR3_DP_VTXCNT_MAX = 1,
+	/* user-clip-plane components, up to 8x vec4's: */
+	IR3_DP_UCP0_X     = 4,
+	/* .... */
+	IR3_DP_UCP7_W     = 35,
+	IR3_DP_VS_COUNT   = 36   /* must be aligned to vec4 */
+};
+
+#define IR3_MAX_SHADER_BUFFERS   32
+#define IR3_MAX_SHADER_IMAGES    32
+#define IR3_MAX_SO_BUFFERS        4
+#define IR3_MAX_SO_OUTPUTS       64
+
+/**
+ * For consts needed to pass internal values to shader which may or may not
+ * be required, rather than allocating worst-case const space, we scan the
+ * shader and allocate consts as-needed:
+ *
+ *   + SSBO sizes: only needed if shader has a get_buffer_size intrinsic
+ *     for a given SSBO
+ *
+ *   + Image dimensions: needed to calculate pixel offset, but only for
+ *     images that have a image_store intrinsic
+ */
+struct ir3_driver_const_layout {
+	struct {
+		uint32_t mask;  /* bitmask of SSBOs that have get_buffer_size */
+		uint32_t count; /* number of consts allocated */
+		/* one const allocated per SSBO which has get_buffer_size,
+		 * ssbo_sizes.off[ssbo_id] is offset from start of ssbo_sizes
+		 * consts:
+		 */
+		uint32_t off[IR3_MAX_SHADER_BUFFERS];
+	} ssbo_size;
+
+	struct {
+		uint32_t mask;  /* bitmask of images that have image_store */
+		uint32_t count; /* number of consts allocated */
+		/* three const allocated per image which has image_store:
+		 *  + cpp         (bytes per pixel)
+		 *  + pitch       (y pitch)
+		 *  + array_pitch (z pitch)
+		 */
+		uint32_t off[IR3_MAX_SHADER_IMAGES];
+	} image_dims;
+};
+
+/**
+ * A single output for vertex transform feedback.
+ */
+struct ir3_stream_output {
+	unsigned register_index:6;  /**< 0 to 63 (OUT index) */
+	unsigned start_component:2; /** 0 to 3 */
+	unsigned num_components:3;  /** 1 to 4 */
+	unsigned output_buffer:3;   /**< 0 to PIPE_MAX_SO_BUFFERS */
+	unsigned dst_offset:16;     /**< offset into the buffer in dwords */
+	unsigned stream:2;          /**< 0 to 3 */
+};
+
+/**
+ * Stream output for vertex transform feedback.
+ */
+struct ir3_stream_output_info {
+	unsigned num_outputs;
+	/** stride for an entire vertex for each buffer in dwords */
+	uint16_t stride[IR3_MAX_SO_BUFFERS];
+
+	/**
+	 * Array of stream outputs, in the order they are to be written in.
+	 * Selected components are tightly packed into the output buffer.
+	 */
+	struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
+};
+
+/* Configuration key used to identify a shader variant.. different
+ * shader variants can be used to implement features not supported
+ * in hw (two sided color), binning-pass vertex shader, etc.
+ */
+struct ir3_shader_key {
+	union {
+		struct {
+			/*
+			 * Combined Vertex/Fragment shader parameters:
+			 */
+			unsigned ucp_enables : 8;
+
+			/* do we need to check {v,f}saturate_{s,t,r}? */
+			unsigned has_per_samp : 1;
+
+			/*
+			 * Vertex shader variant parameters:
+			 */
+			unsigned vclamp_color : 1;
+
+			/*
+			 * Fragment shader variant parameters:
+			 */
+			unsigned color_two_side : 1;
+			unsigned half_precision : 1;
+			/* used when shader needs to handle flat varyings (a4xx)
+			 * for front/back color inputs to frag shader:
+			 */
+			unsigned rasterflat : 1;
+			unsigned fclamp_color : 1;
+		};
+		uint32_t global;
+	};
+
+	/* bitmask of sampler which needs coords clamped for vertex
+	 * shader:
+	 */
+	uint16_t vsaturate_s, vsaturate_t, vsaturate_r;
+
+	/* bitmask of sampler which needs coords clamped for frag
+	 * shader:
+	 */
+	uint16_t fsaturate_s, fsaturate_t, fsaturate_r;
+
+	/* bitmask of ms shifts */
+	uint32_t vsamples, fsamples;
+
+	/* bitmask of samplers which need astc srgb workaround: */
+	uint16_t vastc_srgb, fastc_srgb;
+};
+
+static inline bool
+ir3_shader_key_equal(struct ir3_shader_key *a, struct ir3_shader_key *b)
+{
+	/* slow-path if we need to check {v,f}saturate_{s,t,r} */
+	if (a->has_per_samp || b->has_per_samp)
+		return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
+	return a->global == b->global;
+}
+
+/* will the two keys produce different lowering for a fragment shader? */
+static inline bool
+ir3_shader_key_changes_fs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
+{
+	if (last_key->has_per_samp || key->has_per_samp) {
+		if ((last_key->fsaturate_s != key->fsaturate_s) ||
+				(last_key->fsaturate_t != key->fsaturate_t) ||
+				(last_key->fsaturate_r != key->fsaturate_r) ||
+				(last_key->fsamples != key->fsamples) ||
+				(last_key->fastc_srgb != key->fastc_srgb))
+			return true;
+	}
+
+	if (last_key->fclamp_color != key->fclamp_color)
+		return true;
+
+	if (last_key->color_two_side != key->color_two_side)
+		return true;
+
+	if (last_key->half_precision != key->half_precision)
+		return true;
+
+	if (last_key->rasterflat != key->rasterflat)
+		return true;
+
+	if (last_key->ucp_enables != key->ucp_enables)
+		return true;
+
+	return false;
+}
+
+/* will the two keys produce different lowering for a vertex shader? */
+static inline bool
+ir3_shader_key_changes_vs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
+{
+	if (last_key->has_per_samp || key->has_per_samp) {
+		if ((last_key->vsaturate_s != key->vsaturate_s) ||
+				(last_key->vsaturate_t != key->vsaturate_t) ||
+				(last_key->vsaturate_r != key->vsaturate_r) ||
+				(last_key->vsamples != key->vsamples) ||
+				(last_key->vastc_srgb != key->vastc_srgb))
+			return true;
+	}
+
+	if (last_key->vclamp_color != key->vclamp_color)
+		return true;
+
+	if (last_key->ucp_enables != key->ucp_enables)
+		return true;
+
+	return false;
+}
+
+/* clears shader-key flags which don't apply to the given shader
+ * stage
+ */
+static inline void
+ir3_normalize_key(struct ir3_shader_key *key, gl_shader_stage type)
+{
+	switch (type) {
+	case MESA_SHADER_FRAGMENT:
+		if (key->has_per_samp) {
+			key->vsaturate_s = 0;
+			key->vsaturate_t = 0;
+			key->vsaturate_r = 0;
+			key->vastc_srgb = 0;
+			key->vsamples = 0;
+		}
+		break;
+	case MESA_SHADER_VERTEX:
+		key->color_two_side = false;
+		key->half_precision = false;
+		key->rasterflat = false;
+		if (key->has_per_samp) {
+			key->fsaturate_s = 0;
+			key->fsaturate_t = 0;
+			key->fsaturate_r = 0;
+			key->fastc_srgb = 0;
+			key->fsamples = 0;
+		}
+		break;
+	default:
+		/* TODO */
+		break;
+	}
+
+}
+
+struct ir3_shader_variant {
+	struct fd_bo *bo;
+
+	/* variant id (for debug) */
+	uint32_t id;
+
+	struct ir3_shader_key key;
+
+	/* vertex shaders can have an extra version for hwbinning pass,
+	 * which is pointed to by so->binning:
+	 */
+	bool binning_pass;
+	struct ir3_shader_variant *binning;
+
+	struct ir3_driver_const_layout const_layout;
+	struct ir3_info info;
+	struct ir3 *ir;
+
+	/* the instructions length is in units of instruction groups
+	 * (4 instructions for a3xx, 16 instructions for a4xx.. each
+	 * instruction is 2 dwords):
+	 */
+	unsigned instrlen;
+
+	/* the constants length is in units of vec4's, and is the sum of
+	 * the uniforms and the built-in compiler constants
+	 */
+	unsigned constlen;
+
+	/* number of uniforms (in vec4), not including built-in compiler
+	 * constants, etc.
+	 */
+	unsigned num_uniforms;
+
+	unsigned num_ubos;
+
+	/* About Linkage:
+	 *   + Let the frag shader determine the position/compmask for the
+	 *     varyings, since it is the place where we know if the varying
+	 *     is actually used, and if so, which components are used.  So
+	 *     what the hw calls "outloc" is taken from the "inloc" of the
+	 *     frag shader.
+	 *   + From the vert shader, we only need the output regid
+	 */
+
+	bool frag_coord, frag_face, color0_mrt;
+
+	/* NOTE: for input/outputs, slot is:
+	 *   gl_vert_attrib  - for VS inputs
+	 *   gl_varying_slot - for VS output / FS input
+	 *   gl_frag_result  - for FS output
+	 */
+
+	/* varyings/outputs: */
+	unsigned outputs_count;
+	struct {
+		uint8_t slot;
+		uint8_t regid;
+	} outputs[16 + 2];  /* +POSITION +PSIZE */
+	bool writes_pos, writes_psize;
+
+	/* attributes (VS) / varyings (FS):
+	 * Note that sysval's should come *after* normal inputs.
+	 */
+	unsigned inputs_count;
+	struct {
+		uint8_t slot;
+		uint8_t regid;
+		uint8_t compmask;
+		uint8_t ncomp;
+		/* location of input (ie. offset passed to bary.f, etc).  This
+		 * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
+		 * have the OUTLOCn value offset by 8, presumably to account
+		 * for gl_Position/gl_PointSize)
+		 */
+		uint8_t inloc;
+		/* vertex shader specific: */
+		bool    sysval     : 1;   /* slot is a gl_system_value */
+		/* fragment shader specific: */
+		bool    bary       : 1;   /* fetched varying (vs one loaded into reg) */
+		bool    rasterflat : 1;   /* special handling for emit->rasterflat */
+		enum glsl_interp_mode interpolate;
+	} inputs[16 + 2];  /* +POSITION +FACE */
+
+	/* sum of input components (scalar).  For frag shaders, it only counts
+	 * the varying inputs:
+	 */
+	unsigned total_in;
+
+	/* For frag shaders, the total number of inputs (not scalar,
+	 * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
+	 */
+	unsigned varying_in;
+
+	/* number of samplers/textures (which are currently 1:1): */
+	int num_samp;
+
+	/* do we have one or more SSBO instructions: */
+	bool has_ssbo;
+
+	/* do we have kill instructions: */
+	bool has_kill;
+
+	/* Layout of constant registers, each section (in vec4). Pointer size
+	 * is 32b (a3xx, a4xx), or 64b (a5xx+), which effects the size of the
+	 * UBO and stream-out consts.
+	 */
+	struct {
+		/* user const start at zero */
+		unsigned ubo;
+		/* NOTE that a3xx might need a section for SSBO addresses too */
+		unsigned ssbo_sizes;
+		unsigned image_dims;
+		unsigned driver_param;
+		unsigned tfbo;
+		unsigned immediate;
+	} constbase;
+
+	unsigned immediates_count;
+	unsigned immediates_size;
+	struct {
+		uint32_t val[4];
+	} *immediates;
+
+	/* for astc srgb workaround, the number/base of additional
+	 * alpha tex states we need, and index of original tex states
+	 */
+	struct {
+		unsigned base, count;
+		unsigned orig_idx[16];
+	} astc_srgb;
+
+	/* shader variants form a linked list: */
+	struct ir3_shader_variant *next;
+
+	/* replicated here to avoid passing extra ptrs everywhere: */
+	gl_shader_stage type;
+	struct ir3_shader *shader;
+};
+
+struct ir3_shader {
+	gl_shader_stage type;
+
+	/* shader id (for debug): */
+	uint32_t id;
+	uint32_t variant_count;
+
+	/* so we know when we can disable TGSI related hacks: */
+	bool from_tgsi;
+
+	struct ir3_compiler *compiler;
+
+	struct nir_shader *nir;
+	struct ir3_stream_output_info stream_output;
+
+	struct ir3_shader_variant *variants;
+};
+
+void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
+struct ir3_shader_variant * ir3_shader_get_variant(struct ir3_shader *shader,
+		struct ir3_shader_key *key, bool binning_pass, bool *created);
+struct ir3_shader * ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir);
+void ir3_shader_destroy(struct ir3_shader *shader);
+void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
+uint64_t ir3_shader_outputs(const struct ir3_shader *so);
+
+int
+ir3_glsl_type_size(const struct glsl_type *type);
+
+static inline const char *
+ir3_shader_stage(struct ir3_shader *shader)
+{
+	switch (shader->type) {
+	case MESA_SHADER_VERTEX:     return "VERT";
+	case MESA_SHADER_FRAGMENT:   return "FRAG";
+	case MESA_SHADER_COMPUTE:    return "CL";
+	default:
+		unreachable("invalid type");
+		return NULL;
+	}
+}
+
+/*
+ * Helper/util:
+ */
+
+static inline int
+ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
+{
+	int j;
+
+	for (j = 0; j < so->outputs_count; j++)
+		if (so->outputs[j].slot == slot)
+			return j;
+
+	/* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
+	 * in the vertex shader.. but the fragment shader doesn't know this
+	 * so  it will always have both IN.COLOR[n] and IN.BCOLOR[n].  So
+	 * at link time if there is no matching OUT.BCOLOR[n], we must map
+	 * OUT.COLOR[n] to IN.BCOLOR[n].  And visa versa if there is only
+	 * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
+	 */
+	if (slot == VARYING_SLOT_BFC0) {
+		slot = VARYING_SLOT_COL0;
+	} else if (slot == VARYING_SLOT_BFC1) {
+		slot = VARYING_SLOT_COL1;
+	} else if (slot == VARYING_SLOT_COL0) {
+		slot = VARYING_SLOT_BFC0;
+	} else if (slot == VARYING_SLOT_COL1) {
+		slot = VARYING_SLOT_BFC1;
+	} else {
+		return 0;
+	}
+
+	for (j = 0; j < so->outputs_count; j++)
+		if (so->outputs[j].slot == slot)
+			return j;
+
+	debug_assert(0);
+
+	return 0;
+}
+
+static inline int
+ir3_next_varying(const struct ir3_shader_variant *so, int i)
+{
+	while (++i < so->inputs_count)
+		if (so->inputs[i].compmask && so->inputs[i].bary)
+			break;
+	return i;
+}
+
+struct ir3_shader_linkage {
+	uint8_t max_loc;
+	uint8_t cnt;
+	struct {
+		uint8_t regid;
+		uint8_t compmask;
+		uint8_t loc;
+	} var[32];
+};
+
+static inline void
+ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid, uint8_t compmask, uint8_t loc)
+{
+	int i = l->cnt++;
+
+	debug_assert(i < ARRAY_SIZE(l->var));
+
+	l->var[i].regid    = regid;
+	l->var[i].compmask = compmask;
+	l->var[i].loc      = loc;
+	l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
+}
+
+static inline void
+ir3_link_shaders(struct ir3_shader_linkage *l,
+		const struct ir3_shader_variant *vs,
+		const struct ir3_shader_variant *fs)
+{
+	int j = -1, k;
+
+	while (l->cnt < ARRAY_SIZE(l->var)) {
+		j = ir3_next_varying(fs, j);
+
+		if (j >= fs->inputs_count)
+			break;
+
+		if (fs->inputs[j].inloc >= fs->total_in)
+			continue;
+
+		k = ir3_find_output(vs, fs->inputs[j].slot);
+
+		ir3_link_add(l, vs->outputs[k].regid,
+			fs->inputs[j].compmask, fs->inputs[j].inloc);
+	}
+}
+
+static inline uint32_t
+ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
+{
+	int j;
+	for (j = 0; j < so->outputs_count; j++)
+		if (so->outputs[j].slot == slot)
+			return so->outputs[j].regid;
+	return regid(63, 0);
+}
+
+static inline uint32_t
+ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
+{
+	int j;
+	for (j = 0; j < so->inputs_count; j++)
+		if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
+			return so->inputs[j].regid;
+	return regid(63, 0);
+}
+
+/* calculate register footprint in terms of half-regs (ie. one full
+ * reg counts as two half-regs).
+ */
+static inline uint32_t
+ir3_shader_halfregs(const struct ir3_shader_variant *v)
+{
+	return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
+}
+
+#endif /* IR3_SHADER_H_ */
diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build
new file mode 100644
index 00000000000..07319dff595
--- /dev/null
+++ b/src/freedreno/ir3/meson.build
@@ -0,0 +1,64 @@
+# Copyright © 2018 Rob Clark
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+ir3_nir_trig_c = custom_target(
+  'ir3_nir_trig.c',
+  input : 'ir3_nir_trig.py',
+  output : 'ir3_nir_trig.c',
+  command : [
+    prog_python, '@INPUT@',
+    '-p', join_paths(meson.source_root(), 'src/compiler/nir/'),
+  ],
+  capture : true,
+  depend_files : nir_algebraic_py,
+)
+
+libfreedreno_ir3_files = files(
+  'disasm-a3xx.c',
+  'instr-a3xx.h',
+  'ir3.c',
+  'ir3_compiler_nir.c',
+  'ir3_compiler.c',
+  'ir3_compiler.h',
+  'ir3_cp.c',
+  'ir3_depth.c',
+  'ir3_group.c',
+  'ir3.h',
+  'ir3_legalize.c',
+  'ir3_nir.c',
+  'ir3_nir.h',
+  'ir3_nir_lower_tg4_to_tex.c',
+  'ir3_print.c',
+  'ir3_ra.c',
+  'ir3_sched.c',
+  'ir3_shader.c',
+  'ir3_shader.h',
+)
+
+libfreedreno_ir3 = static_library(
+  'freedreno_ir3',
+  [libfreedreno_ir3_files, ir3_nir_trig_c],
+  include_directories : [inc_freedreno, inc_common],
+  c_args : [c_vis_args, no_override_init_args],
+  cpp_args : [cpp_vis_args],
+  dependencies : idep_nir_headers,
+  build_by_default : false,
+)
+
diff --git a/src/freedreno/meson.build b/src/freedreno/meson.build
index bb2cb201c0d..26ee6213890 100644
--- a/src/freedreno/meson.build
+++ b/src/freedreno/meson.build
@@ -21,3 +21,4 @@
 inc_freedreno = include_directories('.')
 
 subdir('drm')
+subdir('ir3')