freedreno: move ir3 to common location

Move (most of) the ir3 compiler to src/freedreno/ir3 so that it can be re-used by some future vulkan driver. The parts that are gallium specific have been refactored out and remain in the gallium driver. Getting the move done now so that it can happen before further refactoring to support a6xx specific instructions. NOTE also removes ir3_cmdline compiler tool from autotools build since that was easier than fixing it and I normally use meson build. Waiting patiently for the day that we can remove *everything* from the autotools build. Signed-off-by: Rob Clark <[email protected]>
author: Rob Clark <[email protected]> 2018-11-10 12:05:59 -0500
committer: Rob Clark <[email protected]> 2018-11-27 15:44:02 -0500
commit: aa0fed10d3574aec8c129bace78018ae060484c0 (patch)
tree: 2fee64028d47f6112f881903848a126da35eb5ea /src/freedreno
parent: 556eec249d6d81be88389784ce5f2583712d85d5 (diff)
24 files changed, 13731 insertions, 1 deletions
diff --git a/src/freedreno/Makefile.am b/src/freedreno/Makefile.am
index 9ddc3c0ad35..8f027e34f8a 100644
--- a/src/freedreno/Makefile.am
+++ b/src/freedreno/Makefile.am
@@ -45,7 +45,8 @@ TESTS =
 BUILT_SOURCES =
 CLEANFILES =
 EXTRA_DIST = \
-	drm/meson.build
+	drm/meson.build \
+	ir3/meson.build
 
 MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
 PYTHON_GEN = $(AM_V_GEN)$(PYTHON) $(PYTHON_FLAGS)
@@ -57,3 +58,19 @@ noinst_LTLIBRARIES += libfreedreno_drm.la
 libfreedreno_drm_la_SOURCES = $(drm_SOURCES)
 libfreedreno_drm_la_CFLAGS = $(VALGRIND_CFLAGS) $(LIBDRM_CFLAGS)
 
+noinst_LTLIBRARIES += libfreedreno_ir3.la
+
+libfreedreno_ir3_la_SOURCES = $(ir3_SOURCES) $(ir3_GENERATED_FILES)
+libfreedreno_ir3_la_CFLAGS = \
+	-I$(top_srcdir)/src/freedreno/ir3 \
+	-I$(top_builddir)/src/compiler/nir \
+	-I$(top_srcdir)/src/compiler/nir
+libfreedreno_ir3_LIBADD = \
+	$(top_builddir)/src/compiler/nir/libnir.la \
+	$(top_builddir)/src/util/libmesautil.la
+
+MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
+ir3/ir3_nir_trig.c: ir3/ir3_nir_trig.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py
+	$(MKDIR_GEN)
+	$(AM_V_GEN) $(PYTHON) $(PYTHON_FLAGS) $(srcdir)/ir3/ir3_nir_trig.py -p $(top_srcdir)/src/compiler/nir > $@ || ($(RM) $@; false)
+
diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources
index 06a1a99b9e2..1df5e6250b5 100644
--- a/src/freedreno/Makefile.sources
+++ b/src/freedreno/Makefile.sources
@@ -15,3 +15,27 @@ drm_SOURCES := \
 	drm/msm_drm.h \
 	drm/msm_ringbuffer.c
 
+ir3_SOURCES := \
+	ir3/disasm-a3xx.c \
+	ir3/instr-a3xx.h \
+	ir3/ir3.c \
+	ir3/ir3_compiler.c \
+	ir3/ir3_compiler.h \
+	ir3/ir3_compiler_nir.c \
+	ir3/ir3_cp.c \
+	ir3/ir3_depth.c \
+	ir3/ir3_group.c \
+	ir3/ir3.h \
+	ir3/ir3_legalize.c \
+	ir3/ir3_nir.c \
+	ir3/ir3_nir.h \
+	ir3/ir3_nir_lower_tg4_to_tex.c \
+	ir3/ir3_print.c \
+	ir3/ir3_ra.c \
+	ir3/ir3_sched.c \
+	ir3/ir3_shader.c \
+	ir3/ir3_shader.h
+
+ir3_GENERATED_FILES := \
+	ir3/ir3_nir_trig.c
+
diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c
new file mode 100644
index 00000000000..4cf45ce9227
--- /dev/null
+++ b/src/freedreno/ir3/disasm-a3xx.c
@@ -0,0 +1,1038 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include <util/u_debug.h>
+
+#include "instr-a3xx.h"
+
+/* bitmask of debug flags */
+enum debug_t {
+	PRINT_RAW      = 0x1,    /* dump raw hexdump */
+	PRINT_VERBOSE  = 0x2,
+};
+
+static enum debug_t debug;
+
+#define printf debug_printf
+
+static const char *levels[] = {
+		"",
+		"\t",
+		"\t\t",
+		"\t\t\t",
+		"\t\t\t\t",
+		"\t\t\t\t\t",
+		"\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t\t",
+		"\t\t\t\t\t\t\t\t\t",
+		"x",
+		"x",
+		"x",
+		"x",
+		"x",
+		"x",
+};
+
+static const char *component = "xyzw";
+
+static const char *type[] = {
+		[TYPE_F16] = "f16",
+		[TYPE_F32] = "f32",
+		[TYPE_U16] = "u16",
+		[TYPE_U32] = "u32",
+		[TYPE_S16] = "s16",
+		[TYPE_S32] = "s32",
+		[TYPE_U8]  = "u8",
+		[TYPE_S8]  = "s8",
+};
+
+struct disasm_ctx {
+	FILE *out;
+	int level;
+
+	/* current instruction repeat flag: */
+	unsigned repeat;
+};
+
+static void print_reg(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
+		bool c, bool im, bool neg, bool abs, bool addr_rel)
+{
+	const char type = c ? 'c' : 'r';
+
+	// XXX I prefer - and || for neg/abs, but preserving format used
+	// by libllvm-a3xx for easy diffing..
+
+	if (abs && neg)
+		fprintf(ctx->out, "(absneg)");
+	else if (neg)
+		fprintf(ctx->out, "(neg)");
+	else if (abs)
+		fprintf(ctx->out, "(abs)");
+
+	if (r)
+		fprintf(ctx->out, "(r)");
+
+	if (im) {
+		fprintf(ctx->out, "%d", reg.iim_val);
+	} else if (addr_rel) {
+		/* I would just use %+d but trying to make it diff'able with
+		 * libllvm-a3xx...
+		 */
+		if (reg.iim_val < 0)
+			fprintf(ctx->out, "%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
+		else if (reg.iim_val > 0)
+			fprintf(ctx->out, "%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
+		else
+			fprintf(ctx->out, "%s%c<a0.x>", full ? "" : "h", type);
+	} else if ((reg.num == REG_A0) && !c) {
+		fprintf(ctx->out, "a0.%c", component[reg.comp]);
+	} else if ((reg.num == REG_P0) && !c) {
+		fprintf(ctx->out, "p0.%c", component[reg.comp]);
+	} else {
+		fprintf(ctx->out, "%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]);
+	}
+}
+
+
+static void print_reg_dst(struct disasm_ctx *ctx, reg_t reg, bool full, bool addr_rel)
+{
+	print_reg(ctx, reg, full, false, false, false, false, false, addr_rel);
+}
+
+static void print_reg_src(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
+		bool c, bool im, bool neg, bool abs, bool addr_rel)
+{
+	print_reg(ctx, reg, full, r, c, im, neg, abs, addr_rel);
+}
+
+/* TODO switch to using reginfo struct everywhere, since more readable
+ * than passing a bunch of bools to print_reg_src
+ */
+
+struct reginfo {
+	reg_t reg;
+	bool full;
+	bool r;
+	bool c;
+	bool im;
+	bool neg;
+	bool abs;
+	bool addr_rel;
+};
+
+static void print_src(struct disasm_ctx *ctx, struct reginfo *info)
+{
+	print_reg_src(ctx, info->reg, info->full, info->r, info->c, info->im,
+			info->neg, info->abs, info->addr_rel);
+}
+
+//static void print_dst(struct disasm_ctx *ctx, struct reginfo *info)
+//{
+//	print_reg_dst(ctx, info->reg, info->full, info->addr_rel);
+//}
+
+static void print_instr_cat0(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat0_t *cat0 = &instr->cat0;
+
+	switch (cat0->opc) {
+	case OPC_KILL:
+		fprintf(ctx->out, " %sp0.%c", cat0->inv ? "!" : "",
+				component[cat0->comp]);
+		break;
+	case OPC_BR:
+		fprintf(ctx->out, " %sp0.%c, #%d", cat0->inv ? "!" : "",
+				component[cat0->comp], cat0->a3xx.immed);
+		break;
+	case OPC_JUMP:
+	case OPC_CALL:
+		fprintf(ctx->out, " #%d", cat0->a3xx.immed);
+		break;
+	}
+
+	if ((debug & PRINT_VERBOSE) && (cat0->dummy2|cat0->dummy3|cat0->dummy4))
+		fprintf(ctx->out, "\t{0: %x,%x,%x}", cat0->dummy2, cat0->dummy3, cat0->dummy4);
+}
+
+static void print_instr_cat1(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat1_t *cat1 = &instr->cat1;
+
+	if (cat1->ul)
+		fprintf(ctx->out, "(ul)");
+
+	if (cat1->src_type == cat1->dst_type) {
+		if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
+			/* special case (nmemonic?): */
+			fprintf(ctx->out, "mova");
+		} else {
+			fprintf(ctx->out, "mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+		}
+	} else {
+		fprintf(ctx->out, "cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+	}
+
+	fprintf(ctx->out, " ");
+
+	if (cat1->even)
+		fprintf(ctx->out, "(even)");
+
+	if (cat1->pos_inf)
+		fprintf(ctx->out, "(pos_infinity)");
+
+	print_reg_dst(ctx, (reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
+			cat1->dst_rel);
+
+	fprintf(ctx->out, ", ");
+
+	/* ugg, have to special case this.. vs print_reg().. */
+	if (cat1->src_im) {
+		if (type_float(cat1->src_type))
+			fprintf(ctx->out, "(%f)", cat1->fim_val);
+		else if (type_uint(cat1->src_type))
+			fprintf(ctx->out, "0x%08x", cat1->uim_val);
+		else
+			fprintf(ctx->out, "%d", cat1->iim_val);
+	} else if (cat1->src_rel && !cat1->src_c) {
+		/* I would just use %+d but trying to make it diff'able with
+		 * libllvm-a3xx...
+		 */
+		char type = cat1->src_rel_c ? 'c' : 'r';
+		if (cat1->off < 0)
+			fprintf(ctx->out, "%c<a0.x - %d>", type, -cat1->off);
+		else if (cat1->off > 0)
+			fprintf(ctx->out, "%c<a0.x + %d>", type, cat1->off);
+		else
+			fprintf(ctx->out, "%c<a0.x>", type);
+	} else {
+		print_reg_src(ctx, (reg_t)(cat1->src), type_size(cat1->src_type) == 32,
+				cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
+	}
+
+	if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
+		fprintf(ctx->out, "\t{1: %x}", cat1->must_be_0);
+}
+
+static void print_instr_cat2(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat2_t *cat2 = &instr->cat2;
+	static const char *cond[] = {
+			"lt",
+			"le",
+			"gt",
+			"ge",
+			"eq",
+			"ne",
+			"?6?",
+	};
+
+	switch (_OPC(2, cat2->opc)) {
+	case OPC_CMPS_F:
+	case OPC_CMPS_U:
+	case OPC_CMPS_S:
+	case OPC_CMPV_F:
+	case OPC_CMPV_U:
+	case OPC_CMPV_S:
+		fprintf(ctx->out, ".%s", cond[cat2->cond]);
+		break;
+	}
+
+	fprintf(ctx->out, " ");
+	if (cat2->ei)
+		fprintf(ctx->out, "(ei)");
+	print_reg_dst(ctx, (reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
+	fprintf(ctx->out, ", ");
+
+	if (cat2->c1.src1_c) {
+		print_reg_src(ctx, (reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r,
+				cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg,
+				cat2->src1_abs, false);
+	} else if (cat2->rel1.src1_rel) {
+		print_reg_src(ctx, (reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r,
+				cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg,
+				cat2->src1_abs, cat2->rel1.src1_rel);
+	} else {
+		print_reg_src(ctx, (reg_t)(cat2->src1), cat2->full, cat2->src1_r,
+				false, cat2->src1_im, cat2->src1_neg,
+				cat2->src1_abs, false);
+	}
+
+	switch (_OPC(2, cat2->opc)) {
+	case OPC_ABSNEG_F:
+	case OPC_ABSNEG_S:
+	case OPC_CLZ_B:
+	case OPC_CLZ_S:
+	case OPC_SIGN_F:
+	case OPC_FLOOR_F:
+	case OPC_CEIL_F:
+	case OPC_RNDNE_F:
+	case OPC_RNDAZ_F:
+	case OPC_TRUNC_F:
+	case OPC_NOT_B:
+	case OPC_BFREV_B:
+	case OPC_SETRM:
+	case OPC_CBITS_B:
+		/* these only have one src reg */
+		break;
+	default:
+		fprintf(ctx->out, ", ");
+		if (cat2->c2.src2_c) {
+			print_reg_src(ctx, (reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r,
+					cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg,
+					cat2->src2_abs, false);
+		} else if (cat2->rel2.src2_rel) {
+			print_reg_src(ctx, (reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r,
+					cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg,
+					cat2->src2_abs, cat2->rel2.src2_rel);
+		} else {
+			print_reg_src(ctx, (reg_t)(cat2->src2), cat2->full, cat2->src2_r,
+					false, cat2->src2_im, cat2->src2_neg,
+					cat2->src2_abs, false);
+		}
+		break;
+	}
+}
+
+static void print_instr_cat3(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat3_t *cat3 = &instr->cat3;
+	bool full = instr_cat3_full(cat3);
+
+	fprintf(ctx->out, " ");
+	print_reg_dst(ctx, (reg_t)(cat3->dst), full ^ cat3->dst_half, false);
+	fprintf(ctx->out, ", ");
+	if (cat3->c1.src1_c) {
+		print_reg_src(ctx, (reg_t)(cat3->c1.src1), full,
+				cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg,
+				false, false);
+	} else if (cat3->rel1.src1_rel) {
+		print_reg_src(ctx, (reg_t)(cat3->rel1.src1), full,
+				cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg,
+				false, cat3->rel1.src1_rel);
+	} else {
+		print_reg_src(ctx, (reg_t)(cat3->src1), full,
+				cat3->src1_r, false, false, cat3->src1_neg,
+				false, false);
+	}
+	fprintf(ctx->out, ", ");
+	print_reg_src(ctx, (reg_t)cat3->src2, full,
+			cat3->src2_r, cat3->src2_c, false, cat3->src2_neg,
+			false, false);
+	fprintf(ctx->out, ", ");
+	if (cat3->c2.src3_c) {
+		print_reg_src(ctx, (reg_t)(cat3->c2.src3), full,
+				cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg,
+				false, false);
+	} else if (cat3->rel2.src3_rel) {
+		print_reg_src(ctx, (reg_t)(cat3->rel2.src3), full,
+				cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg,
+				false, cat3->rel2.src3_rel);
+	} else {
+		print_reg_src(ctx, (reg_t)(cat3->src3), full,
+				cat3->src3_r, false, false, cat3->src3_neg,
+				false, false);
+	}
+}
+
+static void print_instr_cat4(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat4_t *cat4 = &instr->cat4;
+
+	fprintf(ctx->out, " ");
+	print_reg_dst(ctx, (reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
+	fprintf(ctx->out, ", ");
+
+	if (cat4->c.src_c) {
+		print_reg_src(ctx, (reg_t)(cat4->c.src), cat4->full,
+				cat4->src_r, cat4->c.src_c, cat4->src_im,
+				cat4->src_neg, cat4->src_abs, false);
+	} else if (cat4->rel.src_rel) {
+		print_reg_src(ctx, (reg_t)(cat4->rel.src), cat4->full,
+				cat4->src_r, cat4->rel.src_c, cat4->src_im,
+				cat4->src_neg, cat4->src_abs, cat4->rel.src_rel);
+	} else {
+		print_reg_src(ctx, (reg_t)(cat4->src), cat4->full,
+				cat4->src_r, false, cat4->src_im,
+				cat4->src_neg, cat4->src_abs, false);
+	}
+
+	if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
+		fprintf(ctx->out, "\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
+}
+
+static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr)
+{
+	static const struct {
+		bool src1, src2, samp, tex;
+	} info[0x1f] = {
+			[opc_op(OPC_ISAM)]     = { true,  false, true,  true,  },
+			[opc_op(OPC_ISAML)]    = { true,  true,  true,  true,  },
+			[opc_op(OPC_ISAMM)]    = { true,  false, true,  true,  },
+			[opc_op(OPC_SAM)]      = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMB)]     = { true,  true,  true,  true,  },
+			[opc_op(OPC_SAML)]     = { true,  true,  true,  true,  },
+			[opc_op(OPC_SAMGQ)]    = { true,  false, true,  true,  },
+			[opc_op(OPC_GETLOD)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_CONV)]     = { true,  true,  true,  true,  },
+			[opc_op(OPC_CONVM)]    = { true,  true,  true,  true,  },
+			[opc_op(OPC_GETSIZE)]  = { true,  false, false, true,  },
+			[opc_op(OPC_GETBUF)]   = { false, false, false, true,  },
+			[opc_op(OPC_GETPOS)]   = { true,  false, false, true,  },
+			[opc_op(OPC_GETINFO)]  = { false, false, false, true,  },
+			[opc_op(OPC_DSX)]      = { true,  false, false, false, },
+			[opc_op(OPC_DSY)]      = { true,  false, false, false, },
+			[opc_op(OPC_GATHER4R)] = { true,  false, true,  true,  },
+			[opc_op(OPC_GATHER4G)] = { true,  false, true,  true,  },
+			[opc_op(OPC_GATHER4B)] = { true,  false, true,  true,  },
+			[opc_op(OPC_GATHER4A)] = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP0)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP1)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP2)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_SAMGP3)]   = { true,  false, true,  true,  },
+			[opc_op(OPC_DSXPP_1)]  = { true,  false, false, false, },
+			[opc_op(OPC_DSYPP_1)]  = { true,  false, false, false, },
+			[opc_op(OPC_RGETPOS)]  = { false, false, false, false, },
+			[opc_op(OPC_RGETINFO)] = { false, false, false, false, },
+	};
+	instr_cat5_t *cat5 = &instr->cat5;
+	int i;
+
+	if (cat5->is_3d)   fprintf(ctx->out, ".3d");
+	if (cat5->is_a)    fprintf(ctx->out, ".a");
+	if (cat5->is_o)    fprintf(ctx->out, ".o");
+	if (cat5->is_p)    fprintf(ctx->out, ".p");
+	if (cat5->is_s)    fprintf(ctx->out, ".s");
+	if (cat5->is_s2en) fprintf(ctx->out, ".s2en");
+
+	fprintf(ctx->out, " ");
+
+	switch (_OPC(5, cat5->opc)) {
+	case OPC_DSXPP_1:
+	case OPC_DSYPP_1:
+		break;
+	default:
+		fprintf(ctx->out, "(%s)", type[cat5->type]);
+		break;
+	}
+
+	fprintf(ctx->out, "(");
+	for (i = 0; i < 4; i++)
+		if (cat5->wrmask & (1 << i))
+			fprintf(ctx->out, "%c", "xyzw"[i]);
+	fprintf(ctx->out, ")");
+
+	print_reg_dst(ctx, (reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
+
+	if (info[cat5->opc].src1) {
+		fprintf(ctx->out, ", ");
+		print_reg_src(ctx, (reg_t)(cat5->src1), cat5->full, false, false, false,
+				false, false, false);
+	}
+
+	if (cat5->is_s2en) {
+		fprintf(ctx->out, ", ");
+		print_reg_src(ctx, (reg_t)(cat5->s2en.src2), cat5->full, false, false, false,
+				false, false, false);
+		fprintf(ctx->out, ", ");
+		print_reg_src(ctx, (reg_t)(cat5->s2en.src3), false, false, false, false,
+				false, false, false);
+	} else {
+		if (cat5->is_o || info[cat5->opc].src2) {
+			fprintf(ctx->out, ", ");
+			print_reg_src(ctx, (reg_t)(cat5->norm.src2), cat5->full,
+					false, false, false, false, false, false);
+		}
+		if (info[cat5->opc].samp)
+			fprintf(ctx->out, ", s#%d", cat5->norm.samp);
+		if (info[cat5->opc].tex)
+			fprintf(ctx->out, ", t#%d", cat5->norm.tex);
+	}
+
+	if (debug & PRINT_VERBOSE) {
+		if (cat5->is_s2en) {
+			if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2))
+				fprintf(ctx->out, "\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2);
+		} else {
+			if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2))
+				fprintf(ctx->out, "\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2);
+		}
+	}
+}
+
+static void print_instr_cat6(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat6_t *cat6 = &instr->cat6;
+	char sd = 0, ss = 0;  /* dst/src address space */
+	bool nodst = false;
+	struct reginfo dst, src1, src2;
+	int src1off = 0, dstoff = 0;
+
+	memset(&dst, 0, sizeof(dst));
+	memset(&src1, 0, sizeof(src1));
+	memset(&src2, 0, sizeof(src2));
+
+	switch (_OPC(6, cat6->opc)) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		dst.full  = type_size(cat6->type) == 32;
+		src1.full = type_size(cat6->type) == 32;
+		src2.full = type_size(cat6->type) == 32;
+		break;
+	case OPC_L2G:
+	case OPC_G2L:
+		dst.full = true;
+		src1.full = true;
+		src2.full = true;
+		break;
+	case OPC_STG:
+	case OPC_STL:
+	case OPC_STP:
+	case OPC_STI:
+	case OPC_STLW:
+	case OPC_STIB:
+		dst.full  = true;
+		src1.full = type_size(cat6->type) == 32;
+		src2.full = type_size(cat6->type) == 32;
+		break;
+	default:
+		dst.full  = type_size(cat6->type) == 32;
+		src1.full = true;
+		src2.full = true;
+		break;
+	}
+
+	switch (_OPC(6, cat6->opc)) {
+	case OPC_PREFETCH:
+		break;
+	case OPC_RESINFO:
+		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+		break;
+	case OPC_LDGB:
+		fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
+		break;
+	case OPC_STGB:
+	case OPC_STIB:
+		fprintf(ctx->out, ".%s", cat6->stgb.typed ? "typed" : "untyped");
+		fprintf(ctx->out, ".%dd", cat6->stgb.d + 1);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		fprintf(ctx->out, ".%d", cat6->stgb.type_size + 1);
+		break;
+	case OPC_ATOMIC_ADD:
+	case OPC_ATOMIC_SUB:
+	case OPC_ATOMIC_XCHG:
+	case OPC_ATOMIC_INC:
+	case OPC_ATOMIC_DEC:
+	case OPC_ATOMIC_CMPXCHG:
+	case OPC_ATOMIC_MIN:
+	case OPC_ATOMIC_MAX:
+	case OPC_ATOMIC_AND:
+	case OPC_ATOMIC_OR:
+	case OPC_ATOMIC_XOR:
+		ss = cat6->g ? 'g' : 'l';
+		fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
+		fprintf(ctx->out, ".%c", ss);
+		break;
+	default:
+		dst.im = cat6->g && !cat6->dst_off;
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		break;
+	}
+	fprintf(ctx->out, " ");
+
+	switch (_OPC(6, cat6->opc)) {
+	case OPC_STG:
+		sd = 'g';
+		break;
+	case OPC_STP:
+		sd = 'p';
+		break;
+	case OPC_STL:
+	case OPC_STLW:
+		sd = 'l';
+		break;
+
+	case OPC_LDG:
+	case OPC_LDC:
+		ss = 'g';
+		break;
+	case OPC_LDP:
+		ss = 'p';
+		break;
+	case OPC_LDL:
+	case OPC_LDLW:
+	case OPC_LDLV:
+		ss = 'l';
+		break;
+
+	case OPC_L2G:
+		ss = 'l';
+		sd = 'g';
+		break;
+
+	case OPC_G2L:
+		ss = 'g';
+		sd = 'l';
+		break;
+
+	case OPC_PREFETCH:
+		ss = 'g';
+		nodst = true;
+		break;
+
+	case OPC_STI:
+		dst.full = false;  // XXX or inverts??
+		break;
+	}
+
+	if ((_OPC(6, cat6->opc) == OPC_STGB) || (_OPC(6, cat6->opc) == OPC_STIB)) {
+		struct reginfo src3;
+
+		memset(&src3, 0, sizeof(src3));
+
+		src1.reg = (reg_t)(cat6->stgb.src1);
+		src2.reg = (reg_t)(cat6->stgb.src2);
+		src2.im  = cat6->stgb.src2_im;
+		src3.reg = (reg_t)(cat6->stgb.src3);
+		src3.im  = cat6->stgb.src3_im;
+		src3.full = true;
+
+		fprintf(ctx->out, "g[%u], ", cat6->stgb.dst_ssbo);
+		print_src(ctx, &src1);
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src2);
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src3);
+
+		if (debug & PRINT_VERBOSE)
+			fprintf(ctx->out, " (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3);
+
+		return;
+	}
+
+	if (is_atomic(_OPC(6, cat6->opc))) {
+
+		src1.reg = (reg_t)(cat6->ldgb.src1);
+		src1.im  = cat6->ldgb.src1_im;
+		src2.reg = (reg_t)(cat6->ldgb.src2);
+		src2.im  = cat6->ldgb.src2_im;
+		dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+		print_src(ctx, &dst);
+		fprintf(ctx->out, ", ");
+		if (ss == 'g') {
+			struct reginfo src3;
+			memset(&src3, 0, sizeof(src3));
+
+			src3.reg = (reg_t)(cat6->ldgb.src3);
+			src3.full = true;
+
+			/* For images, the ".typed" variant is used and src2 is
+			 * the ivecN coordinates, ie ivec2 for 2d.
+			 *
+			 * For SSBOs, the ".untyped" variant is used and src2 is
+			 * a simple dword offset..  src3 appears to be
+			 * uvec2(offset * 4, 0).  Not sure the point of that.
+			 */
+
+			fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+			print_src(ctx, &src1);  /* value */
+			fprintf(ctx->out, ", ");
+			print_src(ctx, &src2);  /* offset/coords */
+			fprintf(ctx->out, ", ");
+			print_src(ctx, &src3);  /* 64b byte offset.. */
+
+			if (debug & PRINT_VERBOSE) {
+				fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0,
+						cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+			}
+		} else { /* ss == 'l' */
+			fprintf(ctx->out, "l[");
+			print_src(ctx, &src1);  /* simple byte offset */
+			fprintf(ctx->out, "], ");
+			print_src(ctx, &src2);  /* value */
+
+			if (debug & PRINT_VERBOSE) {
+				fprintf(ctx->out, " (src3=%x, pad0=%x, pad3=%x, mustbe0=%x)",
+						cat6->ldgb.src3, cat6->ldgb.pad0,
+						cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+			}
+		}
+
+		return;
+	} else if (_OPC(6, cat6->opc) == OPC_RESINFO) {
+		dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+		print_src(ctx, &dst);
+		fprintf(ctx->out, ", ");
+		fprintf(ctx->out, "g[%u]", cat6->ldgb.src_ssbo);
+
+		return;
+	} else if (_OPC(6, cat6->opc) == OPC_LDGB) {
+
+		src1.reg = (reg_t)(cat6->ldgb.src1);
+		src1.im  = cat6->ldgb.src1_im;
+		src2.reg = (reg_t)(cat6->ldgb.src2);
+		src2.im  = cat6->ldgb.src2_im;
+		dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+		print_src(ctx, &dst);
+		fprintf(ctx->out, ", ");
+		fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+		print_src(ctx, &src1);
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src2);
+
+		if (debug & PRINT_VERBOSE)
+			fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+
+		return;
+	}
+	if (cat6->dst_off) {
+		dst.reg = (reg_t)(cat6->c.dst);
+		dstoff  = cat6->c.off;
+	} else {
+		dst.reg = (reg_t)(cat6->d.dst);
+	}
+
+	if (cat6->src_off) {
+		src1.reg = (reg_t)(cat6->a.src1);
+		src1.im  = cat6->a.src1_im;
+		src2.reg = (reg_t)(cat6->a.src2);
+		src2.im  = cat6->a.src2_im;
+		src1off  = cat6->a.off;
+	} else {
+		src1.reg = (reg_t)(cat6->b.src1);
+		src1.im  = cat6->b.src1_im;
+		src2.reg = (reg_t)(cat6->b.src2);
+		src2.im  = cat6->b.src2_im;
+	}
+
+	if (!nodst) {
+		if (sd)
+			fprintf(ctx->out, "%c[", sd);
+		/* note: dst might actually be a src (ie. address to store to) */
+		print_src(ctx, &dst);
+		if (dstoff)
+			fprintf(ctx->out, "%+d", dstoff);
+		if (sd)
+			fprintf(ctx->out, "]");
+		fprintf(ctx->out, ", ");
+	}
+
+	if (ss)
+		fprintf(ctx->out, "%c[", ss);
+
+	/* can have a larger than normal immed, so hack: */
+	if (src1.im) {
+		fprintf(ctx->out, "%u", src1.reg.dummy13);
+	} else {
+		print_src(ctx, &src1);
+	}
+
+	if (src1off)
+		fprintf(ctx->out, "%+d", src1off);
+	if (ss)
+		fprintf(ctx->out, "]");
+
+	switch (_OPC(6, cat6->opc)) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		break;
+	default:
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src2);
+		break;
+	}
+}
+
+static void print_instr_cat7(struct disasm_ctx *ctx, instr_t *instr)
+{
+	instr_cat7_t *cat7 = &instr->cat7;
+
+	if (cat7->g)
+		fprintf(ctx->out, ".g");
+	if (cat7->l)
+		fprintf(ctx->out, ".l");
+
+	if (_OPC(7, cat7->opc) == OPC_FENCE) {
+		if (cat7->r)
+			fprintf(ctx->out, ".r");
+		if (cat7->w)
+			fprintf(ctx->out, ".w");
+	}
+}
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+static const struct opc_info {
+	uint16_t cat;
+	uint16_t opc;
+	const char *name;
+	void (*print)(struct disasm_ctx *ctx, instr_t *instr);
+} opcs[1 << (3+NOPC_BITS)] = {
+#define OPC(cat, opc, name) [(opc)] = { (cat), (opc), #name, print_instr_cat##cat }
+	/* category 0: */
+	OPC(0, OPC_NOP,          nop),
+	OPC(0, OPC_BR,           br),
+	OPC(0, OPC_JUMP,         jump),
+	OPC(0, OPC_CALL,         call),
+	OPC(0, OPC_RET,          ret),
+	OPC(0, OPC_KILL,         kill),
+	OPC(0, OPC_END,          end),
+	OPC(0, OPC_EMIT,         emit),
+	OPC(0, OPC_CUT,          cut),
+	OPC(0, OPC_CHMASK,       chmask),
+	OPC(0, OPC_CHSH,         chsh),
+	OPC(0, OPC_FLOW_REV,     flow_rev),
+
+	/* category 1: */
+	OPC(1, OPC_MOV, ),
+
+	/* category 2: */
+	OPC(2, OPC_ADD_F,        add.f),
+	OPC(2, OPC_MIN_F,        min.f),
+	OPC(2, OPC_MAX_F,        max.f),
+	OPC(2, OPC_MUL_F,        mul.f),
+	OPC(2, OPC_SIGN_F,       sign.f),
+	OPC(2, OPC_CMPS_F,       cmps.f),
+	OPC(2, OPC_ABSNEG_F,     absneg.f),
+	OPC(2, OPC_CMPV_F,       cmpv.f),
+	OPC(2, OPC_FLOOR_F,      floor.f),
+	OPC(2, OPC_CEIL_F,       ceil.f),
+	OPC(2, OPC_RNDNE_F,      rndne.f),
+	OPC(2, OPC_RNDAZ_F,      rndaz.f),
+	OPC(2, OPC_TRUNC_F,      trunc.f),
+	OPC(2, OPC_ADD_U,        add.u),
+	OPC(2, OPC_ADD_S,        add.s),
+	OPC(2, OPC_SUB_U,        sub.u),
+	OPC(2, OPC_SUB_S,        sub.s),
+	OPC(2, OPC_CMPS_U,       cmps.u),
+	OPC(2, OPC_CMPS_S,       cmps.s),
+	OPC(2, OPC_MIN_U,        min.u),
+	OPC(2, OPC_MIN_S,        min.s),
+	OPC(2, OPC_MAX_U,        max.u),
+	OPC(2, OPC_MAX_S,        max.s),
+	OPC(2, OPC_ABSNEG_S,     absneg.s),
+	OPC(2, OPC_AND_B,        and.b),
+	OPC(2, OPC_OR_B,         or.b),
+	OPC(2, OPC_NOT_B,        not.b),
+	OPC(2, OPC_XOR_B,        xor.b),
+	OPC(2, OPC_CMPV_U,       cmpv.u),
+	OPC(2, OPC_CMPV_S,       cmpv.s),
+	OPC(2, OPC_MUL_U,        mul.u),
+	OPC(2, OPC_MUL_S,        mul.s),
+	OPC(2, OPC_MULL_U,       mull.u),
+	OPC(2, OPC_BFREV_B,      bfrev.b),
+	OPC(2, OPC_CLZ_S,        clz.s),
+	OPC(2, OPC_CLZ_B,        clz.b),
+	OPC(2, OPC_SHL_B,        shl.b),
+	OPC(2, OPC_SHR_B,        shr.b),
+	OPC(2, OPC_ASHR_B,       ashr.b),
+	OPC(2, OPC_BARY_F,       bary.f),
+	OPC(2, OPC_MGEN_B,       mgen.b),
+	OPC(2, OPC_GETBIT_B,     getbit.b),
+	OPC(2, OPC_SETRM,        setrm),
+	OPC(2, OPC_CBITS_B,      cbits.b),
+	OPC(2, OPC_SHB,          shb),
+	OPC(2, OPC_MSAD,         msad),
+
+	/* category 3: */
+	OPC(3, OPC_MAD_U16,      mad.u16),
+	OPC(3, OPC_MADSH_U16,    madsh.u16),
+	OPC(3, OPC_MAD_S16,      mad.s16),
+	OPC(3, OPC_MADSH_M16,    madsh.m16),
+	OPC(3, OPC_MAD_U24,      mad.u24),
+	OPC(3, OPC_MAD_S24,      mad.s24),
+	OPC(3, OPC_MAD_F16,      mad.f16),
+	OPC(3, OPC_MAD_F32,      mad.f32),
+	OPC(3, OPC_SEL_B16,      sel.b16),
+	OPC(3, OPC_SEL_B32,      sel.b32),
+	OPC(3, OPC_SEL_S16,      sel.s16),
+	OPC(3, OPC_SEL_S32,      sel.s32),
+	OPC(3, OPC_SEL_F16,      sel.f16),
+	OPC(3, OPC_SEL_F32,      sel.f32),
+	OPC(3, OPC_SAD_S16,      sad.s16),
+	OPC(3, OPC_SAD_S32,      sad.s32),
+
+	/* category 4: */
+	OPC(4, OPC_RCP,          rcp),
+	OPC(4, OPC_RSQ,          rsq),
+	OPC(4, OPC_LOG2,         log2),
+	OPC(4, OPC_EXP2,         exp2),
+	OPC(4, OPC_SIN,          sin),
+	OPC(4, OPC_COS,          cos),
+	OPC(4, OPC_SQRT,         sqrt),
+
+	/* category 5: */
+	OPC(5, OPC_ISAM,         isam),
+	OPC(5, OPC_ISAML,        isaml),
+	OPC(5, OPC_ISAMM,        isamm),
+	OPC(5, OPC_SAM,          sam),
+	OPC(5, OPC_SAMB,         samb),
+	OPC(5, OPC_SAML,         saml),
+	OPC(5, OPC_SAMGQ,        samgq),
+	OPC(5, OPC_GETLOD,       getlod),
+	OPC(5, OPC_CONV,         conv),
+	OPC(5, OPC_CONVM,        convm),
+	OPC(5, OPC_GETSIZE,      getsize),
+	OPC(5, OPC_GETBUF,       getbuf),
+	OPC(5, OPC_GETPOS,       getpos),
+	OPC(5, OPC_GETINFO,      getinfo),
+	OPC(5, OPC_DSX,          dsx),
+	OPC(5, OPC_DSY,          dsy),
+	OPC(5, OPC_GATHER4R,     gather4r),
+	OPC(5, OPC_GATHER4G,     gather4g),
+	OPC(5, OPC_GATHER4B,     gather4b),
+	OPC(5, OPC_GATHER4A,     gather4a),
+	OPC(5, OPC_SAMGP0,       samgp0),
+	OPC(5, OPC_SAMGP1,       samgp1),
+	OPC(5, OPC_SAMGP2,       samgp2),
+	OPC(5, OPC_SAMGP3,       samgp3),
+	OPC(5, OPC_DSXPP_1,      dsxpp.1),
+	OPC(5, OPC_DSYPP_1,      dsypp.1),
+	OPC(5, OPC_RGETPOS,      rgetpos),
+	OPC(5, OPC_RGETINFO,     rgetinfo),
+
+
+	/* category 6: */
+	OPC(6, OPC_LDG,          ldg),
+	OPC(6, OPC_LDL,          ldl),
+	OPC(6, OPC_LDP,          ldp),
+	OPC(6, OPC_STG,          stg),
+	OPC(6, OPC_STL,          stl),
+	OPC(6, OPC_STP,          stp),
+	OPC(6, OPC_STI,          sti),
+	OPC(6, OPC_G2L,          g2l),
+	OPC(6, OPC_L2G,          l2g),
+	OPC(6, OPC_PREFETCH,     prefetch),
+	OPC(6, OPC_LDLW,         ldlw),
+	OPC(6, OPC_STLW,         stlw),
+	OPC(6, OPC_RESFMT,       resfmt),
+	OPC(6, OPC_RESINFO,      resinfo),
+	OPC(6, OPC_ATOMIC_ADD,     atomic.add),
+	OPC(6, OPC_ATOMIC_SUB,     atomic.sub),
+	OPC(6, OPC_ATOMIC_XCHG,    atomic.xchg),
+	OPC(6, OPC_ATOMIC_INC,     atomic.inc),
+	OPC(6, OPC_ATOMIC_DEC,     atomic.dec),
+	OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg),
+	OPC(6, OPC_ATOMIC_MIN,     atomic.min),
+	OPC(6, OPC_ATOMIC_MAX,     atomic.max),
+	OPC(6, OPC_ATOMIC_AND,     atomic.and),
+	OPC(6, OPC_ATOMIC_OR,      atomic.or),
+	OPC(6, OPC_ATOMIC_XOR,     atomic.xor),
+	OPC(6, OPC_LDGB,         ldgb),
+	OPC(6, OPC_STGB,         stgb),
+	OPC(6, OPC_STIB,         stib),
+	OPC(6, OPC_LDC,          ldc),
+	OPC(6, OPC_LDLV,         ldlv),
+
+	OPC(7, OPC_BAR,          bar),
+	OPC(7, OPC_FENCE,        fence),
+
+#undef OPC
+};
+
+#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)]))
+
+// XXX hack.. probably should move this table somewhere common:
+#include "ir3.h"
+const char *ir3_instr_name(struct ir3_instruction *instr)
+{
+	if (opc_cat(instr->opc) == -1) return "??meta??";
+	return opcs[instr->opc].name;
+}
+
+static bool print_instr(struct disasm_ctx *ctx, uint32_t *dwords, int n)
+{
+	instr_t *instr = (instr_t *)dwords;
+	uint32_t opc = instr_opc(instr);
+	const char *name;
+
+	if (debug & PRINT_VERBOSE)
+		fprintf(ctx->out, "%s%04d[%08xx_%08xx] ", levels[ctx->level], n, dwords[1], dwords[0]);
+
+	/* NOTE: order flags are printed is a bit fugly.. but for now I
+	 * try to match the order in llvm-a3xx disassembler for easy
+	 * diff'ing..
+	 */
+
+	ctx->repeat = instr_repeat(instr);
+
+	if (instr->sync)
+		fprintf(ctx->out, "(sy)");
+	if (instr->ss && ((instr->opc_cat <= 4) || (instr->opc_cat == 7)))
+		fprintf(ctx->out, "(ss)");
+	if (instr->jmp_tgt)
+		fprintf(ctx->out, "(jp)");
+	if (instr_sat(instr))
+		fprintf(ctx->out, "(sat)");
+	if (ctx->repeat)
+		fprintf(ctx->out, "(rpt%d)", ctx->repeat);
+	if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
+		fprintf(ctx->out, "(ul)");
+
+	name = GETINFO(instr)->name;
+
+	if (name) {
+		fprintf(ctx->out, "%s", name);
+		GETINFO(instr)->print(ctx, instr);
+	} else {
+		fprintf(ctx->out, "unknown(%d,%d)", instr->opc_cat, opc);
+	}
+
+	fprintf(ctx->out, "\n");
+
+	return (instr->opc_cat == 0) && (opc == OPC_END);
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out)
+{
+	struct disasm_ctx ctx;
+	int i;
+
+	assert((sizedwords % 2) == 0);
+
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.out = out;
+	ctx.level = level;
+
+	for (i = 0; i < sizedwords; i += 2)
+		print_instr(&ctx, &dwords[i], i/2);
+
+	return 0;
+}
diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h
new file mode 100644
index 00000000000..7f60ee5fd4c
--- /dev/null
+++ b/src/freedreno/ir3/instr-a3xx.h
@@ -0,0 +1,872 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INSTR_A3XX_H_
+#define INSTR_A3XX_H_
+
+#define PACKED __attribute__((__packed__))
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <assert.h>
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+#define _OPC(cat, opc)   (((cat) << NOPC_BITS) | opc)
+
+typedef enum {
+	/* category 0: */
+	OPC_NOP             = _OPC(0, 0),
+	OPC_BR              = _OPC(0, 1),
+	OPC_JUMP            = _OPC(0, 2),
+	OPC_CALL            = _OPC(0, 3),
+	OPC_RET             = _OPC(0, 4),
+	OPC_KILL            = _OPC(0, 5),
+	OPC_END             = _OPC(0, 6),
+	OPC_EMIT            = _OPC(0, 7),
+	OPC_CUT             = _OPC(0, 8),
+	OPC_CHMASK          = _OPC(0, 9),
+	OPC_CHSH            = _OPC(0, 10),
+	OPC_FLOW_REV        = _OPC(0, 11),
+
+	/* category 1: */
+	OPC_MOV             = _OPC(1, 0),
+
+	/* category 2: */
+	OPC_ADD_F           = _OPC(2, 0),
+	OPC_MIN_F           = _OPC(2, 1),
+	OPC_MAX_F           = _OPC(2, 2),
+	OPC_MUL_F           = _OPC(2, 3),
+	OPC_SIGN_F          = _OPC(2, 4),
+	OPC_CMPS_F          = _OPC(2, 5),
+	OPC_ABSNEG_F        = _OPC(2, 6),
+	OPC_CMPV_F          = _OPC(2, 7),
+	/* 8 - invalid */
+	OPC_FLOOR_F         = _OPC(2, 9),
+	OPC_CEIL_F          = _OPC(2, 10),
+	OPC_RNDNE_F         = _OPC(2, 11),
+	OPC_RNDAZ_F         = _OPC(2, 12),
+	OPC_TRUNC_F         = _OPC(2, 13),
+	/* 14-15 - invalid */
+	OPC_ADD_U           = _OPC(2, 16),
+	OPC_ADD_S           = _OPC(2, 17),
+	OPC_SUB_U           = _OPC(2, 18),
+	OPC_SUB_S           = _OPC(2, 19),
+	OPC_CMPS_U          = _OPC(2, 20),
+	OPC_CMPS_S          = _OPC(2, 21),
+	OPC_MIN_U           = _OPC(2, 22),
+	OPC_MIN_S           = _OPC(2, 23),
+	OPC_MAX_U           = _OPC(2, 24),
+	OPC_MAX_S           = _OPC(2, 25),
+	OPC_ABSNEG_S        = _OPC(2, 26),
+	/* 27 - invalid */
+	OPC_AND_B           = _OPC(2, 28),
+	OPC_OR_B            = _OPC(2, 29),
+	OPC_NOT_B           = _OPC(2, 30),
+	OPC_XOR_B           = _OPC(2, 31),
+	/* 32 - invalid */
+	OPC_CMPV_U          = _OPC(2, 33),
+	OPC_CMPV_S          = _OPC(2, 34),
+	/* 35-47 - invalid */
+	OPC_MUL_U           = _OPC(2, 48),
+	OPC_MUL_S           = _OPC(2, 49),
+	OPC_MULL_U          = _OPC(2, 50),
+	OPC_BFREV_B         = _OPC(2, 51),
+	OPC_CLZ_S           = _OPC(2, 52),
+	OPC_CLZ_B           = _OPC(2, 53),
+	OPC_SHL_B           = _OPC(2, 54),
+	OPC_SHR_B           = _OPC(2, 55),
+	OPC_ASHR_B          = _OPC(2, 56),
+	OPC_BARY_F          = _OPC(2, 57),
+	OPC_MGEN_B          = _OPC(2, 58),
+	OPC_GETBIT_B        = _OPC(2, 59),
+	OPC_SETRM           = _OPC(2, 60),
+	OPC_CBITS_B         = _OPC(2, 61),
+	OPC_SHB             = _OPC(2, 62),
+	OPC_MSAD            = _OPC(2, 63),
+
+	/* category 3: */
+	OPC_MAD_U16         = _OPC(3, 0),
+	OPC_MADSH_U16       = _OPC(3, 1),
+	OPC_MAD_S16         = _OPC(3, 2),
+	OPC_MADSH_M16       = _OPC(3, 3),   /* should this be .s16? */
+	OPC_MAD_U24         = _OPC(3, 4),
+	OPC_MAD_S24         = _OPC(3, 5),
+	OPC_MAD_F16         = _OPC(3, 6),
+	OPC_MAD_F32         = _OPC(3, 7),
+	OPC_SEL_B16         = _OPC(3, 8),
+	OPC_SEL_B32         = _OPC(3, 9),
+	OPC_SEL_S16         = _OPC(3, 10),
+	OPC_SEL_S32         = _OPC(3, 11),
+	OPC_SEL_F16         = _OPC(3, 12),
+	OPC_SEL_F32         = _OPC(3, 13),
+	OPC_SAD_S16         = _OPC(3, 14),
+	OPC_SAD_S32         = _OPC(3, 15),
+
+	/* category 4: */
+	OPC_RCP             = _OPC(4, 0),
+	OPC_RSQ             = _OPC(4, 1),
+	OPC_LOG2            = _OPC(4, 2),
+	OPC_EXP2            = _OPC(4, 3),
+	OPC_SIN             = _OPC(4, 4),
+	OPC_COS             = _OPC(4, 5),
+	OPC_SQRT            = _OPC(4, 6),
+	// 7-63 - invalid
+
+	/* category 5: */
+	OPC_ISAM            = _OPC(5, 0),
+	OPC_ISAML           = _OPC(5, 1),
+	OPC_ISAMM           = _OPC(5, 2),
+	OPC_SAM             = _OPC(5, 3),
+	OPC_SAMB            = _OPC(5, 4),
+	OPC_SAML            = _OPC(5, 5),
+	OPC_SAMGQ           = _OPC(5, 6),
+	OPC_GETLOD          = _OPC(5, 7),
+	OPC_CONV            = _OPC(5, 8),
+	OPC_CONVM           = _OPC(5, 9),
+	OPC_GETSIZE         = _OPC(5, 10),
+	OPC_GETBUF          = _OPC(5, 11),
+	OPC_GETPOS          = _OPC(5, 12),
+	OPC_GETINFO         = _OPC(5, 13),
+	OPC_DSX             = _OPC(5, 14),
+	OPC_DSY             = _OPC(5, 15),
+	OPC_GATHER4R        = _OPC(5, 16),
+	OPC_GATHER4G        = _OPC(5, 17),
+	OPC_GATHER4B        = _OPC(5, 18),
+	OPC_GATHER4A        = _OPC(5, 19),
+	OPC_SAMGP0          = _OPC(5, 20),
+	OPC_SAMGP1          = _OPC(5, 21),
+	OPC_SAMGP2          = _OPC(5, 22),
+	OPC_SAMGP3          = _OPC(5, 23),
+	OPC_DSXPP_1         = _OPC(5, 24),
+	OPC_DSYPP_1         = _OPC(5, 25),
+	OPC_RGETPOS         = _OPC(5, 26),
+	OPC_RGETINFO        = _OPC(5, 27),
+
+	/* category 6: */
+	OPC_LDG             = _OPC(6, 0),        /* load-global */
+	OPC_LDL             = _OPC(6, 1),
+	OPC_LDP             = _OPC(6, 2),
+	OPC_STG             = _OPC(6, 3),        /* store-global */
+	OPC_STL             = _OPC(6, 4),
+	OPC_STP             = _OPC(6, 5),
+	OPC_STI             = _OPC(6, 6),
+	OPC_G2L             = _OPC(6, 7),
+	OPC_L2G             = _OPC(6, 8),
+	OPC_PREFETCH        = _OPC(6, 9),
+	OPC_LDLW            = _OPC(6, 10),
+	OPC_STLW            = _OPC(6, 11),
+	OPC_RESFMT          = _OPC(6, 14),
+	OPC_RESINFO         = _OPC(6, 15),
+	OPC_ATOMIC_ADD      = _OPC(6, 16),
+	OPC_ATOMIC_SUB      = _OPC(6, 17),
+	OPC_ATOMIC_XCHG     = _OPC(6, 18),
+	OPC_ATOMIC_INC      = _OPC(6, 19),
+	OPC_ATOMIC_DEC      = _OPC(6, 20),
+	OPC_ATOMIC_CMPXCHG  = _OPC(6, 21),
+	OPC_ATOMIC_MIN      = _OPC(6, 22),
+	OPC_ATOMIC_MAX      = _OPC(6, 23),
+	OPC_ATOMIC_AND      = _OPC(6, 24),
+	OPC_ATOMIC_OR       = _OPC(6, 25),
+	OPC_ATOMIC_XOR      = _OPC(6, 26),
+	OPC_LDGB            = _OPC(6, 27),
+	OPC_STGB            = _OPC(6, 28),
+	OPC_STIB            = _OPC(6, 29),
+	OPC_LDC             = _OPC(6, 30),
+	OPC_LDLV            = _OPC(6, 31),
+
+	/* category 7: */
+	OPC_BAR             = _OPC(7, 0),
+	OPC_FENCE           = _OPC(7, 1),
+
+	/* meta instructions (category -1): */
+	/* placeholder instr to mark shader inputs: */
+	OPC_META_INPUT      = _OPC(-1, 0),
+	/* The "fan-in" and "fan-out" instructions are used for keeping
+	 * track of instructions that write to multiple dst registers
+	 * (fan-out) like texture sample instructions, or read multiple
+	 * consecutive scalar registers (fan-in) (bary.f, texture samp)
+	 */
+	OPC_META_FO         = _OPC(-1, 2),
+	OPC_META_FI         = _OPC(-1, 3),
+
+} opc_t;
+
+#define opc_cat(opc) ((int)((opc) >> NOPC_BITS))
+#define opc_op(opc)  ((unsigned)((opc) & ((1 << NOPC_BITS) - 1)))
+
+typedef enum {
+	TYPE_F16 = 0,
+	TYPE_F32 = 1,
+	TYPE_U16 = 2,
+	TYPE_U32 = 3,
+	TYPE_S16 = 4,
+	TYPE_S32 = 5,
+	TYPE_U8  = 6,
+	TYPE_S8  = 7,  // XXX I assume?
+} type_t;
+
+static inline uint32_t type_size(type_t type)
+{
+	switch (type) {
+	case TYPE_F32:
+	case TYPE_U32:
+	case TYPE_S32:
+		return 32;
+	case TYPE_F16:
+	case TYPE_U16:
+	case TYPE_S16:
+		return 16;
+	case TYPE_U8:
+	case TYPE_S8:
+		return 8;
+	default:
+		assert(0); /* invalid type */
+		return 0;
+	}
+}
+
+static inline int type_float(type_t type)
+{
+	return (type == TYPE_F32) || (type == TYPE_F16);
+}
+
+static inline int type_uint(type_t type)
+{
+	return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
+}
+
+static inline int type_sint(type_t type)
+{
+	return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
+}
+
+typedef union PACKED {
+	/* normal gpr or const src register: */
+	struct PACKED {
+		uint32_t comp  : 2;
+		uint32_t num   : 10;
+	};
+	/* for immediate val: */
+	int32_t  iim_val   : 11;
+	/* to make compiler happy: */
+	uint32_t dummy32;
+	uint32_t dummy10   : 10;
+	int32_t  idummy10  : 10;
+	uint32_t dummy11   : 11;
+	uint32_t dummy12   : 12;
+	uint32_t dummy13   : 13;
+	uint32_t dummy8    : 8;
+} reg_t;
+
+/* special registers: */
+#define REG_A0 61       /* address register */
+#define REG_P0 62       /* predicate register */
+
+static inline int reg_special(reg_t reg)
+{
+	return (reg.num == REG_A0) || (reg.num == REG_P0);
+}
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			int16_t  immed    : 16;
+			uint32_t dummy1   : 16;
+		} a3xx;
+		struct PACKED {
+			int32_t  immed    : 20;
+			uint32_t dummy1   : 12;
+		} a4xx;
+		struct PACKED {
+			int32_t immed     : 32;
+		} a5xx;
+	};
+
+	/* dword1: */
+	uint32_t dummy2   : 8;
+	uint32_t repeat   : 3;
+	uint32_t dummy3   : 1;
+	uint32_t ss       : 1;
+	uint32_t dummy4   : 7;
+	uint32_t inv      : 1;
+	uint32_t comp     : 2;
+	uint32_t opc      : 4;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat0_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		/* for normal src register: */
+		struct PACKED {
+			uint32_t src : 11;
+			/* at least low bit of pad must be zero or it will
+			 * look like a address relative src
+			 */
+			uint32_t pad : 21;
+		};
+		/* for address relative: */
+		struct PACKED {
+			int32_t  off : 10;
+			uint32_t src_rel_c : 1;
+			uint32_t src_rel : 1;
+			uint32_t unknown : 20;
+		};
+		/* for immediate: */
+		int32_t  iim_val;
+		uint32_t uim_val;
+		float    fim_val;
+	};
+
+	/* dword1: */
+	uint32_t dst        : 8;
+	uint32_t repeat     : 3;
+	uint32_t src_r      : 1;
+	uint32_t ss         : 1;
+	uint32_t ul         : 1;
+	uint32_t dst_type   : 3;
+	uint32_t dst_rel    : 1;
+	uint32_t src_type   : 3;
+	uint32_t src_c      : 1;
+	uint32_t src_im     : 1;
+	uint32_t even       : 1;
+	uint32_t pos_inf    : 1;
+	uint32_t must_be_0  : 2;
+	uint32_t jmp_tgt    : 1;
+	uint32_t sync       : 1;
+	uint32_t opc_cat    : 3;
+} instr_cat1_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			uint32_t src1         : 11;
+			uint32_t must_be_zero1: 2;
+			uint32_t src1_im      : 1;   /* immediate */
+			uint32_t src1_neg     : 1;   /* negate */
+			uint32_t src1_abs     : 1;   /* absolute value */
+		};
+		struct PACKED {
+			uint32_t src1         : 10;
+			uint32_t src1_c       : 1;   /* relative-const */
+			uint32_t src1_rel     : 1;   /* relative address */
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel1;
+		struct PACKED {
+			uint32_t src1         : 12;
+			uint32_t src1_c       : 1;   /* const */
+			uint32_t dummy        : 3;
+		} c1;
+	};
+
+	union PACKED {
+		struct PACKED {
+			uint32_t src2         : 11;
+			uint32_t must_be_zero2: 2;
+			uint32_t src2_im      : 1;   /* immediate */
+			uint32_t src2_neg     : 1;   /* negate */
+			uint32_t src2_abs     : 1;   /* absolute value */
+		};
+		struct PACKED {
+			uint32_t src2         : 10;
+			uint32_t src2_c       : 1;   /* relative-const */
+			uint32_t src2_rel     : 1;   /* relative address */
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel2;
+		struct PACKED {
+			uint32_t src2         : 12;
+			uint32_t src2_c       : 1;   /* const */
+			uint32_t dummy        : 3;
+		} c2;
+	};
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t repeat   : 2;
+	uint32_t sat      : 1;
+	uint32_t src1_r   : 1;
+	uint32_t ss       : 1;
+	uint32_t ul       : 1;   /* dunno */
+	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+	uint32_t ei       : 1;
+	uint32_t cond     : 3;
+	uint32_t src2_r   : 1;
+	uint32_t full     : 1;   /* not half */
+	uint32_t opc      : 6;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat2_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			uint32_t src1         : 11;
+			uint32_t must_be_zero1: 2;
+			uint32_t src2_c       : 1;
+			uint32_t src1_neg     : 1;
+			uint32_t src2_r       : 1;
+		};
+		struct PACKED {
+			uint32_t src1         : 10;
+			uint32_t src1_c       : 1;
+			uint32_t src1_rel     : 1;
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel1;
+		struct PACKED {
+			uint32_t src1         : 12;
+			uint32_t src1_c       : 1;
+			uint32_t dummy        : 3;
+		} c1;
+	};
+
+	union PACKED {
+		struct PACKED {
+			uint32_t src3         : 11;
+			uint32_t must_be_zero2: 2;
+			uint32_t src3_r       : 1;
+			uint32_t src2_neg     : 1;
+			uint32_t src3_neg     : 1;
+		};
+		struct PACKED {
+			uint32_t src3         : 10;
+			uint32_t src3_c       : 1;
+			uint32_t src3_rel     : 1;
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel2;
+		struct PACKED {
+			uint32_t src3         : 12;
+			uint32_t src3_c       : 1;
+			uint32_t dummy        : 3;
+		} c2;
+	};
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t repeat   : 2;
+	uint32_t sat      : 1;
+	uint32_t src1_r   : 1;
+	uint32_t ss       : 1;
+	uint32_t ul       : 1;
+	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+	uint32_t src2     : 8;
+	uint32_t opc      : 4;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat3_t;
+
+static inline bool instr_cat3_full(instr_cat3_t *cat3)
+{
+	switch (_OPC(3, cat3->opc)) {
+	case OPC_MAD_F16:
+	case OPC_MAD_U16:
+	case OPC_MAD_S16:
+	case OPC_SEL_B16:
+	case OPC_SEL_S16:
+	case OPC_SEL_F16:
+	case OPC_SAD_S16:
+	case OPC_SAD_S32:  // really??
+		return false;
+	default:
+		return true;
+	}
+}
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		struct PACKED {
+			uint32_t src          : 11;
+			uint32_t must_be_zero1: 2;
+			uint32_t src_im       : 1;   /* immediate */
+			uint32_t src_neg      : 1;   /* negate */
+			uint32_t src_abs      : 1;   /* absolute value */
+		};
+		struct PACKED {
+			uint32_t src          : 10;
+			uint32_t src_c        : 1;   /* relative-const */
+			uint32_t src_rel      : 1;   /* relative address */
+			uint32_t must_be_zero : 1;
+			uint32_t dummy        : 3;
+		} rel;
+		struct PACKED {
+			uint32_t src          : 12;
+			uint32_t src_c        : 1;   /* const */
+			uint32_t dummy        : 3;
+		} c;
+	};
+	uint32_t dummy1   : 16;  /* seem to be ignored */
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t repeat   : 2;
+	uint32_t sat      : 1;
+	uint32_t src_r    : 1;
+	uint32_t ss       : 1;
+	uint32_t ul       : 1;
+	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+	uint32_t dummy2   : 5;   /* seem to be ignored */
+	uint32_t full     : 1;   /* not half */
+	uint32_t opc      : 6;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat4_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	union PACKED {
+		/* normal case: */
+		struct PACKED {
+			uint32_t full     : 1;   /* not half */
+			uint32_t src1     : 8;
+			uint32_t src2     : 8;
+			uint32_t dummy1   : 4;   /* seem to be ignored */
+			uint32_t samp     : 4;
+			uint32_t tex      : 7;
+		} norm;
+		/* s2en case: */
+		struct PACKED {
+			uint32_t full     : 1;   /* not half */
+			uint32_t src1     : 8;
+			uint32_t src2     : 11;
+			uint32_t dummy1   : 1;
+			uint32_t src3     : 8;
+			uint32_t dummy2   : 3;
+		} s2en;
+		/* same in either case: */
+		// XXX I think, confirm this
+		struct PACKED {
+			uint32_t full     : 1;   /* not half */
+			uint32_t src1     : 8;
+			uint32_t pad      : 23;
+		};
+	};
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t wrmask   : 4;   /* write-mask */
+	uint32_t type     : 3;
+	uint32_t dummy2   : 1;   /* seems to be ignored */
+	uint32_t is_3d    : 1;
+
+	uint32_t is_a     : 1;
+	uint32_t is_s     : 1;
+	uint32_t is_s2en  : 1;
+	uint32_t is_o     : 1;
+	uint32_t is_p     : 1;
+
+	uint32_t opc      : 5;
+	uint32_t jmp_tgt  : 1;
+	uint32_t sync     : 1;
+	uint32_t opc_cat  : 3;
+} instr_cat5_t;
+
+/* dword0 encoding for src_off: [src1 + off], src2: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t mustbe1  : 1;
+	int32_t  off      : 13;
+	uint32_t src1     : 8;
+	uint32_t src1_im  : 1;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t dword1;
+} instr_cat6a_t;
+
+/* dword0 encoding for !src_off: [src1], src2 */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t mustbe0  : 1;
+	uint32_t src1     : 13;
+	uint32_t ignore0  : 8;
+	uint32_t src1_im  : 1;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t dword1;
+} instr_cat6b_t;
+
+/* dword1 encoding for dst_off: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t dword0;
+
+	/* note: there is some weird stuff going on where sometimes
+	 * cat6->a.off is involved.. but that seems like a bug in
+	 * the blob, since it is used even if !cat6->src_off
+	 * It would make sense for there to be some more bits to
+	 * bring us to 11 bits worth of offset, but not sure..
+	 */
+	int32_t off       : 8;
+	uint32_t mustbe1  : 1;
+	uint32_t dst      : 8;
+	uint32_t pad1     : 15;
+} instr_cat6c_t;
+
+/* dword1 encoding for !dst_off: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t dword0;
+
+	uint32_t dst      : 8;
+	uint32_t mustbe0  : 1;
+	uint32_t idx      : 8;
+	uint32_t pad0     : 15;
+} instr_cat6d_t;
+
+/* ldgb and atomics..
+ *
+ * ldgb:      pad0=0, pad3=1
+ * atomic .g: pad0=1, pad3=1
+ *        .l: pad0=1, pad3=0
+ */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t pad0     : 1;
+	uint32_t src3     : 8;
+	uint32_t d        : 2;
+	uint32_t typed    : 1;
+	uint32_t type_size : 2;
+	uint32_t src1     : 8;
+	uint32_t src1_im  : 1;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t dst      : 8;
+	uint32_t mustbe0  : 1;
+	uint32_t src_ssbo : 8;
+	uint32_t pad2     : 3;  // type
+	uint32_t g        : 1;
+	uint32_t pad3     : 1;
+	uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6ldgb_t;
+
+/* stgb, pad0=0, pad3=2
+ */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t mustbe1  : 1;  // ???
+	uint32_t src1     : 8;
+	uint32_t d        : 2;
+	uint32_t typed    : 1;
+	uint32_t type_size : 2;
+	uint32_t pad0     : 9;
+	uint32_t src2_im  : 1;
+	uint32_t src2     : 8;
+
+	/* dword1: */
+	uint32_t src3     : 8;
+	uint32_t src3_im  : 1;
+	uint32_t dst_ssbo : 8;
+	uint32_t pad2     : 3;  // type
+	uint32_t pad3     : 2;
+	uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6stgb_t;
+
+typedef union PACKED {
+	instr_cat6a_t a;
+	instr_cat6b_t b;
+	instr_cat6c_t c;
+	instr_cat6d_t d;
+	instr_cat6ldgb_t ldgb;
+	instr_cat6stgb_t stgb;
+	struct PACKED {
+		/* dword0: */
+		uint32_t src_off  : 1;
+		uint32_t pad1     : 31;
+
+		/* dword1: */
+		uint32_t pad2     : 8;
+		uint32_t dst_off  : 1;
+		uint32_t pad3     : 8;
+		uint32_t type     : 3;
+		uint32_t g        : 1;  /* or in some cases it means dst immed */
+		uint32_t pad4     : 1;
+		uint32_t opc      : 5;
+		uint32_t jmp_tgt  : 1;
+		uint32_t sync     : 1;
+		uint32_t opc_cat  : 3;
+	};
+} instr_cat6_t;
+
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t pad1     : 32;
+
+	/* dword1: */
+	uint32_t pad2     : 12;
+	uint32_t ss       : 1;  /* maybe in the encoding, but blob only uses (sy) */
+	uint32_t pad3     : 6;
+	uint32_t w        : 1;  /* write */
+	uint32_t r        : 1;  /* read */
+	uint32_t l        : 1;  /* local */
+	uint32_t g        : 1;  /* global */
+	uint32_t opc      : 4;  /* presumed, but only a couple known OPCs */
+	uint32_t jmp_tgt  : 1;  /* (jp) */
+	uint32_t sync     : 1;  /* (sy) */
+	uint32_t opc_cat  : 3;
+} instr_cat7_t;
+
+typedef union PACKED {
+	instr_cat0_t cat0;
+	instr_cat1_t cat1;
+	instr_cat2_t cat2;
+	instr_cat3_t cat3;
+	instr_cat4_t cat4;
+	instr_cat5_t cat5;
+	instr_cat6_t cat6;
+	instr_cat7_t cat7;
+	struct PACKED {
+		/* dword0: */
+		uint32_t pad1     : 32;
+
+		/* dword1: */
+		uint32_t pad2     : 12;
+		uint32_t ss       : 1;  /* cat1-cat4 (cat0??) and cat7 (?) */
+		uint32_t ul       : 1;  /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
+		uint32_t pad3     : 13;
+		uint32_t jmp_tgt  : 1;
+		uint32_t sync     : 1;
+		uint32_t opc_cat  : 3;
+
+	};
+} instr_t;
+
+static inline uint32_t instr_repeat(instr_t *instr)
+{
+	switch (instr->opc_cat) {
+	case 0:  return instr->cat0.repeat;
+	case 1:  return instr->cat1.repeat;
+	case 2:  return instr->cat2.repeat;
+	case 3:  return instr->cat3.repeat;
+	case 4:  return instr->cat4.repeat;
+	default: return 0;
+	}
+}
+
+static inline bool instr_sat(instr_t *instr)
+{
+	switch (instr->opc_cat) {
+	case 2:  return instr->cat2.sat;
+	case 3:  return instr->cat3.sat;
+	case 4:  return instr->cat4.sat;
+	default: return false;
+	}
+}
+
+static inline uint32_t instr_opc(instr_t *instr)
+{
+	switch (instr->opc_cat) {
+	case 0:  return instr->cat0.opc;
+	case 1:  return 0;
+	case 2:  return instr->cat2.opc;
+	case 3:  return instr->cat3.opc;
+	case 4:  return instr->cat4.opc;
+	case 5:  return instr->cat5.opc;
+	case 6:  return instr->cat6.opc;
+	case 7:  return instr->cat7.opc;
+	default: return 0;
+	}
+}
+
+static inline bool is_mad(opc_t opc)
+{
+	switch (opc) {
+	case OPC_MAD_U16:
+	case OPC_MAD_S16:
+	case OPC_MAD_U24:
+	case OPC_MAD_S24:
+	case OPC_MAD_F16:
+	case OPC_MAD_F32:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_madsh(opc_t opc)
+{
+	switch (opc) {
+	case OPC_MADSH_U16:
+	case OPC_MADSH_M16:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_atomic(opc_t opc)
+{
+	switch (opc) {
+	case OPC_ATOMIC_ADD:
+	case OPC_ATOMIC_SUB:
+	case OPC_ATOMIC_XCHG:
+	case OPC_ATOMIC_INC:
+	case OPC_ATOMIC_DEC:
+	case OPC_ATOMIC_CMPXCHG:
+	case OPC_ATOMIC_MIN:
+	case OPC_ATOMIC_MAX:
+	case OPC_ATOMIC_AND:
+	case OPC_ATOMIC_OR:
+	case OPC_ATOMIC_XOR:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_ssbo(opc_t opc)
+{
+	switch (opc) {
+	case OPC_RESFMT:
+	case OPC_RESINFO:
+	case OPC_LDGB:
+	case OPC_STGB:
+	case OPC_STIB:
+		return true;
+	default:
+		return false;
+	}
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out);
+
+#endif /* INSTR_A3XX_H_ */
diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
new file mode 100644
index 00000000000..3d1c4449b12
--- /dev/null
+++ b/src/freedreno/ir3/ir3.c
@@ -0,0 +1,941 @@
+/*
+ * Copyright (c) 2012 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "util/bitscan.h"
+#include "util/ralloc.h"
+#include "util/u_math.h"
+
+#include "instr-a3xx.h"
+
+/* simple allocator to carve allocations out of an up-front allocated heap,
+ * so that we can free everything easily in one shot.
+ */
+void * ir3_alloc(struct ir3 *shader, int sz)
+{
+	return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
+}
+
+struct ir3 * ir3_create(struct ir3_compiler *compiler,
+		unsigned nin, unsigned nout)
+{
+	struct ir3 *shader = rzalloc(compiler, struct ir3);
+
+	shader->compiler = compiler;
+	shader->ninputs = nin;
+	shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin);
+
+	shader->noutputs = nout;
+	shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
+
+	list_inithead(&shader->block_list);
+	list_inithead(&shader->array_list);
+
+	return shader;
+}
+
+void ir3_destroy(struct ir3 *shader)
+{
+	ralloc_free(shader);
+}
+
+#define iassert(cond) do { \
+	if (!(cond)) { \
+		debug_assert(cond); \
+		return -1; \
+	} } while (0)
+
+#define iassert_type(reg, full) do { \
+	if ((full)) { \
+		iassert(!((reg)->flags & IR3_REG_HALF)); \
+	} else { \
+		iassert((reg)->flags & IR3_REG_HALF); \
+	} } while (0);
+
+static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
+		uint32_t repeat, uint32_t valid_flags)
+{
+	reg_t val = { .dummy32 = 0 };
+
+	if (reg->flags & ~valid_flags) {
+		debug_printf("INVALID FLAGS: %x vs %x\n",
+				reg->flags, valid_flags);
+	}
+
+	if (!(reg->flags & IR3_REG_R))
+		repeat = 0;
+
+	if (reg->flags & IR3_REG_IMMED) {
+		val.iim_val = reg->iim_val;
+	} else {
+		unsigned components;
+		int16_t max;
+
+		if (reg->flags & IR3_REG_RELATIV) {
+			components = reg->size;
+			val.idummy10 = reg->array.offset;
+			max = (reg->array.offset + repeat + components - 1) >> 2;
+		} else {
+			components = util_last_bit(reg->wrmask);
+			val.comp = reg->num & 0x3;
+			val.num  = reg->num >> 2;
+			max = (reg->num + repeat + components - 1) >> 2;
+		}
+
+		if (reg->flags & IR3_REG_CONST) {
+			info->max_const = MAX2(info->max_const, max);
+		} else if (val.num == 63) {
+			/* ignore writes to dummy register r63.x */
+		} else if (max < 48) {
+			if (reg->flags & IR3_REG_HALF) {
+				if (info->gpu_id >= 600) {
+					/* starting w/ a6xx, half regs conflict with full regs: */
+					info->max_reg = MAX2(info->max_reg, (max+1)/2);
+				} else {
+					info->max_half_reg = MAX2(info->max_half_reg, max);
+				}
+			} else {
+				info->max_reg = MAX2(info->max_reg, max);
+			}
+		}
+	}
+
+	return val.dummy32;
+}
+
+static int emit_cat0(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	instr_cat0_t *cat0 = ptr;
+
+	if (info->gpu_id >= 500) {
+		cat0->a5xx.immed = instr->cat0.immed;
+	} else if (info->gpu_id >= 400) {
+		cat0->a4xx.immed = instr->cat0.immed;
+	} else {
+		cat0->a3xx.immed = instr->cat0.immed;
+	}
+	cat0->repeat   = instr->repeat;
+	cat0->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat0->inv      = instr->cat0.inv;
+	cat0->comp     = instr->cat0.comp;
+	cat0->opc      = instr->opc;
+	cat0->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat0->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat0->opc_cat  = 0;
+
+	return 0;
+}
+
+static int emit_cat1(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src = instr->regs[1];
+	instr_cat1_t *cat1 = ptr;
+
+	iassert(instr->regs_count == 2);
+	iassert_type(dst, type_size(instr->cat1.dst_type) == 32);
+	if (!(src->flags & IR3_REG_IMMED))
+		iassert_type(src, type_size(instr->cat1.src_type) == 32);
+
+	if (src->flags & IR3_REG_IMMED) {
+		cat1->iim_val = src->iim_val;
+		cat1->src_im  = 1;
+	} else if (src->flags & IR3_REG_RELATIV) {
+		cat1->off       = reg(src, info, instr->repeat,
+				IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF | IR3_REG_RELATIV);
+		cat1->src_rel   = 1;
+		cat1->src_rel_c = !!(src->flags & IR3_REG_CONST);
+	} else {
+		cat1->src  = reg(src, info, instr->repeat,
+				IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF);
+		cat1->src_c     = !!(src->flags & IR3_REG_CONST);
+	}
+
+	cat1->dst      = reg(dst, info, instr->repeat,
+			IR3_REG_RELATIV | IR3_REG_EVEN |
+			IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF);
+	cat1->repeat   = instr->repeat;
+	cat1->src_r    = !!(src->flags & IR3_REG_R);
+	cat1->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat1->ul       = !!(instr->flags & IR3_INSTR_UL);
+	cat1->dst_type = instr->cat1.dst_type;
+	cat1->dst_rel  = !!(dst->flags & IR3_REG_RELATIV);
+	cat1->src_type = instr->cat1.src_type;
+	cat1->even     = !!(dst->flags & IR3_REG_EVEN);
+	cat1->pos_inf  = !!(dst->flags & IR3_REG_POS_INF);
+	cat1->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat1->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat1->opc_cat  = 1;
+
+	return 0;
+}
+
+static int emit_cat2(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src1 = instr->regs[1];
+	struct ir3_register *src2 = instr->regs[2];
+	instr_cat2_t *cat2 = ptr;
+	unsigned absneg = ir3_cat2_absneg(instr->opc);
+
+	iassert((instr->regs_count == 2) || (instr->regs_count == 3));
+
+	if (src1->flags & IR3_REG_RELATIV) {
+		iassert(src1->array.offset < (1 << 10));
+		cat2->rel1.src1      = reg(src1, info, instr->repeat,
+				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+				IR3_REG_HALF | absneg);
+		cat2->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
+		cat2->rel1.src1_rel  = 1;
+	} else if (src1->flags & IR3_REG_CONST) {
+		iassert(src1->num < (1 << 12));
+		cat2->c1.src1   = reg(src1, info, instr->repeat,
+				IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+		cat2->c1.src1_c = 1;
+	} else {
+		iassert(src1->num < (1 << 11));
+		cat2->src1 = reg(src1, info, instr->repeat,
+				IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
+				absneg);
+	}
+	cat2->src1_im  = !!(src1->flags & IR3_REG_IMMED);
+	cat2->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+	cat2->src1_abs = !!(src1->flags & (IR3_REG_FABS | IR3_REG_SABS));
+	cat2->src1_r   = !!(src1->flags & IR3_REG_R);
+
+	if (src2) {
+		iassert((src2->flags & IR3_REG_IMMED) ||
+				!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+
+		if (src2->flags & IR3_REG_RELATIV) {
+			iassert(src2->array.offset < (1 << 10));
+			cat2->rel2.src2      = reg(src2, info, instr->repeat,
+					IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+					IR3_REG_HALF | absneg);
+			cat2->rel2.src2_c    = !!(src2->flags & IR3_REG_CONST);
+			cat2->rel2.src2_rel  = 1;
+		} else if (src2->flags & IR3_REG_CONST) {
+			iassert(src2->num < (1 << 12));
+			cat2->c2.src2   = reg(src2, info, instr->repeat,
+					IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+			cat2->c2.src2_c = 1;
+		} else {
+			iassert(src2->num < (1 << 11));
+			cat2->src2 = reg(src2, info, instr->repeat,
+					IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
+					absneg);
+		}
+
+		cat2->src2_im  = !!(src2->flags & IR3_REG_IMMED);
+		cat2->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+		cat2->src2_abs = !!(src2->flags & (IR3_REG_FABS | IR3_REG_SABS));
+		cat2->src2_r   = !!(src2->flags & IR3_REG_R);
+	}
+
+	cat2->dst      = reg(dst, info, instr->repeat,
+			IR3_REG_R | IR3_REG_EI | IR3_REG_HALF);
+	cat2->repeat   = instr->repeat;
+	cat2->sat      = !!(instr->flags & IR3_INSTR_SAT);
+	cat2->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat2->ul       = !!(instr->flags & IR3_INSTR_UL);
+	cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF);
+	cat2->ei       = !!(dst->flags & IR3_REG_EI);
+	cat2->cond     = instr->cat2.condition;
+	cat2->full     = ! (src1->flags & IR3_REG_HALF);
+	cat2->opc      = instr->opc;
+	cat2->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat2->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat2->opc_cat  = 2;
+
+	return 0;
+}
+
+static int emit_cat3(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src1 = instr->regs[1];
+	struct ir3_register *src2 = instr->regs[2];
+	struct ir3_register *src3 = instr->regs[3];
+	unsigned absneg = ir3_cat3_absneg(instr->opc);
+	instr_cat3_t *cat3 = ptr;
+	uint32_t src_flags = 0;
+
+	switch (instr->opc) {
+	case OPC_MAD_F16:
+	case OPC_MAD_U16:
+	case OPC_MAD_S16:
+	case OPC_SEL_B16:
+	case OPC_SEL_S16:
+	case OPC_SEL_F16:
+	case OPC_SAD_S16:
+	case OPC_SAD_S32:  // really??
+		src_flags |= IR3_REG_HALF;
+		break;
+	default:
+		break;
+	}
+
+	iassert(instr->regs_count == 4);
+	iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF));
+	iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF));
+	iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
+
+	if (src1->flags & IR3_REG_RELATIV) {
+		iassert(src1->array.offset < (1 << 10));
+		cat3->rel1.src1      = reg(src1, info, instr->repeat,
+				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+				IR3_REG_HALF | absneg);
+		cat3->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
+		cat3->rel1.src1_rel  = 1;
+	} else if (src1->flags & IR3_REG_CONST) {
+		iassert(src1->num < (1 << 12));
+		cat3->c1.src1   = reg(src1, info, instr->repeat,
+				IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+		cat3->c1.src1_c = 1;
+	} else {
+		iassert(src1->num < (1 << 11));
+		cat3->src1 = reg(src1, info, instr->repeat,
+				IR3_REG_R | IR3_REG_HALF | absneg);
+	}
+
+	cat3->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+	cat3->src1_r   = !!(src1->flags & IR3_REG_R);
+
+	cat3->src2     = reg(src2, info, instr->repeat,
+			IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg);
+	cat3->src2_c   = !!(src2->flags & IR3_REG_CONST);
+	cat3->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+	cat3->src2_r   = !!(src2->flags & IR3_REG_R);
+
+
+	if (src3->flags & IR3_REG_RELATIV) {
+		iassert(src3->array.offset < (1 << 10));
+		cat3->rel2.src3      = reg(src3, info, instr->repeat,
+				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+				IR3_REG_HALF | absneg);
+		cat3->rel2.src3_c    = !!(src3->flags & IR3_REG_CONST);
+		cat3->rel2.src3_rel  = 1;
+	} else if (src3->flags & IR3_REG_CONST) {
+		iassert(src3->num < (1 << 12));
+		cat3->c2.src3   = reg(src3, info, instr->repeat,
+				IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+		cat3->c2.src3_c = 1;
+	} else {
+		iassert(src3->num < (1 << 11));
+		cat3->src3 = reg(src3, info, instr->repeat,
+				IR3_REG_R | IR3_REG_HALF | absneg);
+	}
+
+	cat3->src3_neg = !!(src3->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+	cat3->src3_r   = !!(src3->flags & IR3_REG_R);
+
+	cat3->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	cat3->repeat   = instr->repeat;
+	cat3->sat      = !!(instr->flags & IR3_INSTR_SAT);
+	cat3->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat3->ul       = !!(instr->flags & IR3_INSTR_UL);
+	cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF);
+	cat3->opc      = instr->opc;
+	cat3->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat3->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat3->opc_cat  = 3;
+
+	return 0;
+}
+
+static int emit_cat4(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src = instr->regs[1];
+	instr_cat4_t *cat4 = ptr;
+
+	iassert(instr->regs_count == 2);
+
+	if (src->flags & IR3_REG_RELATIV) {
+		iassert(src->array.offset < (1 << 10));
+		cat4->rel.src      = reg(src, info, instr->repeat,
+				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG |
+				IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF);
+		cat4->rel.src_c    = !!(src->flags & IR3_REG_CONST);
+		cat4->rel.src_rel  = 1;
+	} else if (src->flags & IR3_REG_CONST) {
+		iassert(src->num < (1 << 12));
+		cat4->c.src   = reg(src, info, instr->repeat,
+				IR3_REG_CONST | IR3_REG_FNEG | IR3_REG_FABS |
+				IR3_REG_R | IR3_REG_HALF);
+		cat4->c.src_c = 1;
+	} else {
+		iassert(src->num < (1 << 11));
+		cat4->src = reg(src, info, instr->repeat,
+				IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
+				IR3_REG_R | IR3_REG_HALF);
+	}
+
+	cat4->src_im   = !!(src->flags & IR3_REG_IMMED);
+	cat4->src_neg  = !!(src->flags & IR3_REG_FNEG);
+	cat4->src_abs  = !!(src->flags & IR3_REG_FABS);
+	cat4->src_r    = !!(src->flags & IR3_REG_R);
+
+	cat4->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	cat4->repeat   = instr->repeat;
+	cat4->sat      = !!(instr->flags & IR3_INSTR_SAT);
+	cat4->ss       = !!(instr->flags & IR3_INSTR_SS);
+	cat4->ul       = !!(instr->flags & IR3_INSTR_UL);
+	cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF);
+	cat4->full     = ! (src->flags & IR3_REG_HALF);
+	cat4->opc      = instr->opc;
+	cat4->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat4->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat4->opc_cat  = 4;
+
+	return 0;
+}
+
+static int emit_cat5(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src1 = instr->regs[1];
+	struct ir3_register *src2 = instr->regs[2];
+	struct ir3_register *src3 = instr->regs[3];
+	instr_cat5_t *cat5 = ptr;
+
+	iassert_type(dst, type_size(instr->cat5.type) == 32)
+
+	assume(src1 || !src2);
+	assume(src2 || !src3);
+
+	if (src1) {
+		cat5->full = ! (src1->flags & IR3_REG_HALF);
+		cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF);
+	}
+
+	if (instr->flags & IR3_INSTR_S2EN) {
+		if (src2) {
+			iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+			cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+		}
+		if (src3) {
+			iassert(src3->flags & IR3_REG_HALF);
+			cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF);
+		}
+		iassert(!(instr->cat5.samp | instr->cat5.tex));
+	} else {
+		iassert(!src3);
+		if (src2) {
+			iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+			cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+		}
+		cat5->norm.samp = instr->cat5.samp;
+		cat5->norm.tex  = instr->cat5.tex;
+	}
+
+	cat5->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	cat5->wrmask   = dst->wrmask;
+	cat5->type     = instr->cat5.type;
+	cat5->is_3d    = !!(instr->flags & IR3_INSTR_3D);
+	cat5->is_a     = !!(instr->flags & IR3_INSTR_A);
+	cat5->is_s     = !!(instr->flags & IR3_INSTR_S);
+	cat5->is_s2en  = !!(instr->flags & IR3_INSTR_S2EN);
+	cat5->is_o     = !!(instr->flags & IR3_INSTR_O);
+	cat5->is_p     = !!(instr->flags & IR3_INSTR_P);
+	cat5->opc      = instr->opc;
+	cat5->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat5->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat5->opc_cat  = 5;
+
+	return 0;
+}
+
+static int emit_cat6(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	struct ir3_register *dst, *src1, *src2;
+	instr_cat6_t *cat6 = ptr;
+	bool type_full = type_size(instr->cat6.type) == 32;
+
+	cat6->type     = instr->cat6.type;
+	cat6->opc      = instr->opc;
+	cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+	cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat6->g        = !!(instr->flags & IR3_INSTR_G);
+	cat6->opc_cat  = 6;
+
+	switch (instr->opc) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		iassert_type(instr->regs[0], type_full); /* dst */
+		iassert_type(instr->regs[1], type_full); /* src1 */
+		break;
+	case OPC_L2G:
+	case OPC_G2L:
+		iassert_type(instr->regs[0], true);      /* dst */
+		iassert_type(instr->regs[1], true);      /* src1 */
+		break;
+	case OPC_STG:
+	case OPC_STL:
+	case OPC_STP:
+	case OPC_STI:
+	case OPC_STLW:
+	case OPC_STIB:
+		/* no dst, so regs[0] is dummy */
+		iassert_type(instr->regs[1], true);      /* dst */
+		iassert_type(instr->regs[2], type_full); /* src1 */
+		iassert_type(instr->regs[3], true);      /* src2 */
+		break;
+	default:
+		iassert_type(instr->regs[0], type_full); /* dst */
+		iassert_type(instr->regs[1], true);      /* src1 */
+		if (instr->regs_count > 2)
+			iassert_type(instr->regs[2], true);  /* src1 */
+		break;
+	}
+
+	/* the "dst" for a store instruction is (from the perspective
+	 * of data flow in the shader, ie. register use/def, etc) in
+	 * fact a register that is read by the instruction, rather
+	 * than written:
+	 */
+	if (is_store(instr)) {
+		iassert(instr->regs_count >= 3);
+
+		dst  = instr->regs[1];
+		src1 = instr->regs[2];
+		src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL;
+	} else {
+		iassert(instr->regs_count >= 2);
+
+		dst  = instr->regs[0];
+		src1 = instr->regs[1];
+		src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
+	}
+
+	/* TODO we need a more comprehensive list about which instructions
+	 * can be encoded which way.  Or possibly use IR3_INSTR_0 flag to
+	 * indicate to use the src_off encoding even if offset is zero
+	 * (but then what to do about dst_off?)
+	 */
+	if (is_atomic(instr->opc)) {
+		instr_cat6ldgb_t *ldgb = ptr;
+
+		/* maybe these two bits both determine the instruction encoding? */
+		cat6->src_off = false;
+
+		ldgb->d = instr->cat6.d - 1;
+		ldgb->typed = instr->cat6.typed;
+		ldgb->type_size = instr->cat6.iim_val - 1;
+
+		ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+		if (ldgb->g) {
+			struct ir3_register *src3 = instr->regs[3];
+			struct ir3_register *src4 = instr->regs[4];
+
+			/* first src is src_ssbo: */
+			iassert(src1->flags & IR3_REG_IMMED);
+			ldgb->src_ssbo = src1->uim_val;
+
+			ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+			ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
+			ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+			ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
+
+			ldgb->src3 = reg(src4, info, instr->repeat, 0);
+			ldgb->pad0 = 0x1;
+			ldgb->pad3 = 0x1;
+		} else {
+			ldgb->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
+			ldgb->src1_im = !!(src1->flags & IR3_REG_IMMED);
+			ldgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+			ldgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
+			ldgb->pad0 = 0x1;
+			ldgb->pad3 = 0x0;
+		}
+
+		return 0;
+	} else if (instr->opc == OPC_LDGB) {
+		struct ir3_register *src3 = instr->regs[3];
+		instr_cat6ldgb_t *ldgb = ptr;
+
+		/* maybe these two bits both determine the instruction encoding? */
+		cat6->src_off = false;
+
+		ldgb->d = instr->cat6.d - 1;
+		ldgb->typed = instr->cat6.typed;
+		ldgb->type_size = instr->cat6.iim_val - 1;
+
+		ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+		/* first src is src_ssbo: */
+		iassert(src1->flags & IR3_REG_IMMED);
+		ldgb->src_ssbo = src1->uim_val;
+
+		/* then next two are src1/src2: */
+		ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+		ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
+		ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+		ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
+
+		ldgb->pad0 = 0x0;
+		ldgb->pad3 = 0x1;
+
+		return 0;
+	} else if (instr->opc == OPC_RESINFO) {
+		instr_cat6ldgb_t *ldgb = ptr;
+
+		ldgb->d = instr->cat6.d - 1;
+
+		ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+		/* first src is src_ssbo: */
+		iassert(src1->flags & IR3_REG_IMMED);
+		ldgb->src_ssbo = src1->uim_val;
+
+		return 0;
+	} else if ((instr->opc == OPC_STGB) || (instr->opc == OPC_STIB)) {
+		struct ir3_register *src3 = instr->regs[4];
+		instr_cat6stgb_t *stgb = ptr;
+
+		/* maybe these two bits both determine the instruction encoding? */
+		cat6->src_off = true;
+		stgb->pad3 = 0x2;
+
+		stgb->d = instr->cat6.d - 1;
+		stgb->typed = instr->cat6.typed;
+		stgb->type_size = instr->cat6.iim_val - 1;
+
+		/* first src is dst_ssbo: */
+		iassert(dst->flags & IR3_REG_IMMED);
+		stgb->dst_ssbo = dst->uim_val;
+
+		/* then src1/src2/src3: */
+		stgb->src1 = reg(src1, info, instr->repeat, 0);
+		stgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+		stgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
+		stgb->src3 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+		stgb->src3_im = !!(src3->flags & IR3_REG_IMMED);
+
+		return 0;
+	} else if (instr->cat6.src_offset || (instr->opc == OPC_LDG) ||
+			(instr->opc == OPC_LDL)) {
+		instr_cat6a_t *cat6a = ptr;
+
+		cat6->src_off = true;
+
+		cat6a->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
+		cat6a->src1_im = !!(src1->flags & IR3_REG_IMMED);
+		if (src2) {
+			cat6a->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+			cat6a->src2_im = !!(src2->flags & IR3_REG_IMMED);
+		}
+		cat6a->off = instr->cat6.src_offset;
+	} else {
+		instr_cat6b_t *cat6b = ptr;
+
+		cat6->src_off = false;
+
+		cat6b->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED | IR3_REG_HALF);
+		cat6b->src1_im = !!(src1->flags & IR3_REG_IMMED);
+		if (src2) {
+			cat6b->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+			cat6b->src2_im = !!(src2->flags & IR3_REG_IMMED);
+		}
+	}
+
+	if (instr->cat6.dst_offset || (instr->opc == OPC_STG) ||
+			(instr->opc == OPC_STL)) {
+		instr_cat6c_t *cat6c = ptr;
+		cat6->dst_off = true;
+		cat6c->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+		cat6c->off = instr->cat6.dst_offset;
+	} else {
+		instr_cat6d_t *cat6d = ptr;
+		cat6->dst_off = false;
+		cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	}
+
+	return 0;
+}
+
+static int emit_cat7(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info)
+{
+	instr_cat7_t *cat7 = ptr;
+
+	cat7->ss      = !!(instr->flags & IR3_INSTR_SS);
+	cat7->w       = instr->cat7.w;
+	cat7->r       = instr->cat7.r;
+	cat7->l       = instr->cat7.l;
+	cat7->g       = instr->cat7.g;
+	cat7->opc     = instr->opc;
+	cat7->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+	cat7->sync    = !!(instr->flags & IR3_INSTR_SY);
+	cat7->opc_cat = 7;
+
+	return 0;
+}
+
+static int (*emit[])(struct ir3_instruction *instr, void *ptr,
+		struct ir3_info *info) = {
+	emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6,
+	emit_cat7,
+};
+
+void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
+		uint32_t gpu_id)
+{
+	uint32_t *ptr, *dwords;
+
+	info->gpu_id        = gpu_id;
+	info->max_reg       = -1;
+	info->max_half_reg  = -1;
+	info->max_const     = -1;
+	info->instrs_count  = 0;
+	info->sizedwords    = 0;
+	info->ss = info->sy = 0;
+
+	list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			info->sizedwords += 2;
+		}
+	}
+
+	/* need an integer number of instruction "groups" (sets of 16
+	 * instructions on a4xx or sets of 4 instructions on a3xx),
+	 * so pad out w/ NOPs if needed: (NOTE each instruction is 64bits)
+	 */
+	if (gpu_id >= 400) {
+		info->sizedwords = align(info->sizedwords, 16 * 2);
+	} else {
+		info->sizedwords = align(info->sizedwords, 4 * 2);
+	}
+
+	ptr = dwords = calloc(4, info->sizedwords);
+
+	list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			int ret = emit[opc_cat(instr->opc)](instr, dwords, info);
+			if (ret)
+				goto fail;
+			info->instrs_count += 1 + instr->repeat;
+			dwords += 2;
+
+			if (instr->flags & IR3_INSTR_SS)
+				info->ss++;
+
+			if (instr->flags & IR3_INSTR_SY)
+				info->sy++;
+		}
+	}
+
+	return ptr;
+
+fail:
+	free(ptr);
+	return NULL;
+}
+
+static struct ir3_register * reg_create(struct ir3 *shader,
+		int num, int flags)
+{
+	struct ir3_register *reg =
+			ir3_alloc(shader, sizeof(struct ir3_register));
+	reg->wrmask = 1;
+	reg->flags = flags;
+	reg->num = num;
+	return reg;
+}
+
+static void insert_instr(struct ir3_block *block,
+		struct ir3_instruction *instr)
+{
+	struct ir3 *shader = block->shader;
+#ifdef DEBUG
+	instr->serialno = ++shader->instr_count;
+#endif
+	list_addtail(&instr->node, &block->instr_list);
+
+	if (is_input(instr))
+		array_insert(shader, shader->baryfs, instr);
+}
+
+struct ir3_block * ir3_block_create(struct ir3 *shader)
+{
+	struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
+#ifdef DEBUG
+	block->serialno = ++shader->block_count;
+#endif
+	block->shader = shader;
+	list_inithead(&block->node);
+	list_inithead(&block->instr_list);
+	return block;
+}
+
+static struct ir3_instruction *instr_create(struct ir3_block *block, int nreg)
+{
+	struct ir3_instruction *instr;
+	unsigned sz = sizeof(*instr) + (nreg * sizeof(instr->regs[0]));
+	char *ptr = ir3_alloc(block->shader, sz);
+
+	instr = (struct ir3_instruction *)ptr;
+	ptr  += sizeof(*instr);
+	instr->regs = (struct ir3_register **)ptr;
+
+#ifdef DEBUG
+	instr->regs_max = nreg;
+#endif
+
+	return instr;
+}
+
+struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
+		opc_t opc, int nreg)
+{
+	struct ir3_instruction *instr = instr_create(block, nreg);
+	instr->block = block;
+	instr->opc = opc;
+	insert_instr(block, instr);
+	return instr;
+}
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc)
+{
+	/* NOTE: we could be slightly more clever, at least for non-meta,
+	 * and choose # of regs based on category.
+	 */
+	return ir3_instr_create2(block, opc, 4);
+}
+
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
+{
+	struct ir3_instruction *new_instr = instr_create(instr->block,
+			instr->regs_count);
+	struct ir3_register **regs;
+	unsigned i;
+
+	regs = new_instr->regs;
+	*new_instr = *instr;
+	new_instr->regs = regs;
+
+	insert_instr(instr->block, new_instr);
+
+	/* clone registers: */
+	new_instr->regs_count = 0;
+	for (i = 0; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		struct ir3_register *new_reg =
+				ir3_reg_create(new_instr, reg->num, reg->flags);
+		*new_reg = *reg;
+	}
+
+	return new_instr;
+}
+
+/* Add a false dependency to instruction, to ensure it is scheduled first: */
+void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
+{
+	array_insert(instr, instr->deps, dep);
+}
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+		int num, int flags)
+{
+	struct ir3 *shader = instr->block->shader;
+	struct ir3_register *reg = reg_create(shader, num, flags);
+#ifdef DEBUG
+	debug_assert(instr->regs_count < instr->regs_max);
+#endif
+	instr->regs[instr->regs_count++] = reg;
+	return reg;
+}
+
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+		struct ir3_register *reg)
+{
+	struct ir3_register *new_reg = reg_create(shader, 0, 0);
+	*new_reg = *reg;
+	return new_reg;
+}
+
+void
+ir3_instr_set_address(struct ir3_instruction *instr,
+		struct ir3_instruction *addr)
+{
+	if (instr->address != addr) {
+		struct ir3 *ir = instr->block->shader;
+		instr->address = addr;
+		array_insert(ir, ir->indirects, instr);
+	}
+}
+
+void
+ir3_block_clear_mark(struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+		instr->flags &= ~IR3_INSTR_MARK;
+}
+
+void
+ir3_clear_mark(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		ir3_block_clear_mark(block);
+	}
+}
+
+/* note: this will destroy instr->depth, don't do it until after sched! */
+unsigned
+ir3_count_instructions(struct ir3 *ir)
+{
+	unsigned cnt = 0;
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			instr->ip = cnt++;
+		}
+		block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+		block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+	}
+	return cnt;
+}
+
+struct ir3_array *
+ir3_lookup_array(struct ir3 *ir, unsigned id)
+{
+	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node)
+		if (arr->id == id)
+			return arr;
+	return NULL;
+}
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
new file mode 100644
index 00000000000..ea3218828df
--- /dev/null
+++ b/src/freedreno/ir3/ir3.h
@@ -0,0 +1,1394 @@
+/*
+ * Copyright (c) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IR3_H_
+#define IR3_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "compiler/shader_enums.h"
+
+#include "util/u_debug.h"
+#include "util/list.h"
+
+#include "instr-a3xx.h"
+
+/* low level intermediate representation of an adreno shader program */
+
+struct ir3_compiler;
+struct ir3;
+struct ir3_instruction;
+struct ir3_block;
+
+struct ir3_info {
+	uint32_t gpu_id;
+	uint16_t sizedwords;
+	uint16_t instrs_count;   /* expanded to account for rpt's */
+	/* NOTE: max_reg, etc, does not include registers not touched
+	 * by the shader (ie. vertex fetched via VFD_DECODE but not
+	 * touched by shader)
+	 */
+	int8_t   max_reg;   /* highest GPR # used by shader */
+	int8_t   max_half_reg;
+	int16_t  max_const;
+
+	/* number of sync bits: */
+	uint16_t ss, sy;
+};
+
+struct ir3_register {
+	enum {
+		IR3_REG_CONST  = 0x001,
+		IR3_REG_IMMED  = 0x002,
+		IR3_REG_HALF   = 0x004,
+		/* high registers are used for some things in compute shaders,
+		 * for example.  Seems to be for things that are global to all
+		 * threads in a wave, so possibly these are global/shared by
+		 * all the threads in the wave?
+		 */
+		IR3_REG_HIGH   = 0x008,
+		IR3_REG_RELATIV= 0x010,
+		IR3_REG_R      = 0x020,
+		/* Most instructions, it seems, can do float abs/neg but not
+		 * integer.  The CP pass needs to know what is intended (int or
+		 * float) in order to do the right thing.  For this reason the
+		 * abs/neg flags are split out into float and int variants.  In
+		 * addition, .b (bitwise) operations, the negate is actually a
+		 * bitwise not, so split that out into a new flag to make it
+		 * more clear.
+		 */
+		IR3_REG_FNEG   = 0x040,
+		IR3_REG_FABS   = 0x080,
+		IR3_REG_SNEG   = 0x100,
+		IR3_REG_SABS   = 0x200,
+		IR3_REG_BNOT   = 0x400,
+		IR3_REG_EVEN   = 0x800,
+		IR3_REG_POS_INF= 0x1000,
+		/* (ei) flag, end-input?  Set on last bary, presumably to signal
+		 * that the shader needs no more input:
+		 */
+		IR3_REG_EI     = 0x2000,
+		/* meta-flags, for intermediate stages of IR, ie.
+		 * before register assignment is done:
+		 */
+		IR3_REG_SSA    = 0x4000,   /* 'instr' is ptr to assigning instr */
+		IR3_REG_ARRAY  = 0x8000,
+
+	} flags;
+
+	/* normal registers:
+	 * the component is in the low two bits of the reg #, so
+	 * rN.x becomes: (N << 2) | x
+	 */
+	int   num;
+	union {
+		/* immediate: */
+		int32_t  iim_val;
+		uint32_t uim_val;
+		float    fim_val;
+		/* relative: */
+		struct {
+			uint16_t id;
+			int16_t offset;
+		} array;
+	};
+
+	/* For IR3_REG_SSA, src registers contain ptr back to assigning
+	 * instruction.
+	 *
+	 * For IR3_REG_ARRAY, the pointer is back to the last dependent
+	 * array access (although the net effect is the same, it points
+	 * back to a previous instruction that we depend on).
+	 */
+	struct ir3_instruction *instr;
+
+	union {
+		/* used for cat5 instructions, but also for internal/IR level
+		 * tracking of what registers are read/written by an instruction.
+		 * wrmask may be a bad name since it is used to represent both
+		 * src and dst that touch multiple adjacent registers.
+		 */
+		unsigned wrmask;
+		/* for relative addressing, 32bits for array size is too small,
+		 * but otoh we don't need to deal with disjoint sets, so instead
+		 * use a simple size field (number of scalar components).
+		 */
+		unsigned size;
+	};
+};
+
+/*
+ * Stupid/simple growable array implementation:
+ */
+#define DECLARE_ARRAY(type, name) \
+	unsigned name ## _count, name ## _sz; \
+	type * name;
+
+#define array_insert(ctx, arr, val) do { \
+		if (arr ## _count == arr ## _sz) { \
+			arr ## _sz = MAX2(2 * arr ## _sz, 16); \
+			arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
+		} \
+		arr[arr ##_count++] = val; \
+	} while (0)
+
+struct ir3_instruction {
+	struct ir3_block *block;
+	opc_t opc;
+	enum {
+		/* (sy) flag is set on first instruction, and after sample
+		 * instructions (probably just on RAW hazard).
+		 */
+		IR3_INSTR_SY    = 0x001,
+		/* (ss) flag is set on first instruction, and first instruction
+		 * to depend on the result of "long" instructions (RAW hazard):
+		 *
+		 *   rcp, rsq, log2, exp2, sin, cos, sqrt
+		 *
+		 * It seems to synchronize until all in-flight instructions are
+		 * completed, for example:
+		 *
+		 *   rsq hr1.w, hr1.w
+		 *   add.f hr2.z, (neg)hr2.z, hc0.y
+		 *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
+		 *   rsq hr2.x, hr2.x
+		 *   (rpt1)nop
+		 *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
+		 *   nop
+		 *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
+		 *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
+		 *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
+		 *
+		 * The last mul.f does not have (ss) set, presumably because the
+		 * (ss) on the previous instruction does the job.
+		 *
+		 * The blob driver also seems to set it on WAR hazards, although
+		 * not really clear if this is needed or just blob compiler being
+		 * sloppy.  So far I haven't found a case where removing the (ss)
+		 * causes problems for WAR hazard, but I could just be getting
+		 * lucky:
+		 *
+		 *   rcp r1.y, r3.y
+		 *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
+		 *
+		 */
+		IR3_INSTR_SS    = 0x002,
+		/* (jp) flag is set on jump targets:
+		 */
+		IR3_INSTR_JP    = 0x004,
+		IR3_INSTR_UL    = 0x008,
+		IR3_INSTR_3D    = 0x010,
+		IR3_INSTR_A     = 0x020,
+		IR3_INSTR_O     = 0x040,
+		IR3_INSTR_P     = 0x080,
+		IR3_INSTR_S     = 0x100,
+		IR3_INSTR_S2EN  = 0x200,
+		IR3_INSTR_G     = 0x400,
+		IR3_INSTR_SAT   = 0x800,
+		/* meta-flags, for intermediate stages of IR, ie.
+		 * before register assignment is done:
+		 */
+		IR3_INSTR_MARK  = 0x1000,
+		IR3_INSTR_UNUSED= 0x2000,
+	} flags;
+	int repeat;
+#ifdef DEBUG
+	unsigned regs_max;
+#endif
+	unsigned regs_count;
+	struct ir3_register **regs;
+	union {
+		struct {
+			char inv;
+			char comp;
+			int  immed;
+			struct ir3_block *target;
+		} cat0;
+		struct {
+			type_t src_type, dst_type;
+		} cat1;
+		struct {
+			enum {
+				IR3_COND_LT = 0,
+				IR3_COND_LE = 1,
+				IR3_COND_GT = 2,
+				IR3_COND_GE = 3,
+				IR3_COND_EQ = 4,
+				IR3_COND_NE = 5,
+			} condition;
+		} cat2;
+		struct {
+			unsigned samp, tex;
+			type_t type;
+		} cat5;
+		struct {
+			type_t type;
+			int src_offset;
+			int dst_offset;
+			int iim_val : 3;      /* for ldgb/stgb, # of components */
+			int d : 3;
+			bool typed : 1;
+		} cat6;
+		struct {
+			unsigned w : 1;       /* write */
+			unsigned r : 1;       /* read */
+			unsigned l : 1;       /* local */
+			unsigned g : 1;       /* global */
+		} cat7;
+		/* for meta-instructions, just used to hold extra data
+		 * before instruction scheduling, etc
+		 */
+		struct {
+			int off;              /* component/offset */
+		} fo;
+		struct {
+			struct ir3_block *block;
+		} inout;
+	};
+
+	/* transient values used during various algorithms: */
+	union {
+		/* The instruction depth is the max dependency distance to output.
+		 *
+		 * You can also think of it as the "cost", if we did any sort of
+		 * optimization for register footprint.  Ie. a value that is  just
+		 * result of moving a const to a reg would have a low cost,  so to
+		 * it could make sense to duplicate the instruction at various
+		 * points where the result is needed to reduce register footprint.
+		 */
+		unsigned depth;
+		/* When we get to the RA stage, we no longer need depth, but
+		 * we do need instruction's position/name:
+		 */
+		struct {
+			uint16_t ip;
+			uint16_t name;
+		};
+	};
+
+	/* used for per-pass extra instruction data.
+	 */
+	void *data;
+
+	/* Used during CP and RA stages.  For fanin and shader inputs/
+	 * outputs where we need a sequence of consecutive registers,
+	 * keep track of each src instructions left (ie 'n-1') and right
+	 * (ie 'n+1') neighbor.  The front-end must insert enough mov's
+	 * to ensure that each instruction has at most one left and at
+	 * most one right neighbor.  During the copy-propagation pass,
+	 * we only remove mov's when we can preserve this constraint.
+	 * And during the RA stage, we use the neighbor information to
+	 * allocate a block of registers in one shot.
+	 *
+	 * TODO: maybe just add something like:
+	 *   struct ir3_instruction_ref {
+	 *       struct ir3_instruction *instr;
+	 *       unsigned cnt;
+	 *   }
+	 *
+	 * Or can we get away without the refcnt stuff?  It seems like
+	 * it should be overkill..  the problem is if, potentially after
+	 * already eliminating some mov's, if you have a single mov that
+	 * needs to be grouped with it's neighbors in two different
+	 * places (ex. shader output and a fanin).
+	 */
+	struct {
+		struct ir3_instruction *left, *right;
+		uint16_t left_cnt, right_cnt;
+	} cp;
+
+	/* an instruction can reference at most one address register amongst
+	 * it's src/dst registers.  Beyond that, you need to insert mov's.
+	 *
+	 * NOTE: do not write this directly, use ir3_instr_set_address()
+	 */
+	struct ir3_instruction *address;
+
+	/* Tracking for additional dependent instructions.  Used to handle
+	 * barriers, WAR hazards for arrays/SSBOs/etc.
+	 */
+	DECLARE_ARRAY(struct ir3_instruction *, deps);
+
+	/*
+	 * From PoV of instruction scheduling, not execution (ie. ignores global/
+	 * local distinction):
+	 *                            shared  image  atomic  SSBO  everything
+	 *   barrier()/            -   R/W     R/W    R/W     R/W       X
+	 *     groupMemoryBarrier()
+	 *   memoryBarrier()       -           R/W    R/W
+	 *     (but only images declared coherent?)
+	 *   memoryBarrierAtomic() -                  R/W
+	 *   memoryBarrierBuffer() -                          R/W
+	 *   memoryBarrierImage()  -           R/W
+	 *   memoryBarrierShared() -   R/W
+	 *
+	 * TODO I think for SSBO/image/shared, in cases where we can determine
+	 * which variable is accessed, we don't need to care about accesses to
+	 * different variables (unless declared coherent??)
+	 */
+	enum {
+		IR3_BARRIER_EVERYTHING = 1 << 0,
+		IR3_BARRIER_SHARED_R   = 1 << 1,
+		IR3_BARRIER_SHARED_W   = 1 << 2,
+		IR3_BARRIER_IMAGE_R    = 1 << 3,
+		IR3_BARRIER_IMAGE_W    = 1 << 4,
+		IR3_BARRIER_BUFFER_R   = 1 << 5,
+		IR3_BARRIER_BUFFER_W   = 1 << 6,
+		IR3_BARRIER_ARRAY_R    = 1 << 7,
+		IR3_BARRIER_ARRAY_W    = 1 << 8,
+	} barrier_class, barrier_conflict;
+
+	/* Entry in ir3_block's instruction list: */
+	struct list_head node;
+
+	int use_count;      /* currently just updated/used by cp */
+
+#ifdef DEBUG
+	uint32_t serialno;
+#endif
+};
+
+static inline struct ir3_instruction *
+ir3_neighbor_first(struct ir3_instruction *instr)
+{
+	int cnt = 0;
+	while (instr->cp.left) {
+		instr = instr->cp.left;
+		if (++cnt > 0xffff) {
+			debug_assert(0);
+			break;
+		}
+	}
+	return instr;
+}
+
+static inline int ir3_neighbor_count(struct ir3_instruction *instr)
+{
+	int num = 1;
+
+	debug_assert(!instr->cp.left);
+
+	while (instr->cp.right) {
+		num++;
+		instr = instr->cp.right;
+		if (num > 0xffff) {
+			debug_assert(0);
+			break;
+		}
+	}
+
+	return num;
+}
+
+struct ir3 {
+	struct ir3_compiler *compiler;
+
+	unsigned ninputs, noutputs;
+	struct ir3_instruction **inputs;
+	struct ir3_instruction **outputs;
+
+	/* Track bary.f (and ldlv) instructions.. this is needed in
+	 * scheduling to ensure that all varying fetches happen before
+	 * any potential kill instructions.  The hw gets grumpy if all
+	 * threads in a group are killed before the last bary.f gets
+	 * a chance to signal end of input (ei).
+	 */
+	DECLARE_ARRAY(struct ir3_instruction *, baryfs);
+
+	/* Track all indirect instructions (read and write).  To avoid
+	 * deadlock scenario where an address register gets scheduled,
+	 * but other dependent src instructions cannot be scheduled due
+	 * to dependency on a *different* address register value, the
+	 * scheduler needs to ensure that all dependencies other than
+	 * the instruction other than the address register are scheduled
+	 * before the one that writes the address register.  Having a
+	 * convenient list of instructions that reference some address
+	 * register simplifies this.
+	 */
+	DECLARE_ARRAY(struct ir3_instruction *, indirects);
+
+	/* and same for instructions that consume predicate register: */
+	DECLARE_ARRAY(struct ir3_instruction *, predicates);
+
+	/* Track texture sample instructions which need texture state
+	 * patched in (for astc-srgb workaround):
+	 */
+	DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
+
+	/* List of blocks: */
+	struct list_head block_list;
+
+	/* List of ir3_array's: */
+	struct list_head array_list;
+
+#ifdef DEBUG
+	unsigned block_count, instr_count;
+#endif
+};
+
+struct ir3_array {
+	struct list_head node;
+	unsigned length;
+	unsigned id;
+
+	struct nir_register *r;
+
+	/* To avoid array write's from getting DCE'd, keep track of the
+	 * most recent write.  Any array access depends on the most
+	 * recent write.  This way, nothing depends on writes after the
+	 * last read.  But all the writes that happen before that have
+	 * something depending on them
+	 */
+	struct ir3_instruction *last_write;
+
+	/* extra stuff used in RA pass: */
+	unsigned base;      /* base vreg name */
+	unsigned reg;       /* base physical reg */
+	uint16_t start_ip, end_ip;
+};
+
+struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
+
+struct ir3_block {
+	struct list_head node;
+	struct ir3 *shader;
+
+	const struct nir_block *nblock;
+
+	struct list_head instr_list;  /* list of ir3_instruction */
+
+	/* each block has either one or two successors.. in case of
+	 * two successors, 'condition' decides which one to follow.
+	 * A block preceding an if/else has two successors.
+	 */
+	struct ir3_instruction *condition;
+	struct ir3_block *successors[2];
+
+	unsigned predecessors_count;
+	struct ir3_block **predecessors;
+
+	uint16_t start_ip, end_ip;
+
+	/* Track instructions which do not write a register but other-
+	 * wise must not be discarded (such as kill, stg, etc)
+	 */
+	DECLARE_ARRAY(struct ir3_instruction *, keeps);
+
+	/* used for per-pass extra block data.  Mainly used right
+	 * now in RA step to track livein/liveout.
+	 */
+	void *data;
+
+#ifdef DEBUG
+	uint32_t serialno;
+#endif
+};
+
+static inline uint32_t
+block_id(struct ir3_block *block)
+{
+#ifdef DEBUG
+	return block->serialno;
+#else
+	return (uint32_t)(unsigned long)block;
+#endif
+}
+
+struct ir3 * ir3_create(struct ir3_compiler *compiler,
+		unsigned nin, unsigned nout);
+void ir3_destroy(struct ir3 *shader);
+void * ir3_assemble(struct ir3 *shader,
+		struct ir3_info *info, uint32_t gpu_id);
+void * ir3_alloc(struct ir3 *shader, int sz);
+
+struct ir3_block * ir3_block_create(struct ir3 *shader);
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
+struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
+		opc_t opc, int nreg);
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
+void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
+const char *ir3_instr_name(struct ir3_instruction *instr);
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+		int num, int flags);
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+		struct ir3_register *reg);
+
+void ir3_instr_set_address(struct ir3_instruction *instr,
+		struct ir3_instruction *addr);
+
+static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
+{
+	if (instr->flags & IR3_INSTR_MARK)
+		return true;  /* already visited */
+	instr->flags |= IR3_INSTR_MARK;
+	return false;
+}
+
+void ir3_block_clear_mark(struct ir3_block *block);
+void ir3_clear_mark(struct ir3 *shader);
+
+unsigned ir3_count_instructions(struct ir3 *ir);
+
+static inline int ir3_instr_regno(struct ir3_instruction *instr,
+		struct ir3_register *reg)
+{
+	unsigned i;
+	for (i = 0; i < instr->regs_count; i++)
+		if (reg == instr->regs[i])
+			return i;
+	return -1;
+}
+
+
+#define MAX_ARRAYS 16
+
+/* comp:
+ *   0 - x
+ *   1 - y
+ *   2 - z
+ *   3 - w
+ */
+static inline uint32_t regid(int num, int comp)
+{
+	return (num << 2) | (comp & 0x3);
+}
+
+static inline uint32_t reg_num(struct ir3_register *reg)
+{
+	return reg->num >> 2;
+}
+
+static inline uint32_t reg_comp(struct ir3_register *reg)
+{
+	return reg->num & 0x3;
+}
+
+static inline bool is_flow(struct ir3_instruction *instr)
+{
+	return (opc_cat(instr->opc) == 0);
+}
+
+static inline bool is_kill(struct ir3_instruction *instr)
+{
+	return instr->opc == OPC_KILL;
+}
+
+static inline bool is_nop(struct ir3_instruction *instr)
+{
+	return instr->opc == OPC_NOP;
+}
+
+/* Is it a non-transformative (ie. not type changing) mov?  This can
+ * also include absneg.s/absneg.f, which for the most part can be
+ * treated as a mov (single src argument).
+ */
+static inline bool is_same_type_mov(struct ir3_instruction *instr)
+{
+	struct ir3_register *dst;
+
+	switch (instr->opc) {
+	case OPC_MOV:
+		if (instr->cat1.src_type != instr->cat1.dst_type)
+			return false;
+		break;
+	case OPC_ABSNEG_F:
+	case OPC_ABSNEG_S:
+		if (instr->flags & IR3_INSTR_SAT)
+			return false;
+		break;
+	default:
+		return false;
+	}
+
+	dst = instr->regs[0];
+
+	/* mov's that write to a0.x or p0.x are special: */
+	if (dst->num == regid(REG_P0, 0))
+		return false;
+	if (dst->num == regid(REG_A0, 0))
+		return false;
+
+	if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
+		return false;
+
+	return true;
+}
+
+static inline bool is_alu(struct ir3_instruction *instr)
+{
+	return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
+}
+
+static inline bool is_sfu(struct ir3_instruction *instr)
+{
+	return (opc_cat(instr->opc) == 4);
+}
+
+static inline bool is_tex(struct ir3_instruction *instr)
+{
+	return (opc_cat(instr->opc) == 5);
+}
+
+static inline bool is_mem(struct ir3_instruction *instr)
+{
+	return (opc_cat(instr->opc) == 6);
+}
+
+static inline bool is_barrier(struct ir3_instruction *instr)
+{
+	return (opc_cat(instr->opc) == 7);
+}
+
+static inline bool
+is_store(struct ir3_instruction *instr)
+{
+	/* these instructions, the "destination" register is
+	 * actually a source, the address to store to.
+	 */
+	switch (instr->opc) {
+	case OPC_STG:
+	case OPC_STGB:
+	case OPC_STIB:
+	case OPC_STP:
+	case OPC_STL:
+	case OPC_STLW:
+	case OPC_L2G:
+	case OPC_G2L:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_load(struct ir3_instruction *instr)
+{
+	switch (instr->opc) {
+	case OPC_LDG:
+	case OPC_LDGB:
+	case OPC_LDL:
+	case OPC_LDP:
+	case OPC_L2G:
+	case OPC_LDLW:
+	case OPC_LDC:
+	case OPC_LDLV:
+		/* probably some others too.. */
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_input(struct ir3_instruction *instr)
+{
+	/* in some cases, ldlv is used to fetch varying without
+	 * interpolation.. fortunately inloc is the first src
+	 * register in either case
+	 */
+	switch (instr->opc) {
+	case OPC_LDLV:
+	case OPC_BARY_F:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_bool(struct ir3_instruction *instr)
+{
+	switch (instr->opc) {
+	case OPC_CMPS_F:
+	case OPC_CMPS_S:
+	case OPC_CMPS_U:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool is_meta(struct ir3_instruction *instr)
+{
+	/* TODO how should we count PHI (and maybe fan-in/out) which
+	 * might actually contribute some instructions to the final
+	 * result?
+	 */
+	return (opc_cat(instr->opc) == -1);
+}
+
+static inline bool writes_addr(struct ir3_instruction *instr)
+{
+	if (instr->regs_count > 0) {
+		struct ir3_register *dst = instr->regs[0];
+		return reg_num(dst) == REG_A0;
+	}
+	return false;
+}
+
+static inline bool writes_pred(struct ir3_instruction *instr)
+{
+	if (instr->regs_count > 0) {
+		struct ir3_register *dst = instr->regs[0];
+		return reg_num(dst) == REG_P0;
+	}
+	return false;
+}
+
+/* returns defining instruction for reg */
+/* TODO better name */
+static inline struct ir3_instruction *ssa(struct ir3_register *reg)
+{
+	if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
+		return reg->instr;
+	}
+	return NULL;
+}
+
+static inline bool conflicts(struct ir3_instruction *a,
+		struct ir3_instruction *b)
+{
+	return (a && b) && (a != b);
+}
+
+static inline bool reg_gpr(struct ir3_register *r)
+{
+	if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+		return false;
+	if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
+		return false;
+	return true;
+}
+
+static inline type_t half_type(type_t type)
+{
+	switch (type) {
+	case TYPE_F32: return TYPE_F16;
+	case TYPE_U32: return TYPE_U16;
+	case TYPE_S32: return TYPE_S16;
+	case TYPE_F16:
+	case TYPE_U16:
+	case TYPE_S16:
+		return type;
+	default:
+		assert(0);
+		return ~0;
+	}
+}
+
+/* some cat2 instructions (ie. those which are not float) can embed an
+ * immediate:
+ */
+static inline bool ir3_cat2_int(opc_t opc)
+{
+	switch (opc) {
+	case OPC_ADD_U:
+	case OPC_ADD_S:
+	case OPC_SUB_U:
+	case OPC_SUB_S:
+	case OPC_CMPS_U:
+	case OPC_CMPS_S:
+	case OPC_MIN_U:
+	case OPC_MIN_S:
+	case OPC_MAX_U:
+	case OPC_MAX_S:
+	case OPC_CMPV_U:
+	case OPC_CMPV_S:
+	case OPC_MUL_U:
+	case OPC_MUL_S:
+	case OPC_MULL_U:
+	case OPC_CLZ_S:
+	case OPC_ABSNEG_S:
+	case OPC_AND_B:
+	case OPC_OR_B:
+	case OPC_NOT_B:
+	case OPC_XOR_B:
+	case OPC_BFREV_B:
+	case OPC_CLZ_B:
+	case OPC_SHL_B:
+	case OPC_SHR_B:
+	case OPC_ASHR_B:
+	case OPC_MGEN_B:
+	case OPC_GETBIT_B:
+	case OPC_CBITS_B:
+	case OPC_BARY_F:
+		return true;
+
+	default:
+		return false;
+	}
+}
+
+
+/* map cat2 instruction to valid abs/neg flags: */
+static inline unsigned ir3_cat2_absneg(opc_t opc)
+{
+	switch (opc) {
+	case OPC_ADD_F:
+	case OPC_MIN_F:
+	case OPC_MAX_F:
+	case OPC_MUL_F:
+	case OPC_SIGN_F:
+	case OPC_CMPS_F:
+	case OPC_ABSNEG_F:
+	case OPC_CMPV_F:
+	case OPC_FLOOR_F:
+	case OPC_CEIL_F:
+	case OPC_RNDNE_F:
+	case OPC_RNDAZ_F:
+	case OPC_TRUNC_F:
+	case OPC_BARY_F:
+		return IR3_REG_FABS | IR3_REG_FNEG;
+
+	case OPC_ADD_U:
+	case OPC_ADD_S:
+	case OPC_SUB_U:
+	case OPC_SUB_S:
+	case OPC_CMPS_U:
+	case OPC_CMPS_S:
+	case OPC_MIN_U:
+	case OPC_MIN_S:
+	case OPC_MAX_U:
+	case OPC_MAX_S:
+	case OPC_CMPV_U:
+	case OPC_CMPV_S:
+	case OPC_MUL_U:
+	case OPC_MUL_S:
+	case OPC_MULL_U:
+	case OPC_CLZ_S:
+		return 0;
+
+	case OPC_ABSNEG_S:
+		return IR3_REG_SABS | IR3_REG_SNEG;
+
+	case OPC_AND_B:
+	case OPC_OR_B:
+	case OPC_NOT_B:
+	case OPC_XOR_B:
+	case OPC_BFREV_B:
+	case OPC_CLZ_B:
+	case OPC_SHL_B:
+	case OPC_SHR_B:
+	case OPC_ASHR_B:
+	case OPC_MGEN_B:
+	case OPC_GETBIT_B:
+	case OPC_CBITS_B:
+		return IR3_REG_BNOT;
+
+	default:
+		return 0;
+	}
+}
+
+/* map cat3 instructions to valid abs/neg flags: */
+static inline unsigned ir3_cat3_absneg(opc_t opc)
+{
+	switch (opc) {
+	case OPC_MAD_F16:
+	case OPC_MAD_F32:
+	case OPC_SEL_F16:
+	case OPC_SEL_F32:
+		return IR3_REG_FNEG;
+
+	case OPC_MAD_U16:
+	case OPC_MADSH_U16:
+	case OPC_MAD_S16:
+	case OPC_MADSH_M16:
+	case OPC_MAD_U24:
+	case OPC_MAD_S24:
+	case OPC_SEL_S16:
+	case OPC_SEL_S32:
+	case OPC_SAD_S16:
+	case OPC_SAD_S32:
+		/* neg *may* work on 3rd src.. */
+
+	case OPC_SEL_B16:
+	case OPC_SEL_B32:
+
+	default:
+		return 0;
+	}
+}
+
+#define MASK(n) ((1 << (n)) - 1)
+
+/* iterator for an instructions's sources (reg), also returns src #: */
+#define foreach_src_n(__srcreg, __n, __instr) \
+	if ((__instr)->regs_count) \
+		for (unsigned __cnt = (__instr)->regs_count - 1, __n = 0; __n < __cnt; __n++) \
+			if ((__srcreg = (__instr)->regs[__n + 1]))
+
+/* iterator for an instructions's sources (reg): */
+#define foreach_src(__srcreg, __instr) \
+	foreach_src_n(__srcreg, __i, __instr)
+
+static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
+{
+	unsigned cnt = instr->regs_count + instr->deps_count;
+	if (instr->address)
+		cnt++;
+	return cnt;
+}
+
+static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
+{
+	if (n == (instr->regs_count + instr->deps_count))
+		return instr->address;
+	if (n >= instr->regs_count)
+		return instr->deps[n - instr->regs_count];
+	return ssa(instr->regs[n]);
+}
+
+static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
+{
+	if (n == (instr->regs_count + instr->deps_count))
+		return false;
+	if (n >= instr->regs_count)
+		return true;
+	return false;
+}
+
+#define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1)
+
+/* iterator for an instruction's SSA sources (instr), also returns src #: */
+#define foreach_ssa_src_n(__srcinst, __n, __instr) \
+	for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
+		if ((__srcinst = __ssa_src_n(__instr, __n)))
+
+/* iterator for an instruction's SSA sources (instr): */
+#define foreach_ssa_src(__srcinst, __instr) \
+	foreach_ssa_src_n(__srcinst, __i, __instr)
+
+
+/* dump: */
+void ir3_print(struct ir3 *ir);
+void ir3_print_instr(struct ir3_instruction *instr);
+
+/* depth calculation: */
+int ir3_delayslots(struct ir3_instruction *assigner,
+		struct ir3_instruction *consumer, unsigned n);
+void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
+void ir3_depth(struct ir3 *ir);
+
+/* copy-propagate: */
+struct ir3_shader_variant;
+void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
+
+/* group neighbors and insert mov's to resolve conflicts: */
+void ir3_group(struct ir3 *ir);
+
+/* scheduling: */
+void ir3_sched_add_deps(struct ir3 *ir);
+int ir3_sched(struct ir3 *ir);
+
+/* register assignment: */
+struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler);
+int ir3_ra(struct ir3 *ir3, gl_shader_stage type,
+		bool frag_coord, bool frag_face);
+
+/* legalize: */
+void ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary);
+
+/* ************************************************************************* */
+/* instruction helpers */
+
+/* creates SSA src of correct type (ie. half vs full precision) */
+static inline struct ir3_register * __ssa_src(struct ir3_instruction *instr,
+		struct ir3_instruction *src, unsigned flags)
+{
+	struct ir3_register *reg;
+	if (src->regs[0]->flags & IR3_REG_HALF)
+		flags |= IR3_REG_HALF;
+	reg = ir3_reg_create(instr, 0, IR3_REG_SSA | flags);
+	reg->instr = src;
+	return reg;
+}
+
+static inline struct ir3_instruction *
+ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
+{
+	struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
+	ir3_reg_create(instr, 0, 0);   /* dst */
+	if (src->regs[0]->flags & IR3_REG_ARRAY) {
+		struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
+		src_reg->array = src->regs[0]->array;
+	} else {
+		__ssa_src(instr, src, 0);
+	}
+	debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
+	instr->cat1.src_type = type;
+	instr->cat1.dst_type = type;
+	return instr;
+}
+
+static inline struct ir3_instruction *
+ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
+		type_t src_type, type_t dst_type)
+{
+	struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
+	unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
+	unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
+
+	debug_assert((src->regs[0]->flags & IR3_REG_HALF) == src_flags);
+
+	ir3_reg_create(instr, 0, dst_flags);   /* dst */
+	__ssa_src(instr, src, 0);
+	instr->cat1.src_type = src_type;
+	instr->cat1.dst_type = dst_type;
+	debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
+	return instr;
+}
+
+static inline struct ir3_instruction *
+ir3_NOP(struct ir3_block *block)
+{
+	return ir3_instr_create(block, OPC_NOP);
+}
+
+#define INSTR0(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block)                                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create(block, OPC_##name);                             \
+	return instr;                                                        \
+}
+
+#define INSTR1(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+		struct ir3_instruction *a, unsigned aflags)                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create(block, OPC_##name);                             \
+	ir3_reg_create(instr, 0, 0);   /* dst */                             \
+	__ssa_src(instr, a, aflags);                                         \
+	return instr;                                                        \
+}
+
+#define INSTR2(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+		struct ir3_instruction *a, unsigned aflags,                      \
+		struct ir3_instruction *b, unsigned bflags)                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create(block, OPC_##name);                             \
+	ir3_reg_create(instr, 0, 0);   /* dst */                             \
+	__ssa_src(instr, a, aflags);                                         \
+	__ssa_src(instr, b, bflags);                                         \
+	return instr;                                                        \
+}
+
+#define INSTR3(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+		struct ir3_instruction *a, unsigned aflags,                      \
+		struct ir3_instruction *b, unsigned bflags,                      \
+		struct ir3_instruction *c, unsigned cflags)                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create(block, OPC_##name);                             \
+	ir3_reg_create(instr, 0, 0);   /* dst */                             \
+	__ssa_src(instr, a, aflags);                                         \
+	__ssa_src(instr, b, bflags);                                         \
+	__ssa_src(instr, c, cflags);                                         \
+	return instr;                                                        \
+}
+
+#define INSTR4(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+		struct ir3_instruction *a, unsigned aflags,                      \
+		struct ir3_instruction *b, unsigned bflags,                      \
+		struct ir3_instruction *c, unsigned cflags,                      \
+		struct ir3_instruction *d, unsigned dflags)                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create2(block, OPC_##name, 5);                         \
+	ir3_reg_create(instr, 0, 0);   /* dst */                             \
+	__ssa_src(instr, a, aflags);                                         \
+	__ssa_src(instr, b, bflags);                                         \
+	__ssa_src(instr, c, cflags);                                         \
+	__ssa_src(instr, d, dflags);                                         \
+	return instr;                                                        \
+}
+
+#define INSTR4F(f, name)                                                 \
+static inline struct ir3_instruction *                                   \
+ir3_##name##_##f(struct ir3_block *block,                                \
+		struct ir3_instruction *a, unsigned aflags,                      \
+		struct ir3_instruction *b, unsigned bflags,                      \
+		struct ir3_instruction *c, unsigned cflags,                      \
+		struct ir3_instruction *d, unsigned dflags)                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create2(block, OPC_##name, 5);                         \
+	ir3_reg_create(instr, 0, 0);   /* dst */                             \
+	__ssa_src(instr, a, aflags);                                         \
+	__ssa_src(instr, b, bflags);                                         \
+	__ssa_src(instr, c, cflags);                                         \
+	__ssa_src(instr, d, dflags);                                         \
+	instr->flags |= IR3_INSTR_##f;                                       \
+	return instr;                                                        \
+}
+
+/* cat0 instructions: */
+INSTR0(BR)
+INSTR0(JUMP)
+INSTR1(KILL)
+INSTR0(END)
+
+/* cat2 instructions, most 2 src but some 1 src: */
+INSTR2(ADD_F)
+INSTR2(MIN_F)
+INSTR2(MAX_F)
+INSTR2(MUL_F)
+INSTR1(SIGN_F)
+INSTR2(CMPS_F)
+INSTR1(ABSNEG_F)
+INSTR2(CMPV_F)
+INSTR1(FLOOR_F)
+INSTR1(CEIL_F)
+INSTR1(RNDNE_F)
+INSTR1(RNDAZ_F)
+INSTR1(TRUNC_F)
+INSTR2(ADD_U)
+INSTR2(ADD_S)
+INSTR2(SUB_U)
+INSTR2(SUB_S)
+INSTR2(CMPS_U)
+INSTR2(CMPS_S)
+INSTR2(MIN_U)
+INSTR2(MIN_S)
+INSTR2(MAX_U)
+INSTR2(MAX_S)
+INSTR1(ABSNEG_S)
+INSTR2(AND_B)
+INSTR2(OR_B)
+INSTR1(NOT_B)
+INSTR2(XOR_B)
+INSTR2(CMPV_U)
+INSTR2(CMPV_S)
+INSTR2(MUL_U)
+INSTR2(MUL_S)
+INSTR2(MULL_U)
+INSTR1(BFREV_B)
+INSTR1(CLZ_S)
+INSTR1(CLZ_B)
+INSTR2(SHL_B)
+INSTR2(SHR_B)
+INSTR2(ASHR_B)
+INSTR2(BARY_F)
+INSTR2(MGEN_B)
+INSTR2(GETBIT_B)
+INSTR1(SETRM)
+INSTR1(CBITS_B)
+INSTR2(SHB)
+INSTR2(MSAD)
+
+/* cat3 instructions: */
+INSTR3(MAD_U16)
+INSTR3(MADSH_U16)
+INSTR3(MAD_S16)
+INSTR3(MADSH_M16)
+INSTR3(MAD_U24)
+INSTR3(MAD_S24)
+INSTR3(MAD_F16)
+INSTR3(MAD_F32)
+INSTR3(SEL_B16)
+INSTR3(SEL_B32)
+INSTR3(SEL_S16)
+INSTR3(SEL_S32)
+INSTR3(SEL_F16)
+INSTR3(SEL_F32)
+INSTR3(SAD_S16)
+INSTR3(SAD_S32)
+
+/* cat4 instructions: */
+INSTR1(RCP)
+INSTR1(RSQ)
+INSTR1(LOG2)
+INSTR1(EXP2)
+INSTR1(SIN)
+INSTR1(COS)
+INSTR1(SQRT)
+
+/* cat5 instructions: */
+INSTR1(DSX)
+INSTR1(DSY)
+
+static inline struct ir3_instruction *
+ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
+		unsigned wrmask, unsigned flags, unsigned samp, unsigned tex,
+		struct ir3_instruction *src0, struct ir3_instruction *src1)
+{
+	struct ir3_instruction *sam;
+	struct ir3_register *reg;
+
+	sam = ir3_instr_create(block, opc);
+	sam->flags |= flags;
+	ir3_reg_create(sam, 0, 0)->wrmask = wrmask;
+	if (src0) {
+		reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
+		reg->wrmask = (1 << (src0->regs_count - 1)) - 1;
+		reg->instr = src0;
+	}
+	if (src1) {
+		reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
+		reg->instr = src1;
+		reg->wrmask = (1 << (src1->regs_count - 1)) - 1;
+	}
+	sam->cat5.samp = samp;
+	sam->cat5.tex  = tex;
+	sam->cat5.type  = type;
+
+	return sam;
+}
+
+/* cat6 instructions: */
+INSTR2(LDLV)
+INSTR2(LDG)
+INSTR2(LDL)
+INSTR3(STG)
+INSTR3(STL)
+INSTR3(LDGB)
+INSTR4(STGB)
+INSTR4(STIB)
+INSTR1(RESINFO)
+INSTR1(RESFMT)
+INSTR2(ATOMIC_ADD)
+INSTR2(ATOMIC_SUB)
+INSTR2(ATOMIC_XCHG)
+INSTR2(ATOMIC_INC)
+INSTR2(ATOMIC_DEC)
+INSTR2(ATOMIC_CMPXCHG)
+INSTR2(ATOMIC_MIN)
+INSTR2(ATOMIC_MAX)
+INSTR2(ATOMIC_AND)
+INSTR2(ATOMIC_OR)
+INSTR2(ATOMIC_XOR)
+INSTR4F(G, ATOMIC_ADD)
+INSTR4F(G, ATOMIC_SUB)
+INSTR4F(G, ATOMIC_XCHG)
+INSTR4F(G, ATOMIC_INC)
+INSTR4F(G, ATOMIC_DEC)
+INSTR4F(G, ATOMIC_CMPXCHG)
+INSTR4F(G, ATOMIC_MIN)
+INSTR4F(G, ATOMIC_MAX)
+INSTR4F(G, ATOMIC_AND)
+INSTR4F(G, ATOMIC_OR)
+INSTR4F(G, ATOMIC_XOR)
+
+/* cat7 instructions: */
+INSTR0(BAR)
+INSTR0(FENCE)
+
+/* ************************************************************************* */
+/* split this out or find some helper to use.. like main/bitset.h.. */
+
+#include <string.h>
+
+#define MAX_REG 256
+
+typedef uint8_t regmask_t[2 * MAX_REG / 8];
+
+static inline unsigned regmask_idx(struct ir3_register *reg)
+{
+	unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
+	debug_assert(num < MAX_REG);
+	if (reg->flags & IR3_REG_HALF)
+		num += MAX_REG;
+	return num;
+}
+
+static inline void regmask_init(regmask_t *regmask)
+{
+	memset(regmask, 0, sizeof(*regmask));
+}
+
+static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
+{
+	unsigned idx = regmask_idx(reg);
+	if (reg->flags & IR3_REG_RELATIV) {
+		unsigned i;
+		for (i = 0; i < reg->size; i++, idx++)
+			(*regmask)[idx / 8] |= 1 << (idx % 8);
+	} else {
+		unsigned mask;
+		for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+			if (mask & 1)
+				(*regmask)[idx / 8] |= 1 << (idx % 8);
+	}
+}
+
+static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
+{
+	unsigned i;
+	for (i = 0; i < ARRAY_SIZE(*dst); i++)
+		(*dst)[i] = (*a)[i] | (*b)[i];
+}
+
+/* set bits in a if not set in b, conceptually:
+ *   a |= (reg & ~b)
+ */
+static inline void regmask_set_if_not(regmask_t *a,
+		struct ir3_register *reg, regmask_t *b)
+{
+	unsigned idx = regmask_idx(reg);
+	if (reg->flags & IR3_REG_RELATIV) {
+		unsigned i;
+		for (i = 0; i < reg->size; i++, idx++)
+			if (!((*b)[idx / 8] & (1 << (idx % 8))))
+				(*a)[idx / 8] |= 1 << (idx % 8);
+	} else {
+		unsigned mask;
+		for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+			if (mask & 1)
+				if (!((*b)[idx / 8] & (1 << (idx % 8))))
+					(*a)[idx / 8] |= 1 << (idx % 8);
+	}
+}
+
+static inline bool regmask_get(regmask_t *regmask,
+		struct ir3_register *reg)
+{
+	unsigned idx = regmask_idx(reg);
+	if (reg->flags & IR3_REG_RELATIV) {
+		unsigned i;
+		for (i = 0; i < reg->size; i++, idx++)
+			if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+				return true;
+	} else {
+		unsigned mask;
+		for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+			if (mask & 1)
+				if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+					return true;
+	}
+	return false;
+}
+
+/* ************************************************************************* */
+
+#endif /* IR3_H_ */
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
new file mode 100644
index 00000000000..f00daebabf5
--- /dev/null
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "util/ralloc.h"
+
+#include "ir3_compiler.h"
+
+static const struct debug_named_value shader_debug_options[] = {
+		{"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"},
+		{"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"},
+		{"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"},
+		{"disasm",  IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"},
+		{"optmsgs", IR3_DBG_OPTMSGS,"Enable optimizer debug messages"},
+		DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", shader_debug_options, 0)
+
+enum ir3_shader_debug ir3_shader_debug = 0;
+
+struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
+{
+	struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
+
+	ir3_shader_debug = debug_get_option_ir3_shader_debug();
+
+	compiler->dev = dev;
+	compiler->gpu_id = gpu_id;
+	compiler->set = ir3_ra_alloc_reg_set(compiler);
+
+	if (compiler->gpu_id >= 400) {
+		/* need special handling for "flat" */
+		compiler->flat_bypass = true;
+		compiler->levels_add_one = false;
+		compiler->unminify_coords = false;
+		compiler->txf_ms_with_isaml = false;
+		compiler->array_index_add_half = true;
+	} else {
+		/* no special handling for "flat" */
+		compiler->flat_bypass = false;
+		compiler->levels_add_one = true;
+		compiler->unminify_coords = true;
+		compiler->txf_ms_with_isaml = true;
+		compiler->array_index_add_half = false;
+	}
+
+	return compiler;
+}
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
new file mode 100644
index 00000000000..e2336062b29
--- /dev/null
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2013 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#ifndef IR3_COMPILER_H_
+#define IR3_COMPILER_H_
+
+#include "ir3_shader.h"
+
+struct ir3_ra_reg_set;
+
+struct ir3_compiler {
+	struct fd_device *dev;
+	uint32_t gpu_id;
+	struct ir3_ra_reg_set *set;
+	uint32_t shader_count;
+
+	/*
+	 * Configuration options for things that are handled differently on
+	 * different generations:
+	 */
+
+	/* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
+	 * so we need to use ldlv.u32 to load the varying directly:
+	 */
+	bool flat_bypass;
+
+	/* on a3xx, we need to add one to # of array levels:
+	 */
+	bool levels_add_one;
+
+	/* on a3xx, we need to scale up integer coords for isaml based
+	 * on LoD:
+	 */
+	bool unminify_coords;
+
+	/* on a3xx do txf_ms w/ isaml and scaled coords: */
+	bool txf_ms_with_isaml;
+
+	/* on a4xx, for array textures we need to add 0.5 to the array
+	 * index coordinate:
+	 */
+	bool array_index_add_half;
+};
+
+struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id);
+
+int ir3_compile_shader_nir(struct ir3_compiler *compiler,
+		struct ir3_shader_variant *so);
+
+enum ir3_shader_debug {
+	IR3_DBG_SHADER_VS = 0x01,
+	IR3_DBG_SHADER_FS = 0x02,
+	IR3_DBG_SHADER_CS = 0x04,
+	IR3_DBG_DISASM    = 0x08,
+	IR3_DBG_OPTMSGS   = 0x10,
+};
+
+extern enum ir3_shader_debug ir3_shader_debug;
+
+static inline bool
+shader_debug_enabled(gl_shader_stage type)
+{
+	switch (type) {
+	case MESA_SHADER_VERTEX:      return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
+	case MESA_SHADER_FRAGMENT:    return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
+	case MESA_SHADER_COMPUTE:     return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
+	default:
+		debug_assert(0);
+		return false;
+	}
+}
+
+#endif /* IR3_COMPILER_H_ */
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
new file mode 100644
index 00000000000..445a2b291e9
--- /dev/null
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -0,0 +1,3818 @@
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+#include "ir3_nir.h"
+
+#include "instr-a3xx.h"
+#include "ir3.h"
+
+/* for conditionally setting boolean flag(s): */
+#define COND(bool, val) ((bool) ? (val) : 0)
+
+#define DBG(fmt, ...) \
+		do { debug_printf("%s:%d: "fmt "\n", \
+				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
+
+struct ir3_context {
+	struct ir3_compiler *compiler;
+
+	struct nir_shader *s;
+
+	struct nir_instr *cur_instr;  /* current instruction, just for debug */
+
+	struct ir3 *ir;
+	struct ir3_shader_variant *so;
+
+	struct ir3_block *block;      /* the current block */
+	struct ir3_block *in_block;   /* block created for shader inputs */
+
+	nir_function_impl *impl;
+
+	/* For fragment shaders, varyings are not actual shader inputs,
+	 * instead the hw passes a varying-coord which is used with
+	 * bary.f.
+	 *
+	 * But NIR doesn't know that, it still declares varyings as
+	 * inputs.  So we do all the input tracking normally and fix
+	 * things up after compile_instructions()
+	 *
+	 * NOTE that frag_vcoord is the hardware position (possibly it
+	 * is actually an index or tag or some such.. it is *not*
+	 * values that can be directly used for gl_FragCoord..)
+	 */
+	struct ir3_instruction *frag_vcoord;
+
+	/* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */
+	struct ir3_instruction *frag_face, *frag_coord;
+
+	/* For vertex shaders, keep track of the system values sources */
+	struct ir3_instruction *vertex_id, *basevertex, *instance_id;
+
+	/* For fragment shaders: */
+	struct ir3_instruction *samp_id, *samp_mask_in;
+
+	/* Compute shader inputs: */
+	struct ir3_instruction *local_invocation_id, *work_group_id;
+
+	/* mapping from nir_register to defining instruction: */
+	struct hash_table *def_ht;
+
+	unsigned num_arrays;
+
+	/* a common pattern for indirect addressing is to request the
+	 * same address register multiple times.  To avoid generating
+	 * duplicate instruction sequences (which our backend does not
+	 * try to clean up, since that should be done as the NIR stage)
+	 * we cache the address value generated for a given src value:
+	 *
+	 * Note that we have to cache these per alignment, since same
+	 * src used for an array of vec1 cannot be also used for an
+	 * array of vec4.
+	 */
+	struct hash_table *addr_ht[4];
+
+	/* last dst array, for indirect we need to insert a var-store.
+	 */
+	struct ir3_instruction **last_dst;
+	unsigned last_dst_n;
+
+	/* maps nir_block to ir3_block, mostly for the purposes of
+	 * figuring out the blocks successors
+	 */
+	struct hash_table *block_ht;
+
+	/* on a4xx, bitmask of samplers which need astc+srgb workaround: */
+	unsigned astc_srgb;
+
+	unsigned samples;             /* bitmask of x,y sample shifts */
+
+	unsigned max_texture_index;
+
+	/* set if we encounter something we can't handle yet, so we
+	 * can bail cleanly and fallback to TGSI compiler f/e
+	 */
+	bool error;
+};
+
+/* gpu pointer size in units of 32bit registers/slots */
+static unsigned pointer_size(struct ir3_context *ctx)
+{
+	return (ctx->compiler->gpu_id >= 500) ? 2 : 1;
+}
+
+static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
+static struct ir3_block * get_block(struct ir3_context *ctx, const nir_block *nblock);
+
+
+static struct ir3_context *
+compile_init(struct ir3_compiler *compiler,
+		struct ir3_shader_variant *so)
+{
+	struct ir3_context *ctx = rzalloc(NULL, struct ir3_context);
+
+	if (compiler->gpu_id >= 400) {
+		if (so->type == MESA_SHADER_VERTEX) {
+			ctx->astc_srgb = so->key.vastc_srgb;
+		} else if (so->type == MESA_SHADER_FRAGMENT) {
+			ctx->astc_srgb = so->key.fastc_srgb;
+		}
+
+	} else {
+		if (so->type == MESA_SHADER_VERTEX) {
+			ctx->samples = so->key.vsamples;
+		} else if (so->type == MESA_SHADER_FRAGMENT) {
+			ctx->samples = so->key.fsamples;
+		}
+	}
+
+	ctx->compiler = compiler;
+	ctx->so = so;
+	ctx->def_ht = _mesa_hash_table_create(ctx,
+			_mesa_hash_pointer, _mesa_key_pointer_equal);
+	ctx->block_ht = _mesa_hash_table_create(ctx,
+			_mesa_hash_pointer, _mesa_key_pointer_equal);
+
+	/* TODO: maybe generate some sort of bitmask of what key
+	 * lowers vs what shader has (ie. no need to lower
+	 * texture clamp lowering if no texture sample instrs)..
+	 * although should be done further up the stack to avoid
+	 * creating duplicate variants..
+	 */
+
+	if (ir3_key_lowers_nir(&so->key)) {
+		nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
+		ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
+	} else {
+		/* fast-path for shader key that lowers nothing in NIR: */
+		ctx->s = so->shader->nir;
+	}
+
+	/* this needs to be the last pass run, so do this here instead of
+	 * in ir3_optimize_nir():
+	 */
+	NIR_PASS_V(ctx->s, nir_lower_locals_to_regs);
+	NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
+
+	if (ir3_shader_debug & IR3_DBG_DISASM) {
+		printf("dump nir%dv%d: type=%d, k={cts=%u,hp=%u}",
+			so->shader->id, so->id, so->type,
+			so->key.color_two_side, so->key.half_precision);
+		nir_print_shader(ctx->s, stdout);
+	}
+
+	if (shader_debug_enabled(so->type)) {
+		fprintf(stderr, "NIR (final form) for %s shader:\n",
+			_mesa_shader_stage_to_string(so->type));
+		nir_print_shader(ctx->s, stderr);
+	}
+
+	ir3_nir_scan_driver_consts(ctx->s, &so->const_layout);
+
+	so->num_uniforms = ctx->s->num_uniforms;
+	so->num_ubos = ctx->s->info.num_ubos;
+
+	/* Layout of constant registers, each section aligned to vec4.  Note
+	 * that pointer size (ubo, etc) changes depending on generation.
+	 *
+	 *    user consts
+	 *    UBO addresses
+	 *    SSBO sizes
+	 *    if (vertex shader) {
+	 *        driver params (IR3_DP_*)
+	 *        if (stream_output.num_outputs > 0)
+	 *           stream-out addresses
+	 *    }
+	 *    immediates
+	 *
+	 * Immediates go last mostly because they are inserted in the CP pass
+	 * after the nir -> ir3 frontend.
+	 */
+	unsigned constoff = align(ctx->s->num_uniforms, 4);
+	unsigned ptrsz = pointer_size(ctx);
+
+	memset(&so->constbase, ~0, sizeof(so->constbase));
+
+	if (so->num_ubos > 0) {
+		so->constbase.ubo = constoff;
+		constoff += align(ctx->s->info.num_ubos * ptrsz, 4) / 4;
+	}
+
+	if (so->const_layout.ssbo_size.count > 0) {
+		unsigned cnt = so->const_layout.ssbo_size.count;
+		so->constbase.ssbo_sizes = constoff;
+		constoff += align(cnt, 4) / 4;
+	}
+
+	if (so->const_layout.image_dims.count > 0) {
+		unsigned cnt = so->const_layout.image_dims.count;
+		so->constbase.image_dims = constoff;
+		constoff += align(cnt, 4) / 4;
+	}
+
+	unsigned num_driver_params = 0;
+	if (so->type == MESA_SHADER_VERTEX) {
+		num_driver_params = IR3_DP_VS_COUNT;
+	} else if (so->type == MESA_SHADER_COMPUTE) {
+		num_driver_params = IR3_DP_CS_COUNT;
+	}
+
+	so->constbase.driver_param = constoff;
+	constoff += align(num_driver_params, 4) / 4;
+
+	if ((so->type == MESA_SHADER_VERTEX) &&
+			(compiler->gpu_id < 500) &&
+			so->shader->stream_output.num_outputs > 0) {
+		so->constbase.tfbo = constoff;
+		constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
+	}
+
+	so->constbase.immediate = constoff;
+
+	return ctx;
+}
+
+static void
+compile_error(struct ir3_context *ctx, const char *format, ...)
+{
+	struct hash_table *errors = NULL;
+	va_list ap;
+	va_start(ap, format);
+	if (ctx->cur_instr) {
+		errors = _mesa_hash_table_create(NULL,
+				_mesa_hash_pointer,
+				_mesa_key_pointer_equal);
+		char *msg = ralloc_vasprintf(errors, format, ap);
+		_mesa_hash_table_insert(errors, ctx->cur_instr, msg);
+	} else {
+		_debug_vprintf(format, ap);
+	}
+	va_end(ap);
+	nir_print_shader_annotated(ctx->s, stdout, errors);
+	ralloc_free(errors);
+	ctx->error = true;
+	debug_assert(0);
+}
+
+#define compile_assert(ctx, cond) do { \
+		if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
+	} while (0)
+
+static void
+compile_free(struct ir3_context *ctx)
+{
+	ralloc_free(ctx);
+}
+
+static void
+declare_array(struct ir3_context *ctx, nir_register *reg)
+{
+	struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
+	arr->id = ++ctx->num_arrays;
+	/* NOTE: sometimes we get non array regs, for example for arrays of
+	 * length 1.  See fs-const-array-of-struct-of-array.shader_test.  So
+	 * treat a non-array as if it was an array of length 1.
+	 *
+	 * It would be nice if there was a nir pass to convert arrays of
+	 * length 1 to ssa.
+	 */
+	arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
+	compile_assert(ctx, arr->length > 0);
+	arr->r = reg;
+	list_addtail(&arr->node, &ctx->ir->array_list);
+}
+
+static struct ir3_array *
+get_array(struct ir3_context *ctx, nir_register *reg)
+{
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		if (arr->r == reg)
+			return arr;
+	}
+	compile_error(ctx, "bogus reg: %s\n", reg->name);
+	return NULL;
+}
+
+/* relative (indirect) if address!=NULL */
+static struct ir3_instruction *
+create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
+		struct ir3_instruction *address)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *mov;
+	struct ir3_register *src;
+
+	mov = ir3_instr_create(block, OPC_MOV);
+	mov->cat1.src_type = TYPE_U32;
+	mov->cat1.dst_type = TYPE_U32;
+	mov->barrier_class = IR3_BARRIER_ARRAY_R;
+	mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
+	ir3_reg_create(mov, 0, 0);
+	src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+			COND(address, IR3_REG_RELATIV));
+	src->instr = arr->last_write;
+	src->size  = arr->length;
+	src->array.id = arr->id;
+	src->array.offset = n;
+
+	if (address)
+		ir3_instr_set_address(mov, address);
+
+	return mov;
+}
+
+/* relative (indirect) if address!=NULL */
+static void
+create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
+		struct ir3_instruction *src, struct ir3_instruction *address)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *mov;
+	struct ir3_register *dst;
+
+	/* if not relative store, don't create an extra mov, since that
+	 * ends up being difficult for cp to remove.
+	 */
+	if (!address) {
+		dst = src->regs[0];
+
+		src->barrier_class |= IR3_BARRIER_ARRAY_W;
+		src->barrier_conflict |= IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
+
+		dst->flags |= IR3_REG_ARRAY;
+		dst->instr = arr->last_write;
+		dst->size = arr->length;
+		dst->array.id = arr->id;
+		dst->array.offset = n;
+
+		arr->last_write = src;
+
+		array_insert(block, block->keeps, src);
+
+		return;
+	}
+
+	mov = ir3_instr_create(block, OPC_MOV);
+	mov->cat1.src_type = TYPE_U32;
+	mov->cat1.dst_type = TYPE_U32;
+	mov->barrier_class = IR3_BARRIER_ARRAY_W;
+	mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
+	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+			COND(address, IR3_REG_RELATIV));
+	dst->instr = arr->last_write;
+	dst->size  = arr->length;
+	dst->array.id = arr->id;
+	dst->array.offset = n;
+	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
+
+	if (address)
+		ir3_instr_set_address(mov, address);
+
+	arr->last_write = mov;
+
+	/* the array store may only matter to something in an earlier
+	 * block (ie. loops), but since arrays are not in SSA, depth
+	 * pass won't know this.. so keep all array stores:
+	 */
+	array_insert(block, block->keeps, mov);
+}
+
+static inline type_t utype_for_size(unsigned bit_size)
+{
+	switch (bit_size) {
+	case 32: return TYPE_U32;
+	case 16: return TYPE_U16;
+	case  8: return TYPE_U8;
+	default: unreachable("bad bitsize"); return ~0;
+	}
+}
+
+static inline type_t utype_src(nir_src src)
+{ return utype_for_size(nir_src_bit_size(src)); }
+
+static inline type_t utype_dst(nir_dest dst)
+{ return utype_for_size(nir_dest_bit_size(dst)); }
+
+/* allocate a n element value array (to be populated by caller) and
+ * insert in def_ht
+ */
+static struct ir3_instruction **
+get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n)
+{
+	struct ir3_instruction **value =
+		ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
+	_mesa_hash_table_insert(ctx->def_ht, dst, value);
+	return value;
+}
+
+static struct ir3_instruction **
+get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n)
+{
+	struct ir3_instruction **value;
+
+	if (dst->is_ssa) {
+		value = get_dst_ssa(ctx, &dst->ssa, n);
+	} else {
+		value = ralloc_array(ctx, struct ir3_instruction *, n);
+	}
+
+	/* NOTE: in non-ssa case, we don't really need to store last_dst
+	 * but this helps us catch cases where put_dst() call is forgotten
+	 */
+	compile_assert(ctx, !ctx->last_dst);
+	ctx->last_dst = value;
+	ctx->last_dst_n = n;
+
+	return value;
+}
+
+static struct ir3_instruction * get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align);
+
+static struct ir3_instruction * const *
+get_src(struct ir3_context *ctx, nir_src *src)
+{
+	if (src->is_ssa) {
+		struct hash_entry *entry;
+		entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
+		compile_assert(ctx, entry);
+		return entry->data;
+	} else {
+		nir_register *reg = src->reg.reg;
+		struct ir3_array *arr = get_array(ctx, reg);
+		unsigned num_components = arr->r->num_components;
+		struct ir3_instruction *addr = NULL;
+		struct ir3_instruction **value =
+			ralloc_array(ctx, struct ir3_instruction *, num_components);
+
+		if (src->reg.indirect)
+			addr = get_addr(ctx, get_src(ctx, src->reg.indirect)[0],
+					reg->num_components);
+
+		for (unsigned i = 0; i < num_components; i++) {
+			unsigned n = src->reg.base_offset * reg->num_components + i;
+			compile_assert(ctx, n < arr->length);
+			value[i] = create_array_load(ctx, arr, n, addr);
+		}
+
+		return value;
+	}
+}
+
+static void
+put_dst(struct ir3_context *ctx, nir_dest *dst)
+{
+	unsigned bit_size = nir_dest_bit_size(*dst);
+
+	if (bit_size < 32) {
+		for (unsigned i = 0; i < ctx->last_dst_n; i++) {
+			struct ir3_instruction *dst = ctx->last_dst[i];
+			dst->regs[0]->flags |= IR3_REG_HALF;
+			if (ctx->last_dst[i]->opc == OPC_META_FO)
+				dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF;
+		}
+	}
+
+	if (!dst->is_ssa) {
+		nir_register *reg = dst->reg.reg;
+		struct ir3_array *arr = get_array(ctx, reg);
+		unsigned num_components = ctx->last_dst_n;
+		struct ir3_instruction *addr = NULL;
+
+		if (dst->reg.indirect)
+			addr = get_addr(ctx, get_src(ctx, dst->reg.indirect)[0],
+					reg->num_components);
+
+		for (unsigned i = 0; i < num_components; i++) {
+			unsigned n = dst->reg.base_offset * reg->num_components + i;
+			compile_assert(ctx, n < arr->length);
+			if (!ctx->last_dst[i])
+				continue;
+			create_array_store(ctx, arr, n, ctx->last_dst[i], addr);
+		}
+
+		ralloc_free(ctx->last_dst);
+	}
+	ctx->last_dst = NULL;
+	ctx->last_dst_n = 0;
+}
+
+static struct ir3_instruction *
+create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
+{
+	struct ir3_instruction *mov;
+	unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
+
+	mov = ir3_instr_create(block, OPC_MOV);
+	mov->cat1.src_type = type;
+	mov->cat1.dst_type = type;
+	ir3_reg_create(mov, 0, flags);
+	ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
+
+	return mov;
+}
+
+static struct ir3_instruction *
+create_immed(struct ir3_block *block, uint32_t val)
+{
+	return create_immed_typed(block, val, TYPE_U32);
+}
+
+static struct ir3_instruction *
+create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
+{
+	struct ir3_instruction *instr, *immed;
+
+	/* TODO in at least some cases, the backend could probably be
+	 * made clever enough to propagate IR3_REG_HALF..
+	 */
+	instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
+	instr->regs[0]->flags |= IR3_REG_HALF;
+
+	switch(align){
+	case 1:
+		/* src *= 1: */
+		break;
+	case 2:
+		/* src *= 2	=> src <<= 1: */
+		immed = create_immed(block, 1);
+		immed->regs[0]->flags |= IR3_REG_HALF;
+
+		instr = ir3_SHL_B(block, instr, 0, immed, 0);
+		instr->regs[0]->flags |= IR3_REG_HALF;
+		instr->regs[1]->flags |= IR3_REG_HALF;
+		break;
+	case 3:
+		/* src *= 3: */
+		immed = create_immed(block, 3);
+		immed->regs[0]->flags |= IR3_REG_HALF;
+
+		instr = ir3_MULL_U(block, instr, 0, immed, 0);
+		instr->regs[0]->flags |= IR3_REG_HALF;
+		instr->regs[1]->flags |= IR3_REG_HALF;
+		break;
+	case 4:
+		/* src *= 4 => src <<= 2: */
+		immed = create_immed(block, 2);
+		immed->regs[0]->flags |= IR3_REG_HALF;
+
+		instr = ir3_SHL_B(block, instr, 0, immed, 0);
+		instr->regs[0]->flags |= IR3_REG_HALF;
+		instr->regs[1]->flags |= IR3_REG_HALF;
+		break;
+	default:
+		unreachable("bad align");
+		return NULL;
+	}
+
+	instr = ir3_MOV(block, instr, TYPE_S16);
+	instr->regs[0]->num = regid(REG_A0, 0);
+	instr->regs[0]->flags |= IR3_REG_HALF;
+	instr->regs[1]->flags |= IR3_REG_HALF;
+
+	return instr;
+}
+
+/* caches addr values to avoid generating multiple cov/shl/mova
+ * sequences for each use of a given NIR level src as address
+ */
+static struct ir3_instruction *
+get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align)
+{
+	struct ir3_instruction *addr;
+	unsigned idx = align - 1;
+
+	compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht));
+
+	if (!ctx->addr_ht[idx]) {
+		ctx->addr_ht[idx] = _mesa_hash_table_create(ctx,
+				_mesa_hash_pointer, _mesa_key_pointer_equal);
+	} else {
+		struct hash_entry *entry;
+		entry = _mesa_hash_table_search(ctx->addr_ht[idx], src);
+		if (entry)
+			return entry->data;
+	}
+
+	addr = create_addr(ctx->block, src, align);
+	_mesa_hash_table_insert(ctx->addr_ht[idx], src, addr);
+
+	return addr;
+}
+
+static struct ir3_instruction *
+get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *cond;
+
+	/* NOTE: only cmps.*.* can write p0.x: */
+	cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
+	cond->cat2.condition = IR3_COND_NE;
+
+	/* condition always goes in predicate register: */
+	cond->regs[0]->num = regid(REG_P0, 0);
+
+	return cond;
+}
+
+static struct ir3_instruction *
+create_uniform(struct ir3_context *ctx, unsigned n)
+{
+	struct ir3_instruction *mov;
+
+	mov = ir3_instr_create(ctx->block, OPC_MOV);
+	/* TODO get types right? */
+	mov->cat1.src_type = TYPE_F32;
+	mov->cat1.dst_type = TYPE_F32;
+	ir3_reg_create(mov, 0, 0);
+	ir3_reg_create(mov, n, IR3_REG_CONST);
+
+	return mov;
+}
+
+static struct ir3_instruction *
+create_uniform_indirect(struct ir3_context *ctx, int n,
+		struct ir3_instruction *address)
+{
+	struct ir3_instruction *mov;
+
+	mov = ir3_instr_create(ctx->block, OPC_MOV);
+	mov->cat1.src_type = TYPE_U32;
+	mov->cat1.dst_type = TYPE_U32;
+	ir3_reg_create(mov, 0, 0);
+	ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
+
+	ir3_instr_set_address(mov, address);
+
+	return mov;
+}
+
+static struct ir3_instruction *
+create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
+		unsigned arrsz)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *collect;
+
+	if (arrsz == 0)
+		return NULL;
+
+	unsigned flags = arr[0]->regs[0]->flags & IR3_REG_HALF;
+
+	collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz);
+	ir3_reg_create(collect, 0, flags);     /* dst */
+	for (unsigned i = 0; i < arrsz; i++) {
+		struct ir3_instruction *elem = arr[i];
+
+		/* Since arrays are pre-colored in RA, we can't assume that
+		 * things will end up in the right place.  (Ie. if a collect
+		 * joins elements from two different arrays.)  So insert an
+		 * extra mov.
+		 *
+		 * We could possibly skip this if all the collected elements
+		 * are contiguous elements in a single array.. not sure how
+		 * likely that is to happen.
+		 *
+		 * Fixes a problem with glamor shaders, that in effect do
+		 * something like:
+		 *
+		 *   if (foo)
+		 *     texcoord = ..
+		 *   else
+		 *     texcoord = ..
+		 *   color = texture2D(tex, texcoord);
+		 *
+		 * In this case, texcoord will end up as nir registers (which
+		 * translate to ir3 array's of length 1.  And we can't assume
+		 * the two (or more) arrays will get allocated in consecutive
+		 * scalar registers.
+		 *
+		 */
+		if (elem->regs[0]->flags & IR3_REG_ARRAY) {
+			type_t type = (flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+			elem = ir3_MOV(block, elem, type);
+		}
+
+		compile_assert(ctx, (elem->regs[0]->flags & IR3_REG_HALF) == flags);
+		ir3_reg_create(collect, 0, IR3_REG_SSA | flags)->instr = elem;
+	}
+
+	return collect;
+}
+
+static struct ir3_instruction *
+create_indirect_load(struct ir3_context *ctx, unsigned arrsz, int n,
+		struct ir3_instruction *address, struct ir3_instruction *collect)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *mov;
+	struct ir3_register *src;
+
+	mov = ir3_instr_create(block, OPC_MOV);
+	mov->cat1.src_type = TYPE_U32;
+	mov->cat1.dst_type = TYPE_U32;
+	ir3_reg_create(mov, 0, 0);
+	src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
+	src->instr = collect;
+	src->size  = arrsz;
+	src->array.offset = n;
+
+	ir3_instr_set_address(mov, address);
+
+	return mov;
+}
+
+static struct ir3_instruction *
+create_input_compmask(struct ir3_context *ctx, unsigned n, unsigned compmask)
+{
+	struct ir3_instruction *in;
+
+	in = ir3_instr_create(ctx->in_block, OPC_META_INPUT);
+	in->inout.block = ctx->in_block;
+	ir3_reg_create(in, n, 0);
+
+	in->regs[0]->wrmask = compmask;
+
+	return in;
+}
+
+static struct ir3_instruction *
+create_input(struct ir3_context *ctx, unsigned n)
+{
+	return create_input_compmask(ctx, n, 0x1);
+}
+
+static struct ir3_instruction *
+create_frag_input(struct ir3_context *ctx, bool use_ldlv)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *instr;
+	/* actual inloc is assigned and fixed up later: */
+	struct ir3_instruction *inloc = create_immed(block, 0);
+
+	if (use_ldlv) {
+		instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
+		instr->cat6.type = TYPE_U32;
+		instr->cat6.iim_val = 1;
+	} else {
+		instr = ir3_BARY_F(block, inloc, 0, ctx->frag_vcoord, 0);
+		instr->regs[2]->wrmask = 0x3;
+	}
+
+	return instr;
+}
+
+static struct ir3_instruction *
+create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
+{
+	/* first four vec4 sysval's reserved for UBOs: */
+	/* NOTE: dp is in scalar, but there can be >4 dp components: */
+	unsigned n = ctx->so->constbase.driver_param;
+	unsigned r = regid(n + dp / 4, dp % 4);
+	return create_uniform(ctx, r);
+}
+
+/* helper for instructions that produce multiple consecutive scalar
+ * outputs which need to have a split/fanout meta instruction inserted
+ */
+static void
+split_dest(struct ir3_block *block, struct ir3_instruction **dst,
+		struct ir3_instruction *src, unsigned base, unsigned n)
+{
+	struct ir3_instruction *prev = NULL;
+
+	if ((n == 1) && (src->regs[0]->wrmask == 0x1)) {
+		dst[0] = src;
+		return;
+	}
+
+	for (int i = 0, j = 0; i < n; i++) {
+		struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO);
+		ir3_reg_create(split, 0, IR3_REG_SSA);
+		ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src;
+		split->fo.off = i + base;
+
+		if (prev) {
+			split->cp.left = prev;
+			split->cp.left_cnt++;
+			prev->cp.right = split;
+			prev->cp.right_cnt++;
+		}
+		prev = split;
+
+		if (src->regs[0]->wrmask & (1 << (i + base)))
+			dst[j++] = split;
+	}
+}
+
+/*
+ * Adreno uses uint rather than having dedicated bool type,
+ * which (potentially) requires some conversion, in particular
+ * when using output of an bool instr to int input, or visa
+ * versa.
+ *
+ *         | Adreno  |  NIR  |
+ *  -------+---------+-------+-
+ *   true  |    1    |  ~0   |
+ *   false |    0    |   0   |
+ *
+ * To convert from an adreno bool (uint) to nir, use:
+ *
+ *    absneg.s dst, (neg)src
+ *
+ * To convert back in the other direction:
+ *
+ *    absneg.s dst, (abs)arc
+ *
+ * The CP step can clean up the absneg.s that cancel each other
+ * out, and with a slight bit of extra cleverness (to recognize
+ * the instructions which produce either a 0 or 1) can eliminate
+ * the absneg.s's completely when an instruction that wants
+ * 0/1 consumes the result.  For example, when a nir 'bcsel'
+ * consumes the result of 'feq'.  So we should be able to get by
+ * without a boolean resolve step, and without incuring any
+ * extra penalty in instruction count.
+ */
+
+/* NIR bool -> native (adreno): */
+static struct ir3_instruction *
+ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr)
+{
+	return ir3_ABSNEG_S(block, instr, IR3_REG_SABS);
+}
+
+/* native (adreno) -> NIR bool: */
+static struct ir3_instruction *
+ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr)
+{
+	return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG);
+}
+
+/*
+ * alu/sfu instructions:
+ */
+
+static struct ir3_instruction *
+create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
+		unsigned src_bitsize, nir_op op)
+{
+	type_t src_type, dst_type;
+
+	switch (op) {
+	case nir_op_f2f32:
+	case nir_op_f2f16_rtne:
+	case nir_op_f2f16_rtz:
+	case nir_op_f2f16:
+	case nir_op_f2i32:
+	case nir_op_f2i16:
+	case nir_op_f2i8:
+	case nir_op_f2u32:
+	case nir_op_f2u16:
+	case nir_op_f2u8:
+		switch (src_bitsize) {
+		case 32:
+			src_type = TYPE_F32;
+			break;
+		case 16:
+			src_type = TYPE_F16;
+			break;
+		default:
+			compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+		}
+		break;
+
+	case nir_op_i2f32:
+	case nir_op_i2f16:
+	case nir_op_i2i32:
+	case nir_op_i2i16:
+	case nir_op_i2i8:
+		switch (src_bitsize) {
+		case 32:
+			src_type = TYPE_S32;
+			break;
+		case 16:
+			src_type = TYPE_S16;
+			break;
+		case 8:
+			src_type = TYPE_S8;
+			break;
+		default:
+			compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+		}
+		break;
+
+	case nir_op_u2f32:
+	case nir_op_u2f16:
+	case nir_op_u2u32:
+	case nir_op_u2u16:
+	case nir_op_u2u8:
+		switch (src_bitsize) {
+		case 32:
+			src_type = TYPE_U32;
+			break;
+		case 16:
+			src_type = TYPE_U16;
+			break;
+		case 8:
+			src_type = TYPE_U8;
+			break;
+		default:
+			compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+		}
+		break;
+
+	default:
+		compile_error(ctx, "invalid conversion op: %u", op);
+	}
+
+	switch (op) {
+	case nir_op_f2f32:
+	case nir_op_i2f32:
+	case nir_op_u2f32:
+		dst_type = TYPE_F32;
+		break;
+
+	case nir_op_f2f16_rtne:
+	case nir_op_f2f16_rtz:
+	case nir_op_f2f16:
+		/* TODO how to handle rounding mode? */
+	case nir_op_i2f16:
+	case nir_op_u2f16:
+		dst_type = TYPE_F16;
+		break;
+
+	case nir_op_f2i32:
+	case nir_op_i2i32:
+		dst_type = TYPE_S32;
+		break;
+
+	case nir_op_f2i16:
+	case nir_op_i2i16:
+		dst_type = TYPE_S16;
+		break;
+
+	case nir_op_f2i8:
+	case nir_op_i2i8:
+		dst_type = TYPE_S8;
+		break;
+
+	case nir_op_f2u32:
+	case nir_op_u2u32:
+		dst_type = TYPE_U32;
+		break;
+
+	case nir_op_f2u16:
+	case nir_op_u2u16:
+		dst_type = TYPE_U16;
+		break;
+
+	case nir_op_f2u8:
+	case nir_op_u2u8:
+		dst_type = TYPE_U8;
+		break;
+
+	default:
+		compile_error(ctx, "invalid conversion op: %u", op);
+	}
+
+	return ir3_COV(ctx->block, src, src_type, dst_type);
+}
+
+static void
+emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
+{
+	const nir_op_info *info = &nir_op_infos[alu->op];
+	struct ir3_instruction **dst, *src[info->num_inputs];
+	unsigned bs[info->num_inputs];     /* bit size */
+	struct ir3_block *b = ctx->block;
+	unsigned dst_sz, wrmask;
+
+	if (alu->dest.dest.is_ssa) {
+		dst_sz = alu->dest.dest.ssa.num_components;
+		wrmask = (1 << dst_sz) - 1;
+	} else {
+		dst_sz = alu->dest.dest.reg.reg->num_components;
+		wrmask = alu->dest.write_mask;
+	}
+
+	dst = get_dst(ctx, &alu->dest.dest, dst_sz);
+
+	/* Vectors are special in that they have non-scalarized writemasks,
+	 * and just take the first swizzle channel for each argument in
+	 * order into each writemask channel.
+	 */
+	if ((alu->op == nir_op_vec2) ||
+			(alu->op == nir_op_vec3) ||
+			(alu->op == nir_op_vec4)) {
+
+		for (int i = 0; i < info->num_inputs; i++) {
+			nir_alu_src *asrc = &alu->src[i];
+
+			compile_assert(ctx, !asrc->abs);
+			compile_assert(ctx, !asrc->negate);
+
+			src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]];
+			if (!src[i])
+				src[i] = create_immed(ctx->block, 0);
+			dst[i] = ir3_MOV(b, src[i], TYPE_U32);
+		}
+
+		put_dst(ctx, &alu->dest.dest);
+		return;
+	}
+
+	/* We also get mov's with more than one component for mov's so
+	 * handle those specially:
+	 */
+	if ((alu->op == nir_op_imov) || (alu->op == nir_op_fmov)) {
+		type_t type = (alu->op == nir_op_imov) ? TYPE_U32 : TYPE_F32;
+		nir_alu_src *asrc = &alu->src[0];
+		struct ir3_instruction *const *src0 = get_src(ctx, &asrc->src);
+
+		for (unsigned i = 0; i < dst_sz; i++) {
+			if (wrmask & (1 << i)) {
+				dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], type);
+			} else {
+				dst[i] = NULL;
+			}
+		}
+
+		put_dst(ctx, &alu->dest.dest);
+		return;
+	}
+
+	/* General case: We can just grab the one used channel per src. */
+	for (int i = 0; i < info->num_inputs; i++) {
+		unsigned chan = ffs(alu->dest.write_mask) - 1;
+		nir_alu_src *asrc = &alu->src[i];
+
+		compile_assert(ctx, !asrc->abs);
+		compile_assert(ctx, !asrc->negate);
+
+		src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
+		bs[i] = nir_src_bit_size(asrc->src);
+
+		compile_assert(ctx, src[i]);
+	}
+
+	switch (alu->op) {
+	case nir_op_f2f32:
+	case nir_op_f2f16_rtne:
+	case nir_op_f2f16_rtz:
+	case nir_op_f2f16:
+	case nir_op_f2i32:
+	case nir_op_f2i16:
+	case nir_op_f2i8:
+	case nir_op_f2u32:
+	case nir_op_f2u16:
+	case nir_op_f2u8:
+	case nir_op_i2f32:
+	case nir_op_i2f16:
+	case nir_op_i2i32:
+	case nir_op_i2i16:
+	case nir_op_i2i8:
+	case nir_op_u2f32:
+	case nir_op_u2f16:
+	case nir_op_u2u32:
+	case nir_op_u2u16:
+	case nir_op_u2u8:
+		dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
+		break;
+	case nir_op_f2b:
+		dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0);
+		dst[0]->cat2.condition = IR3_COND_NE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_b2f:
+		dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32);
+		break;
+	case nir_op_b2i:
+		dst[0] = ir3_b2n(b, src[0]);
+		break;
+	case nir_op_i2b:
+		dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
+		dst[0]->cat2.condition = IR3_COND_NE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+
+	case nir_op_fneg:
+		dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
+		break;
+	case nir_op_fabs:
+		dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
+		break;
+	case nir_op_fmax:
+		dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_fmin:
+		dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_fsat:
+		/* if there is just a single use of the src, and it supports
+		 * (sat) bit, we can just fold the (sat) flag back to the
+		 * src instruction and create a mov.  This is easier for cp
+		 * to eliminate.
+		 *
+		 * TODO probably opc_cat==4 is ok too
+		 */
+		if (alu->src[0].src.is_ssa &&
+				(list_length(&alu->src[0].src.ssa->uses) == 1) &&
+				((opc_cat(src[0]->opc) == 2) || (opc_cat(src[0]->opc) == 3))) {
+			src[0]->flags |= IR3_INSTR_SAT;
+			dst[0] = ir3_MOV(b, src[0], TYPE_U32);
+		} else {
+			/* otherwise generate a max.f that saturates.. blob does
+			 * similar (generating a cat2 mov using max.f)
+			 */
+			dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
+			dst[0]->flags |= IR3_INSTR_SAT;
+		}
+		break;
+	case nir_op_fmul:
+		dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_fadd:
+		dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_fsub:
+		dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
+		break;
+	case nir_op_ffma:
+		dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
+		break;
+	case nir_op_fddx:
+		dst[0] = ir3_DSX(b, src[0], 0);
+		dst[0]->cat5.type = TYPE_F32;
+		break;
+	case nir_op_fddy:
+		dst[0] = ir3_DSY(b, src[0], 0);
+		dst[0]->cat5.type = TYPE_F32;
+		break;
+		break;
+	case nir_op_flt:
+		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_LT;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_fge:
+		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_GE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_feq:
+		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_EQ;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_fne:
+		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_NE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_fceil:
+		dst[0] = ir3_CEIL_F(b, src[0], 0);
+		break;
+	case nir_op_ffloor:
+		dst[0] = ir3_FLOOR_F(b, src[0], 0);
+		break;
+	case nir_op_ftrunc:
+		dst[0] = ir3_TRUNC_F(b, src[0], 0);
+		break;
+	case nir_op_fround_even:
+		dst[0] = ir3_RNDNE_F(b, src[0], 0);
+		break;
+	case nir_op_fsign:
+		dst[0] = ir3_SIGN_F(b, src[0], 0);
+		break;
+
+	case nir_op_fsin:
+		dst[0] = ir3_SIN(b, src[0], 0);
+		break;
+	case nir_op_fcos:
+		dst[0] = ir3_COS(b, src[0], 0);
+		break;
+	case nir_op_frsq:
+		dst[0] = ir3_RSQ(b, src[0], 0);
+		break;
+	case nir_op_frcp:
+		dst[0] = ir3_RCP(b, src[0], 0);
+		break;
+	case nir_op_flog2:
+		dst[0] = ir3_LOG2(b, src[0], 0);
+		break;
+	case nir_op_fexp2:
+		dst[0] = ir3_EXP2(b, src[0], 0);
+		break;
+	case nir_op_fsqrt:
+		dst[0] = ir3_SQRT(b, src[0], 0);
+		break;
+
+	case nir_op_iabs:
+		dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
+		break;
+	case nir_op_iadd:
+		dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_iand:
+		dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_imax:
+		dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_umax:
+		dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_imin:
+		dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_umin:
+		dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_imul:
+		/*
+		 * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
+		 *   mull.u tmp0, a, b           ; mul low, i.e. al * bl
+		 *   madsh.m16 tmp1, a, b, tmp0  ; mul-add shift high mix, i.e. ah * bl << 16
+		 *   madsh.m16 dst, b, a, tmp1   ; i.e. al * bh << 16
+		 */
+		dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0,
+					ir3_MADSH_M16(b, src[0], 0, src[1], 0,
+						ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0);
+		break;
+	case nir_op_ineg:
+		dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
+		break;
+	case nir_op_inot:
+		dst[0] = ir3_NOT_B(b, src[0], 0);
+		break;
+	case nir_op_ior:
+		dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_ishl:
+		dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_ishr:
+		dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_isign: {
+		/* maybe this would be sane to lower in nir.. */
+		struct ir3_instruction *neg, *pos;
+
+		neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
+		neg->cat2.condition = IR3_COND_LT;
+
+		pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
+		pos->cat2.condition = IR3_COND_GT;
+
+		dst[0] = ir3_SUB_U(b, pos, 0, neg, 0);
+
+		break;
+	}
+	case nir_op_isub:
+		dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_ixor:
+		dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_ushr:
+		dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0);
+		break;
+	case nir_op_ilt:
+		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_LT;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_ige:
+		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_GE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_ieq:
+		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_EQ;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_ine:
+		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_NE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_ult:
+		dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_LT;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+	case nir_op_uge:
+		dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
+		dst[0]->cat2.condition = IR3_COND_GE;
+		dst[0] = ir3_n2b(b, dst[0]);
+		break;
+
+	case nir_op_bcsel: {
+		struct ir3_instruction *cond = ir3_b2n(b, src[0]);
+		compile_assert(ctx, bs[1] == bs[2]);
+		/* the boolean condition is 32b even if src[1] and src[2] are
+		 * half-precision, but sel.b16 wants all three src's to be the
+		 * same type.
+		 */
+		if (bs[1] < 32)
+			cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
+		dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
+		break;
+	}
+	case nir_op_bit_count:
+		dst[0] = ir3_CBITS_B(b, src[0], 0);
+		break;
+	case nir_op_ifind_msb: {
+		struct ir3_instruction *cmp;
+		dst[0] = ir3_CLZ_S(b, src[0], 0);
+		cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
+		cmp->cat2.condition = IR3_COND_GE;
+		dst[0] = ir3_SEL_B32(b,
+				ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
+				cmp, 0, dst[0], 0);
+		break;
+	}
+	case nir_op_ufind_msb:
+		dst[0] = ir3_CLZ_B(b, src[0], 0);
+		dst[0] = ir3_SEL_B32(b,
+				ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
+				src[0], 0, dst[0], 0);
+		break;
+	case nir_op_find_lsb:
+		dst[0] = ir3_BFREV_B(b, src[0], 0);
+		dst[0] = ir3_CLZ_B(b, dst[0], 0);
+		break;
+	case nir_op_bitfield_reverse:
+		dst[0] = ir3_BFREV_B(b, src[0], 0);
+		break;
+
+	default:
+		compile_error(ctx, "Unhandled ALU op: %s\n",
+				nir_op_infos[alu->op].name);
+		break;
+	}
+
+	put_dst(ctx, &alu->dest.dest);
+}
+
+/* handles direct/indirect UBO reads: */
+static void
+emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
+	nir_const_value *const_offset;
+	/* UBO addresses are the first driver params: */
+	unsigned ubo = regid(ctx->so->constbase.ubo, 0);
+	const unsigned ptrsz = pointer_size(ctx);
+
+	int off = 0;
+
+	/* First src is ubo index, which could either be an immed or not: */
+	src0 = get_src(ctx, &intr->src[0])[0];
+	if (is_same_type_mov(src0) &&
+			(src0->regs[1]->flags & IR3_REG_IMMED)) {
+		base_lo = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz));
+		base_hi = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz) + 1);
+	} else {
+		base_lo = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0, 4));
+		base_hi = create_uniform_indirect(ctx, ubo + 1, get_addr(ctx, src0, 4));
+	}
+
+	/* note: on 32bit gpu's base_hi is ignored and DCE'd */
+	addr = base_lo;
+
+	const_offset = nir_src_as_const_value(intr->src[1]);
+	if (const_offset) {
+		off += const_offset->u32[0];
+	} else {
+		/* For load_ubo_indirect, second src is indirect offset: */
+		src1 = get_src(ctx, &intr->src[1])[0];
+
+		/* and add offset to addr: */
+		addr = ir3_ADD_S(b, addr, 0, src1, 0);
+	}
+
+	/* if offset is to large to encode in the ldg, split it out: */
+	if ((off + (intr->num_components * 4)) > 1024) {
+		/* split out the minimal amount to improve the odds that
+		 * cp can fit the immediate in the add.s instruction:
+		 */
+		unsigned off2 = off + (intr->num_components * 4) - 1024;
+		addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
+		off -= off2;
+	}
+
+	if (ptrsz == 2) {
+		struct ir3_instruction *carry;
+
+		/* handle 32b rollover, ie:
+		 *   if (addr < base_lo)
+		 *      base_hi++
+		 */
+		carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
+		carry->cat2.condition = IR3_COND_LT;
+		base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
+
+		addr = create_collect(ctx, (struct ir3_instruction*[]){ addr, base_hi }, 2);
+	}
+
+	for (int i = 0; i < intr->num_components; i++) {
+		struct ir3_instruction *load =
+				ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
+		load->cat6.type = TYPE_U32;
+		load->cat6.src_offset = off + i * 4;     /* byte offset */
+		dst[i] = load;
+	}
+}
+
+/* src[] = { buffer_index, offset }. No const_index */
+static void
+emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *ldgb, *src0, *src1, *offset;
+	nir_const_value *const_offset;
+
+	/* can this be non-const buffer_index?  how do we handle that? */
+	const_offset = nir_src_as_const_value(intr->src[0]);
+	compile_assert(ctx, const_offset);
+
+	offset = get_src(ctx, &intr->src[1])[0];
+
+	/* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
+	src0 = create_collect(ctx, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
+	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+
+	ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0,
+			src0, 0, src1, 0);
+	ldgb->regs[0]->wrmask = MASK(intr->num_components);
+	ldgb->cat6.iim_val = intr->num_components;
+	ldgb->cat6.d = 4;
+	ldgb->cat6.type = TYPE_U32;
+	ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
+	ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
+
+	split_dest(b, dst, ldgb, 0, intr->num_components);
+}
+
+/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
+static void
+emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *stgb, *src0, *src1, *src2, *offset;
+	nir_const_value *const_offset;
+	/* TODO handle wrmask properly, see _store_shared().. but I think
+	 * it is more a PITA than that, since blob ends up loading the
+	 * masked components and writing them back out.
+	 */
+	unsigned wrmask = intr->const_index[0];
+	unsigned ncomp = ffs(~wrmask) - 1;
+
+	/* can this be non-const buffer_index?  how do we handle that? */
+	const_offset = nir_src_as_const_value(intr->src[1]);
+	compile_assert(ctx, const_offset);
+
+	offset = get_src(ctx, &intr->src[2])[0];
+
+	/* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
+	 * nir already *= 4:
+	 */
+	src0 = create_collect(ctx, get_src(ctx, &intr->src[0]), ncomp);
+	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+	src2 = create_collect(ctx, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
+
+	stgb = ir3_STGB(b, create_immed(b, const_offset->u32[0]), 0,
+			src0, 0, src1, 0, src2, 0);
+	stgb->cat6.iim_val = ncomp;
+	stgb->cat6.d = 4;
+	stgb->cat6.type = TYPE_U32;
+	stgb->barrier_class = IR3_BARRIER_BUFFER_W;
+	stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+
+	array_insert(b, b->keeps, stgb);
+}
+
+/* src[] = { block_index } */
+static void
+emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	/* SSBO size stored as a const starting at ssbo_sizes: */
+	unsigned blk_idx = nir_src_as_const_value(intr->src[0])->u32[0];
+	unsigned idx = regid(ctx->so->constbase.ssbo_sizes, 0) +
+		ctx->so->const_layout.ssbo_size.off[blk_idx];
+
+	debug_assert(ctx->so->const_layout.ssbo_size.mask & (1 << blk_idx));
+
+	dst[0] = create_uniform(ctx, idx);
+}
+
+/*
+ * SSBO atomic intrinsics
+ *
+ * All of the SSBO atomic memory operations read a value from memory,
+ * compute a new value using one of the operations below, write the new
+ * value to memory, and return the original value read.
+ *
+ * All operations take 3 sources except CompSwap that takes 4. These
+ * sources represent:
+ *
+ * 0: The SSBO buffer index.
+ * 1: The offset into the SSBO buffer of the variable that the atomic
+ *    operation will operate on.
+ * 2: The data parameter to the atomic function (i.e. the value to add
+ *    in ssbo_atomic_add, etc).
+ * 3: For CompSwap only: the second data parameter.
+ */
+static struct ir3_instruction *
+emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *offset;
+	nir_const_value *const_offset;
+	type_t type = TYPE_U32;
+
+	/* can this be non-const buffer_index?  how do we handle that? */
+	const_offset = nir_src_as_const_value(intr->src[0]);
+	compile_assert(ctx, const_offset);
+	ssbo = create_immed(b, const_offset->u32[0]);
+
+	offset = get_src(ctx, &intr->src[1])[0];
+
+	/* src0 is data (or uvec2(data, compare))
+	 * src1 is offset
+	 * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
+	 *
+	 * Note that nir already multiplies the offset by four
+	 */
+	src0 = get_src(ctx, &intr->src[2])[0];
+	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+	src2 = create_collect(ctx, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
+
+	switch (intr->intrinsic) {
+	case nir_intrinsic_ssbo_atomic_add:
+		atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_imin:
+		atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		type = TYPE_S32;
+		break;
+	case nir_intrinsic_ssbo_atomic_umin:
+		atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_imax:
+		atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		type = TYPE_S32;
+		break;
+	case nir_intrinsic_ssbo_atomic_umax:
+		atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_and:
+		atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_or:
+		atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_xor:
+		atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_exchange:
+		atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_ssbo_atomic_comp_swap:
+		/* for cmpxchg, src0 is [ui]vec2(data, compare): */
+		src0 = create_collect(ctx, (struct ir3_instruction*[]){
+			get_src(ctx, &intr->src[3])[0],
+			src0,
+		}, 2);
+		atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	default:
+		unreachable("boo");
+	}
+
+	atomic->cat6.iim_val = 1;
+	atomic->cat6.d = 4;
+	atomic->cat6.type = type;
+	atomic->barrier_class = IR3_BARRIER_BUFFER_W;
+	atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+
+	/* even if nothing consume the result, we can't DCE the instruction: */
+	array_insert(b, b->keeps, atomic);
+
+	return atomic;
+}
+
+/* src[] = { offset }. const_index[] = { base } */
+static void
+emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *ldl, *offset;
+	unsigned base;
+
+	offset = get_src(ctx, &intr->src[0])[0];
+	base   = nir_intrinsic_base(intr);
+
+	ldl = ir3_LDL(b, offset, 0, create_immed(b, intr->num_components), 0);
+	ldl->cat6.src_offset = base;
+	ldl->cat6.type = utype_dst(intr->dest);
+	ldl->regs[0]->wrmask = MASK(intr->num_components);
+
+	ldl->barrier_class = IR3_BARRIER_SHARED_R;
+	ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
+
+	split_dest(b, dst, ldl, 0, intr->num_components);
+}
+
+/* src[] = { value, offset }. const_index[] = { base, write_mask } */
+static void
+emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *stl, *offset;
+	struct ir3_instruction * const *value;
+	unsigned base, wrmask;
+
+	value  = get_src(ctx, &intr->src[0]);
+	offset = get_src(ctx, &intr->src[1])[0];
+
+	base   = nir_intrinsic_base(intr);
+	wrmask = nir_intrinsic_write_mask(intr);
+
+	/* Combine groups of consecutive enabled channels in one write
+	 * message. We use ffs to find the first enabled channel and then ffs on
+	 * the bit-inverse, down-shifted writemask to determine the length of
+	 * the block of enabled bits.
+	 *
+	 * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic())
+	 */
+	while (wrmask) {
+		unsigned first_component = ffs(wrmask) - 1;
+		unsigned length = ffs(~(wrmask >> first_component)) - 1;
+
+		stl = ir3_STL(b, offset, 0,
+			create_collect(ctx, &value[first_component], length), 0,
+			create_immed(b, length), 0);
+		stl->cat6.dst_offset = first_component + base;
+		stl->cat6.type = utype_src(intr->src[0]);
+		stl->barrier_class = IR3_BARRIER_SHARED_W;
+		stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+
+		array_insert(b, b->keeps, stl);
+
+		/* Clear the bits in the writemask that we just wrote, then try
+		 * again to see if more channels are left.
+		 */
+		wrmask &= (15 << (first_component + length));
+	}
+}
+
+/*
+ * CS shared variable atomic intrinsics
+ *
+ * All of the shared variable atomic memory operations read a value from
+ * memory, compute a new value using one of the operations below, write the
+ * new value to memory, and return the original value read.
+ *
+ * All operations take 2 sources except CompSwap that takes 3. These
+ * sources represent:
+ *
+ * 0: The offset into the shared variable storage region that the atomic
+ *    operation will operate on.
+ * 1: The data parameter to the atomic function (i.e. the value to add
+ *    in shared_atomic_add, etc).
+ * 2: For CompSwap only: the second data parameter.
+ */
+static struct ir3_instruction *
+emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *atomic, *src0, *src1;
+	type_t type = TYPE_U32;
+
+	src0 = get_src(ctx, &intr->src[0])[0];   /* offset */
+	src1 = get_src(ctx, &intr->src[1])[0];   /* value */
+
+	switch (intr->intrinsic) {
+	case nir_intrinsic_shared_atomic_add:
+		atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_imin:
+		atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+		type = TYPE_S32;
+		break;
+	case nir_intrinsic_shared_atomic_umin:
+		atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_imax:
+		atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+		type = TYPE_S32;
+		break;
+	case nir_intrinsic_shared_atomic_umax:
+		atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_and:
+		atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_or:
+		atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_xor:
+		atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_exchange:
+		atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
+		break;
+	case nir_intrinsic_shared_atomic_comp_swap:
+		/* for cmpxchg, src1 is [ui]vec2(data, compare): */
+		src1 = create_collect(ctx, (struct ir3_instruction*[]){
+			get_src(ctx, &intr->src[2])[0],
+			src1,
+		}, 2);
+		atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
+		break;
+	default:
+		unreachable("boo");
+	}
+
+	atomic->cat6.iim_val = 1;
+	atomic->cat6.d = 1;
+	atomic->cat6.type = type;
+	atomic->barrier_class = IR3_BARRIER_SHARED_W;
+	atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+
+	/* even if nothing consume the result, we can't DCE the instruction: */
+	array_insert(b, b->keeps, atomic);
+
+	return atomic;
+}
+
+/* Images get mapped into SSBO/image state (for store/atomic) and texture
+ * state block (for load).  To simplify things, invert the image id and
+ * map it from end of state block, ie. image 0 becomes num-1, image 1
+ * becomes num-2, etc.  This potentially avoids needing to re-emit texture
+ * state when switching shaders.
+ *
+ * TODO is max # of samplers and SSBOs the same.  This shouldn't be hard-
+ * coded.  Also, since all the gl shader stages (ie. everything but CS)
+ * share the same SSBO/image state block, this might require some more
+ * logic if we supported images in anything other than FS..
+ */
+static unsigned
+get_image_slot(struct ir3_context *ctx, nir_deref_instr *deref)
+{
+	unsigned int loc = 0;
+	unsigned inner_size = 1;
+
+	while (deref->deref_type != nir_deref_type_var) {
+		assert(deref->deref_type == nir_deref_type_array);
+		nir_const_value *const_index = nir_src_as_const_value(deref->arr.index);
+		assert(const_index);
+
+		/* Go to the next instruction */
+		deref = nir_deref_instr_parent(deref);
+
+		assert(glsl_type_is_array(deref->type));
+		const unsigned array_len = glsl_get_length(deref->type);
+		loc += MIN2(const_index->u32[0], array_len - 1) * inner_size;
+
+		/* Update the inner size */
+		inner_size *= array_len;
+	}
+
+	loc += deref->var->data.driver_location;
+
+	/* TODO figure out real limit per generation, and don't hardcode: */
+	const unsigned max_samplers = 16;
+	return max_samplers - loc - 1;
+}
+
+/* see tex_info() for equiv logic for texture instructions.. it would be
+ * nice if this could be better unified..
+ */
+static unsigned
+get_image_coords(const nir_variable *var, unsigned *flagsp)
+{
+	const struct glsl_type *type = glsl_without_array(var->type);
+	unsigned coords, flags = 0;
+
+	switch (glsl_get_sampler_dim(type)) {
+	case GLSL_SAMPLER_DIM_1D:
+	case GLSL_SAMPLER_DIM_BUF:
+		coords = 1;
+		break;
+	case GLSL_SAMPLER_DIM_2D:
+	case GLSL_SAMPLER_DIM_RECT:
+	case GLSL_SAMPLER_DIM_EXTERNAL:
+	case GLSL_SAMPLER_DIM_MS:
+		coords = 2;
+		break;
+	case GLSL_SAMPLER_DIM_3D:
+	case GLSL_SAMPLER_DIM_CUBE:
+		flags |= IR3_INSTR_3D;
+		coords = 3;
+		break;
+	default:
+		unreachable("bad sampler dim");
+		return 0;
+	}
+
+	if (glsl_sampler_type_is_array(type)) {
+		/* note: unlike tex_info(), adjust # of coords to include array idx: */
+		coords++;
+		flags |= IR3_INSTR_A;
+	}
+
+	if (flagsp)
+		*flagsp = flags;
+
+	return coords;
+}
+
+static type_t
+get_image_type(const nir_variable *var)
+{
+	switch (glsl_get_sampler_result_type(glsl_without_array(var->type))) {
+	case GLSL_TYPE_UINT:
+		return TYPE_U32;
+	case GLSL_TYPE_INT:
+		return TYPE_S32;
+	case GLSL_TYPE_FLOAT:
+		return TYPE_F32;
+	default:
+		unreachable("bad sampler type.");
+		return 0;
+	}
+}
+
+static struct ir3_instruction *
+get_image_offset(struct ir3_context *ctx, const nir_variable *var,
+		struct ir3_instruction * const *coords, bool byteoff)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *offset;
+	unsigned ncoords = get_image_coords(var, NULL);
+
+	/* to calculate the byte offset (yes, uggg) we need (up to) three
+	 * const values to know the bytes per pixel, and y and z stride:
+	 */
+	unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
+		ctx->so->const_layout.image_dims.off[var->data.driver_location];
+
+	debug_assert(ctx->so->const_layout.image_dims.mask &
+			(1 << var->data.driver_location));
+
+	/* offset = coords.x * bytes_per_pixel: */
+	offset = ir3_MUL_S(b, coords[0], 0, create_uniform(ctx, cb + 0), 0);
+	if (ncoords > 1) {
+		/* offset += coords.y * y_pitch: */
+		offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 1), 0,
+				coords[1], 0, offset, 0);
+	}
+	if (ncoords > 2) {
+		/* offset += coords.z * z_pitch: */
+		offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 2), 0,
+				coords[2], 0, offset, 0);
+	}
+
+	if (!byteoff) {
+		/* Some cases, like atomics, seem to use dword offset instead
+		 * of byte offsets.. blob just puts an extra shr.b in there
+		 * in those cases:
+		 */
+		offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+	}
+
+	return create_collect(ctx, (struct ir3_instruction*[]){
+		offset,
+		create_immed(b, 0),
+	}, 2);
+}
+
+/* src[] = { deref, coord, sample_index }. const_index[] = {} */
+static void
+emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	struct ir3_block *b = ctx->block;
+	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+	struct ir3_instruction *sam;
+	struct ir3_instruction * const *src0 = get_src(ctx, &intr->src[1]);
+	struct ir3_instruction *coords[4];
+	unsigned flags, ncoords = get_image_coords(var, &flags);
+	unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
+	type_t type = get_image_type(var);
+
+	/* hmm, this seems a bit odd, but it is what blob does and (at least
+	 * a5xx) just faults on bogus addresses otherwise:
+	 */
+	if (flags & IR3_INSTR_3D) {
+		flags &= ~IR3_INSTR_3D;
+		flags |= IR3_INSTR_A;
+	}
+
+	for (unsigned i = 0; i < ncoords; i++)
+		coords[i] = src0[i];
+
+	if (ncoords == 1)
+		coords[ncoords++] = create_immed(b, 0);
+
+	sam = ir3_SAM(b, OPC_ISAM, type, 0b1111, flags,
+			tex_idx, tex_idx, create_collect(ctx, coords, ncoords), NULL);
+
+	sam->barrier_class = IR3_BARRIER_IMAGE_R;
+	sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
+
+	split_dest(b, dst, sam, 0, 4);
+}
+
+/* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
+static void
+emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+	struct ir3_instruction *stib, *offset;
+	struct ir3_instruction * const *value = get_src(ctx, &intr->src[3]);
+	struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
+	unsigned ncoords = get_image_coords(var, NULL);
+	unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
+
+	/* src0 is value
+	 * src1 is coords
+	 * src2 is 64b byte offset
+	 */
+
+	offset = get_image_offset(ctx, var, coords, true);
+
+	/* NOTE: stib seems to take byte offset, but stgb.typed can be used
+	 * too and takes a dword offset.. not quite sure yet why blob uses
+	 * one over the other in various cases.
+	 */
+
+	stib = ir3_STIB(b, create_immed(b, tex_idx), 0,
+			create_collect(ctx, value, 4), 0,
+			create_collect(ctx, coords, ncoords), 0,
+			offset, 0);
+	stib->cat6.iim_val = 4;
+	stib->cat6.d = ncoords;
+	stib->cat6.type = get_image_type(var);
+	stib->cat6.typed = true;
+	stib->barrier_class = IR3_BARRIER_IMAGE_W;
+	stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
+
+	array_insert(b, b->keeps, stib);
+}
+
+static void
+emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+		struct ir3_instruction **dst)
+{
+	struct ir3_block *b = ctx->block;
+	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+	unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
+	struct ir3_instruction *sam, *lod;
+	unsigned flags, ncoords = get_image_coords(var, &flags);
+
+	lod = create_immed(b, 0);
+	sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags,
+			tex_idx, tex_idx, lod, NULL);
+
+	/* Array size actually ends up in .w rather than .z. This doesn't
+	 * matter for miplevel 0, but for higher mips the value in z is
+	 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
+	 * returned, which means that we have to add 1 to it for arrays for
+	 * a3xx.
+	 *
+	 * Note use a temporary dst and then copy, since the size of the dst
+	 * array that is passed in is based on nir's understanding of the
+	 * result size, not the hardware's
+	 */
+	struct ir3_instruction *tmp[4];
+
+	split_dest(b, tmp, sam, 0, 4);
+
+	/* get_size instruction returns size in bytes instead of texels
+	 * for imageBuffer, so we need to divide it by the pixel size
+	 * of the image format.
+	 *
+	 * TODO: This is at least true on a5xx. Check other gens.
+	 */
+	enum glsl_sampler_dim dim =
+		glsl_get_sampler_dim(glsl_without_array(var->type));
+	if (dim == GLSL_SAMPLER_DIM_BUF) {
+		/* Since all the possible values the divisor can take are
+		 * power-of-two (4, 8, or 16), the division is implemented
+		 * as a shift-right.
+		 * During shader setup, the log2 of the image format's
+		 * bytes-per-pixel should have been emitted in 2nd slot of
+		 * image_dims. See ir3_shader::emit_image_dims().
+		 */
+		unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
+			ctx->so->const_layout.image_dims.off[var->data.driver_location];
+		struct ir3_instruction *aux = create_uniform(ctx, cb + 1);
+
+		tmp[0] = ir3_SHR_B(b, tmp[0], 0, aux, 0);
+	}
+
+	for (unsigned i = 0; i < ncoords; i++)
+		dst[i] = tmp[i];
+
+	if (flags & IR3_INSTR_A) {
+		if (ctx->compiler->levels_add_one) {
+			dst[ncoords-1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
+		} else {
+			dst[ncoords-1] = ir3_MOV(b, tmp[3], TYPE_U32);
+		}
+	}
+}
+
+/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
+static struct ir3_instruction *
+emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+	struct ir3_instruction *atomic, *image, *src0, *src1, *src2;
+	struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
+	unsigned ncoords = get_image_coords(var, NULL);
+
+	image = create_immed(b, get_image_slot(ctx, nir_src_as_deref(intr->src[0])));
+
+	/* src0 is value (or uvec2(value, compare))
+	 * src1 is coords
+	 * src2 is 64b byte offset
+	 */
+	src0 = get_src(ctx, &intr->src[3])[0];
+	src1 = create_collect(ctx, coords, ncoords);
+	src2 = get_image_offset(ctx, var, coords, false);
+
+	switch (intr->intrinsic) {
+	case nir_intrinsic_image_deref_atomic_add:
+		atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_min:
+		atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_max:
+		atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_and:
+		atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_or:
+		atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_xor:
+		atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_exchange:
+		atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	case nir_intrinsic_image_deref_atomic_comp_swap:
+		/* for cmpxchg, src0 is [ui]vec2(data, compare): */
+		src0 = create_collect(ctx, (struct ir3_instruction*[]){
+			get_src(ctx, &intr->src[4])[0],
+			src0,
+		}, 2);
+		atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+		break;
+	default:
+		unreachable("boo");
+	}
+
+	atomic->cat6.iim_val = 1;
+	atomic->cat6.d = ncoords;
+	atomic->cat6.type = get_image_type(var);
+	atomic->cat6.typed = true;
+	atomic->barrier_class = IR3_BARRIER_IMAGE_W;
+	atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
+
+	/* even if nothing consume the result, we can't DCE the instruction: */
+	array_insert(b, b->keeps, atomic);
+
+	return atomic;
+}
+
+static void
+emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *barrier;
+
+	switch (intr->intrinsic) {
+	case nir_intrinsic_barrier:
+		barrier = ir3_BAR(b);
+		barrier->cat7.g = true;
+		barrier->cat7.l = true;
+		barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
+		barrier->barrier_class = IR3_BARRIER_EVERYTHING;
+		break;
+	case nir_intrinsic_memory_barrier:
+		barrier = ir3_FENCE(b);
+		barrier->cat7.g = true;
+		barrier->cat7.r = true;
+		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_IMAGE_W |
+				IR3_BARRIER_BUFFER_W;
+		barrier->barrier_conflict =
+				IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
+				IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+		break;
+	case nir_intrinsic_memory_barrier_atomic_counter:
+	case nir_intrinsic_memory_barrier_buffer:
+		barrier = ir3_FENCE(b);
+		barrier->cat7.g = true;
+		barrier->cat7.r = true;
+		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_BUFFER_W;
+		barrier->barrier_conflict = IR3_BARRIER_BUFFER_R |
+				IR3_BARRIER_BUFFER_W;
+		break;
+	case nir_intrinsic_memory_barrier_image:
+		// TODO double check if this should have .g set
+		barrier = ir3_FENCE(b);
+		barrier->cat7.g = true;
+		barrier->cat7.r = true;
+		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_IMAGE_W;
+		barrier->barrier_conflict = IR3_BARRIER_IMAGE_R |
+				IR3_BARRIER_IMAGE_W;
+		break;
+	case nir_intrinsic_memory_barrier_shared:
+		barrier = ir3_FENCE(b);
+		barrier->cat7.g = true;
+		barrier->cat7.l = true;
+		barrier->cat7.r = true;
+		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_SHARED_W;
+		barrier->barrier_conflict = IR3_BARRIER_SHARED_R |
+				IR3_BARRIER_SHARED_W;
+		break;
+	case nir_intrinsic_group_memory_barrier:
+		barrier = ir3_FENCE(b);
+		barrier->cat7.g = true;
+		barrier->cat7.l = true;
+		barrier->cat7.r = true;
+		barrier->cat7.w = true;
+		barrier->barrier_class = IR3_BARRIER_SHARED_W |
+				IR3_BARRIER_IMAGE_W |
+				IR3_BARRIER_BUFFER_W;
+		barrier->barrier_conflict =
+				IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
+				IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
+				IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+		break;
+	default:
+		unreachable("boo");
+	}
+
+	/* make sure barrier doesn't get DCE'd */
+	array_insert(b, b->keeps, barrier);
+}
+
+static void add_sysval_input_compmask(struct ir3_context *ctx,
+		gl_system_value slot, unsigned compmask,
+		struct ir3_instruction *instr)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	unsigned r = regid(so->inputs_count, 0);
+	unsigned n = so->inputs_count++;
+
+	so->inputs[n].sysval = true;
+	so->inputs[n].slot = slot;
+	so->inputs[n].compmask = compmask;
+	so->inputs[n].regid = r;
+	so->inputs[n].interpolate = INTERP_MODE_FLAT;
+	so->total_in++;
+
+	ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1);
+	ctx->ir->inputs[r] = instr;
+}
+
+static void add_sysval_input(struct ir3_context *ctx, gl_system_value slot,
+		struct ir3_instruction *instr)
+{
+	add_sysval_input_compmask(ctx, slot, 0x1, instr);
+}
+
+static void
+emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+	const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
+	struct ir3_instruction **dst;
+	struct ir3_instruction * const *src;
+	struct ir3_block *b = ctx->block;
+	nir_const_value *const_offset;
+	int idx, comp;
+
+	if (info->has_dest) {
+		unsigned n = nir_intrinsic_dest_components(intr);
+		dst = get_dst(ctx, &intr->dest, n);
+	} else {
+		dst = NULL;
+	}
+
+	switch (intr->intrinsic) {
+	case nir_intrinsic_load_uniform:
+		idx = nir_intrinsic_base(intr);
+		const_offset = nir_src_as_const_value(intr->src[0]);
+		if (const_offset) {
+			idx += const_offset->u32[0];
+			for (int i = 0; i < intr->num_components; i++) {
+				unsigned n = idx * 4 + i;
+				dst[i] = create_uniform(ctx, n);
+			}
+		} else {
+			src = get_src(ctx, &intr->src[0]);
+			for (int i = 0; i < intr->num_components; i++) {
+				int n = idx * 4 + i;
+				dst[i] = create_uniform_indirect(ctx, n,
+						get_addr(ctx, src[0], 4));
+			}
+			/* NOTE: if relative addressing is used, we set
+			 * constlen in the compiler (to worst-case value)
+			 * since we don't know in the assembler what the max
+			 * addr reg value can be:
+			 */
+			ctx->so->constlen = ctx->s->num_uniforms;
+		}
+		break;
+	case nir_intrinsic_load_ubo:
+		emit_intrinsic_load_ubo(ctx, intr, dst);
+		break;
+	case nir_intrinsic_load_input:
+		idx = nir_intrinsic_base(intr);
+		comp = nir_intrinsic_component(intr);
+		const_offset = nir_src_as_const_value(intr->src[0]);
+		if (const_offset) {
+			idx += const_offset->u32[0];
+			for (int i = 0; i < intr->num_components; i++) {
+				unsigned n = idx * 4 + i + comp;
+				dst[i] = ctx->ir->inputs[n];
+			}
+		} else {
+			src = get_src(ctx, &intr->src[0]);
+			struct ir3_instruction *collect =
+					create_collect(ctx, ctx->ir->inputs, ctx->ir->ninputs);
+			struct ir3_instruction *addr = get_addr(ctx, src[0], 4);
+			for (int i = 0; i < intr->num_components; i++) {
+				unsigned n = idx * 4 + i + comp;
+				dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
+						n, addr, collect);
+			}
+		}
+		break;
+	case nir_intrinsic_load_ssbo:
+		emit_intrinsic_load_ssbo(ctx, intr, dst);
+		break;
+	case nir_intrinsic_store_ssbo:
+		emit_intrinsic_store_ssbo(ctx, intr);
+		break;
+	case nir_intrinsic_get_buffer_size:
+		emit_intrinsic_ssbo_size(ctx, intr, dst);
+		break;
+	case nir_intrinsic_ssbo_atomic_add:
+	case nir_intrinsic_ssbo_atomic_imin:
+	case nir_intrinsic_ssbo_atomic_umin:
+	case nir_intrinsic_ssbo_atomic_imax:
+	case nir_intrinsic_ssbo_atomic_umax:
+	case nir_intrinsic_ssbo_atomic_and:
+	case nir_intrinsic_ssbo_atomic_or:
+	case nir_intrinsic_ssbo_atomic_xor:
+	case nir_intrinsic_ssbo_atomic_exchange:
+	case nir_intrinsic_ssbo_atomic_comp_swap:
+		dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr);
+		break;
+	case nir_intrinsic_load_shared:
+		emit_intrinsic_load_shared(ctx, intr, dst);
+		break;
+	case nir_intrinsic_store_shared:
+		emit_intrinsic_store_shared(ctx, intr);
+		break;
+	case nir_intrinsic_shared_atomic_add:
+	case nir_intrinsic_shared_atomic_imin:
+	case nir_intrinsic_shared_atomic_umin:
+	case nir_intrinsic_shared_atomic_imax:
+	case nir_intrinsic_shared_atomic_umax:
+	case nir_intrinsic_shared_atomic_and:
+	case nir_intrinsic_shared_atomic_or:
+	case nir_intrinsic_shared_atomic_xor:
+	case nir_intrinsic_shared_atomic_exchange:
+	case nir_intrinsic_shared_atomic_comp_swap:
+		dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
+		break;
+	case nir_intrinsic_image_deref_load:
+		emit_intrinsic_load_image(ctx, intr, dst);
+		break;
+	case nir_intrinsic_image_deref_store:
+		emit_intrinsic_store_image(ctx, intr);
+		break;
+	case nir_intrinsic_image_deref_size:
+		emit_intrinsic_image_size(ctx, intr, dst);
+		break;
+	case nir_intrinsic_image_deref_atomic_add:
+	case nir_intrinsic_image_deref_atomic_min:
+	case nir_intrinsic_image_deref_atomic_max:
+	case nir_intrinsic_image_deref_atomic_and:
+	case nir_intrinsic_image_deref_atomic_or:
+	case nir_intrinsic_image_deref_atomic_xor:
+	case nir_intrinsic_image_deref_atomic_exchange:
+	case nir_intrinsic_image_deref_atomic_comp_swap:
+		dst[0] = emit_intrinsic_atomic_image(ctx, intr);
+		break;
+	case nir_intrinsic_barrier:
+	case nir_intrinsic_memory_barrier:
+	case nir_intrinsic_group_memory_barrier:
+	case nir_intrinsic_memory_barrier_atomic_counter:
+	case nir_intrinsic_memory_barrier_buffer:
+	case nir_intrinsic_memory_barrier_image:
+	case nir_intrinsic_memory_barrier_shared:
+		emit_intrinsic_barrier(ctx, intr);
+		/* note that blk ptr no longer valid, make that obvious: */
+		b = NULL;
+		break;
+	case nir_intrinsic_store_output:
+		idx = nir_intrinsic_base(intr);
+		comp = nir_intrinsic_component(intr);
+		const_offset = nir_src_as_const_value(intr->src[1]);
+		compile_assert(ctx, const_offset != NULL);
+		idx += const_offset->u32[0];
+
+		src = get_src(ctx, &intr->src[0]);
+		for (int i = 0; i < intr->num_components; i++) {
+			unsigned n = idx * 4 + i + comp;
+			ctx->ir->outputs[n] = src[i];
+		}
+		break;
+	case nir_intrinsic_load_base_vertex:
+	case nir_intrinsic_load_first_vertex:
+		if (!ctx->basevertex) {
+			ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
+			add_sysval_input(ctx, SYSTEM_VALUE_FIRST_VERTEX, ctx->basevertex);
+		}
+		dst[0] = ctx->basevertex;
+		break;
+	case nir_intrinsic_load_vertex_id_zero_base:
+	case nir_intrinsic_load_vertex_id:
+		if (!ctx->vertex_id) {
+			gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) ?
+				SYSTEM_VALUE_VERTEX_ID : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
+			ctx->vertex_id = create_input(ctx, 0);
+			add_sysval_input(ctx, sv, ctx->vertex_id);
+		}
+		dst[0] = ctx->vertex_id;
+		break;
+	case nir_intrinsic_load_instance_id:
+		if (!ctx->instance_id) {
+			ctx->instance_id = create_input(ctx, 0);
+			add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID,
+					ctx->instance_id);
+		}
+		dst[0] = ctx->instance_id;
+		break;
+	case nir_intrinsic_load_sample_id:
+	case nir_intrinsic_load_sample_id_no_per_sample:
+		if (!ctx->samp_id) {
+			ctx->samp_id = create_input(ctx, 0);
+			ctx->samp_id->regs[0]->flags |= IR3_REG_HALF;
+			add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID,
+					ctx->samp_id);
+		}
+		dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
+		break;
+	case nir_intrinsic_load_sample_mask_in:
+		if (!ctx->samp_mask_in) {
+			ctx->samp_mask_in = create_input(ctx, 0);
+			add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN,
+					ctx->samp_mask_in);
+		}
+		dst[0] = ctx->samp_mask_in;
+		break;
+	case nir_intrinsic_load_user_clip_plane:
+		idx = nir_intrinsic_ucp_id(intr);
+		for (int i = 0; i < intr->num_components; i++) {
+			unsigned n = idx * 4 + i;
+			dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
+		}
+		break;
+	case nir_intrinsic_load_front_face:
+		if (!ctx->frag_face) {
+			ctx->so->frag_face = true;
+			ctx->frag_face = create_input(ctx, 0);
+			add_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, ctx->frag_face);
+			ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
+		}
+		/* for fragface, we get -1 for back and 0 for front. However this is
+		 * the inverse of what nir expects (where ~0 is true).
+		 */
+		dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32);
+		dst[0] = ir3_NOT_B(b, dst[0], 0);
+		break;
+	case nir_intrinsic_load_local_invocation_id:
+		if (!ctx->local_invocation_id) {
+			ctx->local_invocation_id = create_input_compmask(ctx, 0, 0x7);
+			add_sysval_input_compmask(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID,
+					0x7, ctx->local_invocation_id);
+		}
+		split_dest(b, dst, ctx->local_invocation_id, 0, 3);
+		break;
+	case nir_intrinsic_load_work_group_id:
+		if (!ctx->work_group_id) {
+			ctx->work_group_id = create_input_compmask(ctx, 0, 0x7);
+			add_sysval_input_compmask(ctx, SYSTEM_VALUE_WORK_GROUP_ID,
+					0x7, ctx->work_group_id);
+			ctx->work_group_id->regs[0]->flags |= IR3_REG_HIGH;
+		}
+		split_dest(b, dst, ctx->work_group_id, 0, 3);
+		break;
+	case nir_intrinsic_load_num_work_groups:
+		for (int i = 0; i < intr->num_components; i++) {
+			dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
+		}
+		break;
+	case nir_intrinsic_load_local_group_size:
+		for (int i = 0; i < intr->num_components; i++) {
+			dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
+		}
+		break;
+	case nir_intrinsic_discard_if:
+	case nir_intrinsic_discard: {
+		struct ir3_instruction *cond, *kill;
+
+		if (intr->intrinsic == nir_intrinsic_discard_if) {
+			/* conditional discard: */
+			src = get_src(ctx, &intr->src[0]);
+			cond = ir3_b2n(b, src[0]);
+		} else {
+			/* unconditional discard: */
+			cond = create_immed(b, 1);
+		}
+
+		/* NOTE: only cmps.*.* can write p0.x: */
+		cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
+		cond->cat2.condition = IR3_COND_NE;
+
+		/* condition always goes in predicate register: */
+		cond->regs[0]->num = regid(REG_P0, 0);
+
+		kill = ir3_KILL(b, cond, 0);
+		array_insert(ctx->ir, ctx->ir->predicates, kill);
+
+		array_insert(b, b->keeps, kill);
+		ctx->so->has_kill = true;
+
+		break;
+	}
+	default:
+		compile_error(ctx, "Unhandled intrinsic type: %s\n",
+				nir_intrinsic_infos[intr->intrinsic].name);
+		break;
+	}
+
+	if (info->has_dest)
+		put_dst(ctx, &intr->dest);
+}
+
+static void
+emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
+{
+	struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
+			instr->def.num_components);
+	type_t type = (instr->def.bit_size < 32) ? TYPE_U16 : TYPE_U32;
+
+	for (int i = 0; i < instr->def.num_components; i++)
+		dst[i] = create_immed_typed(ctx->block, instr->value.u32[i], type);
+}
+
+static void
+emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)
+{
+	struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def,
+			undef->def.num_components);
+	type_t type = (undef->def.bit_size < 32) ? TYPE_U16 : TYPE_U32;
+
+	/* backend doesn't want undefined instructions, so just plug
+	 * in 0.0..
+	 */
+	for (int i = 0; i < undef->def.num_components; i++)
+		dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
+}
+
+/*
+ * texture fetch/sample instructions:
+ */
+
+static void
+tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
+{
+	unsigned coords, flags = 0;
+
+	/* note: would use tex->coord_components.. except txs.. also,
+	 * since array index goes after shadow ref, we don't want to
+	 * count it:
+	 */
+	switch (tex->sampler_dim) {
+	case GLSL_SAMPLER_DIM_1D:
+	case GLSL_SAMPLER_DIM_BUF:
+		coords = 1;
+		break;
+	case GLSL_SAMPLER_DIM_2D:
+	case GLSL_SAMPLER_DIM_RECT:
+	case GLSL_SAMPLER_DIM_EXTERNAL:
+	case GLSL_SAMPLER_DIM_MS:
+		coords = 2;
+		break;
+	case GLSL_SAMPLER_DIM_3D:
+	case GLSL_SAMPLER_DIM_CUBE:
+		coords = 3;
+		flags |= IR3_INSTR_3D;
+		break;
+	default:
+		unreachable("bad sampler_dim");
+	}
+
+	if (tex->is_shadow && tex->op != nir_texop_lod)
+		flags |= IR3_INSTR_S;
+
+	if (tex->is_array && tex->op != nir_texop_lod)
+		flags |= IR3_INSTR_A;
+
+	*flagsp = flags;
+	*coordsp = coords;
+}
+
+static void
+emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
+	struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy;
+	struct ir3_instruction *lod, *compare, *proj, *sample_index;
+	bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
+	unsigned i, coords, flags;
+	unsigned nsrc0 = 0, nsrc1 = 0;
+	type_t type;
+	opc_t opc = 0;
+
+	coord = off = ddx = ddy = NULL;
+	lod = proj = compare = sample_index = NULL;
+
+	/* TODO: might just be one component for gathers? */
+	dst = get_dst(ctx, &tex->dest, 4);
+
+	for (unsigned i = 0; i < tex->num_srcs; i++) {
+		switch (tex->src[i].src_type) {
+		case nir_tex_src_coord:
+			coord = get_src(ctx, &tex->src[i].src);
+			break;
+		case nir_tex_src_bias:
+			lod = get_src(ctx, &tex->src[i].src)[0];
+			has_bias = true;
+			break;
+		case nir_tex_src_lod:
+			lod = get_src(ctx, &tex->src[i].src)[0];
+			has_lod = true;
+			break;
+		case nir_tex_src_comparator: /* shadow comparator */
+			compare = get_src(ctx, &tex->src[i].src)[0];
+			break;
+		case nir_tex_src_projector:
+			proj = get_src(ctx, &tex->src[i].src)[0];
+			has_proj = true;
+			break;
+		case nir_tex_src_offset:
+			off = get_src(ctx, &tex->src[i].src);
+			has_off = true;
+			break;
+		case nir_tex_src_ddx:
+			ddx = get_src(ctx, &tex->src[i].src);
+			break;
+		case nir_tex_src_ddy:
+			ddy = get_src(ctx, &tex->src[i].src);
+			break;
+		case nir_tex_src_ms_index:
+			sample_index = get_src(ctx, &tex->src[i].src)[0];
+			break;
+		default:
+			compile_error(ctx, "Unhandled NIR tex src type: %d\n",
+					tex->src[i].src_type);
+			return;
+		}
+	}
+
+	switch (tex->op) {
+	case nir_texop_tex:      opc = has_lod ? OPC_SAML : OPC_SAM; break;
+	case nir_texop_txb:      opc = OPC_SAMB;     break;
+	case nir_texop_txl:      opc = OPC_SAML;     break;
+	case nir_texop_txd:      opc = OPC_SAMGQ;    break;
+	case nir_texop_txf:      opc = OPC_ISAML;    break;
+	case nir_texop_lod:      opc = OPC_GETLOD;   break;
+	case nir_texop_tg4:
+		/* NOTE: a4xx might need to emulate gather w/ txf (this is
+		 * what blob does, seems gather  is broken?), and a3xx did
+		 * not support it (but probably could also emulate).
+		 */
+		switch (tex->component) {
+		case 0:              opc = OPC_GATHER4R; break;
+		case 1:              opc = OPC_GATHER4G; break;
+		case 2:              opc = OPC_GATHER4B; break;
+		case 3:              opc = OPC_GATHER4A; break;
+		}
+		break;
+	case nir_texop_txf_ms:   opc = OPC_ISAMM;    break;
+	case nir_texop_txs:
+	case nir_texop_query_levels:
+	case nir_texop_texture_samples:
+	case nir_texop_samples_identical:
+	case nir_texop_txf_ms_mcs:
+		compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
+		return;
+	}
+
+	tex_info(tex, &flags, &coords);
+
+	/*
+	 * lay out the first argument in the proper order:
+	 *  - actual coordinates first
+	 *  - shadow reference
+	 *  - array index
+	 *  - projection w
+	 *  - starting at offset 4, dpdx.xy, dpdy.xy
+	 *
+	 * bias/lod go into the second arg
+	 */
+
+	/* insert tex coords: */
+	for (i = 0; i < coords; i++)
+		src0[i] = coord[i];
+
+	nsrc0 = i;
+
+	/* NOTE a3xx (and possibly a4xx?) might be different, using isaml
+	 * with scaled x coord according to requested sample:
+	 */
+	if (tex->op == nir_texop_txf_ms) {
+		if (ctx->compiler->txf_ms_with_isaml) {
+			/* the samples are laid out in x dimension as
+			 *     0 1 2 3
+			 * x_ms = (x << ms) + sample_index;
+			 */
+			struct ir3_instruction *ms;
+			ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
+
+			src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
+			src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
+
+			opc = OPC_ISAML;
+		} else {
+			src0[nsrc0++] = sample_index;
+		}
+	}
+
+	/* scale up integer coords for TXF based on the LOD */
+	if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
+		assert(has_lod);
+		for (i = 0; i < coords; i++)
+			src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
+	}
+
+	if (coords == 1) {
+		/* hw doesn't do 1d, so we treat it as 2d with
+		 * height of 1, and patch up the y coord.
+		 * TODO: y coord should be (int)0 in some cases..
+		 */
+		src0[nsrc0++] = create_immed(b, fui(0.5));
+	}
+
+	if (tex->is_shadow && tex->op != nir_texop_lod)
+		src0[nsrc0++] = compare;
+
+	if (tex->is_array && tex->op != nir_texop_lod) {
+		struct ir3_instruction *idx = coord[coords];
+
+		/* the array coord for cube arrays needs 0.5 added to it */
+		if (ctx->compiler->array_index_add_half && (opc != OPC_ISAML))
+			idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);
+
+		src0[nsrc0++] = idx;
+	}
+
+	if (has_proj) {
+		src0[nsrc0++] = proj;
+		flags |= IR3_INSTR_P;
+	}
+
+	/* pad to 4, then ddx/ddy: */
+	if (tex->op == nir_texop_txd) {
+		while (nsrc0 < 4)
+			src0[nsrc0++] = create_immed(b, fui(0.0));
+		for (i = 0; i < coords; i++)
+			src0[nsrc0++] = ddx[i];
+		if (coords < 2)
+			src0[nsrc0++] = create_immed(b, fui(0.0));
+		for (i = 0; i < coords; i++)
+			src0[nsrc0++] = ddy[i];
+		if (coords < 2)
+			src0[nsrc0++] = create_immed(b, fui(0.0));
+	}
+
+	/*
+	 * second argument (if applicable):
+	 *  - offsets
+	 *  - lod
+	 *  - bias
+	 */
+	if (has_off | has_lod | has_bias) {
+		if (has_off) {
+			unsigned off_coords = coords;
+			if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+				off_coords--;
+			for (i = 0; i < off_coords; i++)
+				src1[nsrc1++] = off[i];
+			if (off_coords < 2)
+				src1[nsrc1++] = create_immed(b, fui(0.0));
+			flags |= IR3_INSTR_O;
+		}
+
+		if (has_lod | has_bias)
+			src1[nsrc1++] = lod;
+	}
+
+	switch (tex->dest_type) {
+	case nir_type_invalid:
+	case nir_type_float:
+		type = TYPE_F32;
+		break;
+	case nir_type_int:
+		type = TYPE_S32;
+		break;
+	case nir_type_uint:
+	case nir_type_bool:
+		type = TYPE_U32;
+		break;
+	default:
+		unreachable("bad dest_type");
+	}
+
+	if (opc == OPC_GETLOD)
+		type = TYPE_U32;
+
+	unsigned tex_idx = tex->texture_index;
+
+	ctx->max_texture_index = MAX2(ctx->max_texture_index, tex_idx);
+
+	struct ir3_instruction *col0 = create_collect(ctx, src0, nsrc0);
+	struct ir3_instruction *col1 = create_collect(ctx, src1, nsrc1);
+
+	sam = ir3_SAM(b, opc, type, 0b1111, flags,
+			tex_idx, tex_idx, col0, col1);
+
+	if ((ctx->astc_srgb & (1 << tex_idx)) && !nir_tex_instr_is_query(tex)) {
+		/* only need first 3 components: */
+		sam->regs[0]->wrmask = 0x7;
+		split_dest(b, dst, sam, 0, 3);
+
+		/* we need to sample the alpha separately with a non-ASTC
+		 * texture state:
+		 */
+		sam = ir3_SAM(b, opc, type, 0b1000, flags,
+				tex_idx, tex_idx, col0, col1);
+
+		array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
+
+		/* fixup .w component: */
+		split_dest(b, &dst[3], sam, 3, 1);
+	} else {
+		/* normal (non-workaround) case: */
+		split_dest(b, dst, sam, 0, 4);
+	}
+
+	/* GETLOD returns results in 4.8 fixed point */
+	if (opc == OPC_GETLOD) {
+		struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
+
+		compile_assert(ctx, tex->dest_type == nir_type_float);
+		for (i = 0; i < 2; i++) {
+			dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0,
+							   factor, 0);
+		}
+	}
+
+	put_dst(ctx, &tex->dest);
+}
+
+static void
+emit_tex_query_levels(struct ir3_context *ctx, nir_tex_instr *tex)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction **dst, *sam;
+
+	dst = get_dst(ctx, &tex->dest, 1);
+
+	sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, 0b0100, 0,
+			tex->texture_index, tex->texture_index, NULL, NULL);
+
+	/* even though there is only one component, since it ends
+	 * up in .z rather than .x, we need a split_dest()
+	 */
+	split_dest(b, dst, sam, 0, 3);
+
+	/* The # of levels comes from getinfo.z. We need to add 1 to it, since
+	 * the value in TEX_CONST_0 is zero-based.
+	 */
+	if (ctx->compiler->levels_add_one)
+		dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
+
+	put_dst(ctx, &tex->dest);
+}
+
+static void
+emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction **dst, *sam;
+	struct ir3_instruction *lod;
+	unsigned flags, coords;
+
+	tex_info(tex, &flags, &coords);
+
+	/* Actually we want the number of dimensions, not coordinates. This
+	 * distinction only matters for cubes.
+	 */
+	if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+		coords = 2;
+
+	dst = get_dst(ctx, &tex->dest, 4);
+
+	compile_assert(ctx, tex->num_srcs == 1);
+	compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod);
+
+	lod = get_src(ctx, &tex->src[0].src)[0];
+
+	sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags,
+			tex->texture_index, tex->texture_index, lod, NULL);
+
+	split_dest(b, dst, sam, 0, 4);
+
+	/* Array size actually ends up in .w rather than .z. This doesn't
+	 * matter for miplevel 0, but for higher mips the value in z is
+	 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
+	 * returned, which means that we have to add 1 to it for arrays.
+	 */
+	if (tex->is_array) {
+		if (ctx->compiler->levels_add_one) {
+			dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
+		} else {
+			dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
+		}
+	}
+
+	put_dst(ctx, &tex->dest);
+}
+
+static void
+emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
+{
+	switch (jump->type) {
+	case nir_jump_break:
+	case nir_jump_continue:
+	case nir_jump_return:
+		/* I *think* we can simply just ignore this, and use the
+		 * successor block link to figure out where we need to
+		 * jump to for break/continue
+		 */
+		break;
+	default:
+		compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
+		break;
+	}
+}
+
+static void
+emit_instr(struct ir3_context *ctx, nir_instr *instr)
+{
+	switch (instr->type) {
+	case nir_instr_type_alu:
+		emit_alu(ctx, nir_instr_as_alu(instr));
+		break;
+	case nir_instr_type_deref:
+		/* ignored, handled as part of the intrinsic they are src to */
+		break;
+	case nir_instr_type_intrinsic:
+		emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
+		break;
+	case nir_instr_type_load_const:
+		emit_load_const(ctx, nir_instr_as_load_const(instr));
+		break;
+	case nir_instr_type_ssa_undef:
+		emit_undef(ctx, nir_instr_as_ssa_undef(instr));
+		break;
+	case nir_instr_type_tex: {
+		nir_tex_instr *tex = nir_instr_as_tex(instr);
+		/* couple tex instructions get special-cased:
+		 */
+		switch (tex->op) {
+		case nir_texop_txs:
+			emit_tex_txs(ctx, tex);
+			break;
+		case nir_texop_query_levels:
+			emit_tex_query_levels(ctx, tex);
+			break;
+		default:
+			emit_tex(ctx, tex);
+			break;
+		}
+		break;
+	}
+	case nir_instr_type_jump:
+		emit_jump(ctx, nir_instr_as_jump(instr));
+		break;
+	case nir_instr_type_phi:
+		/* we have converted phi webs to regs in NIR by now */
+		compile_error(ctx, "Unexpected NIR instruction type: %d\n", instr->type);
+		break;
+	case nir_instr_type_call:
+	case nir_instr_type_parallel_copy:
+		compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
+		break;
+	}
+}
+
+static struct ir3_block *
+get_block(struct ir3_context *ctx, const nir_block *nblock)
+{
+	struct ir3_block *block;
+	struct hash_entry *hentry;
+	unsigned i;
+
+	hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
+	if (hentry)
+		return hentry->data;
+
+	block = ir3_block_create(ctx->ir);
+	block->nblock = nblock;
+	_mesa_hash_table_insert(ctx->block_ht, nblock, block);
+
+	block->predecessors_count = nblock->predecessors->entries;
+	block->predecessors = ralloc_array_size(block,
+		sizeof(block->predecessors[0]), block->predecessors_count);
+	i = 0;
+	set_foreach(nblock->predecessors, sentry) {
+		block->predecessors[i++] = get_block(ctx, sentry->key);
+	}
+
+	return block;
+}
+
+static void
+emit_block(struct ir3_context *ctx, nir_block *nblock)
+{
+	struct ir3_block *block = get_block(ctx, nblock);
+
+	for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
+		if (nblock->successors[i]) {
+			block->successors[i] =
+				get_block(ctx, nblock->successors[i]);
+		}
+	}
+
+	ctx->block = block;
+	list_addtail(&block->node, &ctx->ir->block_list);
+
+	/* re-emit addr register in each block if needed: */
+	for (int i = 0; i < ARRAY_SIZE(ctx->addr_ht); i++) {
+		_mesa_hash_table_destroy(ctx->addr_ht[i], NULL);
+		ctx->addr_ht[i] = NULL;
+	}
+
+	nir_foreach_instr(instr, nblock) {
+		ctx->cur_instr = instr;
+		emit_instr(ctx, instr);
+		ctx->cur_instr = NULL;
+		if (ctx->error)
+			return;
+	}
+}
+
+static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
+
+static void
+emit_if(struct ir3_context *ctx, nir_if *nif)
+{
+	struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0];
+
+	ctx->block->condition =
+		get_predicate(ctx, ir3_b2n(condition->block, condition));
+
+	emit_cf_list(ctx, &nif->then_list);
+	emit_cf_list(ctx, &nif->else_list);
+}
+
+static void
+emit_loop(struct ir3_context *ctx, nir_loop *nloop)
+{
+	emit_cf_list(ctx, &nloop->body);
+}
+
+static void
+emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
+{
+	foreach_list_typed(nir_cf_node, node, node, list) {
+		switch (node->type) {
+		case nir_cf_node_block:
+			emit_block(ctx, nir_cf_node_as_block(node));
+			break;
+		case nir_cf_node_if:
+			emit_if(ctx, nir_cf_node_as_if(node));
+			break;
+		case nir_cf_node_loop:
+			emit_loop(ctx, nir_cf_node_as_loop(node));
+			break;
+		case nir_cf_node_function:
+			compile_error(ctx, "TODO\n");
+			break;
+		}
+	}
+}
+
+/* emit stream-out code.  At this point, the current block is the original
+ * (nir) end block, and nir ensures that all flow control paths terminate
+ * into the end block.  We re-purpose the original end block to generate
+ * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
+ * block holding stream-out write instructions, followed by the new end
+ * block:
+ *
+ *   blockOrigEnd {
+ *      p0.x = (vtxcnt < maxvtxcnt)
+ *      // succs: blockStreamOut, blockNewEnd
+ *   }
+ *   blockStreamOut {
+ *      ... stream-out instructions ...
+ *      // succs: blockNewEnd
+ *   }
+ *   blockNewEnd {
+ *   }
+ */
+static void
+emit_stream_out(struct ir3_context *ctx)
+{
+	struct ir3_shader_variant *v = ctx->so;
+	struct ir3 *ir = ctx->ir;
+	struct ir3_stream_output_info *strmout =
+			&ctx->so->shader->stream_output;
+	struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
+	struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
+	struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
+
+	/* create vtxcnt input in input block at top of shader,
+	 * so that it is seen as live over the entire duration
+	 * of the shader:
+	 */
+	vtxcnt = create_input(ctx, 0);
+	add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt);
+
+	maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
+
+	/* at this point, we are at the original 'end' block,
+	 * re-purpose this block to stream-out condition, then
+	 * append stream-out block and new-end block
+	 */
+	orig_end_block = ctx->block;
+
+// TODO these blocks need to update predecessors..
+// maybe w/ store_global intrinsic, we could do this
+// stuff in nir->nir pass
+
+	stream_out_block = ir3_block_create(ir);
+	list_addtail(&stream_out_block->node, &ir->block_list);
+
+	new_end_block = ir3_block_create(ir);
+	list_addtail(&new_end_block->node, &ir->block_list);
+
+	orig_end_block->successors[0] = stream_out_block;
+	orig_end_block->successors[1] = new_end_block;
+	stream_out_block->successors[0] = new_end_block;
+
+	/* setup 'if (vtxcnt < maxvtxcnt)' condition: */
+	cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
+	cond->regs[0]->num = regid(REG_P0, 0);
+	cond->cat2.condition = IR3_COND_LT;
+
+	/* condition goes on previous block to the conditional,
+	 * since it is used to pick which of the two successor
+	 * paths to take:
+	 */
+	orig_end_block->condition = cond;
+
+	/* switch to stream_out_block to generate the stream-out
+	 * instructions:
+	 */
+	ctx->block = stream_out_block;
+
+	/* Calculate base addresses based on vtxcnt.  Instructions
+	 * generated for bases not used in following loop will be
+	 * stripped out in the backend.
+	 */
+	for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
+		unsigned stride = strmout->stride[i];
+		struct ir3_instruction *base, *off;
+
+		base = create_uniform(ctx, regid(v->constbase.tfbo, i));
+
+		/* 24-bit should be enough: */
+		off = ir3_MUL_U(ctx->block, vtxcnt, 0,
+				create_immed(ctx->block, stride * 4), 0);
+
+		bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
+	}
+
+	/* Generate the per-output store instructions: */
+	for (unsigned i = 0; i < strmout->num_outputs; i++) {
+		for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
+			unsigned c = j + strmout->output[i].start_component;
+			struct ir3_instruction *base, *out, *stg;
+
+			base = bases[strmout->output[i].output_buffer];
+			out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
+
+			stg = ir3_STG(ctx->block, base, 0, out, 0,
+					create_immed(ctx->block, 1), 0);
+			stg->cat6.type = TYPE_U32;
+			stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4;
+
+			array_insert(ctx->block, ctx->block->keeps, stg);
+		}
+	}
+
+	/* and finally switch to the new_end_block: */
+	ctx->block = new_end_block;
+}
+
+static void
+emit_function(struct ir3_context *ctx, nir_function_impl *impl)
+{
+	nir_metadata_require(impl, nir_metadata_block_index);
+
+	emit_cf_list(ctx, &impl->body);
+	emit_block(ctx, impl->end_block);
+
+	/* at this point, we should have a single empty block,
+	 * into which we emit the 'end' instruction.
+	 */
+	compile_assert(ctx, list_empty(&ctx->block->instr_list));
+
+	/* If stream-out (aka transform-feedback) enabled, emit the
+	 * stream-out instructions, followed by a new empty block (into
+	 * which the 'end' instruction lands).
+	 *
+	 * NOTE: it is done in this order, rather than inserting before
+	 * we emit end_block, because NIR guarantees that all blocks
+	 * flow into end_block, and that end_block has no successors.
+	 * So by re-purposing end_block as the first block of stream-
+	 * out, we guarantee that all exit paths flow into the stream-
+	 * out instructions.
+	 */
+	if ((ctx->compiler->gpu_id < 500) &&
+			(ctx->so->shader->stream_output.num_outputs > 0) &&
+			!ctx->so->binning_pass) {
+		debug_assert(ctx->so->type == MESA_SHADER_VERTEX);
+		emit_stream_out(ctx);
+	}
+
+	ir3_END(ctx->block);
+}
+
+static struct ir3_instruction *
+create_frag_coord(struct ir3_context *ctx, unsigned comp)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *instr;
+
+	if (!ctx->frag_coord) {
+		ctx->frag_coord = create_input_compmask(ctx, 0, 0xf);
+		/* defer add_sysval_input() until after all inputs created */
+	}
+
+	split_dest(block, &instr, ctx->frag_coord, comp, 1);
+
+	switch (comp) {
+	case 0: /* .x */
+	case 1: /* .y */
+		/* for frag_coord, we get unsigned values.. we need
+		 * to subtract (integer) 8 and divide by 16 (right-
+		 * shift by 4) then convert to float:
+		 *
+		 *    sub.s tmp, src, 8
+		 *    shr.b tmp, tmp, 4
+		 *    mov.u32f32 dst, tmp
+		 *
+		 */
+		instr = ir3_SUB_S(block, instr, 0,
+				create_immed(block, 8), 0);
+		instr = ir3_SHR_B(block, instr, 0,
+				create_immed(block, 4), 0);
+		instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32);
+
+		return instr;
+	case 2: /* .z */
+	case 3: /* .w */
+	default:
+		/* seems that we can use these as-is: */
+		return instr;
+	}
+}
+
+static void
+setup_input(struct ir3_context *ctx, nir_variable *in)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	unsigned ncomp = glsl_get_components(in->type);
+	unsigned n = in->data.driver_location;
+	unsigned slot = in->data.location;
+
+	/* let's pretend things other than vec4 don't exist: */
+	ncomp = MAX2(ncomp, 4);
+
+	/* skip unread inputs, we could end up with (for example), unsplit
+	 * matrix/etc inputs in the case they are not read, so just silently
+	 * skip these.
+	 */
+	if (ncomp > 4)
+		return;
+
+	compile_assert(ctx, ncomp == 4);
+
+	so->inputs[n].slot = slot;
+	so->inputs[n].compmask = (1 << ncomp) - 1;
+	so->inputs_count = MAX2(so->inputs_count, n + 1);
+	so->inputs[n].interpolate = in->data.interpolation;
+
+	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+		for (int i = 0; i < ncomp; i++) {
+			struct ir3_instruction *instr = NULL;
+			unsigned idx = (n * 4) + i;
+
+			if (slot == VARYING_SLOT_POS) {
+				so->inputs[n].bary = false;
+				so->frag_coord = true;
+				instr = create_frag_coord(ctx, i);
+			} else if (slot == VARYING_SLOT_PNTC) {
+				/* see for example st_nir_fixup_varying_slots().. this is
+				 * maybe a bit mesa/st specific.  But we need things to line
+				 * up for this in fdN_program:
+				 *    unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0);
+				 *    if (emit->sprite_coord_enable & texmask) {
+				 *       ...
+				 *    }
+				 */
+				so->inputs[n].slot = VARYING_SLOT_VAR8;
+				so->inputs[n].bary = true;
+				instr = create_frag_input(ctx, false);
+			} else {
+				bool use_ldlv = false;
+
+				/* detect the special case for front/back colors where
+				 * we need to do flat vs smooth shading depending on
+				 * rast state:
+				 */
+				if (in->data.interpolation == INTERP_MODE_NONE) {
+					switch (slot) {
+					case VARYING_SLOT_COL0:
+					case VARYING_SLOT_COL1:
+					case VARYING_SLOT_BFC0:
+					case VARYING_SLOT_BFC1:
+						so->inputs[n].rasterflat = true;
+						break;
+					default:
+						break;
+					}
+				}
+
+				if (ctx->compiler->flat_bypass) {
+					if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) ||
+							(so->inputs[n].rasterflat && ctx->so->key.rasterflat))
+						use_ldlv = true;
+				}
+
+				so->inputs[n].bary = true;
+
+				instr = create_frag_input(ctx, use_ldlv);
+			}
+
+			compile_assert(ctx, idx < ctx->ir->ninputs);
+
+			ctx->ir->inputs[idx] = instr;
+		}
+	} else if (ctx->so->type == MESA_SHADER_VERTEX) {
+		for (int i = 0; i < ncomp; i++) {
+			unsigned idx = (n * 4) + i;
+			compile_assert(ctx, idx < ctx->ir->ninputs);
+			ctx->ir->inputs[idx] = create_input(ctx, idx);
+		}
+	} else {
+		compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
+	}
+
+	if (so->inputs[n].bary || (ctx->so->type == MESA_SHADER_VERTEX)) {
+		so->total_in += ncomp;
+	}
+}
+
+static void
+setup_output(struct ir3_context *ctx, nir_variable *out)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	unsigned ncomp = glsl_get_components(out->type);
+	unsigned n = out->data.driver_location;
+	unsigned slot = out->data.location;
+	unsigned comp = 0;
+
+	/* let's pretend things other than vec4 don't exist: */
+	ncomp = MAX2(ncomp, 4);
+	compile_assert(ctx, ncomp == 4);
+
+	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+		switch (slot) {
+		case FRAG_RESULT_DEPTH:
+			comp = 2;  /* tgsi will write to .z component */
+			so->writes_pos = true;
+			break;
+		case FRAG_RESULT_COLOR:
+			so->color0_mrt = 1;
+			break;
+		default:
+			if (slot >= FRAG_RESULT_DATA0)
+				break;
+			compile_error(ctx, "unknown FS output name: %s\n",
+					gl_frag_result_name(slot));
+		}
+	} else if (ctx->so->type == MESA_SHADER_VERTEX) {
+		switch (slot) {
+		case VARYING_SLOT_POS:
+			so->writes_pos = true;
+			break;
+		case VARYING_SLOT_PSIZ:
+			so->writes_psize = true;
+			break;
+		case VARYING_SLOT_COL0:
+		case VARYING_SLOT_COL1:
+		case VARYING_SLOT_BFC0:
+		case VARYING_SLOT_BFC1:
+		case VARYING_SLOT_FOGC:
+		case VARYING_SLOT_CLIP_DIST0:
+		case VARYING_SLOT_CLIP_DIST1:
+		case VARYING_SLOT_CLIP_VERTEX:
+			break;
+		default:
+			if (slot >= VARYING_SLOT_VAR0)
+				break;
+			if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
+				break;
+			compile_error(ctx, "unknown VS output name: %s\n",
+					gl_varying_slot_name(slot));
+		}
+	} else {
+		compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
+	}
+
+	compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
+
+	so->outputs[n].slot = slot;
+	so->outputs[n].regid = regid(n, comp);
+	so->outputs_count = MAX2(so->outputs_count, n + 1);
+
+	for (int i = 0; i < ncomp; i++) {
+		unsigned idx = (n * 4) + i;
+		compile_assert(ctx, idx < ctx->ir->noutputs);
+		ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
+	}
+}
+
+static int
+max_drvloc(struct exec_list *vars)
+{
+	int drvloc = -1;
+	nir_foreach_variable(var, vars) {
+		drvloc = MAX2(drvloc, (int)var->data.driver_location);
+	}
+	return drvloc;
+}
+
+static const unsigned max_sysvals[] = {
+	[MESA_SHADER_FRAGMENT] = 24,  // TODO
+	[MESA_SHADER_VERTEX]  = 16,
+	[MESA_SHADER_COMPUTE] = 16, // TODO how many do we actually need?
+};
+
+static void
+emit_instructions(struct ir3_context *ctx)
+{
+	unsigned ninputs, noutputs;
+	nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
+
+	ninputs  = (max_drvloc(&ctx->s->inputs) + 1) * 4;
+	noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4;
+
+	/* we need to leave room for sysvals:
+	 */
+	ninputs += max_sysvals[ctx->so->type];
+
+	ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
+
+	/* Create inputs in first block: */
+	ctx->block = get_block(ctx, nir_start_block(fxn));
+	ctx->in_block = ctx->block;
+	list_addtail(&ctx->block->node, &ctx->ir->block_list);
+
+	ninputs -= max_sysvals[ctx->so->type];
+
+	/* for fragment shader, the vcoord input register is used as the
+	 * base for bary.f varying fetch instrs:
+	 */
+	struct ir3_instruction *vcoord = NULL;
+	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+		struct ir3_instruction *xy[2];
+
+		vcoord = create_input_compmask(ctx, 0, 0x3);
+		split_dest(ctx->block, xy, vcoord, 0, 2);
+
+		ctx->frag_vcoord = create_collect(ctx, xy, 2);
+	}
+
+	/* Setup inputs: */
+	nir_foreach_variable(var, &ctx->s->inputs) {
+		setup_input(ctx, var);
+	}
+
+	/* Defer add_sysval_input() stuff until after setup_inputs(),
+	 * because sysvals need to be appended after varyings:
+	 */
+	if (vcoord) {
+		add_sysval_input_compmask(ctx, SYSTEM_VALUE_VARYING_COORD,
+				0x3, vcoord);
+	}
+
+	if (ctx->frag_coord) {
+		add_sysval_input_compmask(ctx, SYSTEM_VALUE_FRAG_COORD,
+				0xf, ctx->frag_coord);
+	}
+
+	/* Setup outputs: */
+	nir_foreach_variable(var, &ctx->s->outputs) {
+		setup_output(ctx, var);
+	}
+
+	/* Setup registers (which should only be arrays): */
+	nir_foreach_register(reg, &ctx->s->registers) {
+		declare_array(ctx, reg);
+	}
+
+	/* NOTE: need to do something more clever when we support >1 fxn */
+	nir_foreach_register(reg, &fxn->registers) {
+		declare_array(ctx, reg);
+	}
+	/* And emit the body: */
+	ctx->impl = fxn;
+	emit_function(ctx, fxn);
+}
+
+/* from NIR perspective, we actually have varying inputs.  But the varying
+ * inputs, from an IR standpoint, are just bary.f/ldlv instructions.  The
+ * only actual inputs are the sysvals.
+ */
+static void
+fixup_frag_inputs(struct ir3_context *ctx)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	struct ir3 *ir = ctx->ir;
+	unsigned i = 0;
+
+	/* sysvals should appear at the end of the inputs, drop everything else: */
+	while ((i < so->inputs_count) && !so->inputs[i].sysval)
+		i++;
+
+	/* at IR level, inputs are always blocks of 4 scalars: */
+	i *= 4;
+
+	ir->inputs = &ir->inputs[i];
+	ir->ninputs -= i;
+}
+
+/* Fixup tex sampler state for astc/srgb workaround instructions.  We
+ * need to assign the tex state indexes for these after we know the
+ * max tex index.
+ */
+static void
+fixup_astc_srgb(struct ir3_context *ctx)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	/* indexed by original tex idx, value is newly assigned alpha sampler
+	 * state tex idx.  Zero is invalid since there is at least one sampler
+	 * if we get here.
+	 */
+	unsigned alt_tex_state[16] = {0};
+	unsigned tex_idx = ctx->max_texture_index + 1;
+	unsigned idx = 0;
+
+	so->astc_srgb.base = tex_idx;
+
+	for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
+		struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
+
+		compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
+
+		if (alt_tex_state[sam->cat5.tex] == 0) {
+			/* assign new alternate/alpha tex state slot: */
+			alt_tex_state[sam->cat5.tex] = tex_idx++;
+			so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
+			so->astc_srgb.count++;
+		}
+
+		sam->cat5.tex = alt_tex_state[sam->cat5.tex];
+	}
+}
+
+static void
+fixup_binning_pass(struct ir3_context *ctx)
+{
+	struct ir3_shader_variant *so = ctx->so;
+	struct ir3 *ir = ctx->ir;
+	unsigned i, j;
+
+	for (i = 0, j = 0; i < so->outputs_count; i++) {
+		unsigned slot = so->outputs[i].slot;
+
+		/* throw away everything but first position/psize */
+		if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) {
+			if (i != j) {
+				so->outputs[j] = so->outputs[i];
+				ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0];
+				ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1];
+				ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2];
+				ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3];
+			}
+			j++;
+		}
+	}
+	so->outputs_count = j;
+	ir->noutputs = j * 4;
+}
+
+int
+ir3_compile_shader_nir(struct ir3_compiler *compiler,
+		struct ir3_shader_variant *so)
+{
+	struct ir3_context *ctx;
+	struct ir3 *ir;
+	struct ir3_instruction **inputs;
+	unsigned i, actual_in, inloc;
+	int ret = 0, max_bary;
+
+	assert(!so->ir);
+
+	ctx = compile_init(compiler, so);
+	if (!ctx) {
+		DBG("INIT failed!");
+		ret = -1;
+		goto out;
+	}
+
+	emit_instructions(ctx);
+
+	if (ctx->error) {
+		DBG("EMIT failed!");
+		ret = -1;
+		goto out;
+	}
+
+	ir = so->ir = ctx->ir;
+
+	/* keep track of the inputs from TGSI perspective.. */
+	inputs = ir->inputs;
+
+	/* but fixup actual inputs for frag shader: */
+	if (so->type == MESA_SHADER_FRAGMENT)
+		fixup_frag_inputs(ctx);
+
+	/* at this point, for binning pass, throw away unneeded outputs: */
+	if (so->binning_pass && (ctx->compiler->gpu_id < 600))
+		fixup_binning_pass(ctx);
+
+	/* if we want half-precision outputs, mark the output registers
+	 * as half:
+	 */
+	if (so->key.half_precision) {
+		for (i = 0; i < ir->noutputs; i++) {
+			struct ir3_instruction *out = ir->outputs[i];
+
+			if (!out)
+				continue;
+
+			/* if frag shader writes z, that needs to be full precision: */
+			if (so->outputs[i/4].slot == FRAG_RESULT_DEPTH)
+				continue;
+
+			out->regs[0]->flags |= IR3_REG_HALF;
+			/* output could be a fanout (ie. texture fetch output)
+			 * in which case we need to propagate the half-reg flag
+			 * up to the definer so that RA sees it:
+			 */
+			if (out->opc == OPC_META_FO) {
+				out = out->regs[1]->instr;
+				out->regs[0]->flags |= IR3_REG_HALF;
+			}
+
+			if (out->opc == OPC_MOV) {
+				out->cat1.dst_type = half_type(out->cat1.dst_type);
+			}
+		}
+	}
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("BEFORE CP:\n");
+		ir3_print(ir);
+	}
+
+	ir3_cp(ir, so);
+
+	/* at this point, for binning pass, throw away unneeded outputs:
+	 * Note that for a6xx and later, we do this after ir3_cp to ensure
+	 * that the uniform/constant layout for BS and VS matches, so that
+	 * we can re-use same VS_CONST state group.
+	 */
+	if (so->binning_pass && (ctx->compiler->gpu_id >= 600))
+		fixup_binning_pass(ctx);
+
+	/* Insert mov if there's same instruction for each output.
+	 * eg. dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_expression.vertex.sampler2dshadow
+	 */
+	for (int i = ir->noutputs - 1; i >= 0; i--) {
+		if (!ir->outputs[i])
+			continue;
+		for (unsigned j = 0; j < i; j++) {
+			if (ir->outputs[i] == ir->outputs[j]) {
+				ir->outputs[i] =
+					ir3_MOV(ir->outputs[i]->block, ir->outputs[i], TYPE_F32);
+			}
+		}
+	}
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("BEFORE GROUPING:\n");
+		ir3_print(ir);
+	}
+
+	ir3_sched_add_deps(ir);
+
+	/* Group left/right neighbors, inserting mov's where needed to
+	 * solve conflicts:
+	 */
+	ir3_group(ir);
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("AFTER GROUPING:\n");
+		ir3_print(ir);
+	}
+
+	ir3_depth(ir);
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("AFTER DEPTH:\n");
+		ir3_print(ir);
+	}
+
+	ret = ir3_sched(ir);
+	if (ret) {
+		DBG("SCHED failed!");
+		goto out;
+	}
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("AFTER SCHED:\n");
+		ir3_print(ir);
+	}
+
+	ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face);
+	if (ret) {
+		DBG("RA failed!");
+		goto out;
+	}
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("AFTER RA:\n");
+		ir3_print(ir);
+	}
+
+	/* fixup input/outputs: */
+	for (i = 0; i < so->outputs_count; i++) {
+		so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
+	}
+
+	/* Note that some or all channels of an input may be unused: */
+	actual_in = 0;
+	inloc = 0;
+	for (i = 0; i < so->inputs_count; i++) {
+		unsigned j, reg = regid(63,0), compmask = 0, maxcomp = 0;
+		so->inputs[i].ncomp = 0;
+		so->inputs[i].inloc = inloc;
+		for (j = 0; j < 4; j++) {
+			struct ir3_instruction *in = inputs[(i*4) + j];
+			if (in && !(in->flags & IR3_INSTR_UNUSED)) {
+				compmask |= (1 << j);
+				reg = in->regs[0]->num - j;
+				actual_in++;
+				so->inputs[i].ncomp++;
+				if ((so->type == MESA_SHADER_FRAGMENT) && so->inputs[i].bary) {
+					/* assign inloc: */
+					assert(in->regs[1]->flags & IR3_REG_IMMED);
+					in->regs[1]->iim_val = inloc + j;
+					maxcomp = j + 1;
+				}
+			}
+		}
+		if ((so->type == MESA_SHADER_FRAGMENT) && compmask && so->inputs[i].bary) {
+			so->varying_in++;
+			so->inputs[i].compmask = (1 << maxcomp) - 1;
+			inloc += maxcomp;
+		} else if (!so->inputs[i].sysval) {
+			so->inputs[i].compmask = compmask;
+		}
+		so->inputs[i].regid = reg;
+	}
+
+	if (ctx->astc_srgb)
+		fixup_astc_srgb(ctx);
+
+	/* We need to do legalize after (for frag shader's) the "bary.f"
+	 * offsets (inloc) have been assigned.
+	 */
+	ir3_legalize(ir, &so->num_samp, &so->has_ssbo, &max_bary);
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		printf("AFTER LEGALIZE:\n");
+		ir3_print(ir);
+	}
+
+	/* Note that actual_in counts inputs that are not bary.f'd for FS: */
+	if (so->type == MESA_SHADER_VERTEX)
+		so->total_in = actual_in;
+	else
+		so->total_in = max_bary + 1;
+
+out:
+	if (ret) {
+		if (so->ir)
+			ir3_destroy(so->ir);
+		so->ir = NULL;
+	}
+	compile_free(ctx);
+
+	return ret;
+}
diff --git a/src/freedreno/ir3/ir3_cp.c b/src/freedreno/ir3/ir3_cp.c
new file mode 100644
index 00000000000..e8e8cc311e3
--- /dev/null
+++ b/src/freedreno/ir3/ir3_cp.c
@@ -0,0 +1,653 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include <math.h>
+
+#include "ir3.h"
+#include "ir3_shader.h"
+
+/*
+ * Copy Propagate:
+ */
+
+struct ir3_cp_ctx {
+	struct ir3 *shader;
+	struct ir3_shader_variant *so;
+	unsigned immediate_idx;
+};
+
+/* is it a type preserving mov, with ok flags? */
+static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
+{
+	if (is_same_type_mov(instr)) {
+		struct ir3_register *dst = instr->regs[0];
+		struct ir3_register *src = instr->regs[1];
+		struct ir3_instruction *src_instr = ssa(src);
+
+		/* only if mov src is SSA (not const/immed): */
+		if (!src_instr)
+			return false;
+
+		/* no indirect: */
+		if (dst->flags & IR3_REG_RELATIV)
+			return false;
+		if (src->flags & IR3_REG_RELATIV)
+			return false;
+
+		if (src->flags & IR3_REG_ARRAY)
+			return false;
+
+		if (!allow_flags)
+			if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG |
+					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
+				return false;
+
+		/* TODO: remove this hack: */
+		if (src_instr->opc == OPC_META_FO)
+			return false;
+
+		return true;
+	}
+	return false;
+}
+
+static unsigned cp_flags(unsigned flags)
+{
+	/* only considering these flags (at least for now): */
+	flags &= (IR3_REG_CONST | IR3_REG_IMMED |
+			IR3_REG_FNEG | IR3_REG_FABS |
+			IR3_REG_SNEG | IR3_REG_SABS |
+			IR3_REG_BNOT | IR3_REG_RELATIV);
+	return flags;
+}
+
+static bool valid_flags(struct ir3_instruction *instr, unsigned n,
+		unsigned flags)
+{
+	unsigned valid_flags;
+	flags = cp_flags(flags);
+
+	/* If destination is indirect, then source cannot be.. at least
+	 * I don't think so..
+	 */
+	if ((instr->regs[0]->flags & IR3_REG_RELATIV) &&
+			(flags & IR3_REG_RELATIV))
+		return false;
+
+	/* TODO it seems to *mostly* work to cp RELATIV, except we get some
+	 * intermittent piglit variable-indexing fails.  Newer blob driver
+	 * doesn't seem to cp these.  Possibly this is hw workaround?  Not
+	 * sure, but until that is understood better, lets just switch off
+	 * cp for indirect src's:
+	 */
+	if (flags & IR3_REG_RELATIV)
+		return false;
+
+	switch (opc_cat(instr->opc)) {
+	case 1:
+		valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
+		if (flags & ~valid_flags)
+			return false;
+		break;
+	case 2:
+		valid_flags = ir3_cat2_absneg(instr->opc) |
+				IR3_REG_CONST | IR3_REG_RELATIV;
+
+		if (ir3_cat2_int(instr->opc))
+			valid_flags |= IR3_REG_IMMED;
+
+		if (flags & ~valid_flags)
+			return false;
+
+		if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
+			unsigned m = (n ^ 1) + 1;
+			/* cannot deal w/ const in both srcs:
+			 * (note that some cat2 actually only have a single src)
+			 */
+			if (m < instr->regs_count) {
+				struct ir3_register *reg = instr->regs[m];
+				if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST))
+					return false;
+				if ((flags & IR3_REG_IMMED) && (reg->flags & IR3_REG_IMMED))
+					return false;
+			}
+			/* cannot be const + ABS|NEG: */
+			if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
+					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
+				return false;
+		}
+		break;
+	case 3:
+		valid_flags = ir3_cat3_absneg(instr->opc) |
+				IR3_REG_CONST | IR3_REG_RELATIV;
+
+		if (flags & ~valid_flags)
+			return false;
+
+		if (flags & (IR3_REG_CONST | IR3_REG_RELATIV)) {
+			/* cannot deal w/ const/relativ in 2nd src: */
+			if (n == 1)
+				return false;
+		}
+
+		if (flags & IR3_REG_CONST) {
+			/* cannot be const + ABS|NEG: */
+			if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
+					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
+				return false;
+		}
+		break;
+	case 4:
+		/* seems like blob compiler avoids const as src.. */
+		/* TODO double check if this is still the case on a4xx */
+		if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
+			return false;
+		if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
+			return false;
+		break;
+	case 5:
+		/* no flags allowed */
+		if (flags)
+			return false;
+		break;
+	case 6:
+		valid_flags = IR3_REG_IMMED;
+		if (flags & ~valid_flags)
+			return false;
+
+		if (flags & IR3_REG_IMMED) {
+			/* doesn't seem like we can have immediate src for store
+			 * instructions:
+			 *
+			 * TODO this restriction could also apply to load instructions,
+			 * but for load instructions this arg is the address (and not
+			 * really sure any good way to test a hard-coded immed addr src)
+			 */
+			if (is_store(instr) && (n == 1))
+				return false;
+
+			if ((instr->opc == OPC_LDL) && (n != 1))
+				return false;
+
+			if ((instr->opc == OPC_STL) && (n != 2))
+				return false;
+
+			/* disallow CP into anything but the SSBO slot argument for
+			 * atomics:
+			 */
+			if (is_atomic(instr->opc) && (n != 0))
+				return false;
+
+			if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
+				return false;
+		}
+
+		break;
+	}
+
+	return true;
+}
+
+/* propagate register flags from src to dst.. negates need special
+ * handling to cancel each other out.
+ */
+static void combine_flags(unsigned *dstflags, struct ir3_instruction *src)
+{
+	unsigned srcflags = src->regs[1]->flags;
+
+	/* if what we are combining into already has (abs) flags,
+	 * we can drop (neg) from src:
+	 */
+	if (*dstflags & IR3_REG_FABS)
+		srcflags &= ~IR3_REG_FNEG;
+	if (*dstflags & IR3_REG_SABS)
+		srcflags &= ~IR3_REG_SNEG;
+
+	if (srcflags & IR3_REG_FABS)
+		*dstflags |= IR3_REG_FABS;
+	if (srcflags & IR3_REG_SABS)
+		*dstflags |= IR3_REG_SABS;
+	if (srcflags & IR3_REG_FNEG)
+		*dstflags ^= IR3_REG_FNEG;
+	if (srcflags & IR3_REG_SNEG)
+		*dstflags ^= IR3_REG_SNEG;
+	if (srcflags & IR3_REG_BNOT)
+		*dstflags ^= IR3_REG_BNOT;
+
+	*dstflags &= ~IR3_REG_SSA;
+	*dstflags |= srcflags & IR3_REG_SSA;
+	*dstflags |= srcflags & IR3_REG_CONST;
+	*dstflags |= srcflags & IR3_REG_IMMED;
+	*dstflags |= srcflags & IR3_REG_RELATIV;
+	*dstflags |= srcflags & IR3_REG_ARRAY;
+
+	/* if src of the src is boolean we can drop the (abs) since we know
+	 * the source value is already a postitive integer.  This cleans
+	 * up the absnegs that get inserted when converting between nir and
+	 * native boolean (see ir3_b2n/n2b)
+	 */
+	struct ir3_instruction *srcsrc = ssa(src->regs[1]);
+	if (srcsrc && is_bool(srcsrc))
+		*dstflags &= ~IR3_REG_SABS;
+}
+
+static struct ir3_register *
+lower_immed(struct ir3_cp_ctx *ctx, struct ir3_register *reg, unsigned new_flags)
+{
+	unsigned swiz, idx, i;
+
+	reg = ir3_reg_clone(ctx->shader, reg);
+
+	/* in some cases, there are restrictions on (abs)/(neg) plus const..
+	 * so just evaluate those and clear the flags:
+	 */
+	if (new_flags & IR3_REG_SABS) {
+		reg->iim_val = abs(reg->iim_val);
+		new_flags &= ~IR3_REG_SABS;
+	}
+
+	if (new_flags & IR3_REG_FABS) {
+		reg->fim_val = fabs(reg->fim_val);
+		new_flags &= ~IR3_REG_FABS;
+	}
+
+	if (new_flags & IR3_REG_SNEG) {
+		reg->iim_val = -reg->iim_val;
+		new_flags &= ~IR3_REG_SNEG;
+	}
+
+	if (new_flags & IR3_REG_FNEG) {
+		reg->fim_val = -reg->fim_val;
+		new_flags &= ~IR3_REG_FNEG;
+	}
+
+	/* Reallocate for 4 more elements whenever it's necessary */
+	if (ctx->immediate_idx == ctx->so->immediates_size * 4) {
+		ctx->so->immediates_size += 4;
+		ctx->so->immediates = realloc (ctx->so->immediates,
+			ctx->so->immediates_size * sizeof (ctx->so->immediates[0]));
+	}
+
+	for (i = 0; i < ctx->immediate_idx; i++) {
+		swiz = i % 4;
+		idx  = i / 4;
+
+		if (ctx->so->immediates[idx].val[swiz] == reg->uim_val) {
+			break;
+		}
+	}
+
+	if (i == ctx->immediate_idx) {
+		/* need to generate a new immediate: */
+		swiz = i % 4;
+		idx  = i / 4;
+		ctx->so->immediates[idx].val[swiz] = reg->uim_val;
+		ctx->so->immediates_count = idx + 1;
+		ctx->immediate_idx++;
+	}
+
+	new_flags &= ~IR3_REG_IMMED;
+	new_flags |= IR3_REG_CONST;
+	reg->flags = new_flags;
+	reg->num = i + (4 * ctx->so->constbase.immediate);
+
+	return reg;
+}
+
+static void
+unuse(struct ir3_instruction *instr)
+{
+	debug_assert(instr->use_count > 0);
+
+	if (--instr->use_count == 0) {
+		struct ir3_block *block = instr->block;
+
+		instr->barrier_class = 0;
+		instr->barrier_conflict = 0;
+
+		/* we don't want to remove anything in keeps (which could
+		 * be things like array store's)
+		 */
+		for (unsigned i = 0; i < block->keeps_count; i++) {
+			debug_assert(block->keeps[i] != instr);
+		}
+	}
+}
+
+/**
+ * Handle cp for a given src register.  This additionally handles
+ * the cases of collapsing immedate/const (which replace the src
+ * register with a non-ssa src) or collapsing mov's from relative
+ * src (which needs to also fixup the address src reference by the
+ * instruction).
+ */
+static void
+reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
+		struct ir3_register *reg, unsigned n)
+{
+	struct ir3_instruction *src = ssa(reg);
+
+	if (is_eligible_mov(src, true)) {
+		/* simple case, no immed/const/relativ, only mov's w/ ssa src: */
+		struct ir3_register *src_reg = src->regs[1];
+		unsigned new_flags = reg->flags;
+
+		combine_flags(&new_flags, src);
+
+		if (valid_flags(instr, n, new_flags)) {
+			if (new_flags & IR3_REG_ARRAY) {
+				debug_assert(!(reg->flags & IR3_REG_ARRAY));
+				reg->array = src_reg->array;
+			}
+			reg->flags = new_flags;
+			reg->instr = ssa(src_reg);
+
+			instr->barrier_class |= src->barrier_class;
+			instr->barrier_conflict |= src->barrier_conflict;
+
+			unuse(src);
+			reg->instr->use_count++;
+		}
+
+	} else if (is_same_type_mov(src) &&
+			/* cannot collapse const/immed/etc into meta instrs: */
+			!is_meta(instr)) {
+		/* immed/const/etc cases, which require some special handling: */
+		struct ir3_register *src_reg = src->regs[1];
+		unsigned new_flags = reg->flags;
+
+		combine_flags(&new_flags, src);
+
+		if (!valid_flags(instr, n, new_flags)) {
+			/* See if lowering an immediate to const would help. */
+			if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
+				debug_assert(new_flags & IR3_REG_IMMED);
+				instr->regs[n + 1] = lower_immed(ctx, src_reg, new_flags);
+				return;
+			}
+
+			/* special case for "normal" mad instructions, we can
+			 * try swapping the first two args if that fits better.
+			 *
+			 * the "plain" MAD's (ie. the ones that don't shift first
+			 * src prior to multiply) can swap their first two srcs if
+			 * src[0] is !CONST and src[1] is CONST:
+			 */
+			if ((n == 1) && is_mad(instr->opc) &&
+					!(instr->regs[0 + 1]->flags & (IR3_REG_CONST | IR3_REG_RELATIV)) &&
+					valid_flags(instr, 0, new_flags & ~IR3_REG_IMMED)) {
+				/* swap src[0] and src[1]: */
+				struct ir3_register *tmp;
+				tmp = instr->regs[0 + 1];
+				instr->regs[0 + 1] = instr->regs[1 + 1];
+				instr->regs[1 + 1] = tmp;
+
+				n = 0;
+			} else {
+				return;
+			}
+		}
+
+		/* Here we handle the special case of mov from
+		 * CONST and/or RELATIV.  These need to be handled
+		 * specially, because in the case of move from CONST
+		 * there is no src ir3_instruction so we need to
+		 * replace the ir3_register.  And in the case of
+		 * RELATIV we need to handle the address register
+		 * dependency.
+		 */
+		if (src_reg->flags & IR3_REG_CONST) {
+			/* an instruction cannot reference two different
+			 * address registers:
+			 */
+			if ((src_reg->flags & IR3_REG_RELATIV) &&
+					conflicts(instr->address, reg->instr->address))
+				return;
+
+			/* This seems to be a hw bug, or something where the timings
+			 * just somehow don't work out.  This restriction may only
+			 * apply if the first src is also CONST.
+			 */
+			if ((opc_cat(instr->opc) == 3) && (n == 2) &&
+					(src_reg->flags & IR3_REG_RELATIV) &&
+					(src_reg->array.offset == 0))
+				return;
+
+			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+			src_reg->flags = new_flags;
+			instr->regs[n+1] = src_reg;
+
+			if (src_reg->flags & IR3_REG_RELATIV)
+				ir3_instr_set_address(instr, reg->instr->address);
+
+			return;
+		}
+
+		if ((src_reg->flags & IR3_REG_RELATIV) &&
+				!conflicts(instr->address, reg->instr->address)) {
+			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+			src_reg->flags = new_flags;
+			instr->regs[n+1] = src_reg;
+			ir3_instr_set_address(instr, reg->instr->address);
+
+			return;
+		}
+
+		/* NOTE: seems we can only do immed integers, so don't
+		 * need to care about float.  But we do need to handle
+		 * abs/neg *before* checking that the immediate requires
+		 * few enough bits to encode:
+		 *
+		 * TODO: do we need to do something to avoid accidentally
+		 * catching a float immed?
+		 */
+		if (src_reg->flags & IR3_REG_IMMED) {
+			int32_t iim_val = src_reg->iim_val;
+
+			debug_assert((opc_cat(instr->opc) == 1) ||
+					(opc_cat(instr->opc) == 6) ||
+					ir3_cat2_int(instr->opc) ||
+					(is_mad(instr->opc) && (n == 0)));
+
+			if (new_flags & IR3_REG_SABS)
+				iim_val = abs(iim_val);
+
+			if (new_flags & IR3_REG_SNEG)
+				iim_val = -iim_val;
+
+			if (new_flags & IR3_REG_BNOT)
+				iim_val = ~iim_val;
+
+			/* other than category 1 (mov) we can only encode up to 10 bits: */
+			if ((instr->opc == OPC_MOV) ||
+					!((iim_val & ~0x3ff) && (-iim_val & ~0x3ff))) {
+				new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
+				src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+				src_reg->flags = new_flags;
+				src_reg->iim_val = iim_val;
+				instr->regs[n+1] = src_reg;
+			} else if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
+				/* See if lowering an immediate to const would help. */
+				instr->regs[n+1] = lower_immed(ctx, src_reg, new_flags);
+			}
+
+			return;
+		}
+	}
+}
+
+/* Handle special case of eliminating output mov, and similar cases where
+ * there isn't a normal "consuming" instruction.  In this case we cannot
+ * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot
+ * be eliminated)
+ */
+static struct ir3_instruction *
+eliminate_output_mov(struct ir3_instruction *instr)
+{
+	if (is_eligible_mov(instr, false)) {
+		struct ir3_register *reg = instr->regs[1];
+		if (!(reg->flags & IR3_REG_ARRAY)) {
+			struct ir3_instruction *src_instr = ssa(reg);
+			debug_assert(src_instr);
+			return src_instr;
+		}
+	}
+	return instr;
+}
+
+/**
+ * Find instruction src's which are mov's that can be collapsed, replacing
+ * the mov dst with the mov src
+ */
+static void
+instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr)
+{
+	struct ir3_register *reg;
+
+	if (instr->regs_count == 0)
+		return;
+
+	if (ir3_instr_check_mark(instr))
+		return;
+
+	/* walk down the graph from each src: */
+	foreach_src_n(reg, n, instr) {
+		struct ir3_instruction *src = ssa(reg);
+
+		if (!src)
+			continue;
+
+		instr_cp(ctx, src);
+
+		/* TODO non-indirect access we could figure out which register
+		 * we actually want and allow cp..
+		 */
+		if (reg->flags & IR3_REG_ARRAY)
+			continue;
+
+		/* Don't CP absneg into meta instructions, that won't end well: */
+		if (is_meta(instr) && (src->opc != OPC_MOV))
+			continue;
+
+		reg_cp(ctx, instr, reg, n);
+	}
+
+	if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+		struct ir3_instruction *src = ssa(instr->regs[0]);
+		if (src)
+			instr_cp(ctx, src);
+	}
+
+	if (instr->address) {
+		instr_cp(ctx, instr->address);
+		ir3_instr_set_address(instr, eliminate_output_mov(instr->address));
+	}
+
+	/* we can end up with extra cmps.s from frontend, which uses a
+	 *
+	 *    cmps.s p0.x, cond, 0
+	 *
+	 * as a way to mov into the predicate register.  But frequently 'cond'
+	 * is itself a cmps.s/cmps.f/cmps.u.  So detect this special case and
+	 * just re-write the instruction writing predicate register to get rid
+	 * of the double cmps.
+	 */
+	if ((instr->opc == OPC_CMPS_S) &&
+			(instr->regs[0]->num == regid(REG_P0, 0)) &&
+			ssa(instr->regs[1]) &&
+			(instr->regs[2]->flags & IR3_REG_IMMED) &&
+			(instr->regs[2]->iim_val == 0)) {
+		struct ir3_instruction *cond = ssa(instr->regs[1]);
+		switch (cond->opc) {
+		case OPC_CMPS_S:
+		case OPC_CMPS_F:
+		case OPC_CMPS_U:
+			instr->opc   = cond->opc;
+			instr->flags = cond->flags;
+			instr->cat2  = cond->cat2;
+			instr->address = cond->address;
+			instr->regs[1] = cond->regs[1];
+			instr->regs[2] = cond->regs[2];
+			instr->barrier_class |= cond->barrier_class;
+			instr->barrier_conflict |= cond->barrier_conflict;
+			unuse(cond);
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+void
+ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so)
+{
+	struct ir3_cp_ctx ctx = {
+			.shader = ir,
+			.so = so,
+	};
+
+	/* This is a bit annoying, and probably wouldn't be necessary if we
+	 * tracked a reverse link from producing instruction to consumer.
+	 * But we need to know when we've eliminated the last consumer of
+	 * a mov, so we need to do a pass to first count consumers of a
+	 * mov.
+	 */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			struct ir3_instruction *src;
+
+			/* by the way, we don't account for false-dep's, so the CP
+			 * pass should always happen before false-dep's are inserted
+			 */
+			debug_assert(instr->deps_count == 0);
+
+			foreach_ssa_src(src, instr) {
+				src->use_count++;
+			}
+		}
+	}
+
+	ir3_clear_mark(ir);
+
+	for (unsigned i = 0; i < ir->noutputs; i++) {
+		if (ir->outputs[i]) {
+			instr_cp(&ctx, ir->outputs[i]);
+			ir->outputs[i] = eliminate_output_mov(ir->outputs[i]);
+		}
+	}
+
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		if (block->condition) {
+			instr_cp(&ctx, block->condition);
+			block->condition = eliminate_output_mov(block->condition);
+		}
+
+		for (unsigned i = 0; i < block->keeps_count; i++) {
+			instr_cp(&ctx, block->keeps[i]);
+			block->keeps[i] = eliminate_output_mov(block->keeps[i]);
+		}
+	}
+}
diff --git a/src/freedreno/ir3/ir3_depth.c b/src/freedreno/ir3/ir3_depth.c
new file mode 100644
index 00000000000..73bf5e19926
--- /dev/null
+++ b/src/freedreno/ir3/ir3_depth.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Instruction Depth:
+ *
+ * Calculates weighted instruction depth, ie. the sum of # of needed
+ * instructions plus delay slots back to original input (ie INPUT or
+ * CONST).  That is to say, an instructions depth is:
+ *
+ *   depth(instr) {
+ *     d = 0;
+ *     // for each src register:
+ *     foreach (src in instr->regs[1..n])
+ *       d = max(d, delayslots(src->instr, n) + depth(src->instr));
+ *     return d + 1;
+ *   }
+ *
+ * After an instruction's depth is calculated, it is inserted into the
+ * blocks depth sorted list, which is used by the scheduling pass.
+ */
+
+/* generally don't count false dependencies, since this can just be
+ * something like a barrier, or SSBO store.  The exception is array
+ * dependencies if the assigner is an array write and the consumer
+ * reads the same array.
+ */
+static bool
+ignore_dep(struct ir3_instruction *assigner,
+		struct ir3_instruction *consumer, unsigned n)
+{
+	if (!__is_false_dep(consumer, n))
+		return false;
+
+	if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
+		struct ir3_register *dst = assigner->regs[0];
+		struct ir3_register *src;
+
+		debug_assert(dst->flags & IR3_REG_ARRAY);
+
+		foreach_src(src, consumer) {
+			if ((src->flags & IR3_REG_ARRAY) &&
+					(dst->array.id == src->array.id)) {
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+/* calculate required # of delay slots between the instruction that
+ * assigns a value and the one that consumes
+ */
+int ir3_delayslots(struct ir3_instruction *assigner,
+		struct ir3_instruction *consumer, unsigned n)
+{
+	if (ignore_dep(assigner, consumer, n))
+		return 0;
+
+	/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
+	 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
+	 * handled with sync bits
+	 */
+
+	if (is_meta(assigner))
+		return 0;
+
+	if (writes_addr(assigner))
+		return 6;
+
+	/* handled via sync flags: */
+	if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
+		return 0;
+
+	/* assigner must be alu: */
+	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
+			is_mem(consumer)) {
+		return 6;
+	} else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
+			(n == 3)) {
+		/* special case, 3rd src to cat3 not required on first cycle */
+		return 1;
+	} else {
+		return 3;
+	}
+}
+
+void
+ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list)
+{
+	/* remove from existing spot in list: */
+	list_delinit(&instr->node);
+
+	/* find where to re-insert instruction: */
+	list_for_each_entry (struct ir3_instruction, pos, list, node) {
+		if (pos->depth > instr->depth) {
+			list_add(&instr->node, &pos->node);
+			return;
+		}
+	}
+	/* if we get here, we didn't find an insertion spot: */
+	list_addtail(&instr->node, list);
+}
+
+static void
+ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep)
+{
+	struct ir3_instruction *src;
+
+	/* don't mark falsedep's as used, but otherwise process them normally: */
+	if (!falsedep)
+		instr->flags &= ~IR3_INSTR_UNUSED;
+
+	if (ir3_instr_check_mark(instr))
+		return;
+
+	instr->depth = 0;
+
+	foreach_ssa_src_n(src, i, instr) {
+		unsigned sd;
+
+		/* visit child to compute it's depth: */
+		ir3_instr_depth(src, boost, __is_false_dep(instr, i));
+
+		/* for array writes, no need to delay on previous write: */
+		if (i == 0)
+			continue;
+
+		sd = ir3_delayslots(src, instr, i) + src->depth;
+		sd += boost;
+
+		instr->depth = MAX2(instr->depth, sd);
+	}
+
+	if (!is_meta(instr))
+		instr->depth++;
+
+	ir3_insert_by_depth(instr, &instr->block->instr_list);
+}
+
+static bool
+remove_unused_by_block(struct ir3_block *block)
+{
+	bool progress = false;
+	list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+		if (instr->opc == OPC_END)
+			continue;
+		if (instr->flags & IR3_INSTR_UNUSED) {
+			list_delinit(&instr->node);
+			progress = true;
+		}
+	}
+	return progress;
+}
+
+static bool
+compute_depth_and_remove_unused(struct ir3 *ir)
+{
+	unsigned i;
+	bool progress = false;
+
+	ir3_clear_mark(ir);
+
+	/* initially mark everything as unused, we'll clear the flag as we
+	 * visit the instructions:
+	 */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			instr->flags |= IR3_INSTR_UNUSED;
+		}
+	}
+
+	for (i = 0; i < ir->noutputs; i++)
+		if (ir->outputs[i])
+			ir3_instr_depth(ir->outputs[i], 0, false);
+
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		for (i = 0; i < block->keeps_count; i++)
+			ir3_instr_depth(block->keeps[i], 0, false);
+
+		/* We also need to account for if-condition: */
+		if (block->condition)
+			ir3_instr_depth(block->condition, 6, false);
+	}
+
+	/* mark un-used instructions: */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		progress |= remove_unused_by_block(block);
+	}
+
+	/* note that we can end up with unused indirects, but we should
+	 * not end up with unused predicates.
+	 */
+	for (i = 0; i < ir->indirects_count; i++) {
+		struct ir3_instruction *instr = ir->indirects[i];
+		if (instr && (instr->flags & IR3_INSTR_UNUSED))
+			ir->indirects[i] = NULL;
+	}
+
+	/* cleanup unused inputs: */
+	for (i = 0; i < ir->ninputs; i++) {
+		struct ir3_instruction *in = ir->inputs[i];
+		if (in && (in->flags & IR3_INSTR_UNUSED))
+			ir->inputs[i] = NULL;
+	}
+
+	return progress;
+}
+
+void
+ir3_depth(struct ir3 *ir)
+{
+	bool progress;
+	do {
+		progress = compute_depth_and_remove_unused(ir);
+	} while (progress);
+}
diff --git a/src/freedreno/ir3/ir3_group.c b/src/freedreno/ir3/ir3_group.c
new file mode 100644
index 00000000000..570055973e8
--- /dev/null
+++ b/src/freedreno/ir3/ir3_group.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "ir3.h"
+
+/*
+ * Find/group instruction neighbors:
+ */
+
+/* bleh.. we need to do the same group_n() thing for both inputs/outputs
+ * (where we have a simple instr[] array), and fanin nodes (where we have
+ * an extra indirection via reg->instr).
+ */
+struct group_ops {
+	struct ir3_instruction *(*get)(void *arr, int idx);
+	void (*insert_mov)(void *arr, int idx, struct ir3_instruction *instr);
+};
+
+static struct ir3_instruction *arr_get(void *arr, int idx)
+{
+	return ((struct ir3_instruction **)arr)[idx];
+}
+static void arr_insert_mov_out(void *arr, int idx, struct ir3_instruction *instr)
+{
+	((struct ir3_instruction **)arr)[idx] =
+			ir3_MOV(instr->block, instr, TYPE_F32);
+}
+static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr)
+{
+	/* so, we can't insert a mov in front of a meta:in.. and the downstream
+	 * instruction already has a pointer to 'instr'.  So we cheat a bit and
+	 * morph the meta:in instruction into a mov and insert a new meta:in
+	 * in front.
+	 */
+	struct ir3_instruction *in;
+
+	debug_assert(instr->regs_count == 1);
+
+	in = ir3_instr_create(instr->block, OPC_META_INPUT);
+	in->inout.block = instr->block;
+	ir3_reg_create(in, instr->regs[0]->num, 0);
+
+	/* create src reg for meta:in and fixup to now be a mov: */
+	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = in;
+	instr->opc = OPC_MOV;
+	instr->cat1.src_type = TYPE_F32;
+	instr->cat1.dst_type = TYPE_F32;
+
+	((struct ir3_instruction **)arr)[idx] = in;
+}
+static struct group_ops arr_ops_out = { arr_get, arr_insert_mov_out };
+static struct group_ops arr_ops_in = { arr_get, arr_insert_mov_in };
+
+static struct ir3_instruction *instr_get(void *arr, int idx)
+{
+	return ssa(((struct ir3_instruction *)arr)->regs[idx+1]);
+}
+static void
+instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr)
+{
+	((struct ir3_instruction *)arr)->regs[idx+1]->instr =
+			ir3_MOV(instr->block, instr, TYPE_F32);
+}
+static struct group_ops instr_ops = { instr_get, instr_insert_mov };
+
+/* verify that cur != instr, but cur is also not in instr's neighbor-list: */
+static bool
+in_neighbor_list(struct ir3_instruction *instr, struct ir3_instruction *cur, int pos)
+{
+	int idx = 0;
+
+	if (!instr)
+		return false;
+
+	if (instr == cur)
+		return true;
+
+	for (instr = ir3_neighbor_first(instr); instr; instr = instr->cp.right)
+		if ((idx++ != pos) && (instr == cur))
+			return true;
+
+	return false;
+}
+
+static void
+group_n(struct group_ops *ops, void *arr, unsigned n)
+{
+	unsigned i, j;
+
+	/* first pass, figure out what has conflicts and needs a mov
+	 * inserted.  Do this up front, before starting to setup
+	 * left/right neighbor pointers.  Trying to do it in a single
+	 * pass could result in a situation where we can't even setup
+	 * the mov's right neighbor ptr if the next instr also needs
+	 * a mov.
+	 */
+restart:
+	for (i = 0; i < n; i++) {
+		struct ir3_instruction *instr = ops->get(arr, i);
+		if (instr) {
+			struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
+			struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
+			bool conflict;
+
+			/* check for left/right neighbor conflicts: */
+			conflict = conflicts(instr->cp.left, left) ||
+				conflicts(instr->cp.right, right);
+
+			/* Mixing array elements and higher register classes
+			 * (ie. groups) doesn't really work out in RA.  See:
+			 *
+			 * https://trello.com/c/DqeDkeVf/156-bug-with-stk-70frag
+			 */
+			if (instr->regs[0]->flags & IR3_REG_ARRAY)
+				conflict = true;
+
+			/* we also can't have an instr twice in the group: */
+			for (j = i + 1; (j < n) && !conflict; j++)
+				if (in_neighbor_list(ops->get(arr, j), instr, i))
+					conflict = true;
+
+			if (conflict) {
+				ops->insert_mov(arr, i, instr);
+				/* inserting the mov may have caused a conflict
+				 * against the previous:
+				 */
+				goto restart;
+			}
+		}
+	}
+
+	/* second pass, now that we've inserted mov's, fixup left/right
+	 * neighbors.  This is guaranteed to succeed, since by definition
+	 * the newly inserted mov's cannot conflict with anything.
+	 */
+	for (i = 0; i < n; i++) {
+		struct ir3_instruction *instr = ops->get(arr, i);
+		if (instr) {
+			struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
+			struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
+
+			debug_assert(!conflicts(instr->cp.left, left));
+			if (left) {
+				instr->cp.left_cnt++;
+				instr->cp.left = left;
+			}
+
+			debug_assert(!conflicts(instr->cp.right, right));
+			if (right) {
+				instr->cp.right_cnt++;
+				instr->cp.right = right;
+			}
+		}
+	}
+}
+
+static void
+instr_find_neighbors(struct ir3_instruction *instr)
+{
+	struct ir3_instruction *src;
+
+	if (ir3_instr_check_mark(instr))
+		return;
+
+	if (instr->opc == OPC_META_FI)
+		group_n(&instr_ops, instr, instr->regs_count - 1);
+
+	foreach_ssa_src(src, instr)
+		instr_find_neighbors(src);
+}
+
+/* a bit of sadness.. we can't have "holes" in inputs from PoV of
+ * register assignment, they still need to be grouped together.  So
+ * we need to insert dummy/padding instruction for grouping, and
+ * then take it back out again before anyone notices.
+ */
+static void
+pad_and_group_input(struct ir3_instruction **input, unsigned n)
+{
+	int i, mask = 0;
+	struct ir3_block *block = NULL;
+
+	for (i = n - 1; i >= 0; i--) {
+		struct ir3_instruction *instr = input[i];
+		if (instr) {
+			block = instr->block;
+		} else if (block) {
+			instr = ir3_NOP(block);
+			ir3_reg_create(instr, 0, IR3_REG_SSA);    /* dummy dst */
+			input[i] = instr;
+			mask |= (1 << i);
+		}
+	}
+
+	group_n(&arr_ops_in, input, n);
+
+	for (i = 0; i < n; i++) {
+		if (mask & (1 << i))
+			input[i] = NULL;
+	}
+}
+
+static void
+find_neighbors(struct ir3 *ir)
+{
+	unsigned i;
+
+	/* shader inputs/outputs themselves must be contiguous as well:
+	 *
+	 * NOTE: group inputs first, since we only insert mov's
+	 * *before* the conflicted instr (and that would go badly
+	 * for inputs).  By doing inputs first, we should never
+	 * have a conflict on inputs.. pushing any conflict to
+	 * resolve to the outputs, for stuff like:
+	 *
+	 *     MOV OUT[n], IN[m].wzyx
+	 *
+	 * NOTE: we assume here inputs/outputs are grouped in vec4.
+	 * This logic won't quite cut it if we don't align smaller
+	 * on vec4 boundaries
+	 */
+	for (i = 0; i < ir->ninputs; i += 4)
+		pad_and_group_input(&ir->inputs[i], 4);
+	for (i = 0; i < ir->noutputs; i += 4)
+		group_n(&arr_ops_out, &ir->outputs[i], 4);
+
+	for (i = 0; i < ir->noutputs; i++) {
+		if (ir->outputs[i]) {
+			struct ir3_instruction *instr = ir->outputs[i];
+			instr_find_neighbors(instr);
+		}
+	}
+
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		for (i = 0; i < block->keeps_count; i++) {
+			struct ir3_instruction *instr = block->keeps[i];
+			instr_find_neighbors(instr);
+		}
+
+		/* We also need to account for if-condition: */
+		if (block->condition)
+			instr_find_neighbors(block->condition);
+	}
+}
+
+void
+ir3_group(struct ir3 *ir)
+{
+	ir3_clear_mark(ir);
+	find_neighbors(ir);
+}
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
new file mode 100644
index 00000000000..ff4c644eab5
--- /dev/null
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -0,0 +1,496 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "util/ralloc.h"
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Legalize:
+ *
+ * We currently require that scheduling ensures that we have enough nop's
+ * in all the right places.  The legalize step mostly handles fixing up
+ * instruction flags ((ss)/(sy)/(ei)), and collapses sequences of nop's
+ * into fewer nop's w/ rpt flag.
+ */
+
+struct ir3_legalize_ctx {
+	int num_samp;
+	bool has_ssbo;
+	int max_bary;
+};
+
+struct ir3_legalize_state {
+	regmask_t needs_ss;
+	regmask_t needs_ss_war;       /* write after read */
+	regmask_t needs_sy;
+};
+
+struct ir3_legalize_block_data {
+	bool valid;
+	struct ir3_legalize_state state;
+};
+
+/* We want to evaluate each block from the position of any other
+ * predecessor block, in order that the flags set are the union of
+ * all possible program paths.
+ *
+ * To do this, we need to know the output state (needs_ss/ss_war/sy)
+ * of all predecessor blocks.  The tricky thing is loops, which mean
+ * that we can't simply recursively process each predecessor block
+ * before legalizing the current block.
+ *
+ * How we handle that is by looping over all the blocks until the
+ * results converge.  If the output state of a given block changes
+ * in a given pass, this means that all successor blocks are not
+ * yet fully legalized.
+ */
+
+static bool
+legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
+{
+	struct ir3_legalize_block_data *bd = block->data;
+
+	if (bd->valid)
+		return false;
+
+	struct ir3_instruction *last_input = NULL;
+	struct ir3_instruction *last_rel = NULL;
+	struct ir3_instruction *last_n = NULL;
+	struct list_head instr_list;
+	struct ir3_legalize_state prev_state = bd->state;
+	struct ir3_legalize_state *state = &bd->state;
+
+	/* our input state is the OR of all predecessor blocks' state: */
+	for (unsigned i = 0; i < block->predecessors_count; i++) {
+		struct ir3_legalize_block_data *pbd = block->predecessors[i]->data;
+		struct ir3_legalize_state *pstate = &pbd->state;
+
+		/* Our input (ss)/(sy) state is based on OR'ing the output
+		 * state of all our predecessor blocks
+		 */
+		regmask_or(&state->needs_ss,
+				&state->needs_ss, &pstate->needs_ss);
+		regmask_or(&state->needs_ss_war,
+				&state->needs_ss_war, &pstate->needs_ss_war);
+		regmask_or(&state->needs_sy,
+				&state->needs_sy, &pstate->needs_sy);
+	}
+
+	/* remove all the instructions from the list, we'll be adding
+	 * them back in as we go
+	 */
+	list_replace(&block->instr_list, &instr_list);
+	list_inithead(&block->instr_list);
+
+	list_for_each_entry_safe (struct ir3_instruction, n, &instr_list, node) {
+		struct ir3_register *reg;
+		unsigned i;
+
+		n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
+
+		if (is_meta(n))
+			continue;
+
+		if (is_input(n)) {
+			struct ir3_register *inloc = n->regs[1];
+			assert(inloc->flags & IR3_REG_IMMED);
+			ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
+		}
+
+		if (last_n && is_barrier(last_n))
+			n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+
+		/* NOTE: consider dst register too.. it could happen that
+		 * texture sample instruction (for example) writes some
+		 * components which are unused.  A subsequent instruction
+		 * that writes the same register can race w/ the sam instr
+		 * resulting in undefined results:
+		 */
+		for (i = 0; i < n->regs_count; i++) {
+			reg = n->regs[i];
+
+			if (reg_gpr(reg)) {
+
+				/* TODO: we probably only need (ss) for alu
+				 * instr consuming sfu result.. need to make
+				 * some tests for both this and (sy)..
+				 */
+				if (regmask_get(&state->needs_ss, reg)) {
+					n->flags |= IR3_INSTR_SS;
+					regmask_init(&state->needs_ss_war);
+					regmask_init(&state->needs_ss);
+				}
+
+				if (regmask_get(&state->needs_sy, reg)) {
+					n->flags |= IR3_INSTR_SY;
+					regmask_init(&state->needs_sy);
+				}
+			}
+
+			/* TODO: is it valid to have address reg loaded from a
+			 * relative src (ie. mova a0, c<a0.x+4>)?  If so, the
+			 * last_rel check below should be moved ahead of this:
+			 */
+			if (reg->flags & IR3_REG_RELATIV)
+				last_rel = n;
+		}
+
+		if (n->regs_count > 0) {
+			reg = n->regs[0];
+			if (regmask_get(&state->needs_ss_war, reg)) {
+				n->flags |= IR3_INSTR_SS;
+				regmask_init(&state->needs_ss_war);
+				regmask_init(&state->needs_ss);
+			}
+
+			if (last_rel && (reg->num == regid(REG_A0, 0))) {
+				last_rel->flags |= IR3_INSTR_UL;
+				last_rel = NULL;
+			}
+		}
+
+		/* cat5+ does not have an (ss) bit, if needed we need to
+		 * insert a nop to carry the sync flag.  Would be kinda
+		 * clever if we were aware of this during scheduling, but
+		 * this should be a pretty rare case:
+		 */
+		if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
+			struct ir3_instruction *nop;
+			nop = ir3_NOP(block);
+			nop->flags |= IR3_INSTR_SS;
+			n->flags &= ~IR3_INSTR_SS;
+		}
+
+		/* need to be able to set (ss) on first instruction: */
+		if (list_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
+			ir3_NOP(block);
+
+		if (is_nop(n) && !list_empty(&block->instr_list)) {
+			struct ir3_instruction *last = list_last_entry(&block->instr_list,
+					struct ir3_instruction, node);
+			if (is_nop(last) && (last->repeat < 5)) {
+				last->repeat++;
+				last->flags |= n->flags;
+				continue;
+			}
+		}
+
+		list_addtail(&n->node, &block->instr_list);
+
+		if (is_sfu(n))
+			regmask_set(&state->needs_ss, n->regs[0]);
+
+		if (is_tex(n)) {
+			/* this ends up being the # of samp instructions.. but that
+			 * is ok, everything else only cares whether it is zero or
+			 * not.  We do this here, rather than when we encounter a
+			 * SAMP decl, because (especially in binning pass shader)
+			 * the samp instruction(s) could get eliminated if the
+			 * result is not used.
+			 */
+			ctx->num_samp = MAX2(ctx->num_samp, n->cat5.samp + 1);
+			regmask_set(&state->needs_sy, n->regs[0]);
+		} else if (n->opc == OPC_RESINFO) {
+			regmask_set(&state->needs_ss, n->regs[0]);
+			ir3_NOP(block)->flags |= IR3_INSTR_SS;
+		} else if (is_load(n)) {
+			/* seems like ldlv needs (ss) bit instead??  which is odd but
+			 * makes a bunch of flat-varying tests start working on a4xx.
+			 */
+			if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL))
+				regmask_set(&state->needs_ss, n->regs[0]);
+			else
+				regmask_set(&state->needs_sy, n->regs[0]);
+		} else if (is_atomic(n->opc)) {
+			if (n->flags & IR3_INSTR_G)
+				regmask_set(&state->needs_sy, n->regs[0]);
+			else
+				regmask_set(&state->needs_ss, n->regs[0]);
+		}
+
+		if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
+			ctx->has_ssbo = true;
+
+		/* both tex/sfu appear to not always immediately consume
+		 * their src register(s):
+		 */
+		if (is_tex(n) || is_sfu(n) || is_mem(n)) {
+			foreach_src(reg, n) {
+				if (reg_gpr(reg))
+					regmask_set(&state->needs_ss_war, reg);
+			}
+		}
+
+		if (is_input(n))
+			last_input = n;
+
+		last_n = n;
+	}
+
+	if (last_input) {
+		/* special hack.. if using ldlv to bypass interpolation,
+		 * we need to insert a dummy bary.f on which we can set
+		 * the (ei) flag:
+		 */
+		if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) {
+			struct ir3_instruction *baryf;
+
+			/* (ss)bary.f (ei)r63.x, 0, r0.x */
+			baryf = ir3_instr_create(block, OPC_BARY_F);
+			baryf->flags |= IR3_INSTR_SS;
+			ir3_reg_create(baryf, regid(63, 0), 0);
+			ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
+			ir3_reg_create(baryf, regid(0, 0), 0);
+
+			/* insert the dummy bary.f after last_input: */
+			list_delinit(&baryf->node);
+			list_add(&baryf->node, &last_input->node);
+
+			last_input = baryf;
+		}
+		last_input->regs[0]->flags |= IR3_REG_EI;
+	}
+
+	if (last_rel)
+		last_rel->flags |= IR3_INSTR_UL;
+
+	bd->valid = true;
+
+	if (memcmp(&prev_state, state, sizeof(*state))) {
+		/* our output state changed, this invalidates all of our
+		 * successors:
+		 */
+		for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
+			if (!block->successors[i])
+				break;
+			struct ir3_legalize_block_data *pbd = block->successors[i]->data;
+			pbd->valid = false;
+		}
+	}
+
+	return true;
+}
+
+/* NOTE: branch instructions are always the last instruction(s)
+ * in the block.  We take advantage of this as we resolve the
+ * branches, since "if (foo) break;" constructs turn into
+ * something like:
+ *
+ *   block3 {
+ *   	...
+ *   	0029:021: mov.s32s32 r62.x, r1.y
+ *   	0082:022: br !p0.x, target=block5
+ *   	0083:023: br p0.x, target=block4
+ *   	// succs: if _[0029:021: mov.s32s32] block4; else block5;
+ *   }
+ *   block4 {
+ *   	0084:024: jump, target=block6
+ *   	// succs: block6;
+ *   }
+ *   block5 {
+ *   	0085:025: jump, target=block7
+ *   	// succs: block7;
+ *   }
+ *
+ * ie. only instruction in block4/block5 is a jump, so when
+ * resolving branches we can easily detect this by checking
+ * that the first instruction in the target block is itself
+ * a jump, and setup the br directly to the jump's target
+ * (and strip back out the now unreached jump)
+ *
+ * TODO sometimes we end up with things like:
+ *
+ *    br !p0.x, #2
+ *    br p0.x, #12
+ *    add.u r0.y, r0.y, 1
+ *
+ * If we swapped the order of the branches, we could drop one.
+ */
+static struct ir3_block *
+resolve_dest_block(struct ir3_block *block)
+{
+	/* special case for last block: */
+	if (!block->successors[0])
+		return block;
+
+	/* NOTE that we may or may not have inserted the jump
+	 * in the target block yet, so conditions to resolve
+	 * the dest to the dest block's successor are:
+	 *
+	 *   (1) successor[1] == NULL &&
+	 *   (2) (block-is-empty || only-instr-is-jump)
+	 */
+	if (block->successors[1] == NULL) {
+		if (list_empty(&block->instr_list)) {
+			return block->successors[0];
+		} else if (list_length(&block->instr_list) == 1) {
+			struct ir3_instruction *instr = list_first_entry(
+					&block->instr_list, struct ir3_instruction, node);
+			if (instr->opc == OPC_JUMP)
+				return block->successors[0];
+		}
+	}
+	return block;
+}
+
+static bool
+resolve_jump(struct ir3_instruction *instr)
+{
+	struct ir3_block *tblock =
+		resolve_dest_block(instr->cat0.target);
+	struct ir3_instruction *target;
+
+	if (tblock != instr->cat0.target) {
+		list_delinit(&instr->cat0.target->node);
+		instr->cat0.target = tblock;
+		return true;
+	}
+
+	target = list_first_entry(&tblock->instr_list,
+				struct ir3_instruction, node);
+
+	/* TODO maybe a less fragile way to do this.  But we are expecting
+	 * a pattern from sched_block() that looks like:
+	 *
+	 *   br !p0.x, #else-block
+	 *   br p0.x, #if-block
+	 *
+	 * if the first branch target is +2, or if 2nd branch target is +1
+	 * then we can just drop the jump.
+	 */
+	unsigned next_block;
+	if (instr->cat0.inv == true)
+		next_block = 2;
+	else
+		next_block = 1;
+
+	if ((!target) || (target->ip == (instr->ip + next_block))) {
+		list_delinit(&instr->node);
+		return true;
+	} else {
+		instr->cat0.immed =
+			(int)target->ip - (int)instr->ip;
+	}
+	return false;
+}
+
+/* resolve jumps, removing jumps/branches to immediately following
+ * instruction which we end up with from earlier stages.  Since
+ * removing an instruction can invalidate earlier instruction's
+ * branch offsets, we need to do this iteratively until no more
+ * branches are removed.
+ */
+static bool
+resolve_jumps(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+			if (is_flow(instr) && instr->cat0.target)
+				if (resolve_jump(instr))
+					return true;
+
+	return false;
+}
+
+/* we want to mark points where divergent flow control re-converges
+ * with (jp) flags.  For now, since we don't do any optimization for
+ * things that start out as a 'do {} while()', re-convergence points
+ * will always be a branch or jump target.  Note that this is overly
+ * conservative, since unconditional jump targets are not convergence
+ * points, we are just assuming that the other path to reach the jump
+ * target was divergent.  If we were clever enough to optimize the
+ * jump at end of a loop back to a conditional branch into a single
+ * conditional branch, ie. like:
+ *
+ *    add.f r1.w, r0.x, (neg)(r)c2.x   <= loop start
+ *    mul.f r1.z, r1.z, r0.x
+ *    mul.f r1.y, r1.y, r0.x
+ *    mul.f r0.z, r1.x, r0.x
+ *    mul.f r0.w, r0.y, r0.x
+ *    cmps.f.ge r0.x, (r)c2.y, (r)r1.w
+ *    add.s r0.x, (r)r0.x, (r)-1
+ *    sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x
+ *    cmps.f.eq p0.x, r0.x, c3.y
+ *    mov.f32f32 r0.x, r1.w
+ *    mov.f32f32 r0.y, r0.w
+ *    mov.f32f32 r1.x, r0.z
+ *    (rpt2)nop
+ *    br !p0.x, #-13
+ *    (jp)mul.f r0.x, c263.y, r1.y
+ *
+ * Then we'd have to be more clever, as the convergence point is no
+ * longer a branch or jump target.
+ */
+static void
+mark_convergence_points(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			if (is_flow(instr) && instr->cat0.target) {
+				struct ir3_instruction *target =
+					list_first_entry(&instr->cat0.target->instr_list,
+							struct ir3_instruction, node);
+				target->flags |= IR3_INSTR_JP;
+			}
+		}
+	}
+}
+
+void
+ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary)
+{
+	struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
+	bool progress;
+
+	ctx->max_bary = -1;
+
+	/* allocate per-block data: */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		block->data = rzalloc(ctx, struct ir3_legalize_block_data);
+	}
+
+	/* process each block: */
+	do {
+		progress = false;
+		list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+			progress |= legalize_block(ctx, block);
+		}
+	} while (progress);
+
+	*num_samp = ctx->num_samp;
+	*has_ssbo = ctx->has_ssbo;
+	*max_bary = ctx->max_bary;
+
+	do {
+		ir3_count_instructions(ir);
+	} while(resolve_jumps(ir));
+
+	mark_convergence_points(ir);
+
+	ralloc_free(ctx);
+}
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
new file mode 100644
index 00000000000..70c01ee0593
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+
+#include "util/debug.h"
+
+#include "ir3_nir.h"
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+static const nir_shader_compiler_options options = {
+		.lower_fpow = true,
+		.lower_scmp = true,
+		.lower_flrp32 = true,
+		.lower_flrp64 = true,
+		.lower_ffract = true,
+		.lower_fmod32 = true,
+		.lower_fmod64 = true,
+		.lower_fdiv = true,
+		.lower_ldexp = true,
+		.fuse_ffma = true,
+		.native_integers = true,
+		.vertex_id_zero_based = true,
+		.lower_extract_byte = true,
+		.lower_extract_word = true,
+		.lower_all_io_to_temps = true,
+		.lower_helper_invocation = true,
+};
+
+const nir_shader_compiler_options *
+ir3_get_compiler_options(struct ir3_compiler *compiler)
+{
+	return &options;
+}
+
+/* for given shader key, are any steps handled in nir? */
+bool
+ir3_key_lowers_nir(const struct ir3_shader_key *key)
+{
+	return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r |
+			key->vsaturate_s | key->vsaturate_t | key->vsaturate_r |
+			key->ucp_enables | key->color_two_side |
+			key->fclamp_color | key->vclamp_color;
+}
+
+#define OPT(nir, pass, ...) ({                             \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   this_progress;                                          \
+})
+
+#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
+
+static void
+ir3_optimize_loop(nir_shader *s)
+{
+	bool progress;
+	do {
+		progress = false;
+
+		OPT_V(s, nir_lower_vars_to_ssa);
+		progress |= OPT(s, nir_opt_copy_prop_vars);
+		progress |= OPT(s, nir_opt_dead_write_vars);
+		progress |= OPT(s, nir_lower_alu_to_scalar);
+		progress |= OPT(s, nir_lower_phis_to_scalar);
+
+		progress |= OPT(s, nir_copy_prop);
+		progress |= OPT(s, nir_opt_dce);
+		progress |= OPT(s, nir_opt_cse);
+		static int gcm = -1;
+		if (gcm == -1)
+			gcm = env_var_as_unsigned("GCM", 0);
+		if (gcm == 1)
+			progress |= OPT(s, nir_opt_gcm, true);
+		else if (gcm == 2)
+			progress |= OPT(s, nir_opt_gcm, false);
+		progress |= OPT(s, nir_opt_peephole_select, 16);
+		progress |= OPT(s, nir_opt_intrinsics);
+		progress |= OPT(s, nir_opt_algebraic);
+		progress |= OPT(s, nir_opt_constant_folding);
+		progress |= OPT(s, nir_opt_dead_cf);
+		if (OPT(s, nir_opt_trivial_continues)) {
+			progress |= true;
+			/* If nir_opt_trivial_continues makes progress, then we need to clean
+			 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
+			 * to make progress.
+			 */
+			OPT(s, nir_copy_prop);
+			OPT(s, nir_opt_dce);
+		}
+		progress |= OPT(s, nir_opt_if);
+		progress |= OPT(s, nir_opt_remove_phis);
+		progress |= OPT(s, nir_opt_undef);
+
+	} while (progress);
+}
+
+struct nir_shader *
+ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+		const struct ir3_shader_key *key)
+{
+	struct nir_lower_tex_options tex_options = {
+			.lower_rect = 0,
+	};
+
+	if (key) {
+		switch (shader->type) {
+		case MESA_SHADER_FRAGMENT:
+			tex_options.saturate_s = key->fsaturate_s;
+			tex_options.saturate_t = key->fsaturate_t;
+			tex_options.saturate_r = key->fsaturate_r;
+			break;
+		case MESA_SHADER_VERTEX:
+			tex_options.saturate_s = key->vsaturate_s;
+			tex_options.saturate_t = key->vsaturate_t;
+			tex_options.saturate_r = key->vsaturate_r;
+			break;
+		default:
+			/* TODO */
+			break;
+		}
+	}
+
+	if (shader->compiler->gpu_id >= 400) {
+		/* a4xx seems to have *no* sam.p */
+		tex_options.lower_txp = ~0;  /* lower all txp */
+	} else {
+		/* a3xx just needs to avoid sam.p for 3d tex */
+		tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
+	}
+
+	if (ir3_shader_debug & IR3_DBG_DISASM) {
+		debug_printf("----------------------\n");
+		nir_print_shader(s, stdout);
+		debug_printf("----------------------\n");
+	}
+
+	OPT_V(s, nir_opt_global_to_local);
+	OPT_V(s, nir_lower_regs_to_ssa);
+
+	if (key) {
+		if (s->info.stage == MESA_SHADER_VERTEX) {
+			OPT_V(s, nir_lower_clip_vs, key->ucp_enables, false);
+			if (key->vclamp_color)
+				OPT_V(s, nir_lower_clamp_color_outputs);
+		} else if (s->info.stage == MESA_SHADER_FRAGMENT) {
+			OPT_V(s, nir_lower_clip_fs, key->ucp_enables);
+			if (key->fclamp_color)
+				OPT_V(s, nir_lower_clamp_color_outputs);
+		}
+		if (key->color_two_side) {
+			OPT_V(s, nir_lower_two_sided_color);
+		}
+	} else {
+		/* only want to do this the first time (when key is null)
+		 * and not again on any potential 2nd variant lowering pass:
+		 */
+		OPT_V(s, ir3_nir_apply_trig_workarounds);
+	}
+
+	OPT_V(s, nir_lower_tex, &tex_options);
+	OPT_V(s, nir_lower_load_const_to_scalar);
+	if (shader->compiler->gpu_id < 500)
+		OPT_V(s, ir3_nir_lower_tg4_to_tex);
+
+	ir3_optimize_loop(s);
+
+	/* do idiv lowering after first opt loop to give a chance for
+	 * divide by immed power-of-two to be caught first:
+	 */
+	if (OPT(s, nir_lower_idiv))
+		ir3_optimize_loop(s);
+
+	OPT_V(s, nir_remove_dead_variables, nir_var_local);
+
+	OPT_V(s, nir_move_load_const);
+
+	if (ir3_shader_debug & IR3_DBG_DISASM) {
+		debug_printf("----------------------\n");
+		nir_print_shader(s, stdout);
+		debug_printf("----------------------\n");
+	}
+
+	nir_sweep(s);
+
+	return s;
+}
+
+void
+ir3_nir_scan_driver_consts(nir_shader *shader,
+		struct ir3_driver_const_layout *layout)
+{
+	nir_foreach_function(function, shader) {
+		if (!function->impl)
+			continue;
+
+		nir_foreach_block(block, function->impl) {
+			nir_foreach_instr(instr, block) {
+				if (instr->type != nir_instr_type_intrinsic)
+					continue;
+
+				nir_intrinsic_instr *intr =
+					nir_instr_as_intrinsic(instr);
+				unsigned idx;
+
+				switch (intr->intrinsic) {
+				case nir_intrinsic_get_buffer_size:
+					idx = nir_src_as_const_value(intr->src[0])->u32[0];
+					if (layout->ssbo_size.mask & (1 << idx))
+						break;
+					layout->ssbo_size.mask |= (1 << idx);
+					layout->ssbo_size.off[idx] =
+						layout->ssbo_size.count;
+					layout->ssbo_size.count += 1; /* one const per */
+					break;
+				case nir_intrinsic_image_deref_atomic_add:
+				case nir_intrinsic_image_deref_atomic_min:
+				case nir_intrinsic_image_deref_atomic_max:
+				case nir_intrinsic_image_deref_atomic_and:
+				case nir_intrinsic_image_deref_atomic_or:
+				case nir_intrinsic_image_deref_atomic_xor:
+				case nir_intrinsic_image_deref_atomic_exchange:
+				case nir_intrinsic_image_deref_atomic_comp_swap:
+				case nir_intrinsic_image_deref_store:
+				case nir_intrinsic_image_deref_size:
+					idx = nir_intrinsic_get_var(intr, 0)->data.driver_location;
+					if (layout->image_dims.mask & (1 << idx))
+						break;
+					layout->image_dims.mask |= (1 << idx);
+					layout->image_dims.off[idx] =
+						layout->image_dims.count;
+					layout->image_dims.count += 3; /* three const per */
+					break;
+				default:
+					break;
+				}
+			}
+		}
+	}
+}
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
new file mode 100644
index 00000000000..74201d34160
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2015 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#ifndef IR3_NIR_H_
+#define IR3_NIR_H_
+
+#include "compiler/nir/nir.h"
+#include "compiler/shader_enums.h"
+
+#include "ir3_shader.h"
+
+void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layout *layout);
+
+bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
+bool ir3_nir_lower_tg4_to_tex(nir_shader *shader);
+
+const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
+bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
+struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+		const struct ir3_shader_key *key);
+
+#endif /* IR3_NIR_H_ */
diff --git a/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
new file mode 100644
index 00000000000..37a3dcb26f8
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright © 2017 Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "ir3_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the
+ * gather results, rather than before. As a result, it must be emulated with
+ * direct texture calls.
+ */
+
+static bool
+lower_tg4(nir_block *block, nir_builder *b, void *mem_ctx)
+{
+	bool progress = false;
+
+	static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} };
+
+	nir_foreach_instr_safe(instr, block) {
+		if (instr->type != nir_instr_type_tex)
+			continue;
+
+		nir_tex_instr *tg4 = (nir_tex_instr *)instr;
+
+		if (tg4->op != nir_texop_tg4)
+			continue;
+
+		b->cursor = nir_before_instr(&tg4->instr);
+
+		nir_ssa_def *results[4];
+		int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
+		for (int i = 0; i < 4; i++) {
+			int num_srcs = tg4->num_srcs + 1 /* lod */;
+			if (offset_index < 0 && i < 3)
+				num_srcs++;
+
+			nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
+			tex->op = nir_texop_txl;
+			tex->sampler_dim = tg4->sampler_dim;
+			tex->coord_components = tg4->coord_components;
+			tex->is_array = tg4->is_array;
+			tex->is_shadow = tg4->is_shadow;
+			tex->is_new_style_shadow = tg4->is_new_style_shadow;
+			tex->texture_index = tg4->texture_index;
+			tex->sampler_index = tg4->sampler_index;
+			tex->dest_type = tg4->dest_type;
+
+			for (int j = 0; j < tg4->num_srcs; j++) {
+				nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
+				tex->src[j].src_type = tg4->src[j].src_type;
+			}
+			if (i != 3) {
+				nir_ssa_def *offset =
+					nir_vec2(b, nir_imm_int(b, offsets[i][0]),
+							 nir_imm_int(b, offsets[i][1]));
+				if (offset_index < 0) {
+					tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
+					tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
+				} else {
+					assert(nir_tex_instr_src_size(tex, offset_index) == 2);
+					nir_ssa_def *orig = nir_ssa_for_src(
+							b, tex->src[offset_index].src, 2);
+					tex->src[offset_index].src =
+						nir_src_for_ssa(nir_iadd(b, orig, offset));
+				}
+			}
+			tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
+			tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
+
+			nir_ssa_dest_init(&tex->instr, &tex->dest,
+							  nir_tex_instr_dest_size(tex), 32, NULL);
+			nir_builder_instr_insert(b, &tex->instr);
+
+			results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
+		}
+
+		nir_ssa_def *result = nir_vec4(b, results[0], results[1], results[2], results[3]);
+		nir_ssa_def_rewrite_uses(&tg4->dest.ssa, nir_src_for_ssa(result));
+
+		nir_instr_remove(&tg4->instr);
+
+		progress = true;
+	}
+
+	return progress;
+}
+
+static bool
+lower_tg4_func(nir_function_impl *impl)
+{
+	void *mem_ctx = ralloc_parent(impl);
+	nir_builder b;
+	nir_builder_init(&b, impl);
+
+	bool progress = false;
+	nir_foreach_block_safe(block, impl) {
+		progress |= lower_tg4(block, &b, mem_ctx);
+	}
+
+	if (progress)
+		nir_metadata_preserve(impl, nir_metadata_block_index |
+									nir_metadata_dominance);
+
+	return progress;
+}
+
+bool
+ir3_nir_lower_tg4_to_tex(nir_shader *shader)
+{
+	bool progress = false;
+
+	nir_foreach_function(function, shader) {
+		if (function->impl)
+			progress |= lower_tg4_func(function->impl);
+	}
+
+	return progress;
+}
diff --git a/src/freedreno/ir3/ir3_nir_trig.py b/src/freedreno/ir3/ir3_nir_trig.py
new file mode 100644
index 00000000000..3968aea543c
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_trig.py
@@ -0,0 +1,51 @@
+#
+# Copyright (C) 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+from __future__ import print_function
+
+import argparse
+import sys
+
+trig_workarounds = [
+   (('fsin', 'x'), ('fsin', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))),
+   (('fcos', 'x'), ('fcos', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))),
+]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--import-path', required=True)
+    args = parser.parse_args()
+    sys.path.insert(0, args.import_path)
+    run()
+
+
+def run():
+    import nir_algebraic  # pylint: disable=import-error
+
+    print('#include "ir3_nir.h"')
+    print(nir_algebraic.AlgebraicPass("ir3_nir_apply_trig_workarounds",
+                                      trig_workarounds).render())
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c
new file mode 100644
index 00000000000..b6ef6e4b5a7
--- /dev/null
+++ b/src/freedreno/ir3/ir3_print.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "ir3.h"
+
+#define PTRID(x) ((unsigned long)(x))
+
+static void print_instr_name(struct ir3_instruction *instr)
+{
+	if (!instr)
+		return;
+#ifdef DEBUG
+	printf("%04u:", instr->serialno);
+#endif
+	printf("%04u:", instr->name);
+	printf("%04u:", instr->ip);
+	printf("%03u: ", instr->depth);
+
+	if (instr->flags & IR3_INSTR_SY)
+		printf("(sy)");
+	if (instr->flags & IR3_INSTR_SS)
+		printf("(ss)");
+
+	if (is_meta(instr)) {
+		switch (instr->opc) {
+		case OPC_META_INPUT:  printf("_meta:in");   break;
+		case OPC_META_FO:     printf("_meta:fo");   break;
+		case OPC_META_FI:     printf("_meta:fi");   break;
+
+		/* shouldn't hit here.. just for debugging: */
+		default: printf("_meta:%d", instr->opc);    break;
+		}
+	} else if (instr->opc == OPC_MOV) {
+		static const char *type[] = {
+				[TYPE_F16] = "f16",
+				[TYPE_F32] = "f32",
+				[TYPE_U16] = "u16",
+				[TYPE_U32] = "u32",
+				[TYPE_S16] = "s16",
+				[TYPE_S32] = "s32",
+				[TYPE_U8]  = "u8",
+				[TYPE_S8]  = "s8",
+		};
+		if (instr->cat1.src_type == instr->cat1.dst_type)
+			printf("mov");
+		else
+			printf("cov");
+		printf(".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
+	} else {
+		printf("%s", ir3_instr_name(instr));
+		if (instr->flags & IR3_INSTR_3D)
+			printf(".3d");
+		if (instr->flags & IR3_INSTR_A)
+			printf(".a");
+		if (instr->flags & IR3_INSTR_O)
+			printf(".o");
+		if (instr->flags & IR3_INSTR_P)
+			printf(".p");
+		if (instr->flags & IR3_INSTR_S)
+			printf(".s");
+		if (instr->flags & IR3_INSTR_S2EN)
+			printf(".s2en");
+	}
+}
+
+static void print_reg_name(struct ir3_register *reg)
+{
+	if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
+			(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
+		printf("(absneg)");
+	else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
+		printf("(neg)");
+	else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
+		printf("(abs)");
+
+	if (reg->flags & IR3_REG_IMMED) {
+		printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
+	} else if (reg->flags & IR3_REG_ARRAY) {
+		printf("arr[id=%u, offset=%d, size=%u", reg->array.id,
+				reg->array.offset, reg->size);
+		/* for ARRAY we could have null src, for example first write
+		 * instruction..
+		 */
+		if (reg->instr) {
+			printf(", _[");
+			print_instr_name(reg->instr);
+			printf("]");
+		}
+		printf("]");
+	} else if (reg->flags & IR3_REG_SSA) {
+		printf("_[");
+		print_instr_name(reg->instr);
+		printf("]");
+	} else if (reg->flags & IR3_REG_RELATIV) {
+		if (reg->flags & IR3_REG_HALF)
+			printf("h");
+		if (reg->flags & IR3_REG_CONST)
+			printf("c<a0.x + %d>", reg->array.offset);
+		else
+			printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size);
+	} else {
+		if (reg->flags & IR3_REG_HALF)
+			printf("h");
+		if (reg->flags & IR3_REG_CONST)
+			printf("c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
+		else
+			printf("\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]);
+	}
+}
+
+static void
+tab(int lvl)
+{
+	for (int i = 0; i < lvl; i++)
+		printf("\t");
+}
+
+static void
+print_instr(struct ir3_instruction *instr, int lvl)
+{
+	unsigned i;
+
+	tab(lvl);
+
+	print_instr_name(instr);
+	for (i = 0; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		printf(i ? ", " : " ");
+		print_reg_name(reg);
+	}
+
+	if (instr->address) {
+		printf(", address=_");
+		printf("[");
+		print_instr_name(instr->address);
+		printf("]");
+	}
+
+	if (instr->cp.left) {
+		printf(", left=_");
+		printf("[");
+		print_instr_name(instr->cp.left);
+		printf("]");
+	}
+
+	if (instr->cp.right) {
+		printf(", right=_");
+		printf("[");
+		print_instr_name(instr->cp.right);
+		printf("]");
+	}
+
+	if (instr->opc == OPC_META_FO) {
+		printf(", off=%d", instr->fo.off);
+	}
+
+	if (is_flow(instr) && instr->cat0.target) {
+		/* the predicate register src is implied: */
+		if (instr->opc == OPC_BR) {
+			printf(" %sp0.x", instr->cat0.inv ? "!" : "");
+		}
+		printf(", target=block%u", block_id(instr->cat0.target));
+	}
+
+	if (instr->deps_count) {
+		printf(", false-deps:");
+		for (unsigned i = 0; i < instr->deps_count; i++) {
+			if (i > 0)
+				printf(", ");
+			printf("_[");
+			print_instr_name(instr->deps[i]);
+			printf("]");
+		}
+	}
+
+	printf("\n");
+}
+
+void ir3_print_instr(struct ir3_instruction *instr)
+{
+	print_instr(instr, 0);
+}
+
+static void
+print_block(struct ir3_block *block, int lvl)
+{
+	tab(lvl); printf("block%u {\n", block_id(block));
+
+	if (block->predecessors_count > 0) {
+		tab(lvl+1);
+		printf("pred: ");
+		for (unsigned i = 0; i < block->predecessors_count; i++) {
+			if (i)
+				printf(", ");
+			printf("block%u", block_id(block->predecessors[i]));
+		}
+		printf("\n");
+	}
+
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		print_instr(instr, lvl+1);
+	}
+
+	tab(lvl+1); printf("/* keeps:\n");
+	for (unsigned i = 0; i < block->keeps_count; i++) {
+		print_instr(block->keeps[i], lvl+2);
+	}
+	tab(lvl+1); printf(" */\n");
+
+	if (block->successors[1]) {
+		/* leading into if/else: */
+		tab(lvl+1);
+		printf("/* succs: if _[");
+		print_instr_name(block->condition);
+		printf("] block%u; else block%u; */\n",
+				block_id(block->successors[0]),
+				block_id(block->successors[1]));
+	} else if (block->successors[0]) {
+		tab(lvl+1);
+		printf("/* succs: block%u; */\n",
+				block_id(block->successors[0]));
+	}
+	tab(lvl); printf("}\n");
+}
+
+void
+ir3_print(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+		print_block(block, 0);
+
+	for (unsigned i = 0; i < ir->noutputs; i++) {
+		if (!ir->outputs[i])
+			continue;
+		printf("out%d: ", i);
+		print_instr(ir->outputs[i], 0);
+	}
+}
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
new file mode 100644
index 00000000000..ad09c4018d3
--- /dev/null
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -0,0 +1,1124 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "util/u_math.h"
+#include "util/register_allocate.h"
+#include "util/ralloc.h"
+#include "util/bitset.h"
+
+#include "ir3.h"
+#include "ir3_compiler.h"
+
+/*
+ * Register Assignment:
+ *
+ * Uses the register_allocate util, which implements graph coloring
+ * algo with interference classes.  To handle the cases where we need
+ * consecutive registers (for example, texture sample instructions),
+ * we model these as larger (double/quad/etc) registers which conflict
+ * with the corresponding registers in other classes.
+ *
+ * Additionally we create additional classes for half-regs, which
+ * do not conflict with the full-reg classes.  We do need at least
+ * sizes 1-4 (to deal w/ texture sample instructions output to half-
+ * reg).  At the moment we don't create the higher order half-reg
+ * classes as half-reg frequently does not have enough precision
+ * for texture coords at higher resolutions.
+ *
+ * There are some additional cases that we need to handle specially,
+ * as the graph coloring algo doesn't understand "partial writes".
+ * For example, a sequence like:
+ *
+ *   add r0.z, ...
+ *   sam (f32)(xy)r0.x, ...
+ *   ...
+ *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
+ *
+ * In this scenario, we treat r0.xyz as class size 3, which is written
+ * (from a use/def perspective) at the 'add' instruction and ignore the
+ * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
+ * defining instruction, as it is the first to partially write r0.xyz.
+ *
+ * Note i965 has a similar scenario, which they solve with a virtual
+ * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
+ * register assignment.  But for us that is horrible from a scheduling
+ * standpoint.  Instead what we do is use idea of 'definer' instruction.
+ * Ie. the first instruction (lowest ip) to write to the variable is the
+ * one we consider from use/def perspective when building interference
+ * graph.  (Other instructions which write other variable components
+ * just define the variable some more.)
+ *
+ * Arrays of arbitrary size are handled via pre-coloring a consecutive
+ * sequence of registers.  Additional scalar (single component) reg
+ * names are allocated starting at ctx->class_base[total_class_count]
+ * (see arr->base), which are pre-colored.  In the use/def graph direct
+ * access is treated as a single element use/def, and indirect access
+ * is treated as use or def of all array elements.  (Only the first
+ * def is tracked, in case of multiple indirect writes, etc.)
+ *
+ * TODO arrays that fit in one of the pre-defined class sizes should
+ * not need to be pre-colored, but instead could be given a normal
+ * vreg name.  (Ignoring this for now since it is a good way to work
+ * out the kinks with arbitrary sized arrays.)
+ *
+ * TODO might be easier for debugging to split this into two passes,
+ * the first assigning vreg names in a way that we could ir3_print()
+ * the result.
+ */
+
+static const unsigned class_sizes[] = {
+	1, 2, 3, 4,
+	4 + 4, /* txd + 1d/2d */
+	4 + 6, /* txd + 3d */
+};
+#define class_count ARRAY_SIZE(class_sizes)
+
+static const unsigned half_class_sizes[] = {
+	1, 2, 3, 4,
+};
+#define half_class_count  ARRAY_SIZE(half_class_sizes)
+
+/* seems to just be used for compute shaders?  Seems like vec1 and vec3
+ * are sufficient (for now?)
+ */
+static const unsigned high_class_sizes[] = {
+	1, 3,
+};
+#define high_class_count ARRAY_SIZE(high_class_sizes)
+
+#define total_class_count (class_count + half_class_count + high_class_count)
+
+/* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
+#define NUM_REGS             (4 * 48)  /* r0 to r47 */
+#define NUM_HIGH_REGS        (4 * 8)   /* r48 to r55 */
+#define FIRST_HIGH_REG       (4 * 48)
+/* Number of virtual regs in a given class: */
+#define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
+#define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
+#define HIGH_CLASS_REGS(i)   (NUM_HIGH_REGS - (high_class_sizes[i] - 1))
+
+#define HALF_OFFSET          (class_count)
+#define HIGH_OFFSET          (class_count + half_class_count)
+
+/* register-set, created one time, used for all shaders: */
+struct ir3_ra_reg_set {
+	struct ra_regs *regs;
+	unsigned int classes[class_count];
+	unsigned int half_classes[half_class_count];
+	unsigned int high_classes[high_class_count];
+	/* maps flat virtual register space to base gpr: */
+	uint16_t *ra_reg_to_gpr;
+	/* maps cls,gpr to flat virtual register space: */
+	uint16_t **gpr_to_ra_reg;
+};
+
+static void
+build_q_values(unsigned int **q_values, unsigned off,
+		const unsigned *sizes, unsigned count)
+{
+	for (unsigned i = 0; i < count; i++) {
+		q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count);
+
+		/* From register_allocate.c:
+		 *
+		 * q(B,C) (indexed by C, B is this register class) in
+		 * Runeson/Nyström paper.  This is "how many registers of B could
+		 * the worst choice register from C conflict with".
+		 *
+		 * If we just let the register allocation algorithm compute these
+		 * values, is extremely expensive.  However, since all of our
+		 * registers are laid out, we can very easily compute them
+		 * ourselves.  View the register from C as fixed starting at GRF n
+		 * somewhere in the middle, and the register from B as sliding back
+		 * and forth.  Then the first register to conflict from B is the
+		 * one starting at n - class_size[B] + 1 and the last register to
+		 * conflict will start at n + class_size[B] - 1.  Therefore, the
+		 * number of conflicts from B is class_size[B] + class_size[C] - 1.
+		 *
+		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+		 * B | | | | | |n| --> | | | | | | |
+		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+		 *             +-+-+-+-+-+
+		 * C           |n| | | | |
+		 *             +-+-+-+-+-+
+		 *
+		 * (Idea copied from brw_fs_reg_allocate.cpp)
+		 */
+		for (unsigned j = 0; j < count; j++)
+			q_values[i + off][j + off] = sizes[i] + sizes[j] - 1;
+	}
+}
+
+/* One-time setup of RA register-set, which describes all the possible
+ * "virtual" registers and their interferences.  Ie. double register
+ * occupies (and conflicts with) two single registers, and so forth.
+ * Since registers do not need to be aligned to their class size, they
+ * can conflict with other registers in the same class too.  Ie:
+ *
+ *    Single (base) |  Double
+ *    --------------+---------------
+ *       R0         |  D0
+ *       R1         |  D0 D1
+ *       R2         |     D1 D2
+ *       R3         |        D2
+ *           .. and so on..
+ *
+ * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
+ * really just four scalar registers.  Don't let that confuse you.)
+ */
+struct ir3_ra_reg_set *
+ir3_ra_alloc_reg_set(struct ir3_compiler *compiler)
+{
+	struct ir3_ra_reg_set *set = rzalloc(compiler, struct ir3_ra_reg_set);
+	unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base;
+	unsigned int **q_values;
+
+	/* calculate # of regs across all classes: */
+	ra_reg_count = 0;
+	for (unsigned i = 0; i < class_count; i++)
+		ra_reg_count += CLASS_REGS(i);
+	for (unsigned i = 0; i < half_class_count; i++)
+		ra_reg_count += HALF_CLASS_REGS(i);
+	for (unsigned i = 0; i < high_class_count; i++)
+		ra_reg_count += HIGH_CLASS_REGS(i);
+
+	/* allocate and populate q_values: */
+	q_values = ralloc_array(set, unsigned *, total_class_count);
+
+	build_q_values(q_values, 0, class_sizes, class_count);
+	build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count);
+	build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count);
+
+	/* allocate the reg-set.. */
+	set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
+	set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
+	set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
+
+	/* .. and classes */
+	reg = 0;
+	for (unsigned i = 0; i < class_count; i++) {
+		set->classes[i] = ra_alloc_reg_class(set->regs);
+
+		set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
+
+		for (unsigned j = 0; j < CLASS_REGS(i); j++) {
+			ra_class_add_reg(set->regs, set->classes[i], reg);
+
+			set->ra_reg_to_gpr[reg] = j;
+			set->gpr_to_ra_reg[i][j] = reg;
+
+			for (unsigned br = j; br < j + class_sizes[i]; br++)
+				ra_add_transitive_reg_conflict(set->regs, br, reg);
+
+			reg++;
+		}
+	}
+
+	first_half_reg = reg;
+	base = HALF_OFFSET;
+
+	for (unsigned i = 0; i < half_class_count; i++) {
+		set->half_classes[i] = ra_alloc_reg_class(set->regs);
+
+		set->gpr_to_ra_reg[base + i] =
+				ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
+
+		for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
+			ra_class_add_reg(set->regs, set->half_classes[i], reg);
+
+			set->ra_reg_to_gpr[reg] = j;
+			set->gpr_to_ra_reg[base + i][j] = reg;
+
+			for (unsigned br = j; br < j + half_class_sizes[i]; br++)
+				ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
+
+			reg++;
+		}
+	}
+
+	first_high_reg = reg;
+	base = HIGH_OFFSET;
+
+	for (unsigned i = 0; i < high_class_count; i++) {
+		set->high_classes[i] = ra_alloc_reg_class(set->regs);
+
+		set->gpr_to_ra_reg[base + i] =
+				ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i));
+
+		for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
+			ra_class_add_reg(set->regs, set->high_classes[i], reg);
+
+			set->ra_reg_to_gpr[reg] = j;
+			set->gpr_to_ra_reg[base + i][j] = reg;
+
+			for (unsigned br = j; br < j + high_class_sizes[i]; br++)
+				ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg);
+
+			reg++;
+		}
+	}
+
+	/* starting a6xx, half precision regs conflict w/ full precision regs: */
+	if (compiler->gpu_id >= 600) {
+		/* because of transitivity, we can get away with just setting up
+		 * conflicts between the first class of full and half regs:
+		 */
+		for (unsigned j = 0; j < CLASS_REGS(0) / 2; j++) {
+			unsigned freg  = set->gpr_to_ra_reg[0][j];
+			unsigned hreg0 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 0];
+			unsigned hreg1 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 1];
+
+			ra_add_transitive_reg_conflict(set->regs, freg, hreg0);
+			ra_add_transitive_reg_conflict(set->regs, freg, hreg1);
+		}
+
+		// TODO also need to update q_values, but for now:
+		ra_set_finalize(set->regs, NULL);
+	} else {
+		ra_set_finalize(set->regs, q_values);
+	}
+
+	ralloc_free(q_values);
+
+	return set;
+}
+
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+	BITSET_WORD *def;        /* variables defined before used in block */
+	BITSET_WORD *use;        /* variables used before defined in block */
+	BITSET_WORD *livein;     /* which defs reach entry point of block */
+	BITSET_WORD *liveout;    /* which defs reach exit point of block */
+};
+
+/* additional instruction-data (per-instruction) */
+struct ir3_ra_instr_data {
+	/* cached instruction 'definer' info: */
+	struct ir3_instruction *defn;
+	int off, sz, cls;
+};
+
+/* register-assign context, per-shader */
+struct ir3_ra_ctx {
+	struct ir3 *ir;
+	gl_shader_stage type;
+	bool frag_face;
+
+	struct ir3_ra_reg_set *set;
+	struct ra_graph *g;
+	unsigned alloc_count;
+	/* one per class, plus one slot for arrays: */
+	unsigned class_alloc_count[total_class_count + 1];
+	unsigned class_base[total_class_count + 1];
+	unsigned instr_cnt;
+	unsigned *def, *use;     /* def/use table */
+	struct ir3_ra_instr_data *instrd;
+};
+
+/* does it conflict? */
+static inline bool
+intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
+{
+	return !((a_start >= b_end) || (b_start >= a_end));
+}
+
+static bool
+is_half(struct ir3_instruction *instr)
+{
+	return !!(instr->regs[0]->flags & IR3_REG_HALF);
+}
+
+static bool
+is_high(struct ir3_instruction *instr)
+{
+	return !!(instr->regs[0]->flags & IR3_REG_HIGH);
+}
+
+static int
+size_to_class(unsigned sz, bool half, bool high)
+{
+	if (high) {
+		for (unsigned i = 0; i < high_class_count; i++)
+			if (high_class_sizes[i] >= sz)
+				return i + HIGH_OFFSET;
+	} else if (half) {
+		for (unsigned i = 0; i < half_class_count; i++)
+			if (half_class_sizes[i] >= sz)
+				return i + HALF_OFFSET;
+	} else {
+		for (unsigned i = 0; i < class_count; i++)
+			if (class_sizes[i] >= sz)
+				return i;
+	}
+	debug_assert(0);
+	return -1;
+}
+
+static bool
+writes_gpr(struct ir3_instruction *instr)
+{
+	if (is_store(instr))
+		return false;
+	/* is dest a normal temp register: */
+	struct ir3_register *reg = instr->regs[0];
+	if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+		return false;
+	if ((reg->num == regid(REG_A0, 0)) ||
+			(reg->num == regid(REG_P0, 0)))
+		return false;
+	return true;
+}
+
+static bool
+instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
+{
+	if (a->flags & IR3_INSTR_UNUSED)
+		return false;
+	return (a->ip < b->ip);
+}
+
+static struct ir3_instruction *
+get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
+		int *sz, int *off)
+{
+	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+	struct ir3_instruction *d = NULL;
+
+	if (id->defn) {
+		*sz = id->sz;
+		*off = id->off;
+		return id->defn;
+	}
+
+	if (instr->opc == OPC_META_FI) {
+		/* What about the case where collect is subset of array, we
+		 * need to find the distance between where actual array starts
+		 * and fanin..  that probably doesn't happen currently.
+		 */
+		struct ir3_register *src;
+		int dsz, doff;
+
+		/* note: don't use foreach_ssa_src as this gets called once
+		 * while assigning regs (which clears SSA flag)
+		 */
+		foreach_src_n(src, n, instr) {
+			struct ir3_instruction *dd;
+			if (!src->instr)
+				continue;
+
+			dd = get_definer(ctx, src->instr, &dsz, &doff);
+
+			if ((!d) || instr_before(dd, d)) {
+				d = dd;
+				*sz = dsz;
+				*off = doff - n;
+			}
+		}
+
+	} else if (instr->cp.right || instr->cp.left) {
+		/* covers also the meta:fo case, which ends up w/ single
+		 * scalar instructions for each component:
+		 */
+		struct ir3_instruction *f = ir3_neighbor_first(instr);
+
+		/* by definition, the entire sequence forms one linked list
+		 * of single scalar register nodes (even if some of them may
+		 * be fanouts from a texture sample (for example) instr.  We
+		 * just need to walk the list finding the first element of
+		 * the group defined (lowest ip)
+		 */
+		int cnt = 0;
+
+		/* need to skip over unused in the group: */
+		while (f && (f->flags & IR3_INSTR_UNUSED)) {
+			f = f->cp.right;
+			cnt++;
+		}
+
+		while (f) {
+			if ((!d) || instr_before(f, d))
+				d = f;
+			if (f == instr)
+				*off = cnt;
+			f = f->cp.right;
+			cnt++;
+		}
+
+		*sz = cnt;
+
+	} else {
+		/* second case is looking directly at the instruction which
+		 * produces multiple values (eg, texture sample), rather
+		 * than the fanout nodes that point back to that instruction.
+		 * This isn't quite right, because it may be part of a larger
+		 * group, such as:
+		 *
+		 *     sam (f32)(xyzw)r0.x, ...
+		 *     add r1.x, ...
+		 *     add r1.y, ...
+		 *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
+		 *
+		 * need to come up with a better way to handle that case.
+		 */
+		if (instr->address) {
+			*sz = instr->regs[0]->size;
+		} else {
+			*sz = util_last_bit(instr->regs[0]->wrmask);
+		}
+		*off = 0;
+		d = instr;
+	}
+
+	if (d->opc == OPC_META_FO) {
+		struct ir3_instruction *dd;
+		int dsz, doff;
+
+		dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
+
+		/* by definition, should come before: */
+		debug_assert(instr_before(dd, d));
+
+		*sz = MAX2(*sz, dsz);
+
+		debug_assert(instr->opc == OPC_META_FO);
+		*off = MAX2(*off, instr->fo.off);
+
+		d = dd;
+	}
+
+	id->defn = d;
+	id->sz = *sz;
+	id->off = *off;
+
+	return d;
+}
+
+static void
+ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+		if (instr->regs_count == 0)
+			continue;
+		/* couple special cases: */
+		if (writes_addr(instr) || writes_pred(instr)) {
+			id->cls = -1;
+		} else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+			id->cls = total_class_count;
+		} else {
+			id->defn = get_definer(ctx, instr, &id->sz, &id->off);
+			id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn));
+		}
+	}
+}
+
+/* give each instruction a name (and ip), and count up the # of names
+ * of each class
+ */
+static void
+ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+
+#ifdef DEBUG
+		instr->name = ~0;
+#endif
+
+		ctx->instr_cnt++;
+
+		if (instr->regs_count == 0)
+			continue;
+
+		if (!writes_gpr(instr))
+			continue;
+
+		if (id->defn != instr)
+			continue;
+
+		/* arrays which don't fit in one of the pre-defined class
+		 * sizes are pre-colored:
+		 */
+		if ((id->cls >= 0) && (id->cls < total_class_count)) {
+			instr->name = ctx->class_alloc_count[id->cls]++;
+			ctx->alloc_count++;
+		}
+	}
+}
+
+static void
+ra_init(struct ir3_ra_ctx *ctx)
+{
+	unsigned n, base;
+
+	ir3_clear_mark(ctx->ir);
+	n = ir3_count_instructions(ctx->ir);
+
+	ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_find_definers(ctx, block);
+	}
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_name_instructions(ctx, block);
+	}
+
+	/* figure out the base register name for each class.  The
+	 * actual ra name is class_base[cls] + instr->name;
+	 */
+	ctx->class_base[0] = 0;
+	for (unsigned i = 1; i <= total_class_count; i++) {
+		ctx->class_base[i] = ctx->class_base[i-1] +
+				ctx->class_alloc_count[i-1];
+	}
+
+	/* and vreg names for array elements: */
+	base = ctx->class_base[total_class_count];
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		arr->base = base;
+		ctx->class_alloc_count[total_class_count] += arr->length;
+		base += arr->length;
+	}
+	ctx->alloc_count += ctx->class_alloc_count[total_class_count];
+
+	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
+	ralloc_steal(ctx->g, ctx->instrd);
+	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+	ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+}
+
+static unsigned
+__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+{
+	unsigned name;
+	debug_assert(cls >= 0);
+	debug_assert(cls < total_class_count);  /* we shouldn't get arrays here.. */
+	name = ctx->class_base[cls] + defn->name;
+	debug_assert(name < ctx->alloc_count);
+	return name;
+}
+
+static int
+ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
+{
+	/* TODO handle name mapping for arrays */
+	return __ra_name(ctx, id->cls, id->defn);
+}
+
+static void
+ra_destroy(struct ir3_ra_ctx *ctx)
+{
+	ralloc_free(ctx->g);
+}
+
+static void
+ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	struct ir3_ra_block_data *bd;
+	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+
+#define def(name, instr) \
+		do { \
+			/* defined on first write: */ \
+			if (!ctx->def[name]) \
+				ctx->def[name] = instr->ip; \
+			ctx->use[name] = instr->ip; \
+			BITSET_SET(bd->def, name); \
+		} while(0);
+
+#define use(name, instr) \
+		do { \
+			ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
+			if (!BITSET_TEST(bd->def, name)) \
+				BITSET_SET(bd->use, name); \
+		} while(0);
+
+	bd = rzalloc(ctx->g, struct ir3_ra_block_data);
+
+	bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+
+	block->data = bd;
+
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_instruction *src;
+		struct ir3_register *reg;
+
+		if (instr->regs_count == 0)
+			continue;
+
+		/* There are a couple special cases to deal with here:
+		 *
+		 * fanout: used to split values from a higher class to a lower
+		 *     class, for example split the results of a texture fetch
+		 *     into individual scalar values;  We skip over these from
+		 *     a 'def' perspective, and for a 'use' we walk the chain
+		 *     up to the defining instruction.
+		 *
+		 * fanin: used to collect values from lower class and assemble
+		 *     them together into a higher class, for example arguments
+		 *     to texture sample instructions;  We consider these to be
+		 *     defined at the earliest fanin source.
+		 *
+		 * Most of this is handled in the get_definer() helper.
+		 *
+		 * In either case, we trace the instruction back to the original
+		 * definer and consider that as the def/use ip.
+		 */
+
+		if (writes_gpr(instr)) {
+			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+			struct ir3_register *dst = instr->regs[0];
+
+			if (dst->flags & IR3_REG_ARRAY) {
+				struct ir3_array *arr =
+					ir3_lookup_array(ctx->ir, dst->array.id);
+				unsigned i;
+
+				arr->start_ip = MIN2(arr->start_ip, instr->ip);
+				arr->end_ip = MAX2(arr->end_ip, instr->ip);
+
+				/* set the node class now.. in case we don't encounter
+				 * this array dst again.  From register_alloc algo's
+				 * perspective, these are all single/scalar regs:
+				 */
+				for (i = 0; i < arr->length; i++) {
+					unsigned name = arr->base + i;
+					ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
+				}
+
+				/* indirect write is treated like a write to all array
+				 * elements, since we don't know which one is actually
+				 * written:
+				 */
+				if (dst->flags & IR3_REG_RELATIV) {
+					for (i = 0; i < arr->length; i++) {
+						unsigned name = arr->base + i;
+						def(name, instr);
+					}
+				} else {
+					unsigned name = arr->base + dst->array.offset;
+					def(name, instr);
+				}
+
+			} else if (id->defn == instr) {
+				unsigned name = ra_name(ctx, id);
+
+				/* since we are in SSA at this point: */
+				debug_assert(!BITSET_TEST(bd->use, name));
+
+				def(name, id->defn);
+
+				if (is_high(id->defn)) {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->high_classes[id->cls - HIGH_OFFSET]);
+				} else if (is_half(id->defn)) {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->half_classes[id->cls - HALF_OFFSET]);
+				} else {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->classes[id->cls]);
+				}
+			}
+		}
+
+		foreach_src(reg, instr) {
+			if (reg->flags & IR3_REG_ARRAY) {
+				struct ir3_array *arr =
+					ir3_lookup_array(ctx->ir, reg->array.id);
+				arr->start_ip = MIN2(arr->start_ip, instr->ip);
+				arr->end_ip = MAX2(arr->end_ip, instr->ip);
+
+				/* indirect read is treated like a read fromall array
+				 * elements, since we don't know which one is actually
+				 * read:
+				 */
+				if (reg->flags & IR3_REG_RELATIV) {
+					unsigned i;
+					for (i = 0; i < arr->length; i++) {
+						unsigned name = arr->base + i;
+						use(name, instr);
+					}
+				} else {
+					unsigned name = arr->base + reg->array.offset;
+					use(name, instr);
+					/* NOTE: arrays are not SSA so unconditionally
+					 * set use bit:
+					 */
+					BITSET_SET(bd->use, name);
+					debug_assert(reg->array.offset < arr->length);
+				}
+			} else if ((src = ssa(reg)) && writes_gpr(src)) {
+				unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
+				use(name, instr);
+			}
+		}
+	}
+}
+
+static bool
+ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
+{
+	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+	bool progress = false;
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		struct ir3_ra_block_data *bd = block->data;
+
+		/* update livein: */
+		for (unsigned i = 0; i < bitset_words; i++) {
+			BITSET_WORD new_livein =
+				(bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
+
+			if (new_livein & ~bd->livein[i]) {
+				bd->livein[i] |= new_livein;
+				progress = true;
+			}
+		}
+
+		/* update liveout: */
+		for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
+			struct ir3_block *succ = block->successors[j];
+			struct ir3_ra_block_data *succ_bd;
+
+			if (!succ)
+				continue;
+
+			succ_bd = succ->data;
+
+			for (unsigned i = 0; i < bitset_words; i++) {
+				BITSET_WORD new_liveout =
+					(succ_bd->livein[i] & ~bd->liveout[i]);
+
+				if (new_liveout) {
+					bd->liveout[i] |= new_liveout;
+					progress = true;
+				}
+			}
+		}
+	}
+
+	return progress;
+}
+
+static void
+print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
+{
+	bool first = true;
+	debug_printf("  %s:", name);
+	for (unsigned i = 0; i < cnt; i++) {
+		if (BITSET_TEST(bs, i)) {
+			if (!first)
+				debug_printf(",");
+			debug_printf(" %04u", i);
+			first = false;
+		}
+	}
+	debug_printf("\n");
+}
+
+static void
+ra_add_interference(struct ir3_ra_ctx *ctx)
+{
+	struct ir3 *ir = ctx->ir;
+
+	/* initialize array live ranges: */
+	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+		arr->start_ip = ~0;
+		arr->end_ip = 0;
+	}
+
+	/* compute live ranges (use/def) on a block level, also updating
+	 * block's def/use bitmasks (used below to calculate per-block
+	 * livein/liveout):
+	 */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		ra_block_compute_live_ranges(ctx, block);
+	}
+
+	/* update per-block livein/liveout: */
+	while (ra_compute_livein_liveout(ctx)) {}
+
+	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+		debug_printf("AFTER LIVEIN/OUT:\n");
+		ir3_print(ir);
+		list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+			struct ir3_ra_block_data *bd = block->data;
+			debug_printf("block%u:\n", block_id(block));
+			print_bitset("  def", bd->def, ctx->alloc_count);
+			print_bitset("  use", bd->use, ctx->alloc_count);
+			print_bitset("  l/i", bd->livein, ctx->alloc_count);
+			print_bitset("  l/o", bd->liveout, ctx->alloc_count);
+		}
+		list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+			debug_printf("array%u:\n", arr->id);
+			debug_printf("  length:   %u\n", arr->length);
+			debug_printf("  start_ip: %u\n", arr->start_ip);
+			debug_printf("  end_ip:   %u\n", arr->end_ip);
+		}
+	}
+
+	/* extend start/end ranges based on livein/liveout info from cfg: */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		struct ir3_ra_block_data *bd = block->data;
+
+		for (unsigned i = 0; i < ctx->alloc_count; i++) {
+			if (BITSET_TEST(bd->livein, i)) {
+				ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
+				ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
+			}
+
+			if (BITSET_TEST(bd->liveout, i)) {
+				ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
+				ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
+			}
+		}
+
+		list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+			for (unsigned i = 0; i < arr->length; i++) {
+				if (BITSET_TEST(bd->livein, i + arr->base)) {
+					arr->start_ip = MIN2(arr->start_ip, block->start_ip);
+				}
+				if (BITSET_TEST(bd->livein, i + arr->base)) {
+					arr->end_ip = MAX2(arr->end_ip, block->end_ip);
+				}
+			}
+		}
+	}
+
+	/* need to fix things up to keep outputs live: */
+	for (unsigned i = 0; i < ir->noutputs; i++) {
+		struct ir3_instruction *instr = ir->outputs[i];
+		unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
+		ctx->use[name] = ctx->instr_cnt;
+	}
+
+	for (unsigned i = 0; i < ctx->alloc_count; i++) {
+		for (unsigned j = 0; j < ctx->alloc_count; j++) {
+			if (intersects(ctx->def[i], ctx->use[i],
+					ctx->def[j], ctx->use[j])) {
+				ra_add_node_interference(ctx->g, i, j);
+			}
+		}
+	}
+}
+
+/* some instructions need fix-up if dst register is half precision: */
+static void fixup_half_instr_dst(struct ir3_instruction *instr)
+{
+	switch (opc_cat(instr->opc)) {
+	case 1: /* move instructions */
+		instr->cat1.dst_type = half_type(instr->cat1.dst_type);
+		break;
+	case 3:
+		switch (instr->opc) {
+		case OPC_MAD_F32:
+			instr->opc = OPC_MAD_F16;
+			break;
+		case OPC_SEL_B32:
+			instr->opc = OPC_SEL_B16;
+			break;
+		case OPC_SEL_S32:
+			instr->opc = OPC_SEL_S16;
+			break;
+		case OPC_SEL_F32:
+			instr->opc = OPC_SEL_F16;
+			break;
+		case OPC_SAD_S32:
+			instr->opc = OPC_SAD_S16;
+			break;
+		/* instructions may already be fixed up: */
+		case OPC_MAD_F16:
+		case OPC_SEL_B16:
+		case OPC_SEL_S16:
+		case OPC_SEL_F16:
+		case OPC_SAD_S16:
+			break;
+		default:
+			assert(0);
+			break;
+		}
+		break;
+	case 5:
+		instr->cat5.type = half_type(instr->cat5.type);
+		break;
+	}
+}
+/* some instructions need fix-up if src register is half precision: */
+static void fixup_half_instr_src(struct ir3_instruction *instr)
+{
+	switch (instr->opc) {
+	case OPC_MOV:
+		instr->cat1.src_type = half_type(instr->cat1.src_type);
+		break;
+	default:
+		break;
+	}
+}
+
+/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
+ * array access(es) which do not have any previous access to depend
+ * on from scheduling point of view
+ */
+static void
+reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
+		struct ir3_instruction *instr)
+{
+	struct ir3_ra_instr_data *id;
+
+	if (reg->flags & IR3_REG_ARRAY) {
+		struct ir3_array *arr =
+			ir3_lookup_array(ctx->ir, reg->array.id);
+		unsigned name = arr->base + reg->array.offset;
+		unsigned r = ra_get_node_reg(ctx->g, name);
+		unsigned num = ctx->set->ra_reg_to_gpr[r];
+
+		if (reg->flags & IR3_REG_RELATIV) {
+			reg->array.offset = num;
+		} else {
+			reg->num = num;
+			reg->flags &= ~IR3_REG_SSA;
+		}
+
+		reg->flags &= ~IR3_REG_ARRAY;
+	} else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
+		unsigned name = ra_name(ctx, id);
+		unsigned r = ra_get_node_reg(ctx->g, name);
+		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
+
+		debug_assert(!(reg->flags & IR3_REG_RELATIV));
+
+		if (is_high(id->defn))
+			num += FIRST_HIGH_REG;
+
+		reg->num = num;
+		reg->flags &= ~IR3_REG_SSA;
+
+		if (is_half(id->defn))
+			reg->flags |= IR3_REG_HALF;
+	}
+}
+
+static void
+ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_register *reg;
+
+		if (instr->regs_count == 0)
+			continue;
+
+		if (writes_gpr(instr)) {
+			reg_assign(ctx, instr->regs[0], instr);
+			if (instr->regs[0]->flags & IR3_REG_HALF)
+				fixup_half_instr_dst(instr);
+		}
+
+		foreach_src_n(reg, n, instr) {
+			struct ir3_instruction *src = reg->instr;
+			/* Note: reg->instr could be null for IR3_REG_ARRAY */
+			if (!(src || (reg->flags & IR3_REG_ARRAY)))
+				continue;
+			reg_assign(ctx, instr->regs[n+1], src);
+			if (instr->regs[n+1]->flags & IR3_REG_HALF)
+				fixup_half_instr_src(instr);
+		}
+	}
+}
+
+static int
+ra_alloc(struct ir3_ra_ctx *ctx)
+{
+	/* pre-assign array elements:
+	 */
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		unsigned base = 0;
+
+		if (arr->end_ip == 0)
+			continue;
+
+		/* figure out what else we conflict with which has already
+		 * been assigned:
+		 */
+retry:
+		list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
+			if (arr2 == arr)
+				break;
+			if (arr2->end_ip == 0)
+				continue;
+			/* if it intersects with liverange AND register range.. */
+			if (intersects(arr->start_ip, arr->end_ip,
+					arr2->start_ip, arr2->end_ip) &&
+				intersects(base, base + arr->length,
+					arr2->reg, arr2->reg + arr2->length)) {
+				base = MAX2(base, arr2->reg + arr2->length);
+				goto retry;
+			}
+		}
+
+		arr->reg = base;
+
+		for (unsigned i = 0; i < arr->length; i++) {
+			unsigned name, reg;
+
+			name = arr->base + i;
+			reg = ctx->set->gpr_to_ra_reg[0][base++];
+
+			ra_set_node_reg(ctx->g, name, reg);
+		}
+	}
+
+	if (!ra_allocate(ctx->g))
+		return -1;
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_alloc(ctx, block);
+	}
+
+	return 0;
+}
+
+int ir3_ra(struct ir3 *ir, gl_shader_stage type,
+		bool frag_coord, bool frag_face)
+{
+	struct ir3_ra_ctx ctx = {
+			.ir = ir,
+			.type = type,
+			.frag_face = frag_face,
+			.set = ir->compiler->set,
+	};
+	int ret;
+
+	ra_init(&ctx);
+	ra_add_interference(&ctx);
+	ret = ra_alloc(&ctx);
+	ra_destroy(&ctx);
+
+	return ret;
+}
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
new file mode 100644
index 00000000000..6552980d90c
--- /dev/null
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Instruction Scheduling:
+ *
+ * A recursive depth based scheduling algo.  Recursively find an eligible
+ * instruction to schedule from the deepest instruction (recursing through
+ * it's unscheduled src instructions).  Normally this would result in a
+ * lot of re-traversal of the same instructions, so we cache results in
+ * instr->data (and clear cached results that would be no longer valid
+ * after scheduling an instruction).
+ *
+ * There are a few special cases that need to be handled, since sched
+ * is currently independent of register allocation.  Usages of address
+ * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
+ * if you have two pairs of instructions that write the same special
+ * register and then read it, then those pairs cannot be interleaved.
+ * To solve this, when we are in such a scheduling "critical section",
+ * and we encounter a conflicting write to a special register, we try
+ * to schedule any remaining instructions that use that value first.
+ */
+
+struct ir3_sched_ctx {
+	struct ir3_block *block;           /* the current block */
+	struct list_head depth_list;       /* depth sorted unscheduled instrs */
+	struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
+	struct ir3_instruction *addr;      /* current a0.x user, if any */
+	struct ir3_instruction *pred;      /* current p0.x user, if any */
+	bool error;
+};
+
+static bool is_sfu_or_mem(struct ir3_instruction *instr)
+{
+	return is_sfu(instr) || is_mem(instr);
+}
+
+#define NULL_INSTR ((void *)~0)
+
+static void
+clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+	list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
+		if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
+			instr2->data = NULL;
+	}
+}
+
+static void
+schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+	debug_assert(ctx->block == instr->block);
+
+	/* maybe there is a better way to handle this than just stuffing
+	 * a nop.. ideally we'd know about this constraint in the
+	 * scheduling and depth calculation..
+	 */
+	if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
+		ir3_NOP(ctx->block);
+
+	/* remove from depth list:
+	 */
+	list_delinit(&instr->node);
+
+	if (writes_addr(instr)) {
+		debug_assert(ctx->addr == NULL);
+		ctx->addr = instr;
+	}
+
+	if (writes_pred(instr)) {
+		debug_assert(ctx->pred == NULL);
+		ctx->pred = instr;
+	}
+
+	instr->flags |= IR3_INSTR_MARK;
+
+	list_addtail(&instr->node, &instr->block->instr_list);
+	ctx->scheduled = instr;
+
+	if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
+		clear_cache(ctx, NULL);
+	} else {
+		/* invalidate only the necessary entries.. */
+		clear_cache(ctx, instr);
+	}
+}
+
+static struct ir3_instruction *
+deepest(struct ir3_instruction **srcs, unsigned nsrcs)
+{
+	struct ir3_instruction *d = NULL;
+	unsigned i = 0, id = 0;
+
+	while ((i < nsrcs) && !(d = srcs[id = i]))
+		i++;
+
+	if (!d)
+		return NULL;
+
+	for (; i < nsrcs; i++)
+		if (srcs[i] && (srcs[i]->depth > d->depth))
+			d = srcs[id = i];
+
+	srcs[id] = NULL;
+
+	return d;
+}
+
+/**
+ * @block: the block to search in, starting from end; in first pass,
+ *    this will be the block the instruction would be inserted into
+ *    (but has not yet, ie. it only contains already scheduled
+ *    instructions).  For intra-block scheduling (second pass), this
+ *    would be one of the predecessor blocks.
+ * @instr: the instruction to search for
+ * @maxd:  max distance, bail after searching this # of instruction
+ *    slots, since it means the instruction we are looking for is
+ *    far enough away
+ * @pred:  if true, recursively search into predecessor blocks to
+ *    find the worst case (shortest) distance (only possible after
+ *    individual blocks are all scheduled
+ */
+static unsigned
+distance(struct ir3_block *block, struct ir3_instruction *instr,
+		unsigned maxd, bool pred)
+{
+	unsigned d = 0;
+
+	list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
+		if ((n == instr) || (d >= maxd))
+			return d;
+		/* NOTE: don't count branch/jump since we don't know yet if they will
+		 * be eliminated later in resolve_jumps().. really should do that
+		 * earlier so we don't have this constraint.
+		 */
+		if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
+			d++;
+	}
+
+	/* if coming from a predecessor block, assume it is assigned far
+	 * enough away.. we'll fix up later.
+	 */
+	if (!pred)
+		return maxd;
+
+	if (pred && (block->data != block)) {
+		/* Search into predecessor blocks, finding the one with the
+		 * shortest distance, since that will be the worst case
+		 */
+		unsigned min = maxd - d;
+
+		/* (ab)use block->data to prevent recursion: */
+		block->data = block;
+
+		for (unsigned i = 0; i < block->predecessors_count; i++) {
+			unsigned n;
+
+			n = distance(block->predecessors[i], instr, min, pred);
+
+			min = MIN2(min, n);
+		}
+
+		block->data = NULL;
+		d += min;
+	}
+
+	return d;
+}
+
+/* calculate delay for specified src: */
+static unsigned
+delay_calc_srcn(struct ir3_block *block,
+		struct ir3_instruction *assigner,
+		struct ir3_instruction *consumer,
+		unsigned srcn, bool soft, bool pred)
+{
+	unsigned delay = 0;
+
+	if (is_meta(assigner)) {
+		struct ir3_instruction *src;
+		foreach_ssa_src(src, assigner) {
+			unsigned d;
+			d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
+			delay = MAX2(delay, d);
+		}
+	} else {
+		if (soft) {
+			if (is_sfu(assigner)) {
+				delay = 4;
+			} else {
+				delay = ir3_delayslots(assigner, consumer, srcn);
+			}
+		} else {
+			delay = ir3_delayslots(assigner, consumer, srcn);
+		}
+		delay -= distance(block, assigner, delay, pred);
+	}
+
+	return delay;
+}
+
+/* calculate delay for instruction (maximum of delay for all srcs): */
+static unsigned
+delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+		bool soft, bool pred)
+{
+	unsigned delay = 0;
+	struct ir3_instruction *src;
+
+	foreach_ssa_src_n(src, i, instr) {
+		unsigned d;
+		d = delay_calc_srcn(block, src, instr, i, soft, pred);
+		delay = MAX2(delay, d);
+	}
+
+	return delay;
+}
+
+struct ir3_sched_notes {
+	/* there is at least one kill which could be scheduled, except
+	 * for unscheduled bary.f's:
+	 */
+	bool blocked_kill;
+	/* there is at least one instruction that could be scheduled,
+	 * except for conflicting address/predicate register usage:
+	 */
+	bool addr_conflict, pred_conflict;
+};
+
+static bool is_scheduled(struct ir3_instruction *instr)
+{
+	return !!(instr->flags & IR3_INSTR_MARK);
+}
+
+/* could an instruction be scheduled if specified ssa src was scheduled? */
+static bool
+could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+{
+	struct ir3_instruction *other_src;
+	foreach_ssa_src(other_src, instr) {
+		/* if dependency not scheduled, we aren't ready yet: */
+		if ((src != other_src) && !is_scheduled(other_src)) {
+			return false;
+		}
+	}
+	return true;
+}
+
+/* Check if instruction is ok to schedule.  Make sure it is not blocked
+ * by use of addr/predicate register, etc.
+ */
+static bool
+check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct ir3_instruction *instr)
+{
+	/* For instructions that write address register we need to
+	 * make sure there is at least one instruction that uses the
+	 * addr value which is otherwise ready.
+	 *
+	 * TODO if any instructions use pred register and have other
+	 * src args, we would need to do the same for writes_pred()..
+	 */
+	if (writes_addr(instr)) {
+		struct ir3 *ir = instr->block->shader;
+		bool ready = false;
+		for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
+			struct ir3_instruction *indirect = ir->indirects[i];
+			if (!indirect)
+				continue;
+			if (indirect->address != instr)
+				continue;
+			ready = could_sched(indirect, instr);
+		}
+
+		/* nothing could be scheduled, so keep looking: */
+		if (!ready)
+			return false;
+	}
+
+	/* if this is a write to address/predicate register, and that
+	 * register is currently in use, we need to defer until it is
+	 * free:
+	 */
+	if (writes_addr(instr) && ctx->addr) {
+		debug_assert(ctx->addr != instr);
+		notes->addr_conflict = true;
+		return false;
+	}
+
+	if (writes_pred(instr) && ctx->pred) {
+		debug_assert(ctx->pred != instr);
+		notes->pred_conflict = true;
+		return false;
+	}
+
+	/* if the instruction is a kill, we need to ensure *every*
+	 * bary.f is scheduled.  The hw seems unhappy if the thread
+	 * gets killed before the end-input (ei) flag is hit.
+	 *
+	 * We could do this by adding each bary.f instruction as
+	 * virtual ssa src for the kill instruction.  But we have
+	 * fixed length instr->regs[].
+	 *
+	 * TODO this wouldn't be quite right if we had multiple
+	 * basic blocks, if any block was conditional.  We'd need
+	 * to schedule the bary.f's outside of any block which
+	 * was conditional that contained a kill.. I think..
+	 */
+	if (is_kill(instr)) {
+		struct ir3 *ir = instr->block->shader;
+
+		for (unsigned i = 0; i < ir->baryfs_count; i++) {
+			struct ir3_instruction *baryf = ir->baryfs[i];
+			if (baryf->flags & IR3_INSTR_UNUSED)
+				continue;
+			if (!is_scheduled(baryf)) {
+				notes->blocked_kill = true;
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+/* Find the best instruction to schedule from specified instruction or
+ * recursively it's ssa sources.
+ */
+static struct ir3_instruction *
+find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct ir3_instruction *instr)
+{
+	struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
+	struct ir3_instruction *src;
+	unsigned nsrcs = 0;
+
+	if (is_scheduled(instr))
+		return NULL;
+
+	/* use instr->data to cache the results of recursing up the
+	 * instr src's.  Otherwise the recursive algo can scale quite
+	 * badly w/ shader size.  But this takes some care to clear
+	 * the cache appropriately when instructions are scheduled.
+	 */
+	if (instr->data) {
+		if (instr->data == NULL_INSTR)
+			return NULL;
+		return instr->data;
+	}
+
+	/* find unscheduled srcs: */
+	foreach_ssa_src(src, instr) {
+		if (!is_scheduled(src)) {
+			debug_assert(nsrcs < ARRAY_SIZE(srcs));
+			srcs[nsrcs++] = src;
+		}
+	}
+
+	/* if all our src's are already scheduled: */
+	if (nsrcs == 0) {
+		if (check_instr(ctx, notes, instr)) {
+			instr->data = instr;
+			return instr;
+		}
+		return NULL;
+	}
+
+	while ((src = deepest(srcs, nsrcs))) {
+		struct ir3_instruction *candidate;
+
+		candidate = find_instr_recursive(ctx, notes, src);
+		if (!candidate)
+			continue;
+
+		if (check_instr(ctx, notes, candidate)) {
+			instr->data = candidate;
+			return candidate;
+		}
+	}
+
+	instr->data = NULL_INSTR;
+	return NULL;
+}
+
+/* find instruction to schedule: */
+static struct ir3_instruction *
+find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		bool soft)
+{
+	struct ir3_instruction *best_instr = NULL;
+	unsigned min_delay = ~0;
+
+	/* TODO we'd really rather use the list/array of block outputs.  But we
+	 * don't have such a thing.  Recursing *every* instruction in the list
+	 * will result in a lot of repeated traversal, since instructions will
+	 * get traversed both when they appear as ssa src to a later instruction
+	 * as well as where they appear in the depth_list.
+	 */
+	list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+		struct ir3_instruction *candidate;
+		unsigned delay;
+
+		candidate = find_instr_recursive(ctx, notes, instr);
+		if (!candidate)
+			continue;
+
+		delay = delay_calc(ctx->block, candidate, soft, false);
+		if (delay < min_delay) {
+			best_instr = candidate;
+			min_delay = delay;
+		}
+
+		if (min_delay == 0)
+			break;
+	}
+
+	return best_instr;
+}
+
+/* "spill" the address register by remapping any unscheduled
+ * instructions which depend on the current address register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static struct ir3_instruction *
+split_addr(struct ir3_sched_ctx *ctx)
+{
+	struct ir3 *ir;
+	struct ir3_instruction *new_addr = NULL;
+	unsigned i;
+
+	debug_assert(ctx->addr);
+
+	ir = ctx->addr->block->shader;
+
+	for (i = 0; i < ir->indirects_count; i++) {
+		struct ir3_instruction *indirect = ir->indirects[i];
+
+		if (!indirect)
+			continue;
+
+		/* skip instructions already scheduled: */
+		if (is_scheduled(indirect))
+			continue;
+
+		/* remap remaining instructions using current addr
+		 * to new addr:
+		 */
+		if (indirect->address == ctx->addr) {
+			if (!new_addr) {
+				new_addr = ir3_instr_clone(ctx->addr);
+				/* original addr is scheduled, but new one isn't: */
+				new_addr->flags &= ~IR3_INSTR_MARK;
+			}
+			ir3_instr_set_address(indirect, new_addr);
+		}
+	}
+
+	/* all remaining indirects remapped to new addr: */
+	ctx->addr = NULL;
+
+	return new_addr;
+}
+
+/* "spill" the predicate register by remapping any unscheduled
+ * instructions which depend on the current predicate register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static struct ir3_instruction *
+split_pred(struct ir3_sched_ctx *ctx)
+{
+	struct ir3 *ir;
+	struct ir3_instruction *new_pred = NULL;
+	unsigned i;
+
+	debug_assert(ctx->pred);
+
+	ir = ctx->pred->block->shader;
+
+	for (i = 0; i < ir->predicates_count; i++) {
+		struct ir3_instruction *predicated = ir->predicates[i];
+
+		/* skip instructions already scheduled: */
+		if (is_scheduled(predicated))
+			continue;
+
+		/* remap remaining instructions using current pred
+		 * to new pred:
+		 *
+		 * TODO is there ever a case when pred isn't first
+		 * (and only) src?
+		 */
+		if (ssa(predicated->regs[1]) == ctx->pred) {
+			if (!new_pred) {
+				new_pred = ir3_instr_clone(ctx->pred);
+				/* original pred is scheduled, but new one isn't: */
+				new_pred->flags &= ~IR3_INSTR_MARK;
+			}
+			predicated->regs[1]->instr = new_pred;
+		}
+	}
+
+	/* all remaining predicated remapped to new pred: */
+	ctx->pred = NULL;
+
+	return new_pred;
+}
+
+static void
+sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+	struct list_head unscheduled_list;
+
+	ctx->block = block;
+
+	/* addr/pred writes are per-block: */
+	ctx->addr = NULL;
+	ctx->pred = NULL;
+
+	/* move all instructions to the unscheduled list, and
+	 * empty the block's instruction list (to which we will
+	 * be inserting).
+	 */
+	list_replace(&block->instr_list, &unscheduled_list);
+	list_inithead(&block->instr_list);
+	list_inithead(&ctx->depth_list);
+
+	/* first a pre-pass to schedule all meta:input instructions
+	 * (which need to appear first so that RA knows the register is
+	 * occupied), and move remaining to depth sorted list:
+	 */
+	list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
+		if (instr->opc == OPC_META_INPUT) {
+			schedule(ctx, instr);
+		} else {
+			ir3_insert_by_depth(instr, &ctx->depth_list);
+		}
+	}
+
+	while (!list_empty(&ctx->depth_list)) {
+		struct ir3_sched_notes notes = {0};
+		struct ir3_instruction *instr;
+
+		instr = find_eligible_instr(ctx, &notes, true);
+		if (!instr)
+			instr = find_eligible_instr(ctx, &notes, false);
+
+		if (instr) {
+			unsigned delay = delay_calc(ctx->block, instr, false, false);
+
+			/* and if we run out of instructions that can be scheduled,
+			 * then it is time for nop's:
+			 */
+			debug_assert(delay <= 6);
+			while (delay > 0) {
+				ir3_NOP(block);
+				delay--;
+			}
+
+			schedule(ctx, instr);
+		} else {
+			struct ir3_instruction *new_instr = NULL;
+
+			/* nothing available to schedule.. if we are blocked on
+			 * address/predicate register conflict, then break the
+			 * deadlock by cloning the instruction that wrote that
+			 * reg:
+			 */
+			if (notes.addr_conflict) {
+				new_instr = split_addr(ctx);
+			} else if (notes.pred_conflict) {
+				new_instr = split_pred(ctx);
+			} else {
+				debug_assert(0);
+				ctx->error = true;
+				return;
+			}
+
+			if (new_instr) {
+				/* clearing current addr/pred can change what is
+				 * available to schedule, so clear cache..
+				 */
+				clear_cache(ctx, NULL);
+
+				ir3_insert_by_depth(new_instr, &ctx->depth_list);
+				/* the original instr that wrote addr/pred may have
+				 * originated from a different block:
+				 */
+				new_instr->block = block;
+			}
+		}
+	}
+
+	/* And lastly, insert branch/jump instructions to take us to
+	 * the next block.  Later we'll strip back out the branches
+	 * that simply jump to next instruction.
+	 */
+	if (block->successors[1]) {
+		/* if/else, conditional branches to "then" or "else": */
+		struct ir3_instruction *br;
+		unsigned delay = 6;
+
+		debug_assert(ctx->pred);
+		debug_assert(block->condition);
+
+		delay -= distance(ctx->block, ctx->pred, delay, false);
+
+		while (delay > 0) {
+			ir3_NOP(block);
+			delay--;
+		}
+
+		/* create "else" branch first (since "then" block should
+		 * frequently/always end up being a fall-thru):
+		 */
+		br = ir3_BR(block);
+		br->cat0.inv = true;
+		br->cat0.target = block->successors[1];
+
+		/* NOTE: we have to hard code delay of 6 above, since
+		 * we want to insert the nop's before constructing the
+		 * branch.  Throw in an assert so we notice if this
+		 * ever breaks on future generation:
+		 */
+		debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
+
+		br = ir3_BR(block);
+		br->cat0.target = block->successors[0];
+
+	} else if (block->successors[0]) {
+		/* otherwise unconditional jump to next block: */
+		struct ir3_instruction *jmp;
+
+		jmp = ir3_JUMP(block);
+		jmp->cat0.target = block->successors[0];
+	}
+
+	/* NOTE: if we kept track of the predecessors, we could do a better
+	 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
+	 * Note that as we eliminate blocks which contain only an unconditional
+	 * jump we probably need to propagate (jp) flag..
+	 */
+}
+
+/* After scheduling individual blocks, we still could have cases where
+ * one (or more) paths into a block, a value produced by a previous
+ * has too few delay slots to be legal.  We can't deal with this in the
+ * first pass, because loops (ie. we can't ensure all predecessor blocks
+ * are already scheduled in the first pass).  All we can really do at
+ * this point is stuff in extra nop's until things are legal.
+ */
+static void
+sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+	unsigned n = 0;
+
+	ctx->block = block;
+
+	list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+		unsigned delay = 0;
+
+		for (unsigned i = 0; i < block->predecessors_count; i++) {
+			unsigned d = delay_calc(block->predecessors[i], instr, false, true);
+			delay = MAX2(d, delay);
+		}
+
+		while (delay > n) {
+			struct ir3_instruction *nop = ir3_NOP(block);
+
+			/* move to before instr: */
+			list_delinit(&nop->node);
+			list_addtail(&nop->node, &instr->node);
+
+			n++;
+		}
+
+		/* we can bail once we hit worst case delay: */
+		if (++n > 6)
+			break;
+	}
+}
+
+int ir3_sched(struct ir3 *ir)
+{
+	struct ir3_sched_ctx ctx = {0};
+
+	ir3_clear_mark(ir);
+
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		sched_block(&ctx, block);
+	}
+
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		sched_intra_block(&ctx, block);
+	}
+
+	if (ctx.error)
+		return -1;
+	return 0;
+}
+
+/* does instruction 'prior' need to be scheduled before 'instr'? */
+static bool
+depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
+{
+	/* TODO for dependencies that are related to a specific object, ie
+	 * a specific SSBO/image/array, we could relax this constraint to
+	 * make accesses to unrelated objects not depend on each other (at
+	 * least as long as not declared coherent)
+	 */
+	if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
+			((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
+		return true;
+	return !!(instr->barrier_class & prior->barrier_conflict);
+}
+
+static void
+add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
+{
+	struct list_head *prev = instr->node.prev;
+	struct list_head *next = instr->node.next;
+
+	/* add dependencies on previous instructions that must be scheduled
+	 * prior to the current instruction
+	 */
+	while (prev != &block->instr_list) {
+		struct ir3_instruction *pi =
+			LIST_ENTRY(struct ir3_instruction, prev, node);
+
+		prev = prev->prev;
+
+		if (is_meta(pi))
+			continue;
+
+		if (instr->barrier_class == pi->barrier_class) {
+			ir3_instr_add_dep(instr, pi);
+			break;
+		}
+
+		if (depends_on(instr, pi))
+			ir3_instr_add_dep(instr, pi);
+	}
+
+	/* add dependencies on this instruction to following instructions
+	 * that must be scheduled after the current instruction:
+	 */
+	while (next != &block->instr_list) {
+		struct ir3_instruction *ni =
+			LIST_ENTRY(struct ir3_instruction, next, node);
+
+		next = next->next;
+
+		if (is_meta(ni))
+			continue;
+
+		if (instr->barrier_class == ni->barrier_class) {
+			ir3_instr_add_dep(ni, instr);
+			break;
+		}
+
+		if (depends_on(ni, instr))
+			ir3_instr_add_dep(ni, instr);
+	}
+}
+
+/* before scheduling a block, we need to add any necessary false-dependencies
+ * to ensure that:
+ *
+ *  (1) barriers are scheduled in the right order wrt instructions related
+ *      to the barrier
+ *
+ *  (2) reads that come before a write actually get scheduled before the
+ *      write
+ */
+static void
+calculate_deps(struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		if (instr->barrier_class) {
+			add_barrier_deps(block, instr);
+		}
+	}
+}
+
+void
+ir3_sched_add_deps(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		calculate_deps(block);
+	}
+}
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c
new file mode 100644
index 00000000000..8b18e950cca
--- /dev/null
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_format.h"
+
+#include "drm/freedreno_drmif.h"
+
+#include "ir3_shader.h"
+#include "ir3_compiler.h"
+#include "ir3_nir.h"
+
+int
+ir3_glsl_type_size(const struct glsl_type *type)
+{
+	return glsl_count_attribute_slots(type, false);
+}
+
+static void
+delete_variant(struct ir3_shader_variant *v)
+{
+	if (v->ir)
+		ir3_destroy(v->ir);
+	if (v->bo)
+		fd_bo_del(v->bo);
+	if (v->immediates)
+		free(v->immediates);
+	free(v);
+}
+
+/* for vertex shader, the inputs are loaded into registers before the shader
+ * is executed, so max_regs from the shader instructions might not properly
+ * reflect the # of registers actually used, especially in case passthrough
+ * varyings.
+ *
+ * Likewise, for fragment shader, we can have some regs which are passed
+ * input values but never touched by the resulting shader (ie. as result
+ * of dead code elimination or simply because we don't know how to turn
+ * the reg off.
+ */
+static void
+fixup_regfootprint(struct ir3_shader_variant *v)
+{
+	unsigned i;
+
+	for (i = 0; i < v->inputs_count; i++) {
+		/* skip frag inputs fetch via bary.f since their reg's are
+		 * not written by gpu before shader starts (and in fact the
+		 * regid's might not even be valid)
+		 */
+		if (v->inputs[i].bary)
+			continue;
+
+		/* ignore high regs that are global to all threads in a warp
+		 * (they exist by default) (a5xx+)
+		 */
+		if (v->inputs[i].regid >= regid(48,0))
+			continue;
+
+		if (v->inputs[i].compmask) {
+			unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
+			int32_t regid = (v->inputs[i].regid + n) >> 2;
+			v->info.max_reg = MAX2(v->info.max_reg, regid);
+		}
+	}
+
+	for (i = 0; i < v->outputs_count; i++) {
+		int32_t regid = (v->outputs[i].regid + 3) >> 2;
+		v->info.max_reg = MAX2(v->info.max_reg, regid);
+	}
+}
+
+/* wrapper for ir3_assemble() which does some info fixup based on
+ * shader state.  Non-static since used by ir3_cmdline too.
+ */
+void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id)
+{
+	void *bin;
+
+	bin = ir3_assemble(v->ir, &v->info, gpu_id);
+	if (!bin)
+		return NULL;
+
+	if (gpu_id >= 400) {
+		v->instrlen = v->info.sizedwords / (2 * 16);
+	} else {
+		v->instrlen = v->info.sizedwords / (2 * 4);
+	}
+
+	/* NOTE: if relative addressing is used, we set constlen in
+	 * the compiler (to worst-case value) since we don't know in
+	 * the assembler what the max addr reg value can be:
+	 */
+	v->constlen = MIN2(255, MAX2(v->constlen, v->info.max_const + 1));
+
+	fixup_regfootprint(v);
+
+	return bin;
+}
+
+static void
+assemble_variant(struct ir3_shader_variant *v)
+{
+	struct ir3_compiler *compiler = v->shader->compiler;
+	uint32_t gpu_id = compiler->gpu_id;
+	uint32_t sz, *bin;
+
+	bin = ir3_shader_assemble(v, gpu_id);
+	sz = v->info.sizedwords * 4;
+
+	v->bo = fd_bo_new(compiler->dev, sz,
+			DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
+			DRM_FREEDRENO_GEM_TYPE_KMEM);
+
+	memcpy(fd_bo_map(v->bo), bin, sz);
+
+	if (ir3_shader_debug & IR3_DBG_DISASM) {
+		struct ir3_shader_key key = v->key;
+		printf("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+			v->binning_pass, key.color_two_side, key.half_precision);
+		ir3_shader_disasm(v, bin, stdout);
+	}
+
+	if (shader_debug_enabled(v->shader->type)) {
+		fprintf(stderr, "Native code for unnamed %s shader %s:\n",
+			_mesa_shader_stage_to_string(v->shader->type),
+			v->shader->nir->info.name);
+		if (v->shader->type == MESA_SHADER_FRAGMENT)
+			fprintf(stderr, "SIMD0\n");
+		ir3_shader_disasm(v, bin, stderr);
+	}
+
+	free(bin);
+
+	/* no need to keep the ir around beyond this point: */
+	ir3_destroy(v->ir);
+	v->ir = NULL;
+}
+
+static struct ir3_shader_variant *
+create_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
+		bool binning_pass)
+{
+	struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
+	int ret;
+
+	if (!v)
+		return NULL;
+
+	v->id = ++shader->variant_count;
+	v->shader = shader;
+	v->binning_pass = binning_pass;
+	v->key = *key;
+	v->type = shader->type;
+
+	ret = ir3_compile_shader_nir(shader->compiler, v);
+	if (ret) {
+		debug_error("compile failed!");
+		goto fail;
+	}
+
+	assemble_variant(v);
+	if (!v->bo) {
+		debug_error("assemble failed!");
+		goto fail;
+	}
+
+	return v;
+
+fail:
+	delete_variant(v);
+	return NULL;
+}
+
+static inline struct ir3_shader_variant *
+shader_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
+		bool *created)
+{
+	struct ir3_shader_variant *v;
+
+	*created = false;
+
+	for (v = shader->variants; v; v = v->next)
+		if (ir3_shader_key_equal(key, &v->key))
+			return v;
+
+	/* compile new variant if it doesn't exist already: */
+	v = create_variant(shader, key, false);
+	if (v) {
+		v->next = shader->variants;
+		shader->variants = v;
+		*created = true;
+	}
+
+	return v;
+}
+
+struct ir3_shader_variant *
+ir3_shader_get_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
+		bool binning_pass, bool *created)
+{
+	struct ir3_shader_variant *v =
+			shader_variant(shader, key, created);
+
+	if (binning_pass) {
+		if (!v->binning)
+			v->binning = create_variant(shader, key, true);
+		return v->binning;
+	}
+
+	return v;
+}
+
+void
+ir3_shader_destroy(struct ir3_shader *shader)
+{
+	struct ir3_shader_variant *v, *t;
+	for (v = shader->variants; v; ) {
+		t = v;
+		v = v->next;
+		delete_variant(t);
+	}
+	ralloc_free(shader->nir);
+	free(shader);
+}
+
+struct ir3_shader *
+ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir)
+{
+	struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
+
+	shader->compiler = compiler;
+	shader->id = ++shader->compiler->shader_count;
+	shader->type = nir->info.stage;
+
+	NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size,
+			   (nir_lower_io_options)0);
+
+	/* do first pass optimization, ignoring the key: */
+	shader->nir = ir3_optimize_nir(shader, nir, NULL);
+	if (ir3_shader_debug & IR3_DBG_DISASM) {
+		printf("dump nir%d: type=%d", shader->id, shader->type);
+		nir_print_shader(shader->nir, stdout);
+	}
+
+	return shader;
+}
+
+static void dump_reg(FILE *out, const char *name, uint32_t r)
+{
+	if (r != regid(63,0))
+		fprintf(out, "; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
+}
+
+static void dump_output(FILE *out, struct ir3_shader_variant *so,
+		unsigned slot, const char *name)
+{
+	uint32_t regid;
+	regid = ir3_find_output_regid(so, slot);
+	dump_reg(out, name, regid);
+}
+
+void
+ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
+{
+	struct ir3 *ir = so->ir;
+	struct ir3_register *reg;
+	const char *type = ir3_shader_stage(so->shader);
+	uint8_t regid;
+	unsigned i;
+
+	for (i = 0; i < ir->ninputs; i++) {
+		if (!ir->inputs[i]) {
+			fprintf(out, "; in%d unused\n", i);
+			continue;
+		}
+		reg = ir->inputs[i]->regs[0];
+		regid = reg->num;
+		fprintf(out, "@in(%sr%d.%c)\tin%d\n",
+				(reg->flags & IR3_REG_HALF) ? "h" : "",
+				(regid >> 2), "xyzw"[regid & 0x3], i);
+	}
+
+	for (i = 0; i < ir->noutputs; i++) {
+		if (!ir->outputs[i]) {
+			fprintf(out, "; out%d unused\n", i);
+			continue;
+		}
+		/* kill shows up as a virtual output.. skip it! */
+		if (is_kill(ir->outputs[i]))
+			continue;
+		reg = ir->outputs[i]->regs[0];
+		regid = reg->num;
+		fprintf(out, "@out(%sr%d.%c)\tout%d\n",
+				(reg->flags & IR3_REG_HALF) ? "h" : "",
+				(regid >> 2), "xyzw"[regid & 0x3], i);
+	}
+
+	for (i = 0; i < so->immediates_count; i++) {
+		fprintf(out, "@const(c%d.x)\t", so->constbase.immediate + i);
+		fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
+				so->immediates[i].val[0],
+				so->immediates[i].val[1],
+				so->immediates[i].val[2],
+				so->immediates[i].val[3]);
+	}
+
+	disasm_a3xx(bin, so->info.sizedwords, 0, out);
+
+	switch (so->type) {
+	case MESA_SHADER_VERTEX:
+		fprintf(out, "; %s: outputs:", type);
+		for (i = 0; i < so->outputs_count; i++) {
+			uint8_t regid = so->outputs[i].regid;
+			fprintf(out, " r%d.%c (%s)",
+					(regid >> 2), "xyzw"[regid & 0x3],
+					gl_varying_slot_name(so->outputs[i].slot));
+		}
+		fprintf(out, "\n");
+		fprintf(out, "; %s: inputs:", type);
+		for (i = 0; i < so->inputs_count; i++) {
+			uint8_t regid = so->inputs[i].regid;
+			fprintf(out, " r%d.%c (cm=%x,il=%u,b=%u)",
+					(regid >> 2), "xyzw"[regid & 0x3],
+					so->inputs[i].compmask,
+					so->inputs[i].inloc,
+					so->inputs[i].bary);
+		}
+		fprintf(out, "\n");
+		break;
+	case MESA_SHADER_FRAGMENT:
+		fprintf(out, "; %s: outputs:", type);
+		for (i = 0; i < so->outputs_count; i++) {
+			uint8_t regid = so->outputs[i].regid;
+			fprintf(out, " r%d.%c (%s)",
+					(regid >> 2), "xyzw"[regid & 0x3],
+					gl_frag_result_name(so->outputs[i].slot));
+		}
+		fprintf(out, "\n");
+		fprintf(out, "; %s: inputs:", type);
+		for (i = 0; i < so->inputs_count; i++) {
+			uint8_t regid = so->inputs[i].regid;
+			fprintf(out, " r%d.%c (%s,cm=%x,il=%u,b=%u)",
+					(regid >> 2), "xyzw"[regid & 0x3],
+					gl_varying_slot_name(so->inputs[i].slot),
+					so->inputs[i].compmask,
+					so->inputs[i].inloc,
+					so->inputs[i].bary);
+		}
+		fprintf(out, "\n");
+		break;
+	default:
+		/* TODO */
+		break;
+	}
+
+	/* print generic shader info: */
+	fprintf(out, "; %s prog %d/%d: %u instructions, %d half, %d full\n",
+			type, so->shader->id, so->id,
+			so->info.instrs_count,
+			so->info.max_half_reg + 1,
+			so->info.max_reg + 1);
+
+	fprintf(out, "; %d const, %u constlen\n",
+			so->info.max_const + 1,
+			so->constlen);
+
+	fprintf(out, "; %u (ss), %u (sy)\n", so->info.ss, so->info.sy);
+
+	/* print shader type specific info: */
+	switch (so->type) {
+	case MESA_SHADER_VERTEX:
+		dump_output(out, so, VARYING_SLOT_POS, "pos");
+		dump_output(out, so, VARYING_SLOT_PSIZ, "psize");
+		break;
+	case MESA_SHADER_FRAGMENT:
+		dump_reg(out, "pos (bary)",
+			ir3_find_sysval_regid(so, SYSTEM_VALUE_VARYING_COORD));
+		dump_output(out, so, FRAG_RESULT_DEPTH, "posz");
+		if (so->color0_mrt) {
+			dump_output(out, so, FRAG_RESULT_COLOR, "color");
+		} else {
+			dump_output(out, so, FRAG_RESULT_DATA0, "data0");
+			dump_output(out, so, FRAG_RESULT_DATA1, "data1");
+			dump_output(out, so, FRAG_RESULT_DATA2, "data2");
+			dump_output(out, so, FRAG_RESULT_DATA3, "data3");
+			dump_output(out, so, FRAG_RESULT_DATA4, "data4");
+			dump_output(out, so, FRAG_RESULT_DATA5, "data5");
+			dump_output(out, so, FRAG_RESULT_DATA6, "data6");
+			dump_output(out, so, FRAG_RESULT_DATA7, "data7");
+		}
+		/* these two are hard-coded since we don't know how to
+		 * program them to anything but all 0's...
+		 */
+		if (so->frag_coord)
+			fprintf(out, "; fragcoord: r0.x\n");
+		if (so->frag_face)
+			fprintf(out, "; fragface: hr0.x\n");
+		break;
+	default:
+		/* TODO */
+		break;
+	}
+
+	fprintf(out, "\n");
+}
+
+uint64_t
+ir3_shader_outputs(const struct ir3_shader *so)
+{
+	return so->nir->info.outputs_written;
+}
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
new file mode 100644
index 00000000000..bc47160d6ea
--- /dev/null
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -0,0 +1,587 @@
+/*
+ * Copyright (C) 2014 Rob Clark <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <[email protected]>
+ */
+
+#ifndef IR3_SHADER_H_
+#define IR3_SHADER_H_
+
+#include <stdio.h>
+
+#include "compiler/shader_enums.h"
+#include "compiler/nir/nir.h"
+#include "util/bitscan.h"
+
+#include "ir3.h"
+
+struct glsl_type;
+
+/* driver param indices: */
+enum ir3_driver_param {
+	/* compute shader driver params: */
+	IR3_DP_NUM_WORK_GROUPS_X = 0,
+	IR3_DP_NUM_WORK_GROUPS_Y = 1,
+	IR3_DP_NUM_WORK_GROUPS_Z = 2,
+	IR3_DP_LOCAL_GROUP_SIZE_X = 4,
+	IR3_DP_LOCAL_GROUP_SIZE_Y = 5,
+	IR3_DP_LOCAL_GROUP_SIZE_Z = 6,
+	/* NOTE: gl_NumWorkGroups should be vec4 aligned because
+	 * glDispatchComputeIndirect() needs to load these from
+	 * the info->indirect buffer.  Keep that in mind when/if
+	 * adding any addition CS driver params.
+	 */
+	IR3_DP_CS_COUNT   = 8,   /* must be aligned to vec4 */
+
+	/* vertex shader driver params: */
+	IR3_DP_VTXID_BASE = 0,
+	IR3_DP_VTXCNT_MAX = 1,
+	/* user-clip-plane components, up to 8x vec4's: */
+	IR3_DP_UCP0_X     = 4,
+	/* .... */
+	IR3_DP_UCP7_W     = 35,
+	IR3_DP_VS_COUNT   = 36   /* must be aligned to vec4 */
+};
+
+#define IR3_MAX_SHADER_BUFFERS   32
+#define IR3_MAX_SHADER_IMAGES    32
+#define IR3_MAX_SO_BUFFERS        4
+#define IR3_MAX_SO_OUTPUTS       64
+
+/**
+ * For consts needed to pass internal values to shader which may or may not
+ * be required, rather than allocating worst-case const space, we scan the
+ * shader and allocate consts as-needed:
+ *
+ *   + SSBO sizes: only needed if shader has a get_buffer_size intrinsic
+ *     for a given SSBO
+ *
+ *   + Image dimensions: needed to calculate pixel offset, but only for
+ *     images that have a image_store intrinsic
+ */
+struct ir3_driver_const_layout {
+	struct {
+		uint32_t mask;  /* bitmask of SSBOs that have get_buffer_size */
+		uint32_t count; /* number of consts allocated */
+		/* one const allocated per SSBO which has get_buffer_size,
+		 * ssbo_sizes.off[ssbo_id] is offset from start of ssbo_sizes
+		 * consts:
+		 */
+		uint32_t off[IR3_MAX_SHADER_BUFFERS];
+	} ssbo_size;
+
+	struct {
+		uint32_t mask;  /* bitmask of images that have image_store */
+		uint32_t count; /* number of consts allocated */
+		/* three const allocated per image which has image_store:
+		 *  + cpp         (bytes per pixel)
+		 *  + pitch       (y pitch)
+		 *  + array_pitch (z pitch)
+		 */
+		uint32_t off[IR3_MAX_SHADER_IMAGES];
+	} image_dims;
+};
+
+/**
+ * A single output for vertex transform feedback.
+ */
+struct ir3_stream_output {
+	unsigned register_index:6;  /**< 0 to 63 (OUT index) */
+	unsigned start_component:2; /** 0 to 3 */
+	unsigned num_components:3;  /** 1 to 4 */
+	unsigned output_buffer:3;   /**< 0 to PIPE_MAX_SO_BUFFERS */
+	unsigned dst_offset:16;     /**< offset into the buffer in dwords */
+	unsigned stream:2;          /**< 0 to 3 */
+};
+
+/**
+ * Stream output for vertex transform feedback.
+ */
+struct ir3_stream_output_info {
+	unsigned num_outputs;
+	/** stride for an entire vertex for each buffer in dwords */
+	uint16_t stride[IR3_MAX_SO_BUFFERS];
+
+	/**
+	 * Array of stream outputs, in the order they are to be written in.
+	 * Selected components are tightly packed into the output buffer.
+	 */
+	struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
+};
+
+/* Configuration key used to identify a shader variant.. different
+ * shader variants can be used to implement features not supported
+ * in hw (two sided color), binning-pass vertex shader, etc.
+ */
+struct ir3_shader_key {
+	union {
+		struct {
+			/*
+			 * Combined Vertex/Fragment shader parameters:
+			 */
+			unsigned ucp_enables : 8;
+
+			/* do we need to check {v,f}saturate_{s,t,r}? */
+			unsigned has_per_samp : 1;
+
+			/*
+			 * Vertex shader variant parameters:
+			 */
+			unsigned vclamp_color : 1;
+
+			/*
+			 * Fragment shader variant parameters:
+			 */
+			unsigned color_two_side : 1;
+			unsigned half_precision : 1;
+			/* used when shader needs to handle flat varyings (a4xx)
+			 * for front/back color inputs to frag shader:
+			 */
+			unsigned rasterflat : 1;
+			unsigned fclamp_color : 1;
+		};
+		uint32_t global;
+	};
+
+	/* bitmask of sampler which needs coords clamped for vertex
+	 * shader:
+	 */
+	uint16_t vsaturate_s, vsaturate_t, vsaturate_r;
+
+	/* bitmask of sampler which needs coords clamped for frag
+	 * shader:
+	 */
+	uint16_t fsaturate_s, fsaturate_t, fsaturate_r;
+
+	/* bitmask of ms shifts */
+	uint32_t vsamples, fsamples;
+
+	/* bitmask of samplers which need astc srgb workaround: */
+	uint16_t vastc_srgb, fastc_srgb;
+};
+
+static inline bool
+ir3_shader_key_equal(struct ir3_shader_key *a, struct ir3_shader_key *b)
+{
+	/* slow-path if we need to check {v,f}saturate_{s,t,r} */
+	if (a->has_per_samp || b->has_per_samp)
+		return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
+	return a->global == b->global;
+}
+
+/* will the two keys produce different lowering for a fragment shader? */
+static inline bool
+ir3_shader_key_changes_fs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
+{
+	if (last_key->has_per_samp || key->has_per_samp) {
+		if ((last_key->fsaturate_s != key->fsaturate_s) ||
+				(last_key->fsaturate_t != key->fsaturate_t) ||
+				(last_key->fsaturate_r != key->fsaturate_r) ||
+				(last_key->fsamples != key->fsamples) ||
+				(last_key->fastc_srgb != key->fastc_srgb))
+			return true;
+	}
+
+	if (last_key->fclamp_color != key->fclamp_color)
+		return true;
+
+	if (last_key->color_two_side != key->color_two_side)
+		return true;
+
+	if (last_key->half_precision != key->half_precision)
+		return true;
+
+	if (last_key->rasterflat != key->rasterflat)
+		return true;
+
+	if (last_key->ucp_enables != key->ucp_enables)
+		return true;
+
+	return false;
+}
+
+/* will the two keys produce different lowering for a vertex shader? */
+static inline bool
+ir3_shader_key_changes_vs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
+{
+	if (last_key->has_per_samp || key->has_per_samp) {
+		if ((last_key->vsaturate_s != key->vsaturate_s) ||
+				(last_key->vsaturate_t != key->vsaturate_t) ||
+				(last_key->vsaturate_r != key->vsaturate_r) ||
+				(last_key->vsamples != key->vsamples) ||
+				(last_key->vastc_srgb != key->vastc_srgb))
+			return true;
+	}
+
+	if (last_key->vclamp_color != key->vclamp_color)
+		return true;
+
+	if (last_key->ucp_enables != key->ucp_enables)
+		return true;
+
+	return false;
+}
+
+/* clears shader-key flags which don't apply to the given shader
+ * stage
+ */
+static inline void
+ir3_normalize_key(struct ir3_shader_key *key, gl_shader_stage type)
+{
+	switch (type) {
+	case MESA_SHADER_FRAGMENT:
+		if (key->has_per_samp) {
+			key->vsaturate_s = 0;
+			key->vsaturate_t = 0;
+			key->vsaturate_r = 0;
+			key->vastc_srgb = 0;
+			key->vsamples = 0;
+		}
+		break;
+	case MESA_SHADER_VERTEX:
+		key->color_two_side = false;
+		key->half_precision = false;
+		key->rasterflat = false;
+		if (key->has_per_samp) {
+			key->fsaturate_s = 0;
+			key->fsaturate_t = 0;
+			key->fsaturate_r = 0;
+			key->fastc_srgb = 0;
+			key->fsamples = 0;
+		}
+		break;
+	default:
+		/* TODO */
+		break;
+	}
+
+}
+
+struct ir3_shader_variant {
+	struct fd_bo *bo;
+
+	/* variant id (for debug) */
+	uint32_t id;
+
+	struct ir3_shader_key key;
+
+	/* vertex shaders can have an extra version for hwbinning pass,
+	 * which is pointed to by so->binning:
+	 */
+	bool binning_pass;
+	struct ir3_shader_variant *binning;
+
+	struct ir3_driver_const_layout const_layout;
+	struct ir3_info info;
+	struct ir3 *ir;
+
+	/* the instructions length is in units of instruction groups
+	 * (4 instructions for a3xx, 16 instructions for a4xx.. each
+	 * instruction is 2 dwords):
+	 */
+	unsigned instrlen;
+
+	/* the constants length is in units of vec4's, and is the sum of
+	 * the uniforms and the built-in compiler constants
+	 */
+	unsigned constlen;
+
+	/* number of uniforms (in vec4), not including built-in compiler
+	 * constants, etc.
+	 */
+	unsigned num_uniforms;
+
+	unsigned num_ubos;
+
+	/* About Linkage:
+	 *   + Let the frag shader determine the position/compmask for the
+	 *     varyings, since it is the place where we know if the varying
+	 *     is actually used, and if so, which components are used.  So
+	 *     what the hw calls "outloc" is taken from the "inloc" of the
+	 *     frag shader.
+	 *   + From the vert shader, we only need the output regid
+	 */
+
+	bool frag_coord, frag_face, color0_mrt;
+
+	/* NOTE: for input/outputs, slot is:
+	 *   gl_vert_attrib  - for VS inputs
+	 *   gl_varying_slot - for VS output / FS input
+	 *   gl_frag_result  - for FS output
+	 */
+
+	/* varyings/outputs: */
+	unsigned outputs_count;
+	struct {
+		uint8_t slot;
+		uint8_t regid;
+	} outputs[16 + 2];  /* +POSITION +PSIZE */
+	bool writes_pos, writes_psize;
+
+	/* attributes (VS) / varyings (FS):
+	 * Note that sysval's should come *after* normal inputs.
+	 */
+	unsigned inputs_count;
+	struct {
+		uint8_t slot;
+		uint8_t regid;
+		uint8_t compmask;
+		uint8_t ncomp;
+		/* location of input (ie. offset passed to bary.f, etc).  This
+		 * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
+		 * have the OUTLOCn value offset by 8, presumably to account
+		 * for gl_Position/gl_PointSize)
+		 */
+		uint8_t inloc;
+		/* vertex shader specific: */
+		bool    sysval     : 1;   /* slot is a gl_system_value */
+		/* fragment shader specific: */
+		bool    bary       : 1;   /* fetched varying (vs one loaded into reg) */
+		bool    rasterflat : 1;   /* special handling for emit->rasterflat */
+		enum glsl_interp_mode interpolate;
+	} inputs[16 + 2];  /* +POSITION +FACE */
+
+	/* sum of input components (scalar).  For frag shaders, it only counts
+	 * the varying inputs:
+	 */
+	unsigned total_in;
+
+	/* For frag shaders, the total number of inputs (not scalar,
+	 * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
+	 */
+	unsigned varying_in;
+
+	/* number of samplers/textures (which are currently 1:1): */
+	int num_samp;
+
+	/* do we have one or more SSBO instructions: */
+	bool has_ssbo;
+
+	/* do we have kill instructions: */
+	bool has_kill;
+
+	/* Layout of constant registers, each section (in vec4). Pointer size
+	 * is 32b (a3xx, a4xx), or 64b (a5xx+), which effects the size of the
+	 * UBO and stream-out consts.
+	 */
+	struct {
+		/* user const start at zero */
+		unsigned ubo;
+		/* NOTE that a3xx might need a section for SSBO addresses too */
+		unsigned ssbo_sizes;
+		unsigned image_dims;
+		unsigned driver_param;
+		unsigned tfbo;
+		unsigned immediate;
+	} constbase;
+
+	unsigned immediates_count;
+	unsigned immediates_size;
+	struct {
+		uint32_t val[4];
+	} *immediates;
+
+	/* for astc srgb workaround, the number/base of additional
+	 * alpha tex states we need, and index of original tex states
+	 */
+	struct {
+		unsigned base, count;
+		unsigned orig_idx[16];
+	} astc_srgb;
+
+	/* shader variants form a linked list: */
+	struct ir3_shader_variant *next;
+
+	/* replicated here to avoid passing extra ptrs everywhere: */
+	gl_shader_stage type;
+	struct ir3_shader *shader;
+};
+
+struct ir3_shader {
+	gl_shader_stage type;
+
+	/* shader id (for debug): */
+	uint32_t id;
+	uint32_t variant_count;
+
+	/* so we know when we can disable TGSI related hacks: */
+	bool from_tgsi;
+
+	struct ir3_compiler *compiler;
+
+	struct nir_shader *nir;
+	struct ir3_stream_output_info stream_output;
+
+	struct ir3_shader_variant *variants;
+};
+
+void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
+struct ir3_shader_variant * ir3_shader_get_variant(struct ir3_shader *shader,
+		struct ir3_shader_key *key, bool binning_pass, bool *created);
+struct ir3_shader * ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir);
+void ir3_shader_destroy(struct ir3_shader *shader);
+void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
+uint64_t ir3_shader_outputs(const struct ir3_shader *so);
+
+int
+ir3_glsl_type_size(const struct glsl_type *type);
+
+static inline const char *
+ir3_shader_stage(struct ir3_shader *shader)
+{
+	switch (shader->type) {
+	case MESA_SHADER_VERTEX:     return "VERT";
+	case MESA_SHADER_FRAGMENT:   return "FRAG";
+	case MESA_SHADER_COMPUTE:    return "CL";
+	default:
+		unreachable("invalid type");
+		return NULL;
+	}
+}
+
+/*
+ * Helper/util:
+ */
+
+static inline int
+ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
+{
+	int j;
+
+	for (j = 0; j < so->outputs_count; j++)
+		if (so->outputs[j].slot == slot)
+			return j;
+
+	/* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
+	 * in the vertex shader.. but the fragment shader doesn't know this
+	 * so  it will always have both IN.COLOR[n] and IN.BCOLOR[n].  So
+	 * at link time if there is no matching OUT.BCOLOR[n], we must map
+	 * OUT.COLOR[n] to IN.BCOLOR[n].  And visa versa if there is only
+	 * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
+	 */
+	if (slot == VARYING_SLOT_BFC0) {
+		slot = VARYING_SLOT_COL0;
+	} else if (slot == VARYING_SLOT_BFC1) {
+		slot = VARYING_SLOT_COL1;
+	} else if (slot == VARYING_SLOT_COL0) {
+		slot = VARYING_SLOT_BFC0;
+	} else if (slot == VARYING_SLOT_COL1) {
+		slot = VARYING_SLOT_BFC1;
+	} else {
+		return 0;
+	}
+
+	for (j = 0; j < so->outputs_count; j++)
+		if (so->outputs[j].slot == slot)
+			return j;
+
+	debug_assert(0);
+
+	return 0;
+}
+
+static inline int
+ir3_next_varying(const struct ir3_shader_variant *so, int i)
+{
+	while (++i < so->inputs_count)
+		if (so->inputs[i].compmask && so->inputs[i].bary)
+			break;
+	return i;
+}
+
+struct ir3_shader_linkage {
+	uint8_t max_loc;
+	uint8_t cnt;
+	struct {
+		uint8_t regid;
+		uint8_t compmask;
+		uint8_t loc;
+	} var[32];
+};
+
+static inline void
+ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid, uint8_t compmask, uint8_t loc)
+{
+	int i = l->cnt++;
+
+	debug_assert(i < ARRAY_SIZE(l->var));
+
+	l->var[i].regid    = regid;
+	l->var[i].compmask = compmask;
+	l->var[i].loc      = loc;
+	l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
+}
+
+static inline void
+ir3_link_shaders(struct ir3_shader_linkage *l,
+		const struct ir3_shader_variant *vs,
+		const struct ir3_shader_variant *fs)
+{
+	int j = -1, k;
+
+	while (l->cnt < ARRAY_SIZE(l->var)) {
+		j = ir3_next_varying(fs, j);
+
+		if (j >= fs->inputs_count)
+			break;
+
+		if (fs->inputs[j].inloc >= fs->total_in)
+			continue;
+
+		k = ir3_find_output(vs, fs->inputs[j].slot);
+
+		ir3_link_add(l, vs->outputs[k].regid,
+			fs->inputs[j].compmask, fs->inputs[j].inloc);
+	}
+}
+
+static inline uint32_t
+ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
+{
+	int j;
+	for (j = 0; j < so->outputs_count; j++)
+		if (so->outputs[j].slot == slot)
+			return so->outputs[j].regid;
+	return regid(63, 0);
+}
+
+static inline uint32_t
+ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
+{
+	int j;
+	for (j = 0; j < so->inputs_count; j++)
+		if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
+			return so->inputs[j].regid;
+	return regid(63, 0);
+}
+
+/* calculate register footprint in terms of half-regs (ie. one full
+ * reg counts as two half-regs).
+ */
+static inline uint32_t
+ir3_shader_halfregs(const struct ir3_shader_variant *v)
+{
+	return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
+}
+
+#endif /* IR3_SHADER_H_ */
diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build
new file mode 100644
index 00000000000..07319dff595
--- /dev/null
+++ b/src/freedreno/ir3/meson.build
@@ -0,0 +1,64 @@
+# Copyright © 2018 Rob Clark
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+ir3_nir_trig_c = custom_target(
+  'ir3_nir_trig.c',
+  input : 'ir3_nir_trig.py',
+  output : 'ir3_nir_trig.c',
+  command : [
+    prog_python, '@INPUT@',
+    '-p', join_paths(meson.source_root(), 'src/compiler/nir/'),
+  ],
+  capture : true,
+  depend_files : nir_algebraic_py,
+)
+
+libfreedreno_ir3_files = files(
+  'disasm-a3xx.c',
+  'instr-a3xx.h',
+  'ir3.c',
+  'ir3_compiler_nir.c',
+  'ir3_compiler.c',
+  'ir3_compiler.h',
+  'ir3_cp.c',
+  'ir3_depth.c',
+  'ir3_group.c',
+  'ir3.h',
+  'ir3_legalize.c',
+  'ir3_nir.c',
+  'ir3_nir.h',
+  'ir3_nir_lower_tg4_to_tex.c',
+  'ir3_print.c',
+  'ir3_ra.c',
+  'ir3_sched.c',
+  'ir3_shader.c',
+  'ir3_shader.h',
+)
+
+libfreedreno_ir3 = static_library(
+  'freedreno_ir3',
+  [libfreedreno_ir3_files, ir3_nir_trig_c],
+  include_directories : [inc_freedreno, inc_common],
+  c_args : [c_vis_args, no_override_init_args],
+  cpp_args : [cpp_vis_args],
+  dependencies : idep_nir_headers,
+  build_by_default : false,
+)
+
diff --git a/src/freedreno/meson.build b/src/freedreno/meson.build
index bb2cb201c0d..26ee6213890 100644
--- a/src/freedreno/meson.build
+++ b/src/freedreno/meson.build
@@ -21,3 +21,4 @@
 inc_freedreno = include_directories('.')
 
 subdir('drm')
+subdir('ir3')
author	Rob Clark <[email protected]>	2018-11-10 12:05:59 -0500
committer	Rob Clark <[email protected]>	2018-11-27 15:44:02 -0500
commit	aa0fed10d3574aec8c129bace78018ae060484c0 (patch)
tree	2fee64028d47f6112f881903848a126da35eb5ea /src/freedreno
parent	556eec249d6d81be88389784ce5f2583712d85d5 (diff)