40 files changed, 37 insertions, 13732 deletions
diff --git a/src/gallium/drivers/freedreno/Automake.inc b/src/gallium/drivers/freedreno/Automake.inc
index 9b9b3d39fea..936c286f4c9 100644
--- a/src/gallium/drivers/freedreno/Automake.inc
+++ b/src/gallium/drivers/freedreno/Automake.inc
@@ -6,6 +6,7 @@ TARGET_LIB_DEPS += \
 	$(top_builddir)/src/gallium/winsys/freedreno/drm/libfreedrenodrm.la \
 	$(top_builddir)/src/gallium/drivers/freedreno/libfreedreno.la \
 	$(top_builddir)/src/freedreno/libfreedreno_drm.la \
+	$(top_builddir)/src/freedreno/libfreedreno_ir3.la \
 	$(FREEDRENO_LIBS) \
 	$(LIBDRM_LIBS)
 
diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am
index 39887e13e37..32130ab94c5 100644
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -9,11 +9,6 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/compiler/nir \
 	$(GALLIUM_DRIVER_CFLAGS)
 
-MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
-ir3/ir3_nir_trig.c: ir3/ir3_nir_trig.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py
-	$(MKDIR_GEN)
-	$(AM_V_GEN) $(PYTHON) $(PYTHON_FLAGS) $(srcdir)/ir3/ir3_nir_trig.py -p $(top_srcdir)/src/compiler/nir > $@ || ($(RM) $@; false)
-
 noinst_LTLIBRARIES = libfreedreno.la
 
 libfreedreno_la_SOURCES = \
@@ -23,28 +18,6 @@ libfreedreno_la_SOURCES = \
 	$(a4xx_SOURCES) \
 	$(a5xx_SOURCES) \
 	$(a6xx_SOURCES) \
-	$(ir3_SOURCES) \
-	$(ir3_GENERATED_FILES)
-
-BUILT_SOURCES := $(ir3_GENERATED_FILES)
-CLEANFILES := $(BUILT_SOURCES)
-EXTRA_DIST = ir3/ir3_nir_trig.py
-
-noinst_PROGRAMS = ir3_compiler
-
-# XXX: Required due to the C++ sources in libnir
-nodist_EXTRA_ir3_compiler_SOURCES = dummy.cpp
-ir3_compiler_SOURCES = \
-	ir3/ir3_cmdline.c
-
-ir3_compiler_LDADD = \
-	libfreedreno.la \
-	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
-	$(top_builddir)/src/compiler/nir/libnir.la \
-	$(top_builddir)/src/compiler/glsl/libstandalone.la \
-	$(top_builddir)/src/util/libmesautil.la \
-	$(top_builddir)/src/mesa/libmesagallium.la \
-	$(top_builddir)/src/freedreno/libfreedreno_drm.la \
-	$(GALLIUM_COMMON_LIB_DEPS)
+	$(ir3_SOURCES)
 
-EXTRA_DIST += meson.build
+EXTRA_DIST = meson.build
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources
index bde217d80a2..039a8ca7af7 100644
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -195,29 +195,8 @@ a6xx_SOURCES := \
 	a6xx/fd6_zsa.h
 
 ir3_SOURCES := \
-	ir3/disasm-a3xx.c \
-	ir3/instr-a3xx.h \
-	ir3/ir3.c \
 	ir3/ir3_cache.c \
 	ir3/ir3_cache.h \
-	ir3/ir3_compiler_nir.c \
-	ir3/ir3_compiler.c \
-	ir3/ir3_compiler.h \
-	ir3/ir3_cp.c \
-	ir3/ir3_depth.c \
 	ir3/ir3_gallium.c \
-	ir3/ir3_gallium.h \
-	ir3/ir3_group.c \
-	ir3/ir3.h \
-	ir3/ir3_legalize.c \
-	ir3/ir3_nir.c \
-	ir3/ir3_nir.h \
-	ir3/ir3_nir_lower_tg4_to_tex.c \
-	ir3/ir3_print.c \
-	ir3/ir3_ra.c \
-	ir3/ir3_sched.c \
-	ir3/ir3_shader.c \
-	ir3/ir3_shader.h
+	ir3/ir3_gallium.h
 
-ir3_GENERATED_FILES := \
-	ir3/ir3_nir_trig.c
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.h b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
index 4596aeee025..0c9412a7501 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
@@ -31,7 +31,7 @@
 
 #include "freedreno_context.h"
 
-#include "ir3_shader.h"
+#include "ir3/ir3_shader.h"
 
 
 struct fd3_context {
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.h b/src/gallium/drivers/freedreno/a3xx/fd3_program.h
index 0551f1f8b91..533838a9a6d 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h
@@ -29,7 +29,8 @@
 
 #include "pipe/p_context.h"
 #include "freedreno_context.h"
-#include "ir3_shader.h"
+
+#include "ir3/ir3_shader.h"
 
 struct fd3_emit;
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
index a010a4df9a1..7ed57d2de5a 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
@@ -30,7 +30,8 @@
 #include "fd3_screen.h"
 #include "fd3_context.h"
 #include "fd3_format.h"
-#include "ir3_compiler.h"
+
+#include "ir3/ir3_compiler.h"
 
 static boolean
 fd3_screen_is_format_supported(struct pipe_screen *pscreen,
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
index a4b84d400ef..a84e3a90f83 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
@@ -31,7 +31,7 @@
 
 #include "freedreno_context.h"
 
-#include "ir3_shader.h"
+#include "ir3/ir3_shader.h"
 
 struct fd4_context {
 	struct fd_context base;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.h b/src/gallium/drivers/freedreno/a4xx/fd4_program.h
index cc98bc9a4d6..a0a0bec264f 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.h
@@ -29,7 +29,8 @@
 
 #include "pipe/p_context.h"
 #include "freedreno_context.h"
-#include "ir3_shader.h"
+
+#include "ir3/ir3_shader.h"
 
 struct fd4_emit;
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
index 4e4e274cd10..961e907b779 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
@@ -30,7 +30,8 @@
 #include "fd4_screen.h"
 #include "fd4_context.h"
 #include "fd4_format.h"
-#include "ir3_compiler.h"
+
+#include "ir3/ir3_compiler.h"
 
 static boolean
 fd4_screen_is_format_supported(struct pipe_screen *pscreen,
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_context.h b/src/gallium/drivers/freedreno/a5xx/fd5_context.h
index 0cd252167b7..324878b4348 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_context.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_context.h
@@ -31,7 +31,7 @@
 
 #include "freedreno_context.h"
 
-#include "ir3_shader.h"
+#include "ir3/ir3_shader.h"
 
 struct fd5_context {
 	struct fd_context base;
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.h b/src/gallium/drivers/freedreno/a5xx/fd5_program.h
index 72cbf9a8b88..cdb31c62b63 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_program.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.h
@@ -29,7 +29,8 @@
 
 #include "pipe/p_context.h"
 #include "freedreno_context.h"
-#include "ir3_shader.h"
+
+#include "ir3/ir3_shader.h"
 
 struct fd5_emit;
 
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
index 7d8d2b3e5b8..db961790879 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
@@ -33,7 +33,7 @@
 #include "fd5_format.h"
 #include "fd5_resource.h"
 
-#include "ir3_compiler.h"
+#include "ir3/ir3_compiler.h"
 
 static bool
 valid_sample_count(unsigned sample_count)
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_context.h b/src/gallium/drivers/freedreno/a6xx/fd6_context.h
index f3cdd44dec4..2493813fe1a 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_context.h
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_context.h
@@ -32,7 +32,7 @@
 
 #include "freedreno_context.h"
 
-#include "ir3_shader.h"
+#include "ir3/ir3_shader.h"
 
 #include "a6xx.xml.h"
 
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.h b/src/gallium/drivers/freedreno/a6xx/fd6_program.h
index 83c4688a243..3ed5426b50e 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_program.h
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.h
@@ -30,7 +30,8 @@
 
 #include "pipe/p_context.h"
 #include "freedreno_context.h"
-#include "ir3_shader.h"
+
+#include "ir3/ir3_shader.h"
 #include "ir3_cache.h"
 
 struct fd6_streamout_state {
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_screen.c b/src/gallium/drivers/freedreno/a6xx/fd6_screen.c
index 9e039bf87a9..a191ea696ba 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_screen.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_screen.c
@@ -33,7 +33,7 @@
 #include "fd6_format.h"
 #include "fd6_resource.h"
 
-#include "ir3_compiler.h"
+#include "ir3/ir3_compiler.h"
 
 static boolean
 fd6_screen_is_format_supported(struct pipe_screen *pscreen,
diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
deleted file mode 100644
index 4cf45ce9227..00000000000
--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ /dev/null
@@ -1,1038 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <string.h>
-#include <assert.h>
-
-#include <util/u_debug.h>
-
-#include "instr-a3xx.h"
-
-/* bitmask of debug flags */
-enum debug_t {
-	PRINT_RAW      = 0x1,    /* dump raw hexdump */
-	PRINT_VERBOSE  = 0x2,
-};
-
-static enum debug_t debug;
-
-#define printf debug_printf
-
-static const char *levels[] = {
-		"",
-		"\t",
-		"\t\t",
-		"\t\t\t",
-		"\t\t\t\t",
-		"\t\t\t\t\t",
-		"\t\t\t\t\t\t",
-		"\t\t\t\t\t\t\t",
-		"\t\t\t\t\t\t\t\t",
-		"\t\t\t\t\t\t\t\t\t",
-		"x",
-		"x",
-		"x",
-		"x",
-		"x",
-		"x",
-};
-
-static const char *component = "xyzw";
-
-static const char *type[] = {
-		[TYPE_F16] = "f16",
-		[TYPE_F32] = "f32",
-		[TYPE_U16] = "u16",
-		[TYPE_U32] = "u32",
-		[TYPE_S16] = "s16",
-		[TYPE_S32] = "s32",
-		[TYPE_U8]  = "u8",
-		[TYPE_S8]  = "s8",
-};
-
-struct disasm_ctx {
-	FILE *out;
-	int level;
-
-	/* current instruction repeat flag: */
-	unsigned repeat;
-};
-
-static void print_reg(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
-		bool c, bool im, bool neg, bool abs, bool addr_rel)
-{
-	const char type = c ? 'c' : 'r';
-
-	// XXX I prefer - and || for neg/abs, but preserving format used
-	// by libllvm-a3xx for easy diffing..
-
-	if (abs && neg)
-		fprintf(ctx->out, "(absneg)");
-	else if (neg)
-		fprintf(ctx->out, "(neg)");
-	else if (abs)
-		fprintf(ctx->out, "(abs)");
-
-	if (r)
-		fprintf(ctx->out, "(r)");
-
-	if (im) {
-		fprintf(ctx->out, "%d", reg.iim_val);
-	} else if (addr_rel) {
-		/* I would just use %+d but trying to make it diff'able with
-		 * libllvm-a3xx...
-		 */
-		if (reg.iim_val < 0)
-			fprintf(ctx->out, "%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
-		else if (reg.iim_val > 0)
-			fprintf(ctx->out, "%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
-		else
-			fprintf(ctx->out, "%s%c<a0.x>", full ? "" : "h", type);
-	} else if ((reg.num == REG_A0) && !c) {
-		fprintf(ctx->out, "a0.%c", component[reg.comp]);
-	} else if ((reg.num == REG_P0) && !c) {
-		fprintf(ctx->out, "p0.%c", component[reg.comp]);
-	} else {
-		fprintf(ctx->out, "%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]);
-	}
-}
-
-
-static void print_reg_dst(struct disasm_ctx *ctx, reg_t reg, bool full, bool addr_rel)
-{
-	print_reg(ctx, reg, full, false, false, false, false, false, addr_rel);
-}
-
-static void print_reg_src(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
-		bool c, bool im, bool neg, bool abs, bool addr_rel)
-{
-	print_reg(ctx, reg, full, r, c, im, neg, abs, addr_rel);
-}
-
-/* TODO switch to using reginfo struct everywhere, since more readable
- * than passing a bunch of bools to print_reg_src
- */
-
-struct reginfo {
-	reg_t reg;
-	bool full;
-	bool r;
-	bool c;
-	bool im;
-	bool neg;
-	bool abs;
-	bool addr_rel;
-};
-
-static void print_src(struct disasm_ctx *ctx, struct reginfo *info)
-{
-	print_reg_src(ctx, info->reg, info->full, info->r, info->c, info->im,
-			info->neg, info->abs, info->addr_rel);
-}
-
-//static void print_dst(struct disasm_ctx *ctx, struct reginfo *info)
-//{
-//	print_reg_dst(ctx, info->reg, info->full, info->addr_rel);
-//}
-
-static void print_instr_cat0(struct disasm_ctx *ctx, instr_t *instr)
-{
-	instr_cat0_t *cat0 = &instr->cat0;
-
-	switch (cat0->opc) {
-	case OPC_KILL:
-		fprintf(ctx->out, " %sp0.%c", cat0->inv ? "!" : "",
-				component[cat0->comp]);
-		break;
-	case OPC_BR:
-		fprintf(ctx->out, " %sp0.%c, #%d", cat0->inv ? "!" : "",
-				component[cat0->comp], cat0->a3xx.immed);
-		break;
-	case OPC_JUMP:
-	case OPC_CALL:
-		fprintf(ctx->out, " #%d", cat0->a3xx.immed);
-		break;
-	}
-
-	if ((debug & PRINT_VERBOSE) && (cat0->dummy2|cat0->dummy3|cat0->dummy4))
-		fprintf(ctx->out, "\t{0: %x,%x,%x}", cat0->dummy2, cat0->dummy3, cat0->dummy4);
-}
-
-static void print_instr_cat1(struct disasm_ctx *ctx, instr_t *instr)
-{
-	instr_cat1_t *cat1 = &instr->cat1;
-
-	if (cat1->ul)
-		fprintf(ctx->out, "(ul)");
-
-	if (cat1->src_type == cat1->dst_type) {
-		if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
-			/* special case (nmemonic?): */
-			fprintf(ctx->out, "mova");
-		} else {
-			fprintf(ctx->out, "mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
-		}
-	} else {
-		fprintf(ctx->out, "cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
-	}
-
-	fprintf(ctx->out, " ");
-
-	if (cat1->even)
-		fprintf(ctx->out, "(even)");
-
-	if (cat1->pos_inf)
-		fprintf(ctx->out, "(pos_infinity)");
-
-	print_reg_dst(ctx, (reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
-			cat1->dst_rel);
-
-	fprintf(ctx->out, ", ");
-
-	/* ugg, have to special case this.. vs print_reg().. */
-	if (cat1->src_im) {
-		if (type_float(cat1->src_type))
-			fprintf(ctx->out, "(%f)", cat1->fim_val);
-		else if (type_uint(cat1->src_type))
-			fprintf(ctx->out, "0x%08x", cat1->uim_val);
-		else
-			fprintf(ctx->out, "%d", cat1->iim_val);
-	} else if (cat1->src_rel && !cat1->src_c) {
-		/* I would just use %+d but trying to make it diff'able with
-		 * libllvm-a3xx...
-		 */
-		char type = cat1->src_rel_c ? 'c' : 'r';
-		if (cat1->off < 0)
-			fprintf(ctx->out, "%c<a0.x - %d>", type, -cat1->off);
-		else if (cat1->off > 0)
-			fprintf(ctx->out, "%c<a0.x + %d>", type, cat1->off);
-		else
-			fprintf(ctx->out, "%c<a0.x>", type);
-	} else {
-		print_reg_src(ctx, (reg_t)(cat1->src), type_size(cat1->src_type) == 32,
-				cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
-	}
-
-	if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
-		fprintf(ctx->out, "\t{1: %x}", cat1->must_be_0);
-}
-
-static void print_instr_cat2(struct disasm_ctx *ctx, instr_t *instr)
-{
-	instr_cat2_t *cat2 = &instr->cat2;
-	static const char *cond[] = {
-			"lt",
-			"le",
-			"gt",
-			"ge",
-			"eq",
-			"ne",
-			"?6?",
-	};
-
-	switch (_OPC(2, cat2->opc)) {
-	case OPC_CMPS_F:
-	case OPC_CMPS_U:
-	case OPC_CMPS_S:
-	case OPC_CMPV_F:
-	case OPC_CMPV_U:
-	case OPC_CMPV_S:
-		fprintf(ctx->out, ".%s", cond[cat2->cond]);
-		break;
-	}
-
-	fprintf(ctx->out, " ");
-	if (cat2->ei)
-		fprintf(ctx->out, "(ei)");
-	print_reg_dst(ctx, (reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
-	fprintf(ctx->out, ", ");
-
-	if (cat2->c1.src1_c) {
-		print_reg_src(ctx, (reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r,
-				cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg,
-				cat2->src1_abs, false);
-	} else if (cat2->rel1.src1_rel) {
-		print_reg_src(ctx, (reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r,
-				cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg,
-				cat2->src1_abs, cat2->rel1.src1_rel);
-	} else {
-		print_reg_src(ctx, (reg_t)(cat2->src1), cat2->full, cat2->src1_r,
-				false, cat2->src1_im, cat2->src1_neg,
-				cat2->src1_abs, false);
-	}
-
-	switch (_OPC(2, cat2->opc)) {
-	case OPC_ABSNEG_F:
-	case OPC_ABSNEG_S:
-	case OPC_CLZ_B:
-	case OPC_CLZ_S:
-	case OPC_SIGN_F:
-	case OPC_FLOOR_F:
-	case OPC_CEIL_F:
-	case OPC_RNDNE_F:
-	case OPC_RNDAZ_F:
-	case OPC_TRUNC_F:
-	case OPC_NOT_B:
-	case OPC_BFREV_B:
-	case OPC_SETRM:
-	case OPC_CBITS_B:
-		/* these only have one src reg */
-		break;
-	default:
-		fprintf(ctx->out, ", ");
-		if (cat2->c2.src2_c) {
-			print_reg_src(ctx, (reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r,
-					cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg,
-					cat2->src2_abs, false);
-		} else if (cat2->rel2.src2_rel) {
-			print_reg_src(ctx, (reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r,
-					cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg,
-					cat2->src2_abs, cat2->rel2.src2_rel);
-		} else {
-			print_reg_src(ctx, (reg_t)(cat2->src2), cat2->full, cat2->src2_r,
-					false, cat2->src2_im, cat2->src2_neg,
-					cat2->src2_abs, false);
-		}
-		break;
-	}
-}
-
-static void print_instr_cat3(struct disasm_ctx *ctx, instr_t *instr)
-{
-	instr_cat3_t *cat3 = &instr->cat3;
-	bool full = instr_cat3_full(cat3);
-
-	fprintf(ctx->out, " ");
-	print_reg_dst(ctx, (reg_t)(cat3->dst), full ^ cat3->dst_half, false);
-	fprintf(ctx->out, ", ");
-	if (cat3->c1.src1_c) {
-		print_reg_src(ctx, (reg_t)(cat3->c1.src1), full,
-				cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg,
-				false, false);
-	} else if (cat3->rel1.src1_rel) {
-		print_reg_src(ctx, (reg_t)(cat3->rel1.src1), full,
-				cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg,
-				false, cat3->rel1.src1_rel);
-	} else {
-		print_reg_src(ctx, (reg_t)(cat3->src1), full,
-				cat3->src1_r, false, false, cat3->src1_neg,
-				false, false);
-	}
-	fprintf(ctx->out, ", ");
-	print_reg_src(ctx, (reg_t)cat3->src2, full,
-			cat3->src2_r, cat3->src2_c, false, cat3->src2_neg,
-			false, false);
-	fprintf(ctx->out, ", ");
-	if (cat3->c2.src3_c) {
-		print_reg_src(ctx, (reg_t)(cat3->c2.src3), full,
-				cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg,
-				false, false);
-	} else if (cat3->rel2.src3_rel) {
-		print_reg_src(ctx, (reg_t)(cat3->rel2.src3), full,
-				cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg,
-				false, cat3->rel2.src3_rel);
-	} else {
-		print_reg_src(ctx, (reg_t)(cat3->src3), full,
-				cat3->src3_r, false, false, cat3->src3_neg,
-				false, false);
-	}
-}
-
-static void print_instr_cat4(struct disasm_ctx *ctx, instr_t *instr)
-{
-	instr_cat4_t *cat4 = &instr->cat4;
-
-	fprintf(ctx->out, " ");
-	print_reg_dst(ctx, (reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
-	fprintf(ctx->out, ", ");
-
-	if (cat4->c.src_c) {
-		print_reg_src(ctx, (reg_t)(cat4->c.src), cat4->full,
-				cat4->src_r, cat4->c.src_c, cat4->src_im,
-				cat4->src_neg, cat4->src_abs, false);
-	} else if (cat4->rel.src_rel) {
-		print_reg_src(ctx, (reg_t)(cat4->rel.src), cat4->full,
-				cat4->src_r, cat4->rel.src_c, cat4->src_im,
-				cat4->src_neg, cat4->src_abs, cat4->rel.src_rel);
-	} else {
-		print_reg_src(ctx, (reg_t)(cat4->src), cat4->full,
-				cat4->src_r, false, cat4->src_im,
-				cat4->src_neg, cat4->src_abs, false);
-	}
-
-	if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
-		fprintf(ctx->out, "\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
-}
-
-static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr)
-{
-	static const struct {
-		bool src1, src2, samp, tex;
-	} info[0x1f] = {
-			[opc_op(OPC_ISAM)]     = { true,  false, true,  true,  },
-			[opc_op(OPC_ISAML)]    = { true,  true,  true,  true,  },
-			[opc_op(OPC_ISAMM)]    = { true,  false, true,  true,  },
-			[opc_op(OPC_SAM)]      = { true,  false, true,  true,  },
-			[opc_op(OPC_SAMB)]     = { true,  true,  true,  true,  },
-			[opc_op(OPC_SAML)]     = { true,  true,  true,  true,  },
-			[opc_op(OPC_SAMGQ)]    = { true,  false, true,  true,  },
-			[opc_op(OPC_GETLOD)]   = { true,  false, true,  true,  },
-			[opc_op(OPC_CONV)]     = { true,  true,  true,  true,  },
-			[opc_op(OPC_CONVM)]    = { true,  true,  true,  true,  },
-			[opc_op(OPC_GETSIZE)]  = { true,  false, false, true,  },
-			[opc_op(OPC_GETBUF)]   = { false, false, false, true,  },
-			[opc_op(OPC_GETPOS)]   = { true,  false, false, true,  },
-			[opc_op(OPC_GETINFO)]  = { false, false, false, true,  },
-			[opc_op(OPC_DSX)]      = { true,  false, false, false, },
-			[opc_op(OPC_DSY)]      = { true,  false, false, false, },
-			[opc_op(OPC_GATHER4R)] = { true,  false, true,  true,  },
-			[opc_op(OPC_GATHER4G)] = { true,  false, true,  true,  },
-			[opc_op(OPC_GATHER4B)] = { true,  false, true,  true,  },
-			[opc_op(OPC_GATHER4A)] = { true,  false, true,  true,  },
-			[opc_op(OPC_SAMGP0)]   = { true,  false, true,  true,  },
-			[opc_op(OPC_SAMGP1)]   = { true,  false, true,  true,  },
-			[opc_op(OPC_SAMGP2)]   = { true,  false, true,  true,  },
-			[opc_op(OPC_SAMGP3)]   = { true,  false, true,  true,  },
-			[opc_op(OPC_DSXPP_1)]  = { true,  false, false, false, },
-			[opc_op(OPC_DSYPP_1)]  = { true,  false, false, false, },
-			[opc_op(OPC_RGETPOS)]  = { false, false, false, false, },
-			[opc_op(OPC_RGETINFO)] = { false, false, false, false, },
-	};
-	instr_cat5_t *cat5 = &instr->cat5;
-	int i;
-
-	if (cat5->is_3d)   fprintf(ctx->out, ".3d");
-	if (cat5->is_a)    fprintf(ctx->out, ".a");
-	if (cat5->is_o)    fprintf(ctx->out, ".o");
-	if (cat5->is_p)    fprintf(ctx->out, ".p");
-	if (cat5->is_s)    fprintf(ctx->out, ".s");
-	if (cat5->is_s2en) fprintf(ctx->out, ".s2en");
-
-	fprintf(ctx->out, " ");
-
-	switch (_OPC(5, cat5->opc)) {
-	case OPC_DSXPP_1:
-	case OPC_DSYPP_1:
-		break;
-	default:
-		fprintf(ctx->out, "(%s)", type[cat5->type]);
-		break;
-	}
-
-	fprintf(ctx->out, "(");
-	for (i = 0; i < 4; i++)
-		if (cat5->wrmask & (1 << i))
-			fprintf(ctx->out, "%c", "xyzw"[i]);
-	fprintf(ctx->out, ")");
-
-	print_reg_dst(ctx, (reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
-
-	if (info[cat5->opc].src1) {
-		fprintf(ctx->out, ", ");
-		print_reg_src(ctx, (reg_t)(cat5->src1), cat5->full, false, false, false,
-				false, false, false);
-	}
-
-	if (cat5->is_s2en) {
-		fprintf(ctx->out, ", ");
-		print_reg_src(ctx, (reg_t)(cat5->s2en.src2), cat5->full, false, false, false,
-				false, false, false);
-		fprintf(ctx->out, ", ");
-		print_reg_src(ctx, (reg_t)(cat5->s2en.src3), false, false, false, false,
-				false, false, false);
-	} else {
-		if (cat5->is_o || info[cat5->opc].src2) {
-			fprintf(ctx->out, ", ");
-			print_reg_src(ctx, (reg_t)(cat5->norm.src2), cat5->full,
-					false, false, false, false, false, false);
-		}
-		if (info[cat5->opc].samp)
-			fprintf(ctx->out, ", s#%d", cat5->norm.samp);
-		if (info[cat5->opc].tex)
-			fprintf(ctx->out, ", t#%d", cat5->norm.tex);
-	}
-
-	if (debug & PRINT_VERBOSE) {
-		if (cat5->is_s2en) {
-			if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2))
-				fprintf(ctx->out, "\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2);
-		} else {
-			if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2))
-				fprintf(ctx->out, "\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2);
-		}
-	}
-}
-
-static void print_instr_cat6(struct disasm_ctx *ctx, instr_t *instr)
-{
-	instr_cat6_t *cat6 = &instr->cat6;
-	char sd = 0, ss = 0;  /* dst/src address space */
-	bool nodst = false;
-	struct reginfo dst, src1, src2;
-	int src1off = 0, dstoff = 0;
-
-	memset(&dst, 0, sizeof(dst));
-	memset(&src1, 0, sizeof(src1));
-	memset(&src2, 0, sizeof(src2));
-
-	switch (_OPC(6, cat6->opc)) {
-	case OPC_RESINFO:
-	case OPC_RESFMT:
-		dst.full  = type_size(cat6->type) == 32;
-		src1.full = type_size(cat6->type) == 32;
-		src2.full = type_size(cat6->type) == 32;
-		break;
-	case OPC_L2G:
-	case OPC_G2L:
-		dst.full = true;
-		src1.full = true;
-		src2.full = true;
-		break;
-	case OPC_STG:
-	case OPC_STL:
-	case OPC_STP:
-	case OPC_STI:
-	case OPC_STLW:
-	case OPC_STIB:
-		dst.full  = true;
-		src1.full = type_size(cat6->type) == 32;
-		src2.full = type_size(cat6->type) == 32;
-		break;
-	default:
-		dst.full  = type_size(cat6->type) == 32;
-		src1.full = true;
-		src2.full = true;
-		break;
-	}
-
-	switch (_OPC(6, cat6->opc)) {
-	case OPC_PREFETCH:
-		break;
-	case OPC_RESINFO:
-		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
-		break;
-	case OPC_LDGB:
-		fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
-		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
-		fprintf(ctx->out, ".%s", type[cat6->type]);
-		fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
-		break;
-	case OPC_STGB:
-	case OPC_STIB:
-		fprintf(ctx->out, ".%s", cat6->stgb.typed ? "typed" : "untyped");
-		fprintf(ctx->out, ".%dd", cat6->stgb.d + 1);
-		fprintf(ctx->out, ".%s", type[cat6->type]);
-		fprintf(ctx->out, ".%d", cat6->stgb.type_size + 1);
-		break;
-	case OPC_ATOMIC_ADD:
-	case OPC_ATOMIC_SUB:
-	case OPC_ATOMIC_XCHG:
-	case OPC_ATOMIC_INC:
-	case OPC_ATOMIC_DEC:
-	case OPC_ATOMIC_CMPXCHG:
-	case OPC_ATOMIC_MIN:
-	case OPC_ATOMIC_MAX:
-	case OPC_ATOMIC_AND:
-	case OPC_ATOMIC_OR:
-	case OPC_ATOMIC_XOR:
-		ss = cat6->g ? 'g' : 'l';
-		fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
-		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
-		fprintf(ctx->out, ".%s", type[cat6->type]);
-		fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
-		fprintf(ctx->out, ".%c", ss);
-		break;
-	default:
-		dst.im = cat6->g && !cat6->dst_off;
-		fprintf(ctx->out, ".%s", type[cat6->type]);
-		break;
-	}
-	fprintf(ctx->out, " ");
-
-	switch (_OPC(6, cat6->opc)) {
-	case OPC_STG:
-		sd = 'g';
-		break;
-	case OPC_STP:
-		sd = 'p';
-		break;
-	case OPC_STL:
-	case OPC_STLW:
-		sd = 'l';
-		break;
-
-	case OPC_LDG:
-	case OPC_LDC:
-		ss = 'g';
-		break;
-	case OPC_LDP:
-		ss = 'p';
-		break;
-	case OPC_LDL:
-	case OPC_LDLW:
-	case OPC_LDLV:
-		ss = 'l';
-		break;
-
-	case OPC_L2G:
-		ss = 'l';
-		sd = 'g';
-		break;
-
-	case OPC_G2L:
-		ss = 'g';
-		sd = 'l';
-		break;
-
-	case OPC_PREFETCH:
-		ss = 'g';
-		nodst = true;
-		break;
-
-	case OPC_STI:
-		dst.full = false;  // XXX or inverts??
-		break;
-	}
-
-	if ((_OPC(6, cat6->opc) == OPC_STGB) || (_OPC(6, cat6->opc) == OPC_STIB)) {
-		struct reginfo src3;
-
-		memset(&src3, 0, sizeof(src3));
-
-		src1.reg = (reg_t)(cat6->stgb.src1);
-		src2.reg = (reg_t)(cat6->stgb.src2);
-		src2.im  = cat6->stgb.src2_im;
-		src3.reg = (reg_t)(cat6->stgb.src3);
-		src3.im  = cat6->stgb.src3_im;
-		src3.full = true;
-
-		fprintf(ctx->out, "g[%u], ", cat6->stgb.dst_ssbo);
-		print_src(ctx, &src1);
-		fprintf(ctx->out, ", ");
-		print_src(ctx, &src2);
-		fprintf(ctx->out, ", ");
-		print_src(ctx, &src3);
-
-		if (debug & PRINT_VERBOSE)
-			fprintf(ctx->out, " (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3);
-
-		return;
-	}
-
-	if (is_atomic(_OPC(6, cat6->opc))) {
-
-		src1.reg = (reg_t)(cat6->ldgb.src1);
-		src1.im  = cat6->ldgb.src1_im;
-		src2.reg = (reg_t)(cat6->ldgb.src2);
-		src2.im  = cat6->ldgb.src2_im;
-		dst.reg  = (reg_t)(cat6->ldgb.dst);
-
-		print_src(ctx, &dst);
-		fprintf(ctx->out, ", ");
-		if (ss == 'g') {
-			struct reginfo src3;
-			memset(&src3, 0, sizeof(src3));
-
-			src3.reg = (reg_t)(cat6->ldgb.src3);
-			src3.full = true;
-
-			/* For images, the ".typed" variant is used and src2 is
-			 * the ivecN coordinates, ie ivec2 for 2d.
-			 *
-			 * For SSBOs, the ".untyped" variant is used and src2 is
-			 * a simple dword offset..  src3 appears to be
-			 * uvec2(offset * 4, 0).  Not sure the point of that.
-			 */
-
-			fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
-			print_src(ctx, &src1);  /* value */
-			fprintf(ctx->out, ", ");
-			print_src(ctx, &src2);  /* offset/coords */
-			fprintf(ctx->out, ", ");
-			print_src(ctx, &src3);  /* 64b byte offset.. */
-
-			if (debug & PRINT_VERBOSE) {
-				fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0,
-						cat6->ldgb.pad3, cat6->ldgb.mustbe0);
-			}
-		} else { /* ss == 'l' */
-			fprintf(ctx->out, "l[");
-			print_src(ctx, &src1);  /* simple byte offset */
-			fprintf(ctx->out, "], ");
-			print_src(ctx, &src2);  /* value */
-
-			if (debug & PRINT_VERBOSE) {
-				fprintf(ctx->out, " (src3=%x, pad0=%x, pad3=%x, mustbe0=%x)",
-						cat6->ldgb.src3, cat6->ldgb.pad0,
-						cat6->ldgb.pad3, cat6->ldgb.mustbe0);
-			}
-		}
-
-		return;
-	} else if (_OPC(6, cat6->opc) == OPC_RESINFO) {
-		dst.reg  = (reg_t)(cat6->ldgb.dst);
-
-		print_src(ctx, &dst);
-		fprintf(ctx->out, ", ");
-		fprintf(ctx->out, "g[%u]", cat6->ldgb.src_ssbo);
-
-		return;
-	} else if (_OPC(6, cat6->opc) == OPC_LDGB) {
-
-		src1.reg = (reg_t)(cat6->ldgb.src1);
-		src1.im  = cat6->ldgb.src1_im;
-		src2.reg = (reg_t)(cat6->ldgb.src2);
-		src2.im  = cat6->ldgb.src2_im;
-		dst.reg  = (reg_t)(cat6->ldgb.dst);
-
-		print_src(ctx, &dst);
-		fprintf(ctx->out, ", ");
-		fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
-		print_src(ctx, &src1);
-		fprintf(ctx->out, ", ");
-		print_src(ctx, &src2);
-
-		if (debug & PRINT_VERBOSE)
-			fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0);
-
-		return;
-	}
-	if (cat6->dst_off) {
-		dst.reg = (reg_t)(cat6->c.dst);
-		dstoff  = cat6->c.off;
-	} else {
-		dst.reg = (reg_t)(cat6->d.dst);
-	}
-
-	if (cat6->src_off) {
-		src1.reg = (reg_t)(cat6->a.src1);
-		src1.im  = cat6->a.src1_im;
-		src2.reg = (reg_t)(cat6->a.src2);
-		src2.im  = cat6->a.src2_im;
-		src1off  = cat6->a.off;
-	} else {
-		src1.reg = (reg_t)(cat6->b.src1);
-		src1.im  = cat6->b.src1_im;
-		src2.reg = (reg_t)(cat6->b.src2);
-		src2.im  = cat6->b.src2_im;
-	}
-
-	if (!nodst) {
-		if (sd)
-			fprintf(ctx->out, "%c[", sd);
-		/* note: dst might actually be a src (ie. address to store to) */
-		print_src(ctx, &dst);
-		if (dstoff)
-			fprintf(ctx->out, "%+d", dstoff);
-		if (sd)
-			fprintf(ctx->out, "]");
-		fprintf(ctx->out, ", ");
-	}
-
-	if (ss)
-		fprintf(ctx->out, "%c[", ss);
-
-	/* can have a larger than normal immed, so hack: */
-	if (src1.im) {
-		fprintf(ctx->out, "%u", src1.reg.dummy13);
-	} else {
-		print_src(ctx, &src1);
-	}
-
-	if (src1off)
-		fprintf(ctx->out, "%+d", src1off);
-	if (ss)
-		fprintf(ctx->out, "]");
-
-	switch (_OPC(6, cat6->opc)) {
-	case OPC_RESINFO:
-	case OPC_RESFMT:
-		break;
-	default:
-		fprintf(ctx->out, ", ");
-		print_src(ctx, &src2);
-		break;
-	}
-}
-
-static void print_instr_cat7(struct disasm_ctx *ctx, instr_t *instr)
-{
-	instr_cat7_t *cat7 = &instr->cat7;
-
-	if (cat7->g)
-		fprintf(ctx->out, ".g");
-	if (cat7->l)
-		fprintf(ctx->out, ".l");
-
-	if (_OPC(7, cat7->opc) == OPC_FENCE) {
-		if (cat7->r)
-			fprintf(ctx->out, ".r");
-		if (cat7->w)
-			fprintf(ctx->out, ".w");
-	}
-}
-
-/* size of largest OPC field of all the instruction categories: */
-#define NOPC_BITS 6
-
-static const struct opc_info {
-	uint16_t cat;
-	uint16_t opc;
-	const char *name;
-	void (*print)(struct disasm_ctx *ctx, instr_t *instr);
-} opcs[1 << (3+NOPC_BITS)] = {
-#define OPC(cat, opc, name) [(opc)] = { (cat), (opc), #name, print_instr_cat##cat }
-	/* category 0: */
-	OPC(0, OPC_NOP,          nop),
-	OPC(0, OPC_BR,           br),
-	OPC(0, OPC_JUMP,         jump),
-	OPC(0, OPC_CALL,         call),
-	OPC(0, OPC_RET,          ret),
-	OPC(0, OPC_KILL,         kill),
-	OPC(0, OPC_END,          end),
-	OPC(0, OPC_EMIT,         emit),
-	OPC(0, OPC_CUT,          cut),
-	OPC(0, OPC_CHMASK,       chmask),
-	OPC(0, OPC_CHSH,         chsh),
-	OPC(0, OPC_FLOW_REV,     flow_rev),
-
-	/* category 1: */
-	OPC(1, OPC_MOV, ),
-
-	/* category 2: */
-	OPC(2, OPC_ADD_F,        add.f),
-	OPC(2, OPC_MIN_F,        min.f),
-	OPC(2, OPC_MAX_F,        max.f),
-	OPC(2, OPC_MUL_F,        mul.f),
-	OPC(2, OPC_SIGN_F,       sign.f),
-	OPC(2, OPC_CMPS_F,       cmps.f),
-	OPC(2, OPC_ABSNEG_F,     absneg.f),
-	OPC(2, OPC_CMPV_F,       cmpv.f),
-	OPC(2, OPC_FLOOR_F,      floor.f),
-	OPC(2, OPC_CEIL_F,       ceil.f),
-	OPC(2, OPC_RNDNE_F,      rndne.f),
-	OPC(2, OPC_RNDAZ_F,      rndaz.f),
-	OPC(2, OPC_TRUNC_F,      trunc.f),
-	OPC(2, OPC_ADD_U,        add.u),
-	OPC(2, OPC_ADD_S,        add.s),
-	OPC(2, OPC_SUB_U,        sub.u),
-	OPC(2, OPC_SUB_S,        sub.s),
-	OPC(2, OPC_CMPS_U,       cmps.u),
-	OPC(2, OPC_CMPS_S,       cmps.s),
-	OPC(2, OPC_MIN_U,        min.u),
-	OPC(2, OPC_MIN_S,        min.s),
-	OPC(2, OPC_MAX_U,        max.u),
-	OPC(2, OPC_MAX_S,        max.s),
-	OPC(2, OPC_ABSNEG_S,     absneg.s),
-	OPC(2, OPC_AND_B,        and.b),
-	OPC(2, OPC_OR_B,         or.b),
-	OPC(2, OPC_NOT_B,        not.b),
-	OPC(2, OPC_XOR_B,        xor.b),
-	OPC(2, OPC_CMPV_U,       cmpv.u),
-	OPC(2, OPC_CMPV_S,       cmpv.s),
-	OPC(2, OPC_MUL_U,        mul.u),
-	OPC(2, OPC_MUL_S,        mul.s),
-	OPC(2, OPC_MULL_U,       mull.u),
-	OPC(2, OPC_BFREV_B,      bfrev.b),
-	OPC(2, OPC_CLZ_S,        clz.s),
-	OPC(2, OPC_CLZ_B,        clz.b),
-	OPC(2, OPC_SHL_B,        shl.b),
-	OPC(2, OPC_SHR_B,        shr.b),
-	OPC(2, OPC_ASHR_B,       ashr.b),
-	OPC(2, OPC_BARY_F,       bary.f),
-	OPC(2, OPC_MGEN_B,       mgen.b),
-	OPC(2, OPC_GETBIT_B,     getbit.b),
-	OPC(2, OPC_SETRM,        setrm),
-	OPC(2, OPC_CBITS_B,      cbits.b),
-	OPC(2, OPC_SHB,          shb),
-	OPC(2, OPC_MSAD,         msad),
-
-	/* category 3: */
-	OPC(3, OPC_MAD_U16,      mad.u16),
-	OPC(3, OPC_MADSH_U16,    madsh.u16),
-	OPC(3, OPC_MAD_S16,      mad.s16),
-	OPC(3, OPC_MADSH_M16,    madsh.m16),
-	OPC(3, OPC_MAD_U24,      mad.u24),
-	OPC(3, OPC_MAD_S24,      mad.s24),
-	OPC(3, OPC_MAD_F16,      mad.f16),
-	OPC(3, OPC_MAD_F32,      mad.f32),
-	OPC(3, OPC_SEL_B16,      sel.b16),
-	OPC(3, OPC_SEL_B32,      sel.b32),
-	OPC(3, OPC_SEL_S16,      sel.s16),
-	OPC(3, OPC_SEL_S32,      sel.s32),
-	OPC(3, OPC_SEL_F16,      sel.f16),
-	OPC(3, OPC_SEL_F32,      sel.f32),
-	OPC(3, OPC_SAD_S16,      sad.s16),
-	OPC(3, OPC_SAD_S32,      sad.s32),
-
-	/* category 4: */
-	OPC(4, OPC_RCP,          rcp),
-	OPC(4, OPC_RSQ,          rsq),
-	OPC(4, OPC_LOG2,         log2),
-	OPC(4, OPC_EXP2,         exp2),
-	OPC(4, OPC_SIN,          sin),
-	OPC(4, OPC_COS,          cos),
-	OPC(4, OPC_SQRT,         sqrt),
-
-	/* category 5: */
-	OPC(5, OPC_ISAM,         isam),
-	OPC(5, OPC_ISAML,        isaml),
-	OPC(5, OPC_ISAMM,        isamm),
-	OPC(5, OPC_SAM,          sam),
-	OPC(5, OPC_SAMB,         samb),
-	OPC(5, OPC_SAML,         saml),
-	OPC(5, OPC_SAMGQ,        samgq),
-	OPC(5, OPC_GETLOD,       getlod),
-	OPC(5, OPC_CONV,         conv),
-	OPC(5, OPC_CONVM,        convm),
-	OPC(5, OPC_GETSIZE,      getsize),
-	OPC(5, OPC_GETBUF,       getbuf),
-	OPC(5, OPC_GETPOS,       getpos),
-	OPC(5, OPC_GETINFO,      getinfo),
-	OPC(5, OPC_DSX,          dsx),
-	OPC(5, OPC_DSY,          dsy),
-	OPC(5, OPC_GATHER4R,     gather4r),
-	OPC(5, OPC_GATHER4G,     gather4g),
-	OPC(5, OPC_GATHER4B,     gather4b),
-	OPC(5, OPC_GATHER4A,     gather4a),
-	OPC(5, OPC_SAMGP0,       samgp0),
-	OPC(5, OPC_SAMGP1,       samgp1),
-	OPC(5, OPC_SAMGP2,       samgp2),
-	OPC(5, OPC_SAMGP3,       samgp3),
-	OPC(5, OPC_DSXPP_1,      dsxpp.1),
-	OPC(5, OPC_DSYPP_1,      dsypp.1),
-	OPC(5, OPC_RGETPOS,      rgetpos),
-	OPC(5, OPC_RGETINFO,     rgetinfo),
-
-
-	/* category 6: */
-	OPC(6, OPC_LDG,          ldg),
-	OPC(6, OPC_LDL,          ldl),
-	OPC(6, OPC_LDP,          ldp),
-	OPC(6, OPC_STG,          stg),
-	OPC(6, OPC_STL,          stl),
-	OPC(6, OPC_STP,          stp),
-	OPC(6, OPC_STI,          sti),
-	OPC(6, OPC_G2L,          g2l),
-	OPC(6, OPC_L2G,          l2g),
-	OPC(6, OPC_PREFETCH,     prefetch),
-	OPC(6, OPC_LDLW,         ldlw),
-	OPC(6, OPC_STLW,         stlw),
-	OPC(6, OPC_RESFMT,       resfmt),
-	OPC(6, OPC_RESINFO,      resinfo),
-	OPC(6, OPC_ATOMIC_ADD,     atomic.add),
-	OPC(6, OPC_ATOMIC_SUB,     atomic.sub),
-	OPC(6, OPC_ATOMIC_XCHG,    atomic.xchg),
-	OPC(6, OPC_ATOMIC_INC,     atomic.inc),
-	OPC(6, OPC_ATOMIC_DEC,     atomic.dec),
-	OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg),
-	OPC(6, OPC_ATOMIC_MIN,     atomic.min),
-	OPC(6, OPC_ATOMIC_MAX,     atomic.max),
-	OPC(6, OPC_ATOMIC_AND,     atomic.and),
-	OPC(6, OPC_ATOMIC_OR,      atomic.or),
-	OPC(6, OPC_ATOMIC_XOR,     atomic.xor),
-	OPC(6, OPC_LDGB,         ldgb),
-	OPC(6, OPC_STGB,         stgb),
-	OPC(6, OPC_STIB,         stib),
-	OPC(6, OPC_LDC,          ldc),
-	OPC(6, OPC_LDLV,         ldlv),
-
-	OPC(7, OPC_BAR,          bar),
-	OPC(7, OPC_FENCE,        fence),
-
-#undef OPC
-};
-
-#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)]))
-
-// XXX hack.. probably should move this table somewhere common:
-#include "ir3.h"
-const char *ir3_instr_name(struct ir3_instruction *instr)
-{
-	if (opc_cat(instr->opc) == -1) return "??meta??";
-	return opcs[instr->opc].name;
-}
-
-static bool print_instr(struct disasm_ctx *ctx, uint32_t *dwords, int n)
-{
-	instr_t *instr = (instr_t *)dwords;
-	uint32_t opc = instr_opc(instr);
-	const char *name;
-
-	if (debug & PRINT_VERBOSE)
-		fprintf(ctx->out, "%s%04d[%08xx_%08xx] ", levels[ctx->level], n, dwords[1], dwords[0]);
-
-	/* NOTE: order flags are printed is a bit fugly.. but for now I
-	 * try to match the order in llvm-a3xx disassembler for easy
-	 * diff'ing..
-	 */
-
-	ctx->repeat = instr_repeat(instr);
-
-	if (instr->sync)
-		fprintf(ctx->out, "(sy)");
-	if (instr->ss && ((instr->opc_cat <= 4) || (instr->opc_cat == 7)))
-		fprintf(ctx->out, "(ss)");
-	if (instr->jmp_tgt)
-		fprintf(ctx->out, "(jp)");
-	if (instr_sat(instr))
-		fprintf(ctx->out, "(sat)");
-	if (ctx->repeat)
-		fprintf(ctx->out, "(rpt%d)", ctx->repeat);
-	if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
-		fprintf(ctx->out, "(ul)");
-
-	name = GETINFO(instr)->name;
-
-	if (name) {
-		fprintf(ctx->out, "%s", name);
-		GETINFO(instr)->print(ctx, instr);
-	} else {
-		fprintf(ctx->out, "unknown(%d,%d)", instr->opc_cat, opc);
-	}
-
-	fprintf(ctx->out, "\n");
-
-	return (instr->opc_cat == 0) && (opc == OPC_END);
-}
-
-int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out)
-{
-	struct disasm_ctx ctx;
-	int i;
-
-	assert((sizedwords % 2) == 0);
-
-	memset(&ctx, 0, sizeof(ctx));
-	ctx.out = out;
-	ctx.level = level;
-
-	for (i = 0; i < sizedwords; i += 2)
-		print_instr(&ctx, &dwords[i], i/2);
-
-	return 0;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
deleted file mode 100644
index 7f60ee5fd4c..00000000000
--- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ /dev/null
@@ -1,872 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef INSTR_A3XX_H_
-#define INSTR_A3XX_H_
-
-#define PACKED __attribute__((__packed__))
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <assert.h>
-
-/* size of largest OPC field of all the instruction categories: */
-#define NOPC_BITS 6
-
-#define _OPC(cat, opc)   (((cat) << NOPC_BITS) | opc)
-
-typedef enum {
-	/* category 0: */
-	OPC_NOP             = _OPC(0, 0),
-	OPC_BR              = _OPC(0, 1),
-	OPC_JUMP            = _OPC(0, 2),
-	OPC_CALL            = _OPC(0, 3),
-	OPC_RET             = _OPC(0, 4),
-	OPC_KILL            = _OPC(0, 5),
-	OPC_END             = _OPC(0, 6),
-	OPC_EMIT            = _OPC(0, 7),
-	OPC_CUT             = _OPC(0, 8),
-	OPC_CHMASK          = _OPC(0, 9),
-	OPC_CHSH            = _OPC(0, 10),
-	OPC_FLOW_REV        = _OPC(0, 11),
-
-	/* category 1: */
-	OPC_MOV             = _OPC(1, 0),
-
-	/* category 2: */
-	OPC_ADD_F           = _OPC(2, 0),
-	OPC_MIN_F           = _OPC(2, 1),
-	OPC_MAX_F           = _OPC(2, 2),
-	OPC_MUL_F           = _OPC(2, 3),
-	OPC_SIGN_F          = _OPC(2, 4),
-	OPC_CMPS_F          = _OPC(2, 5),
-	OPC_ABSNEG_F        = _OPC(2, 6),
-	OPC_CMPV_F          = _OPC(2, 7),
-	/* 8 - invalid */
-	OPC_FLOOR_F         = _OPC(2, 9),
-	OPC_CEIL_F          = _OPC(2, 10),
-	OPC_RNDNE_F         = _OPC(2, 11),
-	OPC_RNDAZ_F         = _OPC(2, 12),
-	OPC_TRUNC_F         = _OPC(2, 13),
-	/* 14-15 - invalid */
-	OPC_ADD_U           = _OPC(2, 16),
-	OPC_ADD_S           = _OPC(2, 17),
-	OPC_SUB_U           = _OPC(2, 18),
-	OPC_SUB_S           = _OPC(2, 19),
-	OPC_CMPS_U          = _OPC(2, 20),
-	OPC_CMPS_S          = _OPC(2, 21),
-	OPC_MIN_U           = _OPC(2, 22),
-	OPC_MIN_S           = _OPC(2, 23),
-	OPC_MAX_U           = _OPC(2, 24),
-	OPC_MAX_S           = _OPC(2, 25),
-	OPC_ABSNEG_S        = _OPC(2, 26),
-	/* 27 - invalid */
-	OPC_AND_B           = _OPC(2, 28),
-	OPC_OR_B            = _OPC(2, 29),
-	OPC_NOT_B           = _OPC(2, 30),
-	OPC_XOR_B           = _OPC(2, 31),
-	/* 32 - invalid */
-	OPC_CMPV_U          = _OPC(2, 33),
-	OPC_CMPV_S          = _OPC(2, 34),
-	/* 35-47 - invalid */
-	OPC_MUL_U           = _OPC(2, 48),
-	OPC_MUL_S           = _OPC(2, 49),
-	OPC_MULL_U          = _OPC(2, 50),
-	OPC_BFREV_B         = _OPC(2, 51),
-	OPC_CLZ_S           = _OPC(2, 52),
-	OPC_CLZ_B           = _OPC(2, 53),
-	OPC_SHL_B           = _OPC(2, 54),
-	OPC_SHR_B           = _OPC(2, 55),
-	OPC_ASHR_B          = _OPC(2, 56),
-	OPC_BARY_F          = _OPC(2, 57),
-	OPC_MGEN_B          = _OPC(2, 58),
-	OPC_GETBIT_B        = _OPC(2, 59),
-	OPC_SETRM           = _OPC(2, 60),
-	OPC_CBITS_B         = _OPC(2, 61),
-	OPC_SHB             = _OPC(2, 62),
-	OPC_MSAD            = _OPC(2, 63),
-
-	/* category 3: */
-	OPC_MAD_U16         = _OPC(3, 0),
-	OPC_MADSH_U16       = _OPC(3, 1),
-	OPC_MAD_S16         = _OPC(3, 2),
-	OPC_MADSH_M16       = _OPC(3, 3),   /* should this be .s16? */
-	OPC_MAD_U24         = _OPC(3, 4),
-	OPC_MAD_S24         = _OPC(3, 5),
-	OPC_MAD_F16         = _OPC(3, 6),
-	OPC_MAD_F32         = _OPC(3, 7),
-	OPC_SEL_B16         = _OPC(3, 8),
-	OPC_SEL_B32         = _OPC(3, 9),
-	OPC_SEL_S16         = _OPC(3, 10),
-	OPC_SEL_S32         = _OPC(3, 11),
-	OPC_SEL_F16         = _OPC(3, 12),
-	OPC_SEL_F32         = _OPC(3, 13),
-	OPC_SAD_S16         = _OPC(3, 14),
-	OPC_SAD_S32         = _OPC(3, 15),
-
-	/* category 4: */
-	OPC_RCP             = _OPC(4, 0),
-	OPC_RSQ             = _OPC(4, 1),
-	OPC_LOG2            = _OPC(4, 2),
-	OPC_EXP2            = _OPC(4, 3),
-	OPC_SIN             = _OPC(4, 4),
-	OPC_COS             = _OPC(4, 5),
-	OPC_SQRT            = _OPC(4, 6),
-	// 7-63 - invalid
-
-	/* category 5: */
-	OPC_ISAM            = _OPC(5, 0),
-	OPC_ISAML           = _OPC(5, 1),
-	OPC_ISAMM           = _OPC(5, 2),
-	OPC_SAM             = _OPC(5, 3),
-	OPC_SAMB            = _OPC(5, 4),
-	OPC_SAML            = _OPC(5, 5),
-	OPC_SAMGQ           = _OPC(5, 6),
-	OPC_GETLOD          = _OPC(5, 7),
-	OPC_CONV            = _OPC(5, 8),
-	OPC_CONVM           = _OPC(5, 9),
-	OPC_GETSIZE         = _OPC(5, 10),
-	OPC_GETBUF          = _OPC(5, 11),
-	OPC_GETPOS          = _OPC(5, 12),
-	OPC_GETINFO         = _OPC(5, 13),
-	OPC_DSX             = _OPC(5, 14),
-	OPC_DSY             = _OPC(5, 15),
-	OPC_GATHER4R        = _OPC(5, 16),
-	OPC_GATHER4G        = _OPC(5, 17),
-	OPC_GATHER4B        = _OPC(5, 18),
-	OPC_GATHER4A        = _OPC(5, 19),
-	OPC_SAMGP0          = _OPC(5, 20),
-	OPC_SAMGP1          = _OPC(5, 21),
-	OPC_SAMGP2          = _OPC(5, 22),
-	OPC_SAMGP3          = _OPC(5, 23),
-	OPC_DSXPP_1         = _OPC(5, 24),
-	OPC_DSYPP_1         = _OPC(5, 25),
-	OPC_RGETPOS         = _OPC(5, 26),
-	OPC_RGETINFO        = _OPC(5, 27),
-
-	/* category 6: */
-	OPC_LDG             = _OPC(6, 0),        /* load-global */
-	OPC_LDL             = _OPC(6, 1),
-	OPC_LDP             = _OPC(6, 2),
-	OPC_STG             = _OPC(6, 3),        /* store-global */
-	OPC_STL             = _OPC(6, 4),
-	OPC_STP             = _OPC(6, 5),
-	OPC_STI             = _OPC(6, 6),
-	OPC_G2L             = _OPC(6, 7),
-	OPC_L2G             = _OPC(6, 8),
-	OPC_PREFETCH        = _OPC(6, 9),
-	OPC_LDLW            = _OPC(6, 10),
-	OPC_STLW            = _OPC(6, 11),
-	OPC_RESFMT          = _OPC(6, 14),
-	OPC_RESINFO         = _OPC(6, 15),
-	OPC_ATOMIC_ADD      = _OPC(6, 16),
-	OPC_ATOMIC_SUB      = _OPC(6, 17),
-	OPC_ATOMIC_XCHG     = _OPC(6, 18),
-	OPC_ATOMIC_INC      = _OPC(6, 19),
-	OPC_ATOMIC_DEC      = _OPC(6, 20),
-	OPC_ATOMIC_CMPXCHG  = _OPC(6, 21),
-	OPC_ATOMIC_MIN      = _OPC(6, 22),
-	OPC_ATOMIC_MAX      = _OPC(6, 23),
-	OPC_ATOMIC_AND      = _OPC(6, 24),
-	OPC_ATOMIC_OR       = _OPC(6, 25),
-	OPC_ATOMIC_XOR      = _OPC(6, 26),
-	OPC_LDGB            = _OPC(6, 27),
-	OPC_STGB            = _OPC(6, 28),
-	OPC_STIB            = _OPC(6, 29),
-	OPC_LDC             = _OPC(6, 30),
-	OPC_LDLV            = _OPC(6, 31),
-
-	/* category 7: */
-	OPC_BAR             = _OPC(7, 0),
-	OPC_FENCE           = _OPC(7, 1),
-
-	/* meta instructions (category -1): */
-	/* placeholder instr to mark shader inputs: */
-	OPC_META_INPUT      = _OPC(-1, 0),
-	/* The "fan-in" and "fan-out" instructions are used for keeping
-	 * track of instructions that write to multiple dst registers
-	 * (fan-out) like texture sample instructions, or read multiple
-	 * consecutive scalar registers (fan-in) (bary.f, texture samp)
-	 */
-	OPC_META_FO         = _OPC(-1, 2),
-	OPC_META_FI         = _OPC(-1, 3),
-
-} opc_t;
-
-#define opc_cat(opc) ((int)((opc) >> NOPC_BITS))
-#define opc_op(opc)  ((unsigned)((opc) & ((1 << NOPC_BITS) - 1)))
-
-typedef enum {
-	TYPE_F16 = 0,
-	TYPE_F32 = 1,
-	TYPE_U16 = 2,
-	TYPE_U32 = 3,
-	TYPE_S16 = 4,
-	TYPE_S32 = 5,
-	TYPE_U8  = 6,
-	TYPE_S8  = 7,  // XXX I assume?
-} type_t;
-
-static inline uint32_t type_size(type_t type)
-{
-	switch (type) {
-	case TYPE_F32:
-	case TYPE_U32:
-	case TYPE_S32:
-		return 32;
-	case TYPE_F16:
-	case TYPE_U16:
-	case TYPE_S16:
-		return 16;
-	case TYPE_U8:
-	case TYPE_S8:
-		return 8;
-	default:
-		assert(0); /* invalid type */
-		return 0;
-	}
-}
-
-static inline int type_float(type_t type)
-{
-	return (type == TYPE_F32) || (type == TYPE_F16);
-}
-
-static inline int type_uint(type_t type)
-{
-	return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
-}
-
-static inline int type_sint(type_t type)
-{
-	return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
-}
-
-typedef union PACKED {
-	/* normal gpr or const src register: */
-	struct PACKED {
-		uint32_t comp  : 2;
-		uint32_t num   : 10;
-	};
-	/* for immediate val: */
-	int32_t  iim_val   : 11;
-	/* to make compiler happy: */
-	uint32_t dummy32;
-	uint32_t dummy10   : 10;
-	int32_t  idummy10  : 10;
-	uint32_t dummy11   : 11;
-	uint32_t dummy12   : 12;
-	uint32_t dummy13   : 13;
-	uint32_t dummy8    : 8;
-} reg_t;
-
-/* special registers: */
-#define REG_A0 61       /* address register */
-#define REG_P0 62       /* predicate register */
-
-static inline int reg_special(reg_t reg)
-{
-	return (reg.num == REG_A0) || (reg.num == REG_P0);
-}
-
-typedef struct PACKED {
-	/* dword0: */
-	union PACKED {
-		struct PACKED {
-			int16_t  immed    : 16;
-			uint32_t dummy1   : 16;
-		} a3xx;
-		struct PACKED {
-			int32_t  immed    : 20;
-			uint32_t dummy1   : 12;
-		} a4xx;
-		struct PACKED {
-			int32_t immed     : 32;
-		} a5xx;
-	};
-
-	/* dword1: */
-	uint32_t dummy2   : 8;
-	uint32_t repeat   : 3;
-	uint32_t dummy3   : 1;
-	uint32_t ss       : 1;
-	uint32_t dummy4   : 7;
-	uint32_t inv      : 1;
-	uint32_t comp     : 2;
-	uint32_t opc      : 4;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
-} instr_cat0_t;
-
-typedef struct PACKED {
-	/* dword0: */
-	union PACKED {
-		/* for normal src register: */
-		struct PACKED {
-			uint32_t src : 11;
-			/* at least low bit of pad must be zero or it will
-			 * look like a address relative src
-			 */
-			uint32_t pad : 21;
-		};
-		/* for address relative: */
-		struct PACKED {
-			int32_t  off : 10;
-			uint32_t src_rel_c : 1;
-			uint32_t src_rel : 1;
-			uint32_t unknown : 20;
-		};
-		/* for immediate: */
-		int32_t  iim_val;
-		uint32_t uim_val;
-		float    fim_val;
-	};
-
-	/* dword1: */
-	uint32_t dst        : 8;
-	uint32_t repeat     : 3;
-	uint32_t src_r      : 1;
-	uint32_t ss         : 1;
-	uint32_t ul         : 1;
-	uint32_t dst_type   : 3;
-	uint32_t dst_rel    : 1;
-	uint32_t src_type   : 3;
-	uint32_t src_c      : 1;
-	uint32_t src_im     : 1;
-	uint32_t even       : 1;
-	uint32_t pos_inf    : 1;
-	uint32_t must_be_0  : 2;
-	uint32_t jmp_tgt    : 1;
-	uint32_t sync       : 1;
-	uint32_t opc_cat    : 3;
-} instr_cat1_t;
-
-typedef struct PACKED {
-	/* dword0: */
-	union PACKED {
-		struct PACKED {
-			uint32_t src1         : 11;
-			uint32_t must_be_zero1: 2;
-			uint32_t src1_im      : 1;   /* immediate */
-			uint32_t src1_neg     : 1;   /* negate */
-			uint32_t src1_abs     : 1;   /* absolute value */
-		};
-		struct PACKED {
-			uint32_t src1         : 10;
-			uint32_t src1_c       : 1;   /* relative-const */
-			uint32_t src1_rel     : 1;   /* relative address */
-			uint32_t must_be_zero : 1;
-			uint32_t dummy        : 3;
-		} rel1;
-		struct PACKED {
-			uint32_t src1         : 12;
-			uint32_t src1_c       : 1;   /* const */
-			uint32_t dummy        : 3;
-		} c1;
-	};
-
-	union PACKED {
-		struct PACKED {
-			uint32_t src2         : 11;
-			uint32_t must_be_zero2: 2;
-			uint32_t src2_im      : 1;   /* immediate */
-			uint32_t src2_neg     : 1;   /* negate */
-			uint32_t src2_abs     : 1;   /* absolute value */
-		};
-		struct PACKED {
-			uint32_t src2         : 10;
-			uint32_t src2_c       : 1;   /* relative-const */
-			uint32_t src2_rel     : 1;   /* relative address */
-			uint32_t must_be_zero : 1;
-			uint32_t dummy        : 3;
-		} rel2;
-		struct PACKED {
-			uint32_t src2         : 12;
-			uint32_t src2_c       : 1;   /* const */
-			uint32_t dummy        : 3;
-		} c2;
-	};
-
-	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t repeat   : 2;
-	uint32_t sat      : 1;
-	uint32_t src1_r   : 1;
-	uint32_t ss       : 1;
-	uint32_t ul       : 1;   /* dunno */
-	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-	uint32_t ei       : 1;
-	uint32_t cond     : 3;
-	uint32_t src2_r   : 1;
-	uint32_t full     : 1;   /* not half */
-	uint32_t opc      : 6;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
-} instr_cat2_t;
-
-typedef struct PACKED {
-	/* dword0: */
-	union PACKED {
-		struct PACKED {
-			uint32_t src1         : 11;
-			uint32_t must_be_zero1: 2;
-			uint32_t src2_c       : 1;
-			uint32_t src1_neg     : 1;
-			uint32_t src2_r       : 1;
-		};
-		struct PACKED {
-			uint32_t src1         : 10;
-			uint32_t src1_c       : 1;
-			uint32_t src1_rel     : 1;
-			uint32_t must_be_zero : 1;
-			uint32_t dummy        : 3;
-		} rel1;
-		struct PACKED {
-			uint32_t src1         : 12;
-			uint32_t src1_c       : 1;
-			uint32_t dummy        : 3;
-		} c1;
-	};
-
-	union PACKED {
-		struct PACKED {
-			uint32_t src3         : 11;
-			uint32_t must_be_zero2: 2;
-			uint32_t src3_r       : 1;
-			uint32_t src2_neg     : 1;
-			uint32_t src3_neg     : 1;
-		};
-		struct PACKED {
-			uint32_t src3         : 10;
-			uint32_t src3_c       : 1;
-			uint32_t src3_rel     : 1;
-			uint32_t must_be_zero : 1;
-			uint32_t dummy        : 3;
-		} rel2;
-		struct PACKED {
-			uint32_t src3         : 12;
-			uint32_t src3_c       : 1;
-			uint32_t dummy        : 3;
-		} c2;
-	};
-
-	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t repeat   : 2;
-	uint32_t sat      : 1;
-	uint32_t src1_r   : 1;
-	uint32_t ss       : 1;
-	uint32_t ul       : 1;
-	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-	uint32_t src2     : 8;
-	uint32_t opc      : 4;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
-} instr_cat3_t;
-
-static inline bool instr_cat3_full(instr_cat3_t *cat3)
-{
-	switch (_OPC(3, cat3->opc)) {
-	case OPC_MAD_F16:
-	case OPC_MAD_U16:
-	case OPC_MAD_S16:
-	case OPC_SEL_B16:
-	case OPC_SEL_S16:
-	case OPC_SEL_F16:
-	case OPC_SAD_S16:
-	case OPC_SAD_S32:  // really??
-		return false;
-	default:
-		return true;
-	}
-}
-
-typedef struct PACKED {
-	/* dword0: */
-	union PACKED {
-		struct PACKED {
-			uint32_t src          : 11;
-			uint32_t must_be_zero1: 2;
-			uint32_t src_im       : 1;   /* immediate */
-			uint32_t src_neg      : 1;   /* negate */
-			uint32_t src_abs      : 1;   /* absolute value */
-		};
-		struct PACKED {
-			uint32_t src          : 10;
-			uint32_t src_c        : 1;   /* relative-const */
-			uint32_t src_rel      : 1;   /* relative address */
-			uint32_t must_be_zero : 1;
-			uint32_t dummy        : 3;
-		} rel;
-		struct PACKED {
-			uint32_t src          : 12;
-			uint32_t src_c        : 1;   /* const */
-			uint32_t dummy        : 3;
-		} c;
-	};
-	uint32_t dummy1   : 16;  /* seem to be ignored */
-
-	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t repeat   : 2;
-	uint32_t sat      : 1;
-	uint32_t src_r    : 1;
-	uint32_t ss       : 1;
-	uint32_t ul       : 1;
-	uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-	uint32_t dummy2   : 5;   /* seem to be ignored */
-	uint32_t full     : 1;   /* not half */
-	uint32_t opc      : 6;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
-} instr_cat4_t;
-
-typedef struct PACKED {
-	/* dword0: */
-	union PACKED {
-		/* normal case: */
-		struct PACKED {
-			uint32_t full     : 1;   /* not half */
-			uint32_t src1     : 8;
-			uint32_t src2     : 8;
-			uint32_t dummy1   : 4;   /* seem to be ignored */
-			uint32_t samp     : 4;
-			uint32_t tex      : 7;
-		} norm;
-		/* s2en case: */
-		struct PACKED {
-			uint32_t full     : 1;   /* not half */
-			uint32_t src1     : 8;
-			uint32_t src2     : 11;
-			uint32_t dummy1   : 1;
-			uint32_t src3     : 8;
-			uint32_t dummy2   : 3;
-		} s2en;
-		/* same in either case: */
-		// XXX I think, confirm this
-		struct PACKED {
-			uint32_t full     : 1;   /* not half */
-			uint32_t src1     : 8;
-			uint32_t pad      : 23;
-		};
-	};
-
-	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t wrmask   : 4;   /* write-mask */
-	uint32_t type     : 3;
-	uint32_t dummy2   : 1;   /* seems to be ignored */
-	uint32_t is_3d    : 1;
-
-	uint32_t is_a     : 1;
-	uint32_t is_s     : 1;
-	uint32_t is_s2en  : 1;
-	uint32_t is_o     : 1;
-	uint32_t is_p     : 1;
-
-	uint32_t opc      : 5;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
-} instr_cat5_t;
-
-/* dword0 encoding for src_off: [src1 + off], src2: */
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t mustbe1  : 1;
-	int32_t  off      : 13;
-	uint32_t src1     : 8;
-	uint32_t src1_im  : 1;
-	uint32_t src2_im  : 1;
-	uint32_t src2     : 8;
-
-	/* dword1: */
-	uint32_t dword1;
-} instr_cat6a_t;
-
-/* dword0 encoding for !src_off: [src1], src2 */
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t mustbe0  : 1;
-	uint32_t src1     : 13;
-	uint32_t ignore0  : 8;
-	uint32_t src1_im  : 1;
-	uint32_t src2_im  : 1;
-	uint32_t src2     : 8;
-
-	/* dword1: */
-	uint32_t dword1;
-} instr_cat6b_t;
-
-/* dword1 encoding for dst_off: */
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t dword0;
-
-	/* note: there is some weird stuff going on where sometimes
-	 * cat6->a.off is involved.. but that seems like a bug in
-	 * the blob, since it is used even if !cat6->src_off
-	 * It would make sense for there to be some more bits to
-	 * bring us to 11 bits worth of offset, but not sure..
-	 */
-	int32_t off       : 8;
-	uint32_t mustbe1  : 1;
-	uint32_t dst      : 8;
-	uint32_t pad1     : 15;
-} instr_cat6c_t;
-
-/* dword1 encoding for !dst_off: */
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t dword0;
-
-	uint32_t dst      : 8;
-	uint32_t mustbe0  : 1;
-	uint32_t idx      : 8;
-	uint32_t pad0     : 15;
-} instr_cat6d_t;
-
-/* ldgb and atomics..
- *
- * ldgb:      pad0=0, pad3=1
- * atomic .g: pad0=1, pad3=1
- *        .l: pad0=1, pad3=0
- */
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t pad0     : 1;
-	uint32_t src3     : 8;
-	uint32_t d        : 2;
-	uint32_t typed    : 1;
-	uint32_t type_size : 2;
-	uint32_t src1     : 8;
-	uint32_t src1_im  : 1;
-	uint32_t src2_im  : 1;
-	uint32_t src2     : 8;
-
-	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t mustbe0  : 1;
-	uint32_t src_ssbo : 8;
-	uint32_t pad2     : 3;  // type
-	uint32_t g        : 1;
-	uint32_t pad3     : 1;
-	uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
-} instr_cat6ldgb_t;
-
-/* stgb, pad0=0, pad3=2
- */
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t mustbe1  : 1;  // ???
-	uint32_t src1     : 8;
-	uint32_t d        : 2;
-	uint32_t typed    : 1;
-	uint32_t type_size : 2;
-	uint32_t pad0     : 9;
-	uint32_t src2_im  : 1;
-	uint32_t src2     : 8;
-
-	/* dword1: */
-	uint32_t src3     : 8;
-	uint32_t src3_im  : 1;
-	uint32_t dst_ssbo : 8;
-	uint32_t pad2     : 3;  // type
-	uint32_t pad3     : 2;
-	uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
-} instr_cat6stgb_t;
-
-typedef union PACKED {
-	instr_cat6a_t a;
-	instr_cat6b_t b;
-	instr_cat6c_t c;
-	instr_cat6d_t d;
-	instr_cat6ldgb_t ldgb;
-	instr_cat6stgb_t stgb;
-	struct PACKED {
-		/* dword0: */
-		uint32_t src_off  : 1;
-		uint32_t pad1     : 31;
-
-		/* dword1: */
-		uint32_t pad2     : 8;
-		uint32_t dst_off  : 1;
-		uint32_t pad3     : 8;
-		uint32_t type     : 3;
-		uint32_t g        : 1;  /* or in some cases it means dst immed */
-		uint32_t pad4     : 1;
-		uint32_t opc      : 5;
-		uint32_t jmp_tgt  : 1;
-		uint32_t sync     : 1;
-		uint32_t opc_cat  : 3;
-	};
-} instr_cat6_t;
-
-typedef struct PACKED {
-	/* dword0: */
-	uint32_t pad1     : 32;
-
-	/* dword1: */
-	uint32_t pad2     : 12;
-	uint32_t ss       : 1;  /* maybe in the encoding, but blob only uses (sy) */
-	uint32_t pad3     : 6;
-	uint32_t w        : 1;  /* write */
-	uint32_t r        : 1;  /* read */
-	uint32_t l        : 1;  /* local */
-	uint32_t g        : 1;  /* global */
-	uint32_t opc      : 4;  /* presumed, but only a couple known OPCs */
-	uint32_t jmp_tgt  : 1;  /* (jp) */
-	uint32_t sync     : 1;  /* (sy) */
-	uint32_t opc_cat  : 3;
-} instr_cat7_t;
-
-typedef union PACKED {
-	instr_cat0_t cat0;
-	instr_cat1_t cat1;
-	instr_cat2_t cat2;
-	instr_cat3_t cat3;
-	instr_cat4_t cat4;
-	instr_cat5_t cat5;
-	instr_cat6_t cat6;
-	instr_cat7_t cat7;
-	struct PACKED {
-		/* dword0: */
-		uint32_t pad1     : 32;
-
-		/* dword1: */
-		uint32_t pad2     : 12;
-		uint32_t ss       : 1;  /* cat1-cat4 (cat0??) and cat7 (?) */
-		uint32_t ul       : 1;  /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
-		uint32_t pad3     : 13;
-		uint32_t jmp_tgt  : 1;
-		uint32_t sync     : 1;
-		uint32_t opc_cat  : 3;
-
-	};
-} instr_t;
-
-static inline uint32_t instr_repeat(instr_t *instr)
-{
-	switch (instr->opc_cat) {
-	case 0:  return instr->cat0.repeat;
-	case 1:  return instr->cat1.repeat;
-	case 2:  return instr->cat2.repeat;
-	case 3:  return instr->cat3.repeat;
-	case 4:  return instr->cat4.repeat;
-	default: return 0;
-	}
-}
-
-static inline bool instr_sat(instr_t *instr)
-{
-	switch (instr->opc_cat) {
-	case 2:  return instr->cat2.sat;
-	case 3:  return instr->cat3.sat;
-	case 4:  return instr->cat4.sat;
-	default: return false;
-	}
-}
-
-static inline uint32_t instr_opc(instr_t *instr)
-{
-	switch (instr->opc_cat) {
-	case 0:  return instr->cat0.opc;
-	case 1:  return 0;
-	case 2:  return instr->cat2.opc;
-	case 3:  return instr->cat3.opc;
-	case 4:  return instr->cat4.opc;
-	case 5:  return instr->cat5.opc;
-	case 6:  return instr->cat6.opc;
-	case 7:  return instr->cat7.opc;
-	default: return 0;
-	}
-}
-
-static inline bool is_mad(opc_t opc)
-{
-	switch (opc) {
-	case OPC_MAD_U16:
-	case OPC_MAD_S16:
-	case OPC_MAD_U24:
-	case OPC_MAD_S24:
-	case OPC_MAD_F16:
-	case OPC_MAD_F32:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_madsh(opc_t opc)
-{
-	switch (opc) {
-	case OPC_MADSH_U16:
-	case OPC_MADSH_M16:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_atomic(opc_t opc)
-{
-	switch (opc) {
-	case OPC_ATOMIC_ADD:
-	case OPC_ATOMIC_SUB:
-	case OPC_ATOMIC_XCHG:
-	case OPC_ATOMIC_INC:
-	case OPC_ATOMIC_DEC:
-	case OPC_ATOMIC_CMPXCHG:
-	case OPC_ATOMIC_MIN:
-	case OPC_ATOMIC_MAX:
-	case OPC_ATOMIC_AND:
-	case OPC_ATOMIC_OR:
-	case OPC_ATOMIC_XOR:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_ssbo(opc_t opc)
-{
-	switch (opc) {
-	case OPC_RESFMT:
-	case OPC_RESINFO:
-	case OPC_LDGB:
-	case OPC_STGB:
-	case OPC_STIB:
-		return true;
-	default:
-		return false;
-	}
-}
-
-int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out);
-
-#endif /* INSTR_A3XX_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
deleted file mode 100644
index 3d1c4449b12..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ /dev/null
@@ -1,941 +0,0 @@
-/*
- * Copyright (c) 2012 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ir3.h"
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <stdbool.h>
-#include <errno.h>
-
-#include "util/bitscan.h"
-#include "util/ralloc.h"
-#include "util/u_math.h"
-
-#include "instr-a3xx.h"
-
-/* simple allocator to carve allocations out of an up-front allocated heap,
- * so that we can free everything easily in one shot.
- */
-void * ir3_alloc(struct ir3 *shader, int sz)
-{
-	return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
-}
-
-struct ir3 * ir3_create(struct ir3_compiler *compiler,
-		unsigned nin, unsigned nout)
-{
-	struct ir3 *shader = rzalloc(compiler, struct ir3);
-
-	shader->compiler = compiler;
-	shader->ninputs = nin;
-	shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin);
-
-	shader->noutputs = nout;
-	shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
-
-	list_inithead(&shader->block_list);
-	list_inithead(&shader->array_list);
-
-	return shader;
-}
-
-void ir3_destroy(struct ir3 *shader)
-{
-	ralloc_free(shader);
-}
-
-#define iassert(cond) do { \
-	if (!(cond)) { \
-		debug_assert(cond); \
-		return -1; \
-	} } while (0)
-
-#define iassert_type(reg, full) do { \
-	if ((full)) { \
-		iassert(!((reg)->flags & IR3_REG_HALF)); \
-	} else { \
-		iassert((reg)->flags & IR3_REG_HALF); \
-	} } while (0);
-
-static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
-		uint32_t repeat, uint32_t valid_flags)
-{
-	reg_t val = { .dummy32 = 0 };
-
-	if (reg->flags & ~valid_flags) {
-		debug_printf("INVALID FLAGS: %x vs %x\n",
-				reg->flags, valid_flags);
-	}
-
-	if (!(reg->flags & IR3_REG_R))
-		repeat = 0;
-
-	if (reg->flags & IR3_REG_IMMED) {
-		val.iim_val = reg->iim_val;
-	} else {
-		unsigned components;
-		int16_t max;
-
-		if (reg->flags & IR3_REG_RELATIV) {
-			components = reg->size;
-			val.idummy10 = reg->array.offset;
-			max = (reg->array.offset + repeat + components - 1) >> 2;
-		} else {
-			components = util_last_bit(reg->wrmask);
-			val.comp = reg->num & 0x3;
-			val.num  = reg->num >> 2;
-			max = (reg->num + repeat + components - 1) >> 2;
-		}
-
-		if (reg->flags & IR3_REG_CONST) {
-			info->max_const = MAX2(info->max_const, max);
-		} else if (val.num == 63) {
-			/* ignore writes to dummy register r63.x */
-		} else if (max < 48) {
-			if (reg->flags & IR3_REG_HALF) {
-				if (info->gpu_id >= 600) {
-					/* starting w/ a6xx, half regs conflict with full regs: */
-					info->max_reg = MAX2(info->max_reg, (max+1)/2);
-				} else {
-					info->max_half_reg = MAX2(info->max_half_reg, max);
-				}
-			} else {
-				info->max_reg = MAX2(info->max_reg, max);
-			}
-		}
-	}
-
-	return val.dummy32;
-}
-
-static int emit_cat0(struct ir3_instruction *instr, void *ptr,
-		struct ir3_info *info)
-{
-	instr_cat0_t *cat0 = ptr;
-
-	if (info->gpu_id >= 500) {
-		cat0->a5xx.immed = instr->cat0.immed;
-	} else if (info->gpu_id >= 400) {
-		cat0->a4xx.immed = instr->cat0.immed;
-	} else {
-		cat0->a3xx.immed = instr->cat0.immed;
-	}
-	cat0->repeat   = instr->repeat;
-	cat0->ss       = !!(instr->flags & IR3_INSTR_SS);
-	cat0->inv      = instr->cat0.inv;
-	cat0->comp     = instr->cat0.comp;
-	cat0->opc      = instr->opc;
-	cat0->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-	cat0->sync     = !!(instr->flags & IR3_INSTR_SY);
-	cat0->opc_cat  = 0;
-
-	return 0;
-}
-
-static int emit_cat1(struct ir3_instruction *instr, void *ptr,
-		struct ir3_info *info)
-{
-	struct ir3_register *dst = instr->regs[0];
-	struct ir3_register *src = instr->regs[1];
-	instr_cat1_t *cat1 = ptr;
-
-	iassert(instr->regs_count == 2);
-	iassert_type(dst, type_size(instr->cat1.dst_type) == 32);
-	if (!(src->flags & IR3_REG_IMMED))
-		iassert_type(src, type_size(instr->cat1.src_type) == 32);
-
-	if (src->flags & IR3_REG_IMMED) {
-		cat1->iim_val = src->iim_val;
-		cat1->src_im  = 1;
-	} else if (src->flags & IR3_REG_RELATIV) {
-		cat1->off       = reg(src, info, instr->repeat,
-				IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF | IR3_REG_RELATIV);
-		cat1->src_rel   = 1;
-		cat1->src_rel_c = !!(src->flags & IR3_REG_CONST);
-	} else {
-		cat1->src  = reg(src, info, instr->repeat,
-				IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF);
-		cat1->src_c     = !!(src->flags & IR3_REG_CONST);
-	}
-
-	cat1->dst      = reg(dst, info, instr->repeat,
-			IR3_REG_RELATIV | IR3_REG_EVEN |
-			IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF);
-	cat1->repeat   = instr->repeat;
-	cat1->src_r    = !!(src->flags & IR3_REG_R);
-	cat1->ss       = !!(instr->flags & IR3_INSTR_SS);
-	cat1->ul       = !!(instr->flags & IR3_INSTR_UL);
-	cat1->dst_type = instr->cat1.dst_type;
-	cat1->dst_rel  = !!(dst->flags & IR3_REG_RELATIV);
-	cat1->src_type = instr->cat1.src_type;
-	cat1->even     = !!(dst->flags & IR3_REG_EVEN);
-	cat1->pos_inf  = !!(dst->flags & IR3_REG_POS_INF);
-	cat1->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-	cat1->sync     = !!(instr->flags & IR3_INSTR_SY);
-	cat1->opc_cat  = 1;
-
-	return 0;
-}
-
-static int emit_cat2(struct ir3_instruction *instr, void *ptr,
-		struct ir3_info *info)
-{
-	struct ir3_register *dst = instr->regs[0];
-	struct ir3_register *src1 = instr->regs[1];
-	struct ir3_register *src2 = instr->regs[2];
-	instr_cat2_t *cat2 = ptr;
-	unsigned absneg = ir3_cat2_absneg(instr->opc);
-
-	iassert((instr->regs_count == 2) || (instr->regs_count == 3));
-
-	if (src1->flags & IR3_REG_RELATIV) {
-		iassert(src1->array.offset < (1 << 10));
-		cat2->rel1.src1      = reg(src1, info, instr->repeat,
-				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
-				IR3_REG_HALF | absneg);
-		cat2->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
-		cat2->rel1.src1_rel  = 1;
-	} else if (src1->flags & IR3_REG_CONST) {
-		iassert(src1->num < (1 << 12));
-		cat2->c1.src1   = reg(src1, info, instr->repeat,
-				IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
-		cat2->c1.src1_c = 1;
-	} else {
-		iassert(src1->num < (1 << 11));
-		cat2->src1 = reg(src1, info, instr->repeat,
-				IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
-				absneg);
-	}
-	cat2->src1_im  = !!(src1->flags & IR3_REG_IMMED);
-	cat2->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
-	cat2->src1_abs = !!(src1->flags & (IR3_REG_FABS | IR3_REG_SABS));
-	cat2->src1_r   = !!(src1->flags & IR3_REG_R);
-
-	if (src2) {
-		iassert((src2->flags & IR3_REG_IMMED) ||
-				!((src1->flags ^ src2->flags) & IR3_REG_HALF));
-
-		if (src2->flags & IR3_REG_RELATIV) {
-			iassert(src2->array.offset < (1 << 10));
-			cat2->rel2.src2      = reg(src2, info, instr->repeat,
-					IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
-					IR3_REG_HALF | absneg);
-			cat2->rel2.src2_c    = !!(src2->flags & IR3_REG_CONST);
-			cat2->rel2.src2_rel  = 1;
-		} else if (src2->flags & IR3_REG_CONST) {
-			iassert(src2->num < (1 << 12));
-			cat2->c2.src2   = reg(src2, info, instr->repeat,
-					IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
-			cat2->c2.src2_c = 1;
-		} else {
-			iassert(src2->num < (1 << 11));
-			cat2->src2 = reg(src2, info, instr->repeat,
-					IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
-					absneg);
-		}
-
-		cat2->src2_im  = !!(src2->flags & IR3_REG_IMMED);
-		cat2->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
-		cat2->src2_abs = !!(src2->flags & (IR3_REG_FABS | IR3_REG_SABS));
-		cat2->src2_r   = !!(src2->flags & IR3_REG_R);
-	}
-
-	cat2->dst      = reg(dst, info, instr->repeat,
-			IR3_REG_R | IR3_REG_EI | IR3_REG_HALF);
-	cat2->repeat   = instr->repeat;
-	cat2->sat      = !!(instr->flags & IR3_INSTR_SAT);
-	cat2->ss       = !!(instr->flags & IR3_INSTR_SS);
-	cat2->ul       = !!(instr->flags & IR3_INSTR_UL);
-	cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF);
-	cat2->ei       = !!(dst->flags & IR3_REG_EI);
-	cat2->cond     = instr->cat2.condition;
-	cat2->full     = ! (src1->flags & IR3_REG_HALF);
-	cat2->opc      = instr->opc;
-	cat2->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-	cat2->sync     = !!(instr->flags & IR3_INSTR_SY);
-	cat2->opc_cat  = 2;
-
-	return 0;
-}
-
-static int emit_cat3(struct ir3_instruction *instr, void *ptr,
-		struct ir3_info *info)
-{
-	struct ir3_register *dst = instr->regs[0];
-	struct ir3_register *src1 = instr->regs[1];
-	struct ir3_register *src2 = instr->regs[2];
-	struct ir3_register *src3 = instr->regs[3];
-	unsigned absneg = ir3_cat3_absneg(instr->opc);
-	instr_cat3_t *cat3 = ptr;
-	uint32_t src_flags = 0;
-
-	switch (instr->opc) {
-	case OPC_MAD_F16:
-	case OPC_MAD_U16:
-	case OPC_MAD_S16:
-	case OPC_SEL_B16:
-	case OPC_SEL_S16:
-	case OPC_SEL_F16:
-	case OPC_SAD_S16:
-	case OPC_SAD_S32:  // really??
-		src_flags |= IR3_REG_HALF;
-		break;
-	default:
-		break;
-	}
-
-	iassert(instr->regs_count == 4);
-	iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF));
-	iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF));
-	iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
-
-	if (src1->flags & IR3_REG_RELATIV) {
-		iassert(src1->array.offset < (1 << 10));
-		cat3->rel1.src1      = reg(src1, info, instr->repeat,
-				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
-				IR3_REG_HALF | absneg);
-		cat3->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
-		cat3->rel1.src1_rel  = 1;
-	} else if (src1->flags & IR3_REG_CONST) {
-		iassert(src1->num < (1 << 12));
-		cat3->c1.src1   = reg(src1, info, instr->repeat,
-				IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
-		cat3->c1.src1_c = 1;
-	} else {
-		iassert(src1->num < (1 << 11));
-		cat3->src1 = reg(src1, info, instr->repeat,
-				IR3_REG_R | IR3_REG_HALF | absneg);
-	}
-
-	cat3->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
-	cat3->src1_r   = !!(src1->flags & IR3_REG_R);
-
-	cat3->src2     = reg(src2, info, instr->repeat,
-			IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg);
-	cat3->src2_c   = !!(src2->flags & IR3_REG_CONST);
-	cat3->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
-	cat3->src2_r   = !!(src2->flags & IR3_REG_R);
-
-
-	if (src3->flags & IR3_REG_RELATIV) {
-		iassert(src3->array.offset < (1 << 10));
-		cat3->rel2.src3      = reg(src3, info, instr->repeat,
-				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
-				IR3_REG_HALF | absneg);
-		cat3->rel2.src3_c    = !!(src3->flags & IR3_REG_CONST);
-		cat3->rel2.src3_rel  = 1;
-	} else if (src3->flags & IR3_REG_CONST) {
-		iassert(src3->num < (1 << 12));
-		cat3->c2.src3   = reg(src3, info, instr->repeat,
-				IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
-		cat3->c2.src3_c = 1;
-	} else {
-		iassert(src3->num < (1 << 11));
-		cat3->src3 = reg(src3, info, instr->repeat,
-				IR3_REG_R | IR3_REG_HALF | absneg);
-	}
-
-	cat3->src3_neg = !!(src3->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
-	cat3->src3_r   = !!(src3->flags & IR3_REG_R);
-
-	cat3->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-	cat3->repeat   = instr->repeat;
-	cat3->sat      = !!(instr->flags & IR3_INSTR_SAT);
-	cat3->ss       = !!(instr->flags & IR3_INSTR_SS);
-	cat3->ul       = !!(instr->flags & IR3_INSTR_UL);
-	cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF);
-	cat3->opc      = instr->opc;
-	cat3->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-	cat3->sync     = !!(instr->flags & IR3_INSTR_SY);
-	cat3->opc_cat  = 3;
-
-	return 0;
-}
-
-static int emit_cat4(struct ir3_instruction *instr, void *ptr,
-		struct ir3_info *info)
-{
-	struct ir3_register *dst = instr->regs[0];
-	struct ir3_register *src = instr->regs[1];
-	instr_cat4_t *cat4 = ptr;
-
-	iassert(instr->regs_count == 2);
-
-	if (src->flags & IR3_REG_RELATIV) {
-		iassert(src->array.offset < (1 << 10));
-		cat4->rel.src      = reg(src, info, instr->repeat,
-				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG |
-				IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF);
-		cat4->rel.src_c    = !!(src->flags & IR3_REG_CONST);
-		cat4->rel.src_rel  = 1;
-	} else if (src->flags & IR3_REG_CONST) {
-		iassert(src->num < (1 << 12));
-		cat4->c.src   = reg(src, info, instr->repeat,
-				IR3_REG_CONST | IR3_REG_FNEG | IR3_REG_FABS |
-				IR3_REG_R | IR3_REG_HALF);
-		cat4->c.src_c = 1;
-	} else {
-		iassert(src->num < (1 << 11));
-		cat4->src = reg(src, info, instr->repeat,
-				IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
-				IR3_REG_R | IR3_REG_HALF);
-	}
-
-	cat4->src_im   = !!(src->flags & IR3_REG_IMMED);
-	cat4->src_neg  = !!(src->flags & IR3_REG_FNEG);
-	cat4->src_abs  = !!(src->flags & IR3_REG_FABS);
-	cat4->src_r    = !!(src->flags & IR3_REG_R);
-
-	cat4->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-	cat4->repeat   = instr->repeat;
-	cat4->sat      = !!(instr->flags & IR3_INSTR_SAT);
-	cat4->ss       = !!(instr->flags & IR3_INSTR_SS);
-	cat4->ul       = !!(instr->flags & IR3_INSTR_UL);
-	cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF);
-	cat4->full     = ! (src->flags & IR3_REG_HALF);
-	cat4->opc      = instr->opc;
-	cat4->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-	cat4->sync     = !!(instr->flags & IR3_INSTR_SY);
-	cat4->opc_cat  = 4;
-
-	return 0;
-}
-
-static int emit_cat5(struct ir3_instruction *instr, void *ptr,
-		struct ir3_info *info)
-{
-	struct ir3_register *dst = instr->regs[0];
-	struct ir3_register *src1 = instr->regs[1];
-	struct ir3_register *src2 = instr->regs[2];
-	struct ir3_register *src3 = instr->regs[3];
-	instr_cat5_t *cat5 = ptr;
-
-	iassert_type(dst, type_size(instr->cat5.type) == 32)
-
-	assume(src1 || !src2);
-	assume(src2 || !src3);
-
-	if (src1) {
-		cat5->full = ! (src1->flags & IR3_REG_HALF);
-		cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF);
-	}
-
-	if (instr->flags & IR3_INSTR_S2EN) {
-		if (src2) {
-			iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
-			cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
-		}
-		if (src3) {
-			iassert(src3->flags & IR3_REG_HALF);
-			cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF);
-		}
-		iassert(!(instr->cat5.samp | instr->cat5.tex));
-	} else {
-		iassert(!src3);
-		if (src2) {
-			iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
-			cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
-		}
-		cat5->norm.samp = instr->cat5.samp;
-		cat5->norm.tex  = instr->cat5.tex;
-	}
-
-	cat5->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-	cat5->wrmask   = dst->wrmask;
-	cat5->type     = instr->cat5.type;
-	cat5->is_3d    = !!(instr->flags & IR3_INSTR_3D);
-	cat5->is_a     = !!(instr->flags & IR3_INSTR_A);
-	cat5->is_s     = !!(instr->flags & IR3_INSTR_S);
-	cat5->is_s2en  = !!(instr->flags & IR3_INSTR_S2EN);
-	cat5->is_o     = !!(instr->flags & IR3_INSTR_O);
-	cat5->is_p     = !!(instr->flags & IR3_INSTR_P);
-	cat5->opc      = instr->opc;
-	cat5->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-	cat5->sync     = !!(instr->flags & IR3_INSTR_SY);
-	cat5->opc_cat  = 5;
-
-	return 0;
-}
-
-static int emit_cat6(struct ir3_instruction *instr, void *ptr,
-		struct ir3_info *info)
-{
-	struct ir3_register *dst, *src1, *src2;
-	instr_cat6_t *cat6 = ptr;
-	bool type_full = type_size(instr->cat6.type) == 32;
-
-	cat6->type     = instr->cat6.type;
-	cat6->opc      = instr->opc;
-	cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-	cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
-	cat6->g        = !!(instr->flags & IR3_INSTR_G);
-	cat6->opc_cat  = 6;
-
-	switch (instr->opc) {
-	case OPC_RESINFO:
-	case OPC_RESFMT:
-		iassert_type(instr->regs[0], type_full); /* dst */
-		iassert_type(instr->regs[1], type_full); /* src1 */
-		break;
-	case OPC_L2G:
-	case OPC_G2L:
-		iassert_type(instr->regs[0], true);      /* dst */
-		iassert_type(instr->regs[1], true);      /* src1 */
-		break;
-	case OPC_STG:
-	case OPC_STL:
-	case OPC_STP:
-	case OPC_STI:
-	case OPC_STLW:
-	case OPC_STIB:
-		/* no dst, so regs[0] is dummy */
-		iassert_type(instr->regs[1], true);      /* dst */
-		iassert_type(instr->regs[2], type_full); /* src1 */
-		iassert_type(instr->regs[3], true);      /* src2 */
-		break;
-	default:
-		iassert_type(instr->regs[0], type_full); /* dst */
-		iassert_type(instr->regs[1], true);      /* src1 */
-		if (instr->regs_count > 2)
-			iassert_type(instr->regs[2], true);  /* src1 */
-		break;
-	}
-
-	/* the "dst" for a store instruction is (from the perspective
-	 * of data flow in the shader, ie. register use/def, etc) in
-	 * fact a register that is read by the instruction, rather
-	 * than written:
-	 */
-	if (is_store(instr)) {
-		iassert(instr->regs_count >= 3);
-
-		dst  = instr->regs[1];
-		src1 = instr->regs[2];
-		src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL;
-	} else {
-		iassert(instr->regs_count >= 2);
-
-		dst  = instr->regs[0];
-		src1 = instr->regs[1];
-		src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
-	}
-
-	/* TODO we need a more comprehensive list about which instructions
-	 * can be encoded which way.  Or possibly use IR3_INSTR_0 flag to
-	 * indicate to use the src_off encoding even if offset is zero
-	 * (but then what to do about dst_off?)
-	 */
-	if (is_atomic(instr->opc)) {
-		instr_cat6ldgb_t *ldgb = ptr;
-
-		/* maybe these two bits both determine the instruction encoding? */
-		cat6->src_off = false;
-
-		ldgb->d = instr->cat6.d - 1;
-		ldgb->typed = instr->cat6.typed;
-		ldgb->type_size = instr->cat6.iim_val - 1;
-
-		ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-
-		if (ldgb->g) {
-			struct ir3_register *src3 = instr->regs[3];
-			struct ir3_register *src4 = instr->regs[4];
-
-			/* first src is src_ssbo: */
-			iassert(src1->flags & IR3_REG_IMMED);
-			ldgb->src_ssbo = src1->uim_val;
-
-			ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
-			ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
-			ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
-			ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
-
-			ldgb->src3 = reg(src4, info, instr->repeat, 0);
-			ldgb->pad0 = 0x1;
-			ldgb->pad3 = 0x1;
-		} else {
-			ldgb->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
-			ldgb->src1_im = !!(src1->flags & IR3_REG_IMMED);
-			ldgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
-			ldgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
-			ldgb->pad0 = 0x1;
-			ldgb->pad3 = 0x0;
-		}
-
-		return 0;
-	} else if (instr->opc == OPC_LDGB) {
-		struct ir3_register *src3 = instr->regs[3];
-		instr_cat6ldgb_t *ldgb = ptr;
-
-		/* maybe these two bits both determine the instruction encoding? */
-		cat6->src_off = false;
-
-		ldgb->d = instr->cat6.d - 1;
-		ldgb->typed = instr->cat6.typed;
-		ldgb->type_size = instr->cat6.iim_val - 1;
-
-		ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-
-		/* first src is src_ssbo: */
-		iassert(src1->flags & IR3_REG_IMMED);
-		ldgb->src_ssbo = src1->uim_val;
-
-		/* then next two are src1/src2: */
-		ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
-		ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
-		ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
-		ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
-
-		ldgb->pad0 = 0x0;
-		ldgb->pad3 = 0x1;
-
-		return 0;
-	} else if (instr->opc == OPC_RESINFO) {
-		instr_cat6ldgb_t *ldgb = ptr;
-
-		ldgb->d = instr->cat6.d - 1;
-
-		ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-
-		/* first src is src_ssbo: */
-		iassert(src1->flags & IR3_REG_IMMED);
-		ldgb->src_ssbo = src1->uim_val;
-
-		return 0;
-	} else if ((instr->opc == OPC_STGB) || (instr->opc == OPC_STIB)) {
-		struct ir3_register *src3 = instr->regs[4];
-		instr_cat6stgb_t *stgb = ptr;
-
-		/* maybe these two bits both determine the instruction encoding? */
-		cat6->src_off = true;
-		stgb->pad3 = 0x2;
-
-		stgb->d = instr->cat6.d - 1;
-		stgb->typed = instr->cat6.typed;
-		stgb->type_size = instr->cat6.iim_val - 1;
-
-		/* first src is dst_ssbo: */
-		iassert(dst->flags & IR3_REG_IMMED);
-		stgb->dst_ssbo = dst->uim_val;
-
-		/* then src1/src2/src3: */
-		stgb->src1 = reg(src1, info, instr->repeat, 0);
-		stgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
-		stgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
-		stgb->src3 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
-		stgb->src3_im = !!(src3->flags & IR3_REG_IMMED);
-
-		return 0;
-	} else if (instr->cat6.src_offset || (instr->opc == OPC_LDG) ||
-			(instr->opc == OPC_LDL)) {
-		instr_cat6a_t *cat6a = ptr;
-
-		cat6->src_off = true;
-
-		cat6a->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
-		cat6a->src1_im = !!(src1->flags & IR3_REG_IMMED);
-		if (src2) {
-			cat6a->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
-			cat6a->src2_im = !!(src2->flags & IR3_REG_IMMED);
-		}
-		cat6a->off = instr->cat6.src_offset;
-	} else {
-		instr_cat6b_t *cat6b = ptr;
-
-		cat6->src_off = false;
-
-		cat6b->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED | IR3_REG_HALF);
-		cat6b->src1_im = !!(src1->flags & IR3_REG_IMMED);
-		if (src2) {
-			cat6b->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
-			cat6b->src2_im = !!(src2->flags & IR3_REG_IMMED);
-		}
-	}
-
-	if (instr->cat6.dst_offset || (instr->opc == OPC_STG) ||
-			(instr->opc == OPC_STL)) {
-		instr_cat6c_t *cat6c = ptr;
-		cat6->dst_off = true;
-		cat6c->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-		cat6c->off = instr->cat6.dst_offset;
-	} else {
-		instr_cat6d_t *cat6d = ptr;
-		cat6->dst_off = false;
-		cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-	}
-
-	return 0;
-}
-
-static int emit_cat7(struct ir3_instruction *instr, void *ptr,
-		struct ir3_info *info)
-{
-	instr_cat7_t *cat7 = ptr;
-
-	cat7->ss      = !!(instr->flags & IR3_INSTR_SS);
-	cat7->w       = instr->cat7.w;
-	cat7->r       = instr->cat7.r;
-	cat7->l       = instr->cat7.l;
-	cat7->g       = instr->cat7.g;
-	cat7->opc     = instr->opc;
-	cat7->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
-	cat7->sync    = !!(instr->flags & IR3_INSTR_SY);
-	cat7->opc_cat = 7;
-
-	return 0;
-}
-
-static int (*emit[])(struct ir3_instruction *instr, void *ptr,
-		struct ir3_info *info) = {
-	emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6,
-	emit_cat7,
-};
-
-void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
-		uint32_t gpu_id)
-{
-	uint32_t *ptr, *dwords;
-
-	info->gpu_id        = gpu_id;
-	info->max_reg       = -1;
-	info->max_half_reg  = -1;
-	info->max_const     = -1;
-	info->instrs_count  = 0;
-	info->sizedwords    = 0;
-	info->ss = info->sy = 0;
-
-	list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
-		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-			info->sizedwords += 2;
-		}
-	}
-
-	/* need an integer number of instruction "groups" (sets of 16
-	 * instructions on a4xx or sets of 4 instructions on a3xx),
-	 * so pad out w/ NOPs if needed: (NOTE each instruction is 64bits)
-	 */
-	if (gpu_id >= 400) {
-		info->sizedwords = align(info->sizedwords, 16 * 2);
-	} else {
-		info->sizedwords = align(info->sizedwords, 4 * 2);
-	}
-
-	ptr = dwords = calloc(4, info->sizedwords);
-
-	list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
-		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-			int ret = emit[opc_cat(instr->opc)](instr, dwords, info);
-			if (ret)
-				goto fail;
-			info->instrs_count += 1 + instr->repeat;
-			dwords += 2;
-
-			if (instr->flags & IR3_INSTR_SS)
-				info->ss++;
-
-			if (instr->flags & IR3_INSTR_SY)
-				info->sy++;
-		}
-	}
-
-	return ptr;
-
-fail:
-	free(ptr);
-	return NULL;
-}
-
-static struct ir3_register * reg_create(struct ir3 *shader,
-		int num, int flags)
-{
-	struct ir3_register *reg =
-			ir3_alloc(shader, sizeof(struct ir3_register));
-	reg->wrmask = 1;
-	reg->flags = flags;
-	reg->num = num;
-	return reg;
-}
-
-static void insert_instr(struct ir3_block *block,
-		struct ir3_instruction *instr)
-{
-	struct ir3 *shader = block->shader;
-#ifdef DEBUG
-	instr->serialno = ++shader->instr_count;
-#endif
-	list_addtail(&instr->node, &block->instr_list);
-
-	if (is_input(instr))
-		array_insert(shader, shader->baryfs, instr);
-}
-
-struct ir3_block * ir3_block_create(struct ir3 *shader)
-{
-	struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
-#ifdef DEBUG
-	block->serialno = ++shader->block_count;
-#endif
-	block->shader = shader;
-	list_inithead(&block->node);
-	list_inithead(&block->instr_list);
-	return block;
-}
-
-static struct ir3_instruction *instr_create(struct ir3_block *block, int nreg)
-{
-	struct ir3_instruction *instr;
-	unsigned sz = sizeof(*instr) + (nreg * sizeof(instr->regs[0]));
-	char *ptr = ir3_alloc(block->shader, sz);
-
-	instr = (struct ir3_instruction *)ptr;
-	ptr  += sizeof(*instr);
-	instr->regs = (struct ir3_register **)ptr;
-
-#ifdef DEBUG
-	instr->regs_max = nreg;
-#endif
-
-	return instr;
-}
-
-struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
-		opc_t opc, int nreg)
-{
-	struct ir3_instruction *instr = instr_create(block, nreg);
-	instr->block = block;
-	instr->opc = opc;
-	insert_instr(block, instr);
-	return instr;
-}
-
-struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc)
-{
-	/* NOTE: we could be slightly more clever, at least for non-meta,
-	 * and choose # of regs based on category.
-	 */
-	return ir3_instr_create2(block, opc, 4);
-}
-
-struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
-{
-	struct ir3_instruction *new_instr = instr_create(instr->block,
-			instr->regs_count);
-	struct ir3_register **regs;
-	unsigned i;
-
-	regs = new_instr->regs;
-	*new_instr = *instr;
-	new_instr->regs = regs;
-
-	insert_instr(instr->block, new_instr);
-
-	/* clone registers: */
-	new_instr->regs_count = 0;
-	for (i = 0; i < instr->regs_count; i++) {
-		struct ir3_register *reg = instr->regs[i];
-		struct ir3_register *new_reg =
-				ir3_reg_create(new_instr, reg->num, reg->flags);
-		*new_reg = *reg;
-	}
-
-	return new_instr;
-}
-
-/* Add a false dependency to instruction, to ensure it is scheduled first: */
-void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
-{
-	array_insert(instr, instr->deps, dep);
-}
-
-struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
-		int num, int flags)
-{
-	struct ir3 *shader = instr->block->shader;
-	struct ir3_register *reg = reg_create(shader, num, flags);
-#ifdef DEBUG
-	debug_assert(instr->regs_count < instr->regs_max);
-#endif
-	instr->regs[instr->regs_count++] = reg;
-	return reg;
-}
-
-struct ir3_register * ir3_reg_clone(struct ir3 *shader,
-		struct ir3_register *reg)
-{
-	struct ir3_register *new_reg = reg_create(shader, 0, 0);
-	*new_reg = *reg;
-	return new_reg;
-}
-
-void
-ir3_instr_set_address(struct ir3_instruction *instr,
-		struct ir3_instruction *addr)
-{
-	if (instr->address != addr) {
-		struct ir3 *ir = instr->block->shader;
-		instr->address = addr;
-		array_insert(ir, ir->indirects, instr);
-	}
-}
-
-void
-ir3_block_clear_mark(struct ir3_block *block)
-{
-	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
-		instr->flags &= ~IR3_INSTR_MARK;
-}
-
-void
-ir3_clear_mark(struct ir3 *ir)
-{
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		ir3_block_clear_mark(block);
-	}
-}
-
-/* note: this will destroy instr->depth, don't do it until after sched! */
-unsigned
-ir3_count_instructions(struct ir3 *ir)
-{
-	unsigned cnt = 0;
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-			instr->ip = cnt++;
-		}
-		block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
-		block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
-	}
-	return cnt;
-}
-
-struct ir3_array *
-ir3_lookup_array(struct ir3 *ir, unsigned id)
-{
-	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node)
-		if (arr->id == id)
-			return arr;
-	return NULL;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
deleted file mode 100644
index ea3218828df..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ /dev/null
@@ -1,1394 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef IR3_H_
-#define IR3_H_
-
-#include <stdint.h>
-#include <stdbool.h>
-
-#include "compiler/shader_enums.h"
-
-#include "util/u_debug.h"
-#include "util/list.h"
-
-#include "instr-a3xx.h"
-
-/* low level intermediate representation of an adreno shader program */
-
-struct ir3_compiler;
-struct ir3;
-struct ir3_instruction;
-struct ir3_block;
-
-struct ir3_info {
-	uint32_t gpu_id;
-	uint16_t sizedwords;
-	uint16_t instrs_count;   /* expanded to account for rpt's */
-	/* NOTE: max_reg, etc, does not include registers not touched
-	 * by the shader (ie. vertex fetched via VFD_DECODE but not
-	 * touched by shader)
-	 */
-	int8_t   max_reg;   /* highest GPR # used by shader */
-	int8_t   max_half_reg;
-	int16_t  max_const;
-
-	/* number of sync bits: */
-	uint16_t ss, sy;
-};
-
-struct ir3_register {
-	enum {
-		IR3_REG_CONST  = 0x001,
-		IR3_REG_IMMED  = 0x002,
-		IR3_REG_HALF   = 0x004,
-		/* high registers are used for some things in compute shaders,
-		 * for example.  Seems to be for things that are global to all
-		 * threads in a wave, so possibly these are global/shared by
-		 * all the threads in the wave?
-		 */
-		IR3_REG_HIGH   = 0x008,
-		IR3_REG_RELATIV= 0x010,
-		IR3_REG_R      = 0x020,
-		/* Most instructions, it seems, can do float abs/neg but not
-		 * integer.  The CP pass needs to know what is intended (int or
-		 * float) in order to do the right thing.  For this reason the
-		 * abs/neg flags are split out into float and int variants.  In
-		 * addition, .b (bitwise) operations, the negate is actually a
-		 * bitwise not, so split that out into a new flag to make it
-		 * more clear.
-		 */
-		IR3_REG_FNEG   = 0x040,
-		IR3_REG_FABS   = 0x080,
-		IR3_REG_SNEG   = 0x100,
-		IR3_REG_SABS   = 0x200,
-		IR3_REG_BNOT   = 0x400,
-		IR3_REG_EVEN   = 0x800,
-		IR3_REG_POS_INF= 0x1000,
-		/* (ei) flag, end-input?  Set on last bary, presumably to signal
-		 * that the shader needs no more input:
-		 */
-		IR3_REG_EI     = 0x2000,
-		/* meta-flags, for intermediate stages of IR, ie.
-		 * before register assignment is done:
-		 */
-		IR3_REG_SSA    = 0x4000,   /* 'instr' is ptr to assigning instr */
-		IR3_REG_ARRAY  = 0x8000,
-
-	} flags;
-
-	/* normal registers:
-	 * the component is in the low two bits of the reg #, so
-	 * rN.x becomes: (N << 2) | x
-	 */
-	int   num;
-	union {
-		/* immediate: */
-		int32_t  iim_val;
-		uint32_t uim_val;
-		float    fim_val;
-		/* relative: */
-		struct {
-			uint16_t id;
-			int16_t offset;
-		} array;
-	};
-
-	/* For IR3_REG_SSA, src registers contain ptr back to assigning
-	 * instruction.
-	 *
-	 * For IR3_REG_ARRAY, the pointer is back to the last dependent
-	 * array access (although the net effect is the same, it points
-	 * back to a previous instruction that we depend on).
-	 */
-	struct ir3_instruction *instr;
-
-	union {
-		/* used for cat5 instructions, but also for internal/IR level
-		 * tracking of what registers are read/written by an instruction.
-		 * wrmask may be a bad name since it is used to represent both
-		 * src and dst that touch multiple adjacent registers.
-		 */
-		unsigned wrmask;
-		/* for relative addressing, 32bits for array size is too small,
-		 * but otoh we don't need to deal with disjoint sets, so instead
-		 * use a simple size field (number of scalar components).
-		 */
-		unsigned size;
-	};
-};
-
-/*
- * Stupid/simple growable array implementation:
- */
-#define DECLARE_ARRAY(type, name) \
-	unsigned name ## _count, name ## _sz; \
-	type * name;
-
-#define array_insert(ctx, arr, val) do { \
-		if (arr ## _count == arr ## _sz) { \
-			arr ## _sz = MAX2(2 * arr ## _sz, 16); \
-			arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
-		} \
-		arr[arr ##_count++] = val; \
-	} while (0)
-
-struct ir3_instruction {
-	struct ir3_block *block;
-	opc_t opc;
-	enum {
-		/* (sy) flag is set on first instruction, and after sample
-		 * instructions (probably just on RAW hazard).
-		 */
-		IR3_INSTR_SY    = 0x001,
-		/* (ss) flag is set on first instruction, and first instruction
-		 * to depend on the result of "long" instructions (RAW hazard):
-		 *
-		 *   rcp, rsq, log2, exp2, sin, cos, sqrt
-		 *
-		 * It seems to synchronize until all in-flight instructions are
-		 * completed, for example:
-		 *
-		 *   rsq hr1.w, hr1.w
-		 *   add.f hr2.z, (neg)hr2.z, hc0.y
-		 *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
-		 *   rsq hr2.x, hr2.x
-		 *   (rpt1)nop
-		 *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
-		 *   nop
-		 *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
-		 *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
-		 *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
-		 *
-		 * The last mul.f does not have (ss) set, presumably because the
-		 * (ss) on the previous instruction does the job.
-		 *
-		 * The blob driver also seems to set it on WAR hazards, although
-		 * not really clear if this is needed or just blob compiler being
-		 * sloppy.  So far I haven't found a case where removing the (ss)
-		 * causes problems for WAR hazard, but I could just be getting
-		 * lucky:
-		 *
-		 *   rcp r1.y, r3.y
-		 *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
-		 *
-		 */
-		IR3_INSTR_SS    = 0x002,
-		/* (jp) flag is set on jump targets:
-		 */
-		IR3_INSTR_JP    = 0x004,
-		IR3_INSTR_UL    = 0x008,
-		IR3_INSTR_3D    = 0x010,
-		IR3_INSTR_A     = 0x020,
-		IR3_INSTR_O     = 0x040,
-		IR3_INSTR_P     = 0x080,
-		IR3_INSTR_S     = 0x100,
-		IR3_INSTR_S2EN  = 0x200,
-		IR3_INSTR_G     = 0x400,
-		IR3_INSTR_SAT   = 0x800,
-		/* meta-flags, for intermediate stages of IR, ie.
-		 * before register assignment is done:
-		 */
-		IR3_INSTR_MARK  = 0x1000,
-		IR3_INSTR_UNUSED= 0x2000,
-	} flags;
-	int repeat;
-#ifdef DEBUG
-	unsigned regs_max;
-#endif
-	unsigned regs_count;
-	struct ir3_register **regs;
-	union {
-		struct {
-			char inv;
-			char comp;
-			int  immed;
-			struct ir3_block *target;
-		} cat0;
-		struct {
-			type_t src_type, dst_type;
-		} cat1;
-		struct {
-			enum {
-				IR3_COND_LT = 0,
-				IR3_COND_LE = 1,
-				IR3_COND_GT = 2,
-				IR3_COND_GE = 3,
-				IR3_COND_EQ = 4,
-				IR3_COND_NE = 5,
-			} condition;
-		} cat2;
-		struct {
-			unsigned samp, tex;
-			type_t type;
-		} cat5;
-		struct {
-			type_t type;
-			int src_offset;
-			int dst_offset;
-			int iim_val : 3;      /* for ldgb/stgb, # of components */
-			int d : 3;
-			bool typed : 1;
-		} cat6;
-		struct {
-			unsigned w : 1;       /* write */
-			unsigned r : 1;       /* read */
-			unsigned l : 1;       /* local */
-			unsigned g : 1;       /* global */
-		} cat7;
-		/* for meta-instructions, just used to hold extra data
-		 * before instruction scheduling, etc
-		 */
-		struct {
-			int off;              /* component/offset */
-		} fo;
-		struct {
-			struct ir3_block *block;
-		} inout;
-	};
-
-	/* transient values used during various algorithms: */
-	union {
-		/* The instruction depth is the max dependency distance to output.
-		 *
-		 * You can also think of it as the "cost", if we did any sort of
-		 * optimization for register footprint.  Ie. a value that is  just
-		 * result of moving a const to a reg would have a low cost,  so to
-		 * it could make sense to duplicate the instruction at various
-		 * points where the result is needed to reduce register footprint.
-		 */
-		unsigned depth;
-		/* When we get to the RA stage, we no longer need depth, but
-		 * we do need instruction's position/name:
-		 */
-		struct {
-			uint16_t ip;
-			uint16_t name;
-		};
-	};
-
-	/* used for per-pass extra instruction data.
-	 */
-	void *data;
-
-	/* Used during CP and RA stages.  For fanin and shader inputs/
-	 * outputs where we need a sequence of consecutive registers,
-	 * keep track of each src instructions left (ie 'n-1') and right
-	 * (ie 'n+1') neighbor.  The front-end must insert enough mov's
-	 * to ensure that each instruction has at most one left and at
-	 * most one right neighbor.  During the copy-propagation pass,
-	 * we only remove mov's when we can preserve this constraint.
-	 * And during the RA stage, we use the neighbor information to
-	 * allocate a block of registers in one shot.
-	 *
-	 * TODO: maybe just add something like:
-	 *   struct ir3_instruction_ref {
-	 *       struct ir3_instruction *instr;
-	 *       unsigned cnt;
-	 *   }
-	 *
-	 * Or can we get away without the refcnt stuff?  It seems like
-	 * it should be overkill..  the problem is if, potentially after
-	 * already eliminating some mov's, if you have a single mov that
-	 * needs to be grouped with it's neighbors in two different
-	 * places (ex. shader output and a fanin).
-	 */
-	struct {
-		struct ir3_instruction *left, *right;
-		uint16_t left_cnt, right_cnt;
-	} cp;
-
-	/* an instruction can reference at most one address register amongst
-	 * it's src/dst registers.  Beyond that, you need to insert mov's.
-	 *
-	 * NOTE: do not write this directly, use ir3_instr_set_address()
-	 */
-	struct ir3_instruction *address;
-
-	/* Tracking for additional dependent instructions.  Used to handle
-	 * barriers, WAR hazards for arrays/SSBOs/etc.
-	 */
-	DECLARE_ARRAY(struct ir3_instruction *, deps);
-
-	/*
-	 * From PoV of instruction scheduling, not execution (ie. ignores global/
-	 * local distinction):
-	 *                            shared  image  atomic  SSBO  everything
-	 *   barrier()/            -   R/W     R/W    R/W     R/W       X
-	 *     groupMemoryBarrier()
-	 *   memoryBarrier()       -           R/W    R/W
-	 *     (but only images declared coherent?)
-	 *   memoryBarrierAtomic() -                  R/W
-	 *   memoryBarrierBuffer() -                          R/W
-	 *   memoryBarrierImage()  -           R/W
-	 *   memoryBarrierShared() -   R/W
-	 *
-	 * TODO I think for SSBO/image/shared, in cases where we can determine
-	 * which variable is accessed, we don't need to care about accesses to
-	 * different variables (unless declared coherent??)
-	 */
-	enum {
-		IR3_BARRIER_EVERYTHING = 1 << 0,
-		IR3_BARRIER_SHARED_R   = 1 << 1,
-		IR3_BARRIER_SHARED_W   = 1 << 2,
-		IR3_BARRIER_IMAGE_R    = 1 << 3,
-		IR3_BARRIER_IMAGE_W    = 1 << 4,
-		IR3_BARRIER_BUFFER_R   = 1 << 5,
-		IR3_BARRIER_BUFFER_W   = 1 << 6,
-		IR3_BARRIER_ARRAY_R    = 1 << 7,
-		IR3_BARRIER_ARRAY_W    = 1 << 8,
-	} barrier_class, barrier_conflict;
-
-	/* Entry in ir3_block's instruction list: */
-	struct list_head node;
-
-	int use_count;      /* currently just updated/used by cp */
-
-#ifdef DEBUG
-	uint32_t serialno;
-#endif
-};
-
-static inline struct ir3_instruction *
-ir3_neighbor_first(struct ir3_instruction *instr)
-{
-	int cnt = 0;
-	while (instr->cp.left) {
-		instr = instr->cp.left;
-		if (++cnt > 0xffff) {
-			debug_assert(0);
-			break;
-		}
-	}
-	return instr;
-}
-
-static inline int ir3_neighbor_count(struct ir3_instruction *instr)
-{
-	int num = 1;
-
-	debug_assert(!instr->cp.left);
-
-	while (instr->cp.right) {
-		num++;
-		instr = instr->cp.right;
-		if (num > 0xffff) {
-			debug_assert(0);
-			break;
-		}
-	}
-
-	return num;
-}
-
-struct ir3 {
-	struct ir3_compiler *compiler;
-
-	unsigned ninputs, noutputs;
-	struct ir3_instruction **inputs;
-	struct ir3_instruction **outputs;
-
-	/* Track bary.f (and ldlv) instructions.. this is needed in
-	 * scheduling to ensure that all varying fetches happen before
-	 * any potential kill instructions.  The hw gets grumpy if all
-	 * threads in a group are killed before the last bary.f gets
-	 * a chance to signal end of input (ei).
-	 */
-	DECLARE_ARRAY(struct ir3_instruction *, baryfs);
-
-	/* Track all indirect instructions (read and write).  To avoid
-	 * deadlock scenario where an address register gets scheduled,
-	 * but other dependent src instructions cannot be scheduled due
-	 * to dependency on a *different* address register value, the
-	 * scheduler needs to ensure that all dependencies other than
-	 * the instruction other than the address register are scheduled
-	 * before the one that writes the address register.  Having a
-	 * convenient list of instructions that reference some address
-	 * register simplifies this.
-	 */
-	DECLARE_ARRAY(struct ir3_instruction *, indirects);
-
-	/* and same for instructions that consume predicate register: */
-	DECLARE_ARRAY(struct ir3_instruction *, predicates);
-
-	/* Track texture sample instructions which need texture state
-	 * patched in (for astc-srgb workaround):
-	 */
-	DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
-
-	/* List of blocks: */
-	struct list_head block_list;
-
-	/* List of ir3_array's: */
-	struct list_head array_list;
-
-#ifdef DEBUG
-	unsigned block_count, instr_count;
-#endif
-};
-
-struct ir3_array {
-	struct list_head node;
-	unsigned length;
-	unsigned id;
-
-	struct nir_register *r;
-
-	/* To avoid array write's from getting DCE'd, keep track of the
-	 * most recent write.  Any array access depends on the most
-	 * recent write.  This way, nothing depends on writes after the
-	 * last read.  But all the writes that happen before that have
-	 * something depending on them
-	 */
-	struct ir3_instruction *last_write;
-
-	/* extra stuff used in RA pass: */
-	unsigned base;      /* base vreg name */
-	unsigned reg;       /* base physical reg */
-	uint16_t start_ip, end_ip;
-};
-
-struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
-
-struct ir3_block {
-	struct list_head node;
-	struct ir3 *shader;
-
-	const struct nir_block *nblock;
-
-	struct list_head instr_list;  /* list of ir3_instruction */
-
-	/* each block has either one or two successors.. in case of
-	 * two successors, 'condition' decides which one to follow.
-	 * A block preceding an if/else has two successors.
-	 */
-	struct ir3_instruction *condition;
-	struct ir3_block *successors[2];
-
-	unsigned predecessors_count;
-	struct ir3_block **predecessors;
-
-	uint16_t start_ip, end_ip;
-
-	/* Track instructions which do not write a register but other-
-	 * wise must not be discarded (such as kill, stg, etc)
-	 */
-	DECLARE_ARRAY(struct ir3_instruction *, keeps);
-
-	/* used for per-pass extra block data.  Mainly used right
-	 * now in RA step to track livein/liveout.
-	 */
-	void *data;
-
-#ifdef DEBUG
-	uint32_t serialno;
-#endif
-};
-
-static inline uint32_t
-block_id(struct ir3_block *block)
-{
-#ifdef DEBUG
-	return block->serialno;
-#else
-	return (uint32_t)(unsigned long)block;
-#endif
-}
-
-struct ir3 * ir3_create(struct ir3_compiler *compiler,
-		unsigned nin, unsigned nout);
-void ir3_destroy(struct ir3 *shader);
-void * ir3_assemble(struct ir3 *shader,
-		struct ir3_info *info, uint32_t gpu_id);
-void * ir3_alloc(struct ir3 *shader, int sz);
-
-struct ir3_block * ir3_block_create(struct ir3 *shader);
-
-struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
-struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
-		opc_t opc, int nreg);
-struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
-void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
-const char *ir3_instr_name(struct ir3_instruction *instr);
-
-struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
-		int num, int flags);
-struct ir3_register * ir3_reg_clone(struct ir3 *shader,
-		struct ir3_register *reg);
-
-void ir3_instr_set_address(struct ir3_instruction *instr,
-		struct ir3_instruction *addr);
-
-static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
-{
-	if (instr->flags & IR3_INSTR_MARK)
-		return true;  /* already visited */
-	instr->flags |= IR3_INSTR_MARK;
-	return false;
-}
-
-void ir3_block_clear_mark(struct ir3_block *block);
-void ir3_clear_mark(struct ir3 *shader);
-
-unsigned ir3_count_instructions(struct ir3 *ir);
-
-static inline int ir3_instr_regno(struct ir3_instruction *instr,
-		struct ir3_register *reg)
-{
-	unsigned i;
-	for (i = 0; i < instr->regs_count; i++)
-		if (reg == instr->regs[i])
-			return i;
-	return -1;
-}
-
-
-#define MAX_ARRAYS 16
-
-/* comp:
- *   0 - x
- *   1 - y
- *   2 - z
- *   3 - w
- */
-static inline uint32_t regid(int num, int comp)
-{
-	return (num << 2) | (comp & 0x3);
-}
-
-static inline uint32_t reg_num(struct ir3_register *reg)
-{
-	return reg->num >> 2;
-}
-
-static inline uint32_t reg_comp(struct ir3_register *reg)
-{
-	return reg->num & 0x3;
-}
-
-static inline bool is_flow(struct ir3_instruction *instr)
-{
-	return (opc_cat(instr->opc) == 0);
-}
-
-static inline bool is_kill(struct ir3_instruction *instr)
-{
-	return instr->opc == OPC_KILL;
-}
-
-static inline bool is_nop(struct ir3_instruction *instr)
-{
-	return instr->opc == OPC_NOP;
-}
-
-/* Is it a non-transformative (ie. not type changing) mov?  This can
- * also include absneg.s/absneg.f, which for the most part can be
- * treated as a mov (single src argument).
- */
-static inline bool is_same_type_mov(struct ir3_instruction *instr)
-{
-	struct ir3_register *dst;
-
-	switch (instr->opc) {
-	case OPC_MOV:
-		if (instr->cat1.src_type != instr->cat1.dst_type)
-			return false;
-		break;
-	case OPC_ABSNEG_F:
-	case OPC_ABSNEG_S:
-		if (instr->flags & IR3_INSTR_SAT)
-			return false;
-		break;
-	default:
-		return false;
-	}
-
-	dst = instr->regs[0];
-
-	/* mov's that write to a0.x or p0.x are special: */
-	if (dst->num == regid(REG_P0, 0))
-		return false;
-	if (dst->num == regid(REG_A0, 0))
-		return false;
-
-	if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
-		return false;
-
-	return true;
-}
-
-static inline bool is_alu(struct ir3_instruction *instr)
-{
-	return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
-}
-
-static inline bool is_sfu(struct ir3_instruction *instr)
-{
-	return (opc_cat(instr->opc) == 4);
-}
-
-static inline bool is_tex(struct ir3_instruction *instr)
-{
-	return (opc_cat(instr->opc) == 5);
-}
-
-static inline bool is_mem(struct ir3_instruction *instr)
-{
-	return (opc_cat(instr->opc) == 6);
-}
-
-static inline bool is_barrier(struct ir3_instruction *instr)
-{
-	return (opc_cat(instr->opc) == 7);
-}
-
-static inline bool
-is_store(struct ir3_instruction *instr)
-{
-	/* these instructions, the "destination" register is
-	 * actually a source, the address to store to.
-	 */
-	switch (instr->opc) {
-	case OPC_STG:
-	case OPC_STGB:
-	case OPC_STIB:
-	case OPC_STP:
-	case OPC_STL:
-	case OPC_STLW:
-	case OPC_L2G:
-	case OPC_G2L:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_load(struct ir3_instruction *instr)
-{
-	switch (instr->opc) {
-	case OPC_LDG:
-	case OPC_LDGB:
-	case OPC_LDL:
-	case OPC_LDP:
-	case OPC_L2G:
-	case OPC_LDLW:
-	case OPC_LDC:
-	case OPC_LDLV:
-		/* probably some others too.. */
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_input(struct ir3_instruction *instr)
-{
-	/* in some cases, ldlv is used to fetch varying without
-	 * interpolation.. fortunately inloc is the first src
-	 * register in either case
-	 */
-	switch (instr->opc) {
-	case OPC_LDLV:
-	case OPC_BARY_F:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_bool(struct ir3_instruction *instr)
-{
-	switch (instr->opc) {
-	case OPC_CMPS_F:
-	case OPC_CMPS_S:
-	case OPC_CMPS_U:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool is_meta(struct ir3_instruction *instr)
-{
-	/* TODO how should we count PHI (and maybe fan-in/out) which
-	 * might actually contribute some instructions to the final
-	 * result?
-	 */
-	return (opc_cat(instr->opc) == -1);
-}
-
-static inline bool writes_addr(struct ir3_instruction *instr)
-{
-	if (instr->regs_count > 0) {
-		struct ir3_register *dst = instr->regs[0];
-		return reg_num(dst) == REG_A0;
-	}
-	return false;
-}
-
-static inline bool writes_pred(struct ir3_instruction *instr)
-{
-	if (instr->regs_count > 0) {
-		struct ir3_register *dst = instr->regs[0];
-		return reg_num(dst) == REG_P0;
-	}
-	return false;
-}
-
-/* returns defining instruction for reg */
-/* TODO better name */
-static inline struct ir3_instruction *ssa(struct ir3_register *reg)
-{
-	if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
-		return reg->instr;
-	}
-	return NULL;
-}
-
-static inline bool conflicts(struct ir3_instruction *a,
-		struct ir3_instruction *b)
-{
-	return (a && b) && (a != b);
-}
-
-static inline bool reg_gpr(struct ir3_register *r)
-{
-	if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
-		return false;
-	if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
-		return false;
-	return true;
-}
-
-static inline type_t half_type(type_t type)
-{
-	switch (type) {
-	case TYPE_F32: return TYPE_F16;
-	case TYPE_U32: return TYPE_U16;
-	case TYPE_S32: return TYPE_S16;
-	case TYPE_F16:
-	case TYPE_U16:
-	case TYPE_S16:
-		return type;
-	default:
-		assert(0);
-		return ~0;
-	}
-}
-
-/* some cat2 instructions (ie. those which are not float) can embed an
- * immediate:
- */
-static inline bool ir3_cat2_int(opc_t opc)
-{
-	switch (opc) {
-	case OPC_ADD_U:
-	case OPC_ADD_S:
-	case OPC_SUB_U:
-	case OPC_SUB_S:
-	case OPC_CMPS_U:
-	case OPC_CMPS_S:
-	case OPC_MIN_U:
-	case OPC_MIN_S:
-	case OPC_MAX_U:
-	case OPC_MAX_S:
-	case OPC_CMPV_U:
-	case OPC_CMPV_S:
-	case OPC_MUL_U:
-	case OPC_MUL_S:
-	case OPC_MULL_U:
-	case OPC_CLZ_S:
-	case OPC_ABSNEG_S:
-	case OPC_AND_B:
-	case OPC_OR_B:
-	case OPC_NOT_B:
-	case OPC_XOR_B:
-	case OPC_BFREV_B:
-	case OPC_CLZ_B:
-	case OPC_SHL_B:
-	case OPC_SHR_B:
-	case OPC_ASHR_B:
-	case OPC_MGEN_B:
-	case OPC_GETBIT_B:
-	case OPC_CBITS_B:
-	case OPC_BARY_F:
-		return true;
-
-	default:
-		return false;
-	}
-}
-
-
-/* map cat2 instruction to valid abs/neg flags: */
-static inline unsigned ir3_cat2_absneg(opc_t opc)
-{
-	switch (opc) {
-	case OPC_ADD_F:
-	case OPC_MIN_F:
-	case OPC_MAX_F:
-	case OPC_MUL_F:
-	case OPC_SIGN_F:
-	case OPC_CMPS_F:
-	case OPC_ABSNEG_F:
-	case OPC_CMPV_F:
-	case OPC_FLOOR_F:
-	case OPC_CEIL_F:
-	case OPC_RNDNE_F:
-	case OPC_RNDAZ_F:
-	case OPC_TRUNC_F:
-	case OPC_BARY_F:
-		return IR3_REG_FABS | IR3_REG_FNEG;
-
-	case OPC_ADD_U:
-	case OPC_ADD_S:
-	case OPC_SUB_U:
-	case OPC_SUB_S:
-	case OPC_CMPS_U:
-	case OPC_CMPS_S:
-	case OPC_MIN_U:
-	case OPC_MIN_S:
-	case OPC_MAX_U:
-	case OPC_MAX_S:
-	case OPC_CMPV_U:
-	case OPC_CMPV_S:
-	case OPC_MUL_U:
-	case OPC_MUL_S:
-	case OPC_MULL_U:
-	case OPC_CLZ_S:
-		return 0;
-
-	case OPC_ABSNEG_S:
-		return IR3_REG_SABS | IR3_REG_SNEG;
-
-	case OPC_AND_B:
-	case OPC_OR_B:
-	case OPC_NOT_B:
-	case OPC_XOR_B:
-	case OPC_BFREV_B:
-	case OPC_CLZ_B:
-	case OPC_SHL_B:
-	case OPC_SHR_B:
-	case OPC_ASHR_B:
-	case OPC_MGEN_B:
-	case OPC_GETBIT_B:
-	case OPC_CBITS_B:
-		return IR3_REG_BNOT;
-
-	default:
-		return 0;
-	}
-}
-
-/* map cat3 instructions to valid abs/neg flags: */
-static inline unsigned ir3_cat3_absneg(opc_t opc)
-{
-	switch (opc) {
-	case OPC_MAD_F16:
-	case OPC_MAD_F32:
-	case OPC_SEL_F16:
-	case OPC_SEL_F32:
-		return IR3_REG_FNEG;
-
-	case OPC_MAD_U16:
-	case OPC_MADSH_U16:
-	case OPC_MAD_S16:
-	case OPC_MADSH_M16:
-	case OPC_MAD_U24:
-	case OPC_MAD_S24:
-	case OPC_SEL_S16:
-	case OPC_SEL_S32:
-	case OPC_SAD_S16:
-	case OPC_SAD_S32:
-		/* neg *may* work on 3rd src.. */
-
-	case OPC_SEL_B16:
-	case OPC_SEL_B32:
-
-	default:
-		return 0;
-	}
-}
-
-#define MASK(n) ((1 << (n)) - 1)
-
-/* iterator for an instructions's sources (reg), also returns src #: */
-#define foreach_src_n(__srcreg, __n, __instr) \
-	if ((__instr)->regs_count) \
-		for (unsigned __cnt = (__instr)->regs_count - 1, __n = 0; __n < __cnt; __n++) \
-			if ((__srcreg = (__instr)->regs[__n + 1]))
-
-/* iterator for an instructions's sources (reg): */
-#define foreach_src(__srcreg, __instr) \
-	foreach_src_n(__srcreg, __i, __instr)
-
-static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
-{
-	unsigned cnt = instr->regs_count + instr->deps_count;
-	if (instr->address)
-		cnt++;
-	return cnt;
-}
-
-static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
-{
-	if (n == (instr->regs_count + instr->deps_count))
-		return instr->address;
-	if (n >= instr->regs_count)
-		return instr->deps[n - instr->regs_count];
-	return ssa(instr->regs[n]);
-}
-
-static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
-{
-	if (n == (instr->regs_count + instr->deps_count))
-		return false;
-	if (n >= instr->regs_count)
-		return true;
-	return false;
-}
-
-#define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1)
-
-/* iterator for an instruction's SSA sources (instr), also returns src #: */
-#define foreach_ssa_src_n(__srcinst, __n, __instr) \
-	for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
-		if ((__srcinst = __ssa_src_n(__instr, __n)))
-
-/* iterator for an instruction's SSA sources (instr): */
-#define foreach_ssa_src(__srcinst, __instr) \
-	foreach_ssa_src_n(__srcinst, __i, __instr)
-
-
-/* dump: */
-void ir3_print(struct ir3 *ir);
-void ir3_print_instr(struct ir3_instruction *instr);
-
-/* depth calculation: */
-int ir3_delayslots(struct ir3_instruction *assigner,
-		struct ir3_instruction *consumer, unsigned n);
-void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
-void ir3_depth(struct ir3 *ir);
-
-/* copy-propagate: */
-struct ir3_shader_variant;
-void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
-
-/* group neighbors and insert mov's to resolve conflicts: */
-void ir3_group(struct ir3 *ir);
-
-/* scheduling: */
-void ir3_sched_add_deps(struct ir3 *ir);
-int ir3_sched(struct ir3 *ir);
-
-/* register assignment: */
-struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler);
-int ir3_ra(struct ir3 *ir3, gl_shader_stage type,
-		bool frag_coord, bool frag_face);
-
-/* legalize: */
-void ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary);
-
-/* ************************************************************************* */
-/* instruction helpers */
-
-/* creates SSA src of correct type (ie. half vs full precision) */
-static inline struct ir3_register * __ssa_src(struct ir3_instruction *instr,
-		struct ir3_instruction *src, unsigned flags)
-{
-	struct ir3_register *reg;
-	if (src->regs[0]->flags & IR3_REG_HALF)
-		flags |= IR3_REG_HALF;
-	reg = ir3_reg_create(instr, 0, IR3_REG_SSA | flags);
-	reg->instr = src;
-	return reg;
-}
-
-static inline struct ir3_instruction *
-ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
-{
-	struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
-	ir3_reg_create(instr, 0, 0);   /* dst */
-	if (src->regs[0]->flags & IR3_REG_ARRAY) {
-		struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
-		src_reg->array = src->regs[0]->array;
-	} else {
-		__ssa_src(instr, src, 0);
-	}
-	debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
-	instr->cat1.src_type = type;
-	instr->cat1.dst_type = type;
-	return instr;
-}
-
-static inline struct ir3_instruction *
-ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
-		type_t src_type, type_t dst_type)
-{
-	struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
-	unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
-	unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
-
-	debug_assert((src->regs[0]->flags & IR3_REG_HALF) == src_flags);
-
-	ir3_reg_create(instr, 0, dst_flags);   /* dst */
-	__ssa_src(instr, src, 0);
-	instr->cat1.src_type = src_type;
-	instr->cat1.dst_type = dst_type;
-	debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
-	return instr;
-}
-
-static inline struct ir3_instruction *
-ir3_NOP(struct ir3_block *block)
-{
-	return ir3_instr_create(block, OPC_NOP);
-}
-
-#define INSTR0(name)                                                     \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block)                                      \
-{                                                                        \
-	struct ir3_instruction *instr =                                      \
-		ir3_instr_create(block, OPC_##name);                             \
-	return instr;                                                        \
-}
-
-#define INSTR1(name)                                                     \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block,                                      \
-		struct ir3_instruction *a, unsigned aflags)                      \
-{                                                                        \
-	struct ir3_instruction *instr =                                      \
-		ir3_instr_create(block, OPC_##name);                             \
-	ir3_reg_create(instr, 0, 0);   /* dst */                             \
-	__ssa_src(instr, a, aflags);                                         \
-	return instr;                                                        \
-}
-
-#define INSTR2(name)                                                     \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block,                                      \
-		struct ir3_instruction *a, unsigned aflags,                      \
-		struct ir3_instruction *b, unsigned bflags)                      \
-{                                                                        \
-	struct ir3_instruction *instr =                                      \
-		ir3_instr_create(block, OPC_##name);                             \
-	ir3_reg_create(instr, 0, 0);   /* dst */                             \
-	__ssa_src(instr, a, aflags);                                         \
-	__ssa_src(instr, b, bflags);                                         \
-	return instr;                                                        \
-}
-
-#define INSTR3(name)                                                     \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block,                                      \
-		struct ir3_instruction *a, unsigned aflags,                      \
-		struct ir3_instruction *b, unsigned bflags,                      \
-		struct ir3_instruction *c, unsigned cflags)                      \
-{                                                                        \
-	struct ir3_instruction *instr =                                      \
-		ir3_instr_create(block, OPC_##name);                             \
-	ir3_reg_create(instr, 0, 0);   /* dst */                             \
-	__ssa_src(instr, a, aflags);                                         \
-	__ssa_src(instr, b, bflags);                                         \
-	__ssa_src(instr, c, cflags);                                         \
-	return instr;                                                        \
-}
-
-#define INSTR4(name)                                                     \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block,                                      \
-		struct ir3_instruction *a, unsigned aflags,                      \
-		struct ir3_instruction *b, unsigned bflags,                      \
-		struct ir3_instruction *c, unsigned cflags,                      \
-		struct ir3_instruction *d, unsigned dflags)                      \
-{                                                                        \
-	struct ir3_instruction *instr =                                      \
-		ir3_instr_create2(block, OPC_##name, 5);                         \
-	ir3_reg_create(instr, 0, 0);   /* dst */                             \
-	__ssa_src(instr, a, aflags);                                         \
-	__ssa_src(instr, b, bflags);                                         \
-	__ssa_src(instr, c, cflags);                                         \
-	__ssa_src(instr, d, dflags);                                         \
-	return instr;                                                        \
-}
-
-#define INSTR4F(f, name)                                                 \
-static inline struct ir3_instruction *                                   \
-ir3_##name##_##f(struct ir3_block *block,                                \
-		struct ir3_instruction *a, unsigned aflags,                      \
-		struct ir3_instruction *b, unsigned bflags,                      \
-		struct ir3_instruction *c, unsigned cflags,                      \
-		struct ir3_instruction *d, unsigned dflags)                      \
-{                                                                        \
-	struct ir3_instruction *instr =                                      \
-		ir3_instr_create2(block, OPC_##name, 5);                         \
-	ir3_reg_create(instr, 0, 0);   /* dst */                             \
-	__ssa_src(instr, a, aflags);                                         \
-	__ssa_src(instr, b, bflags);                                         \
-	__ssa_src(instr, c, cflags);                                         \
-	__ssa_src(instr, d, dflags);                                         \
-	instr->flags |= IR3_INSTR_##f;                                       \
-	return instr;                                                        \
-}
-
-/* cat0 instructions: */
-INSTR0(BR)
-INSTR0(JUMP)
-INSTR1(KILL)
-INSTR0(END)
-
-/* cat2 instructions, most 2 src but some 1 src: */
-INSTR2(ADD_F)
-INSTR2(MIN_F)
-INSTR2(MAX_F)
-INSTR2(MUL_F)
-INSTR1(SIGN_F)
-INSTR2(CMPS_F)
-INSTR1(ABSNEG_F)
-INSTR2(CMPV_F)
-INSTR1(FLOOR_F)
-INSTR1(CEIL_F)
-INSTR1(RNDNE_F)
-INSTR1(RNDAZ_F)
-INSTR1(TRUNC_F)
-INSTR2(ADD_U)
-INSTR2(ADD_S)
-INSTR2(SUB_U)
-INSTR2(SUB_S)
-INSTR2(CMPS_U)
-INSTR2(CMPS_S)
-INSTR2(MIN_U)
-INSTR2(MIN_S)
-INSTR2(MAX_U)
-INSTR2(MAX_S)
-INSTR1(ABSNEG_S)
-INSTR2(AND_B)
-INSTR2(OR_B)
-INSTR1(NOT_B)
-INSTR2(XOR_B)
-INSTR2(CMPV_U)
-INSTR2(CMPV_S)
-INSTR2(MUL_U)
-INSTR2(MUL_S)
-INSTR2(MULL_U)
-INSTR1(BFREV_B)
-INSTR1(CLZ_S)
-INSTR1(CLZ_B)
-INSTR2(SHL_B)
-INSTR2(SHR_B)
-INSTR2(ASHR_B)
-INSTR2(BARY_F)
-INSTR2(MGEN_B)
-INSTR2(GETBIT_B)
-INSTR1(SETRM)
-INSTR1(CBITS_B)
-INSTR2(SHB)
-INSTR2(MSAD)
-
-/* cat3 instructions: */
-INSTR3(MAD_U16)
-INSTR3(MADSH_U16)
-INSTR3(MAD_S16)
-INSTR3(MADSH_M16)
-INSTR3(MAD_U24)
-INSTR3(MAD_S24)
-INSTR3(MAD_F16)
-INSTR3(MAD_F32)
-INSTR3(SEL_B16)
-INSTR3(SEL_B32)
-INSTR3(SEL_S16)
-INSTR3(SEL_S32)
-INSTR3(SEL_F16)
-INSTR3(SEL_F32)
-INSTR3(SAD_S16)
-INSTR3(SAD_S32)
-
-/* cat4 instructions: */
-INSTR1(RCP)
-INSTR1(RSQ)
-INSTR1(LOG2)
-INSTR1(EXP2)
-INSTR1(SIN)
-INSTR1(COS)
-INSTR1(SQRT)
-
-/* cat5 instructions: */
-INSTR1(DSX)
-INSTR1(DSY)
-
-static inline struct ir3_instruction *
-ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
-		unsigned wrmask, unsigned flags, unsigned samp, unsigned tex,
-		struct ir3_instruction *src0, struct ir3_instruction *src1)
-{
-	struct ir3_instruction *sam;
-	struct ir3_register *reg;
-
-	sam = ir3_instr_create(block, opc);
-	sam->flags |= flags;
-	ir3_reg_create(sam, 0, 0)->wrmask = wrmask;
-	if (src0) {
-		reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
-		reg->wrmask = (1 << (src0->regs_count - 1)) - 1;
-		reg->instr = src0;
-	}
-	if (src1) {
-		reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
-		reg->instr = src1;
-		reg->wrmask = (1 << (src1->regs_count - 1)) - 1;
-	}
-	sam->cat5.samp = samp;
-	sam->cat5.tex  = tex;
-	sam->cat5.type  = type;
-
-	return sam;
-}
-
-/* cat6 instructions: */
-INSTR2(LDLV)
-INSTR2(LDG)
-INSTR2(LDL)
-INSTR3(STG)
-INSTR3(STL)
-INSTR3(LDGB)
-INSTR4(STGB)
-INSTR4(STIB)
-INSTR1(RESINFO)
-INSTR1(RESFMT)
-INSTR2(ATOMIC_ADD)
-INSTR2(ATOMIC_SUB)
-INSTR2(ATOMIC_XCHG)
-INSTR2(ATOMIC_INC)
-INSTR2(ATOMIC_DEC)
-INSTR2(ATOMIC_CMPXCHG)
-INSTR2(ATOMIC_MIN)
-INSTR2(ATOMIC_MAX)
-INSTR2(ATOMIC_AND)
-INSTR2(ATOMIC_OR)
-INSTR2(ATOMIC_XOR)
-INSTR4F(G, ATOMIC_ADD)
-INSTR4F(G, ATOMIC_SUB)
-INSTR4F(G, ATOMIC_XCHG)
-INSTR4F(G, ATOMIC_INC)
-INSTR4F(G, ATOMIC_DEC)
-INSTR4F(G, ATOMIC_CMPXCHG)
-INSTR4F(G, ATOMIC_MIN)
-INSTR4F(G, ATOMIC_MAX)
-INSTR4F(G, ATOMIC_AND)
-INSTR4F(G, ATOMIC_OR)
-INSTR4F(G, ATOMIC_XOR)
-
-/* cat7 instructions: */
-INSTR0(BAR)
-INSTR0(FENCE)
-
-/* ************************************************************************* */
-/* split this out or find some helper to use.. like main/bitset.h.. */
-
-#include <string.h>
-
-#define MAX_REG 256
-
-typedef uint8_t regmask_t[2 * MAX_REG / 8];
-
-static inline unsigned regmask_idx(struct ir3_register *reg)
-{
-	unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
-	debug_assert(num < MAX_REG);
-	if (reg->flags & IR3_REG_HALF)
-		num += MAX_REG;
-	return num;
-}
-
-static inline void regmask_init(regmask_t *regmask)
-{
-	memset(regmask, 0, sizeof(*regmask));
-}
-
-static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
-{
-	unsigned idx = regmask_idx(reg);
-	if (reg->flags & IR3_REG_RELATIV) {
-		unsigned i;
-		for (i = 0; i < reg->size; i++, idx++)
-			(*regmask)[idx / 8] |= 1 << (idx % 8);
-	} else {
-		unsigned mask;
-		for (mask = reg->wrmask; mask; mask >>= 1, idx++)
-			if (mask & 1)
-				(*regmask)[idx / 8] |= 1 << (idx % 8);
-	}
-}
-
-static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
-{
-	unsigned i;
-	for (i = 0; i < ARRAY_SIZE(*dst); i++)
-		(*dst)[i] = (*a)[i] | (*b)[i];
-}
-
-/* set bits in a if not set in b, conceptually:
- *   a |= (reg & ~b)
- */
-static inline void regmask_set_if_not(regmask_t *a,
-		struct ir3_register *reg, regmask_t *b)
-{
-	unsigned idx = regmask_idx(reg);
-	if (reg->flags & IR3_REG_RELATIV) {
-		unsigned i;
-		for (i = 0; i < reg->size; i++, idx++)
-			if (!((*b)[idx / 8] & (1 << (idx % 8))))
-				(*a)[idx / 8] |= 1 << (idx % 8);
-	} else {
-		unsigned mask;
-		for (mask = reg->wrmask; mask; mask >>= 1, idx++)
-			if (mask & 1)
-				if (!((*b)[idx / 8] & (1 << (idx % 8))))
-					(*a)[idx / 8] |= 1 << (idx % 8);
-	}
-}
-
-static inline bool regmask_get(regmask_t *regmask,
-		struct ir3_register *reg)
-{
-	unsigned idx = regmask_idx(reg);
-	if (reg->flags & IR3_REG_RELATIV) {
-		unsigned i;
-		for (i = 0; i < reg->size; i++, idx++)
-			if ((*regmask)[idx / 8] & (1 << (idx % 8)))
-				return true;
-	} else {
-		unsigned mask;
-		for (mask = reg->wrmask; mask; mask >>= 1, idx++)
-			if (mask & 1)
-				if ((*regmask)[idx / 8] & (1 << (idx % 8)))
-					return true;
-	}
-	return false;
-}
-
-/* ************************************************************************* */
-
-#endif /* IR3_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cache.h b/src/gallium/drivers/freedreno/ir3/ir3_cache.h
index 3d3a7f8050d..73d555e92ce 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cache.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cache.h
@@ -27,7 +27,7 @@
 #ifndef IR3_CACHE_H_
 #define IR3_CACHE_H_
 
-#include "ir3_shader.h"
+#include "ir3/ir3_shader.h"
 
 /*
  * An in-memory cache for mapping shader state objects plus shader key to
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index d12cdd353ab..47fd5dfd012 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -37,11 +37,11 @@
 #include "tgsi/tgsi_text.h"
 #include "tgsi/tgsi_dump.h"
 
-#include "ir3_compiler.h"
-#include "ir3_gallium.h"
-#include "ir3_nir.h"
-#include "instr-a3xx.h"
-#include "ir3.h"
+#include "ir3/ir3_compiler.h"
+#include "ir3/ir3_gallium.h"
+#include "ir3/ir3_nir.h"
+#include "ir3/instr-a3xx.h"
+#include "ir3/ir3.h"
 
 #include "compiler/glsl/standalone.h"
 #include "compiler/glsl/glsl_to_nir.h"
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
deleted file mode 100644
index f00daebabf5..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (C) 2015 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#include "util/ralloc.h"
-
-#include "ir3_compiler.h"
-
-static const struct debug_named_value shader_debug_options[] = {
-		{"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"},
-		{"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"},
-		{"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"},
-		{"disasm",  IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"},
-		{"optmsgs", IR3_DBG_OPTMSGS,"Enable optimizer debug messages"},
-		DEBUG_NAMED_VALUE_END
-};
-
-DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", shader_debug_options, 0)
-
-enum ir3_shader_debug ir3_shader_debug = 0;
-
-struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
-{
-	struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
-
-	ir3_shader_debug = debug_get_option_ir3_shader_debug();
-
-	compiler->dev = dev;
-	compiler->gpu_id = gpu_id;
-	compiler->set = ir3_ra_alloc_reg_set(compiler);
-
-	if (compiler->gpu_id >= 400) {
-		/* need special handling for "flat" */
-		compiler->flat_bypass = true;
-		compiler->levels_add_one = false;
-		compiler->unminify_coords = false;
-		compiler->txf_ms_with_isaml = false;
-		compiler->array_index_add_half = true;
-	} else {
-		/* no special handling for "flat" */
-		compiler->flat_bypass = false;
-		compiler->levels_add_one = true;
-		compiler->unminify_coords = true;
-		compiler->txf_ms_with_isaml = true;
-		compiler->array_index_add_half = false;
-	}
-
-	return compiler;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
deleted file mode 100644
index e2336062b29..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (C) 2013 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#ifndef IR3_COMPILER_H_
-#define IR3_COMPILER_H_
-
-#include "ir3_shader.h"
-
-struct ir3_ra_reg_set;
-
-struct ir3_compiler {
-	struct fd_device *dev;
-	uint32_t gpu_id;
-	struct ir3_ra_reg_set *set;
-	uint32_t shader_count;
-
-	/*
-	 * Configuration options for things that are handled differently on
-	 * different generations:
-	 */
-
-	/* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
-	 * so we need to use ldlv.u32 to load the varying directly:
-	 */
-	bool flat_bypass;
-
-	/* on a3xx, we need to add one to # of array levels:
-	 */
-	bool levels_add_one;
-
-	/* on a3xx, we need to scale up integer coords for isaml based
-	 * on LoD:
-	 */
-	bool unminify_coords;
-
-	/* on a3xx do txf_ms w/ isaml and scaled coords: */
-	bool txf_ms_with_isaml;
-
-	/* on a4xx, for array textures we need to add 0.5 to the array
-	 * index coordinate:
-	 */
-	bool array_index_add_half;
-};
-
-struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id);
-
-int ir3_compile_shader_nir(struct ir3_compiler *compiler,
-		struct ir3_shader_variant *so);
-
-enum ir3_shader_debug {
-	IR3_DBG_SHADER_VS = 0x01,
-	IR3_DBG_SHADER_FS = 0x02,
-	IR3_DBG_SHADER_CS = 0x04,
-	IR3_DBG_DISASM    = 0x08,
-	IR3_DBG_OPTMSGS   = 0x10,
-};
-
-extern enum ir3_shader_debug ir3_shader_debug;
-
-static inline bool
-shader_debug_enabled(gl_shader_stage type)
-{
-	switch (type) {
-	case MESA_SHADER_VERTEX:      return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
-	case MESA_SHADER_FRAGMENT:    return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
-	case MESA_SHADER_COMPUTE:     return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
-	default:
-		debug_assert(0);
-		return false;
-	}
-}
-
-#endif /* IR3_COMPILER_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
deleted file mode 100644
index 445a2b291e9..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ /dev/null
@@ -1,3818 +0,0 @@
-/*
- * Copyright (C) 2015 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#include <stdarg.h>
-
-#include "util/u_string.h"
-#include "util/u_memory.h"
-#include "util/u_math.h"
-
-#include "ir3_compiler.h"
-#include "ir3_shader.h"
-#include "ir3_nir.h"
-
-#include "instr-a3xx.h"
-#include "ir3.h"
-
-/* for conditionally setting boolean flag(s): */
-#define COND(bool, val) ((bool) ? (val) : 0)
-
-#define DBG(fmt, ...) \
-		do { debug_printf("%s:%d: "fmt "\n", \
-				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
-
-struct ir3_context {
-	struct ir3_compiler *compiler;
-
-	struct nir_shader *s;
-
-	struct nir_instr *cur_instr;  /* current instruction, just for debug */
-
-	struct ir3 *ir;
-	struct ir3_shader_variant *so;
-
-	struct ir3_block *block;      /* the current block */
-	struct ir3_block *in_block;   /* block created for shader inputs */
-
-	nir_function_impl *impl;
-
-	/* For fragment shaders, varyings are not actual shader inputs,
-	 * instead the hw passes a varying-coord which is used with
-	 * bary.f.
-	 *
-	 * But NIR doesn't know that, it still declares varyings as
-	 * inputs.  So we do all the input tracking normally and fix
-	 * things up after compile_instructions()
-	 *
-	 * NOTE that frag_vcoord is the hardware position (possibly it
-	 * is actually an index or tag or some such.. it is *not*
-	 * values that can be directly used for gl_FragCoord..)
-	 */
-	struct ir3_instruction *frag_vcoord;
-
-	/* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */
-	struct ir3_instruction *frag_face, *frag_coord;
-
-	/* For vertex shaders, keep track of the system values sources */
-	struct ir3_instruction *vertex_id, *basevertex, *instance_id;
-
-	/* For fragment shaders: */
-	struct ir3_instruction *samp_id, *samp_mask_in;
-
-	/* Compute shader inputs: */
-	struct ir3_instruction *local_invocation_id, *work_group_id;
-
-	/* mapping from nir_register to defining instruction: */
-	struct hash_table *def_ht;
-
-	unsigned num_arrays;
-
-	/* a common pattern for indirect addressing is to request the
-	 * same address register multiple times.  To avoid generating
-	 * duplicate instruction sequences (which our backend does not
-	 * try to clean up, since that should be done as the NIR stage)
-	 * we cache the address value generated for a given src value:
-	 *
-	 * Note that we have to cache these per alignment, since same
-	 * src used for an array of vec1 cannot be also used for an
-	 * array of vec4.
-	 */
-	struct hash_table *addr_ht[4];
-
-	/* last dst array, for indirect we need to insert a var-store.
-	 */
-	struct ir3_instruction **last_dst;
-	unsigned last_dst_n;
-
-	/* maps nir_block to ir3_block, mostly for the purposes of
-	 * figuring out the blocks successors
-	 */
-	struct hash_table *block_ht;
-
-	/* on a4xx, bitmask of samplers which need astc+srgb workaround: */
-	unsigned astc_srgb;
-
-	unsigned samples;             /* bitmask of x,y sample shifts */
-
-	unsigned max_texture_index;
-
-	/* set if we encounter something we can't handle yet, so we
-	 * can bail cleanly and fallback to TGSI compiler f/e
-	 */
-	bool error;
-};
-
-/* gpu pointer size in units of 32bit registers/slots */
-static unsigned pointer_size(struct ir3_context *ctx)
-{
-	return (ctx->compiler->gpu_id >= 500) ? 2 : 1;
-}
-
-static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
-static struct ir3_block * get_block(struct ir3_context *ctx, const nir_block *nblock);
-
-
-static struct ir3_context *
-compile_init(struct ir3_compiler *compiler,
-		struct ir3_shader_variant *so)
-{
-	struct ir3_context *ctx = rzalloc(NULL, struct ir3_context);
-
-	if (compiler->gpu_id >= 400) {
-		if (so->type == MESA_SHADER_VERTEX) {
-			ctx->astc_srgb = so->key.vastc_srgb;
-		} else if (so->type == MESA_SHADER_FRAGMENT) {
-			ctx->astc_srgb = so->key.fastc_srgb;
-		}
-
-	} else {
-		if (so->type == MESA_SHADER_VERTEX) {
-			ctx->samples = so->key.vsamples;
-		} else if (so->type == MESA_SHADER_FRAGMENT) {
-			ctx->samples = so->key.fsamples;
-		}
-	}
-
-	ctx->compiler = compiler;
-	ctx->so = so;
-	ctx->def_ht = _mesa_hash_table_create(ctx,
-			_mesa_hash_pointer, _mesa_key_pointer_equal);
-	ctx->block_ht = _mesa_hash_table_create(ctx,
-			_mesa_hash_pointer, _mesa_key_pointer_equal);
-
-	/* TODO: maybe generate some sort of bitmask of what key
-	 * lowers vs what shader has (ie. no need to lower
-	 * texture clamp lowering if no texture sample instrs)..
-	 * although should be done further up the stack to avoid
-	 * creating duplicate variants..
-	 */
-
-	if (ir3_key_lowers_nir(&so->key)) {
-		nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
-		ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
-	} else {
-		/* fast-path for shader key that lowers nothing in NIR: */
-		ctx->s = so->shader->nir;
-	}
-
-	/* this needs to be the last pass run, so do this here instead of
-	 * in ir3_optimize_nir():
-	 */
-	NIR_PASS_V(ctx->s, nir_lower_locals_to_regs);
-	NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
-
-	if (ir3_shader_debug & IR3_DBG_DISASM) {
-		printf("dump nir%dv%d: type=%d, k={cts=%u,hp=%u}",
-			so->shader->id, so->id, so->type,
-			so->key.color_two_side, so->key.half_precision);
-		nir_print_shader(ctx->s, stdout);
-	}
-
-	if (shader_debug_enabled(so->type)) {
-		fprintf(stderr, "NIR (final form) for %s shader:\n",
-			_mesa_shader_stage_to_string(so->type));
-		nir_print_shader(ctx->s, stderr);
-	}
-
-	ir3_nir_scan_driver_consts(ctx->s, &so->const_layout);
-
-	so->num_uniforms = ctx->s->num_uniforms;
-	so->num_ubos = ctx->s->info.num_ubos;
-
-	/* Layout of constant registers, each section aligned to vec4.  Note
-	 * that pointer size (ubo, etc) changes depending on generation.
-	 *
-	 *    user consts
-	 *    UBO addresses
-	 *    SSBO sizes
-	 *    if (vertex shader) {
-	 *        driver params (IR3_DP_*)
-	 *        if (stream_output.num_outputs > 0)
-	 *           stream-out addresses
-	 *    }
-	 *    immediates
-	 *
-	 * Immediates go last mostly because they are inserted in the CP pass
-	 * after the nir -> ir3 frontend.
-	 */
-	unsigned constoff = align(ctx->s->num_uniforms, 4);
-	unsigned ptrsz = pointer_size(ctx);
-
-	memset(&so->constbase, ~0, sizeof(so->constbase));
-
-	if (so->num_ubos > 0) {
-		so->constbase.ubo = constoff;
-		constoff += align(ctx->s->info.num_ubos * ptrsz, 4) / 4;
-	}
-
-	if (so->const_layout.ssbo_size.count > 0) {
-		unsigned cnt = so->const_layout.ssbo_size.count;
-		so->constbase.ssbo_sizes = constoff;
-		constoff += align(cnt, 4) / 4;
-	}
-
-	if (so->const_layout.image_dims.count > 0) {
-		unsigned cnt = so->const_layout.image_dims.count;
-		so->constbase.image_dims = constoff;
-		constoff += align(cnt, 4) / 4;
-	}
-
-	unsigned num_driver_params = 0;
-	if (so->type == MESA_SHADER_VERTEX) {
-		num_driver_params = IR3_DP_VS_COUNT;
-	} else if (so->type == MESA_SHADER_COMPUTE) {
-		num_driver_params = IR3_DP_CS_COUNT;
-	}
-
-	so->constbase.driver_param = constoff;
-	constoff += align(num_driver_params, 4) / 4;
-
-	if ((so->type == MESA_SHADER_VERTEX) &&
-			(compiler->gpu_id < 500) &&
-			so->shader->stream_output.num_outputs > 0) {
-		so->constbase.tfbo = constoff;
-		constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
-	}
-
-	so->constbase.immediate = constoff;
-
-	return ctx;
-}
-
-static void
-compile_error(struct ir3_context *ctx, const char *format, ...)
-{
-	struct hash_table *errors = NULL;
-	va_list ap;
-	va_start(ap, format);
-	if (ctx->cur_instr) {
-		errors = _mesa_hash_table_create(NULL,
-				_mesa_hash_pointer,
-				_mesa_key_pointer_equal);
-		char *msg = ralloc_vasprintf(errors, format, ap);
-		_mesa_hash_table_insert(errors, ctx->cur_instr, msg);
-	} else {
-		_debug_vprintf(format, ap);
-	}
-	va_end(ap);
-	nir_print_shader_annotated(ctx->s, stdout, errors);
-	ralloc_free(errors);
-	ctx->error = true;
-	debug_assert(0);
-}
-
-#define compile_assert(ctx, cond) do { \
-		if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
-	} while (0)
-
-static void
-compile_free(struct ir3_context *ctx)
-{
-	ralloc_free(ctx);
-}
-
-static void
-declare_array(struct ir3_context *ctx, nir_register *reg)
-{
-	struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
-	arr->id = ++ctx->num_arrays;
-	/* NOTE: sometimes we get non array regs, for example for arrays of
-	 * length 1.  See fs-const-array-of-struct-of-array.shader_test.  So
-	 * treat a non-array as if it was an array of length 1.
-	 *
-	 * It would be nice if there was a nir pass to convert arrays of
-	 * length 1 to ssa.
-	 */
-	arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
-	compile_assert(ctx, arr->length > 0);
-	arr->r = reg;
-	list_addtail(&arr->node, &ctx->ir->array_list);
-}
-
-static struct ir3_array *
-get_array(struct ir3_context *ctx, nir_register *reg)
-{
-	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
-		if (arr->r == reg)
-			return arr;
-	}
-	compile_error(ctx, "bogus reg: %s\n", reg->name);
-	return NULL;
-}
-
-/* relative (indirect) if address!=NULL */
-static struct ir3_instruction *
-create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
-		struct ir3_instruction *address)
-{
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction *mov;
-	struct ir3_register *src;
-
-	mov = ir3_instr_create(block, OPC_MOV);
-	mov->cat1.src_type = TYPE_U32;
-	mov->cat1.dst_type = TYPE_U32;
-	mov->barrier_class = IR3_BARRIER_ARRAY_R;
-	mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
-	ir3_reg_create(mov, 0, 0);
-	src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
-			COND(address, IR3_REG_RELATIV));
-	src->instr = arr->last_write;
-	src->size  = arr->length;
-	src->array.id = arr->id;
-	src->array.offset = n;
-
-	if (address)
-		ir3_instr_set_address(mov, address);
-
-	return mov;
-}
-
-/* relative (indirect) if address!=NULL */
-static void
-create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
-		struct ir3_instruction *src, struct ir3_instruction *address)
-{
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction *mov;
-	struct ir3_register *dst;
-
-	/* if not relative store, don't create an extra mov, since that
-	 * ends up being difficult for cp to remove.
-	 */
-	if (!address) {
-		dst = src->regs[0];
-
-		src->barrier_class |= IR3_BARRIER_ARRAY_W;
-		src->barrier_conflict |= IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
-
-		dst->flags |= IR3_REG_ARRAY;
-		dst->instr = arr->last_write;
-		dst->size = arr->length;
-		dst->array.id = arr->id;
-		dst->array.offset = n;
-
-		arr->last_write = src;
-
-		array_insert(block, block->keeps, src);
-
-		return;
-	}
-
-	mov = ir3_instr_create(block, OPC_MOV);
-	mov->cat1.src_type = TYPE_U32;
-	mov->cat1.dst_type = TYPE_U32;
-	mov->barrier_class = IR3_BARRIER_ARRAY_W;
-	mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
-	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
-			COND(address, IR3_REG_RELATIV));
-	dst->instr = arr->last_write;
-	dst->size  = arr->length;
-	dst->array.id = arr->id;
-	dst->array.offset = n;
-	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
-
-	if (address)
-		ir3_instr_set_address(mov, address);
-
-	arr->last_write = mov;
-
-	/* the array store may only matter to something in an earlier
-	 * block (ie. loops), but since arrays are not in SSA, depth
-	 * pass won't know this.. so keep all array stores:
-	 */
-	array_insert(block, block->keeps, mov);
-}
-
-static inline type_t utype_for_size(unsigned bit_size)
-{
-	switch (bit_size) {
-	case 32: return TYPE_U32;
-	case 16: return TYPE_U16;
-	case  8: return TYPE_U8;
-	default: unreachable("bad bitsize"); return ~0;
-	}
-}
-
-static inline type_t utype_src(nir_src src)
-{ return utype_for_size(nir_src_bit_size(src)); }
-
-static inline type_t utype_dst(nir_dest dst)
-{ return utype_for_size(nir_dest_bit_size(dst)); }
-
-/* allocate a n element value array (to be populated by caller) and
- * insert in def_ht
- */
-static struct ir3_instruction **
-get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n)
-{
-	struct ir3_instruction **value =
-		ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
-	_mesa_hash_table_insert(ctx->def_ht, dst, value);
-	return value;
-}
-
-static struct ir3_instruction **
-get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n)
-{
-	struct ir3_instruction **value;
-
-	if (dst->is_ssa) {
-		value = get_dst_ssa(ctx, &dst->ssa, n);
-	} else {
-		value = ralloc_array(ctx, struct ir3_instruction *, n);
-	}
-
-	/* NOTE: in non-ssa case, we don't really need to store last_dst
-	 * but this helps us catch cases where put_dst() call is forgotten
-	 */
-	compile_assert(ctx, !ctx->last_dst);
-	ctx->last_dst = value;
-	ctx->last_dst_n = n;
-
-	return value;
-}
-
-static struct ir3_instruction * get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align);
-
-static struct ir3_instruction * const *
-get_src(struct ir3_context *ctx, nir_src *src)
-{
-	if (src->is_ssa) {
-		struct hash_entry *entry;
-		entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
-		compile_assert(ctx, entry);
-		return entry->data;
-	} else {
-		nir_register *reg = src->reg.reg;
-		struct ir3_array *arr = get_array(ctx, reg);
-		unsigned num_components = arr->r->num_components;
-		struct ir3_instruction *addr = NULL;
-		struct ir3_instruction **value =
-			ralloc_array(ctx, struct ir3_instruction *, num_components);
-
-		if (src->reg.indirect)
-			addr = get_addr(ctx, get_src(ctx, src->reg.indirect)[0],
-					reg->num_components);
-
-		for (unsigned i = 0; i < num_components; i++) {
-			unsigned n = src->reg.base_offset * reg->num_components + i;
-			compile_assert(ctx, n < arr->length);
-			value[i] = create_array_load(ctx, arr, n, addr);
-		}
-
-		return value;
-	}
-}
-
-static void
-put_dst(struct ir3_context *ctx, nir_dest *dst)
-{
-	unsigned bit_size = nir_dest_bit_size(*dst);
-
-	if (bit_size < 32) {
-		for (unsigned i = 0; i < ctx->last_dst_n; i++) {
-			struct ir3_instruction *dst = ctx->last_dst[i];
-			dst->regs[0]->flags |= IR3_REG_HALF;
-			if (ctx->last_dst[i]->opc == OPC_META_FO)
-				dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF;
-		}
-	}
-
-	if (!dst->is_ssa) {
-		nir_register *reg = dst->reg.reg;
-		struct ir3_array *arr = get_array(ctx, reg);
-		unsigned num_components = ctx->last_dst_n;
-		struct ir3_instruction *addr = NULL;
-
-		if (dst->reg.indirect)
-			addr = get_addr(ctx, get_src(ctx, dst->reg.indirect)[0],
-					reg->num_components);
-
-		for (unsigned i = 0; i < num_components; i++) {
-			unsigned n = dst->reg.base_offset * reg->num_components + i;
-			compile_assert(ctx, n < arr->length);
-			if (!ctx->last_dst[i])
-				continue;
-			create_array_store(ctx, arr, n, ctx->last_dst[i], addr);
-		}
-
-		ralloc_free(ctx->last_dst);
-	}
-	ctx->last_dst = NULL;
-	ctx->last_dst_n = 0;
-}
-
-static struct ir3_instruction *
-create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
-{
-	struct ir3_instruction *mov;
-	unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
-
-	mov = ir3_instr_create(block, OPC_MOV);
-	mov->cat1.src_type = type;
-	mov->cat1.dst_type = type;
-	ir3_reg_create(mov, 0, flags);
-	ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
-
-	return mov;
-}
-
-static struct ir3_instruction *
-create_immed(struct ir3_block *block, uint32_t val)
-{
-	return create_immed_typed(block, val, TYPE_U32);
-}
-
-static struct ir3_instruction *
-create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
-{
-	struct ir3_instruction *instr, *immed;
-
-	/* TODO in at least some cases, the backend could probably be
-	 * made clever enough to propagate IR3_REG_HALF..
-	 */
-	instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
-	instr->regs[0]->flags |= IR3_REG_HALF;
-
-	switch(align){
-	case 1:
-		/* src *= 1: */
-		break;
-	case 2:
-		/* src *= 2	=> src <<= 1: */
-		immed = create_immed(block, 1);
-		immed->regs[0]->flags |= IR3_REG_HALF;
-
-		instr = ir3_SHL_B(block, instr, 0, immed, 0);
-		instr->regs[0]->flags |= IR3_REG_HALF;
-		instr->regs[1]->flags |= IR3_REG_HALF;
-		break;
-	case 3:
-		/* src *= 3: */
-		immed = create_immed(block, 3);
-		immed->regs[0]->flags |= IR3_REG_HALF;
-
-		instr = ir3_MULL_U(block, instr, 0, immed, 0);
-		instr->regs[0]->flags |= IR3_REG_HALF;
-		instr->regs[1]->flags |= IR3_REG_HALF;
-		break;
-	case 4:
-		/* src *= 4 => src <<= 2: */
-		immed = create_immed(block, 2);
-		immed->regs[0]->flags |= IR3_REG_HALF;
-
-		instr = ir3_SHL_B(block, instr, 0, immed, 0);
-		instr->regs[0]->flags |= IR3_REG_HALF;
-		instr->regs[1]->flags |= IR3_REG_HALF;
-		break;
-	default:
-		unreachable("bad align");
-		return NULL;
-	}
-
-	instr = ir3_MOV(block, instr, TYPE_S16);
-	instr->regs[0]->num = regid(REG_A0, 0);
-	instr->regs[0]->flags |= IR3_REG_HALF;
-	instr->regs[1]->flags |= IR3_REG_HALF;
-
-	return instr;
-}
-
-/* caches addr values to avoid generating multiple cov/shl/mova
- * sequences for each use of a given NIR level src as address
- */
-static struct ir3_instruction *
-get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align)
-{
-	struct ir3_instruction *addr;
-	unsigned idx = align - 1;
-
-	compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht));
-
-	if (!ctx->addr_ht[idx]) {
-		ctx->addr_ht[idx] = _mesa_hash_table_create(ctx,
-				_mesa_hash_pointer, _mesa_key_pointer_equal);
-	} else {
-		struct hash_entry *entry;
-		entry = _mesa_hash_table_search(ctx->addr_ht[idx], src);
-		if (entry)
-			return entry->data;
-	}
-
-	addr = create_addr(ctx->block, src, align);
-	_mesa_hash_table_insert(ctx->addr_ht[idx], src, addr);
-
-	return addr;
-}
-
-static struct ir3_instruction *
-get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
-{
-	struct ir3_block *b = ctx->block;
-	struct ir3_instruction *cond;
-
-	/* NOTE: only cmps.*.* can write p0.x: */
-	cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
-	cond->cat2.condition = IR3_COND_NE;
-
-	/* condition always goes in predicate register: */
-	cond->regs[0]->num = regid(REG_P0, 0);
-
-	return cond;
-}
-
-static struct ir3_instruction *
-create_uniform(struct ir3_context *ctx, unsigned n)
-{
-	struct ir3_instruction *mov;
-
-	mov = ir3_instr_create(ctx->block, OPC_MOV);
-	/* TODO get types right? */
-	mov->cat1.src_type = TYPE_F32;
-	mov->cat1.dst_type = TYPE_F32;
-	ir3_reg_create(mov, 0, 0);
-	ir3_reg_create(mov, n, IR3_REG_CONST);
-
-	return mov;
-}
-
-static struct ir3_instruction *
-create_uniform_indirect(struct ir3_context *ctx, int n,
-		struct ir3_instruction *address)
-{
-	struct ir3_instruction *mov;
-
-	mov = ir3_instr_create(ctx->block, OPC_MOV);
-	mov->cat1.src_type = TYPE_U32;
-	mov->cat1.dst_type = TYPE_U32;
-	ir3_reg_create(mov, 0, 0);
-	ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
-
-	ir3_instr_set_address(mov, address);
-
-	return mov;
-}
-
-static struct ir3_instruction *
-create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
-		unsigned arrsz)
-{
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction *collect;
-
-	if (arrsz == 0)
-		return NULL;
-
-	unsigned flags = arr[0]->regs[0]->flags & IR3_REG_HALF;
-
-	collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz);
-	ir3_reg_create(collect, 0, flags);     /* dst */
-	for (unsigned i = 0; i < arrsz; i++) {
-		struct ir3_instruction *elem = arr[i];
-
-		/* Since arrays are pre-colored in RA, we can't assume that
-		 * things will end up in the right place.  (Ie. if a collect
-		 * joins elements from two different arrays.)  So insert an
-		 * extra mov.
-		 *
-		 * We could possibly skip this if all the collected elements
-		 * are contiguous elements in a single array.. not sure how
-		 * likely that is to happen.
-		 *
-		 * Fixes a problem with glamor shaders, that in effect do
-		 * something like:
-		 *
-		 *   if (foo)
-		 *     texcoord = ..
-		 *   else
-		 *     texcoord = ..
-		 *   color = texture2D(tex, texcoord);
-		 *
-		 * In this case, texcoord will end up as nir registers (which
-		 * translate to ir3 array's of length 1.  And we can't assume
-		 * the two (or more) arrays will get allocated in consecutive
-		 * scalar registers.
-		 *
-		 */
-		if (elem->regs[0]->flags & IR3_REG_ARRAY) {
-			type_t type = (flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
-			elem = ir3_MOV(block, elem, type);
-		}
-
-		compile_assert(ctx, (elem->regs[0]->flags & IR3_REG_HALF) == flags);
-		ir3_reg_create(collect, 0, IR3_REG_SSA | flags)->instr = elem;
-	}
-
-	return collect;
-}
-
-static struct ir3_instruction *
-create_indirect_load(struct ir3_context *ctx, unsigned arrsz, int n,
-		struct ir3_instruction *address, struct ir3_instruction *collect)
-{
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction *mov;
-	struct ir3_register *src;
-
-	mov = ir3_instr_create(block, OPC_MOV);
-	mov->cat1.src_type = TYPE_U32;
-	mov->cat1.dst_type = TYPE_U32;
-	ir3_reg_create(mov, 0, 0);
-	src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
-	src->instr = collect;
-	src->size  = arrsz;
-	src->array.offset = n;
-
-	ir3_instr_set_address(mov, address);
-
-	return mov;
-}
-
-static struct ir3_instruction *
-create_input_compmask(struct ir3_context *ctx, unsigned n, unsigned compmask)
-{
-	struct ir3_instruction *in;
-
-	in = ir3_instr_create(ctx->in_block, OPC_META_INPUT);
-	in->inout.block = ctx->in_block;
-	ir3_reg_create(in, n, 0);
-
-	in->regs[0]->wrmask = compmask;
-
-	return in;
-}
-
-static struct ir3_instruction *
-create_input(struct ir3_context *ctx, unsigned n)
-{
-	return create_input_compmask(ctx, n, 0x1);
-}
-
-static struct ir3_instruction *
-create_frag_input(struct ir3_context *ctx, bool use_ldlv)
-{
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction *instr;
-	/* actual inloc is assigned and fixed up later: */
-	struct ir3_instruction *inloc = create_immed(block, 0);
-
-	if (use_ldlv) {
-		instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
-		instr->cat6.type = TYPE_U32;
-		instr->cat6.iim_val = 1;
-	} else {
-		instr = ir3_BARY_F(block, inloc, 0, ctx->frag_vcoord, 0);
-		instr->regs[2]->wrmask = 0x3;
-	}
-
-	return instr;
-}
-
-static struct ir3_instruction *
-create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
-{
-	/* first four vec4 sysval's reserved for UBOs: */
-	/* NOTE: dp is in scalar, but there can be >4 dp components: */
-	unsigned n = ctx->so->constbase.driver_param;
-	unsigned r = regid(n + dp / 4, dp % 4);
-	return create_uniform(ctx, r);
-}
-
-/* helper for instructions that produce multiple consecutive scalar
- * outputs which need to have a split/fanout meta instruction inserted
- */
-static void
-split_dest(struct ir3_block *block, struct ir3_instruction **dst,
-		struct ir3_instruction *src, unsigned base, unsigned n)
-{
-	struct ir3_instruction *prev = NULL;
-
-	if ((n == 1) && (src->regs[0]->wrmask == 0x1)) {
-		dst[0] = src;
-		return;
-	}
-
-	for (int i = 0, j = 0; i < n; i++) {
-		struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO);
-		ir3_reg_create(split, 0, IR3_REG_SSA);
-		ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src;
-		split->fo.off = i + base;
-
-		if (prev) {
-			split->cp.left = prev;
-			split->cp.left_cnt++;
-			prev->cp.right = split;
-			prev->cp.right_cnt++;
-		}
-		prev = split;
-
-		if (src->regs[0]->wrmask & (1 << (i + base)))
-			dst[j++] = split;
-	}
-}
-
-/*
- * Adreno uses uint rather than having dedicated bool type,
- * which (potentially) requires some conversion, in particular
- * when using output of an bool instr to int input, or visa
- * versa.
- *
- *         | Adreno  |  NIR  |
- *  -------+---------+-------+-
- *   true  |    1    |  ~0   |
- *   false |    0    |   0   |
- *
- * To convert from an adreno bool (uint) to nir, use:
- *
- *    absneg.s dst, (neg)src
- *
- * To convert back in the other direction:
- *
- *    absneg.s dst, (abs)arc
- *
- * The CP step can clean up the absneg.s that cancel each other
- * out, and with a slight bit of extra cleverness (to recognize
- * the instructions which produce either a 0 or 1) can eliminate
- * the absneg.s's completely when an instruction that wants
- * 0/1 consumes the result.  For example, when a nir 'bcsel'
- * consumes the result of 'feq'.  So we should be able to get by
- * without a boolean resolve step, and without incuring any
- * extra penalty in instruction count.
- */
-
-/* NIR bool -> native (adreno): */
-static struct ir3_instruction *
-ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr)
-{
-	return ir3_ABSNEG_S(block, instr, IR3_REG_SABS);
-}
-
-/* native (adreno) -> NIR bool: */
-static struct ir3_instruction *
-ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr)
-{
-	return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG);
-}
-
-/*
- * alu/sfu instructions:
- */
-
-static struct ir3_instruction *
-create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
-		unsigned src_bitsize, nir_op op)
-{
-	type_t src_type, dst_type;
-
-	switch (op) {
-	case nir_op_f2f32:
-	case nir_op_f2f16_rtne:
-	case nir_op_f2f16_rtz:
-	case nir_op_f2f16:
-	case nir_op_f2i32:
-	case nir_op_f2i16:
-	case nir_op_f2i8:
-	case nir_op_f2u32:
-	case nir_op_f2u16:
-	case nir_op_f2u8:
-		switch (src_bitsize) {
-		case 32:
-			src_type = TYPE_F32;
-			break;
-		case 16:
-			src_type = TYPE_F16;
-			break;
-		default:
-			compile_error(ctx, "invalid src bit size: %u", src_bitsize);
-		}
-		break;
-
-	case nir_op_i2f32:
-	case nir_op_i2f16:
-	case nir_op_i2i32:
-	case nir_op_i2i16:
-	case nir_op_i2i8:
-		switch (src_bitsize) {
-		case 32:
-			src_type = TYPE_S32;
-			break;
-		case 16:
-			src_type = TYPE_S16;
-			break;
-		case 8:
-			src_type = TYPE_S8;
-			break;
-		default:
-			compile_error(ctx, "invalid src bit size: %u", src_bitsize);
-		}
-		break;
-
-	case nir_op_u2f32:
-	case nir_op_u2f16:
-	case nir_op_u2u32:
-	case nir_op_u2u16:
-	case nir_op_u2u8:
-		switch (src_bitsize) {
-		case 32:
-			src_type = TYPE_U32;
-			break;
-		case 16:
-			src_type = TYPE_U16;
-			break;
-		case 8:
-			src_type = TYPE_U8;
-			break;
-		default:
-			compile_error(ctx, "invalid src bit size: %u", src_bitsize);
-		}
-		break;
-
-	default:
-		compile_error(ctx, "invalid conversion op: %u", op);
-	}
-
-	switch (op) {
-	case nir_op_f2f32:
-	case nir_op_i2f32:
-	case nir_op_u2f32:
-		dst_type = TYPE_F32;
-		break;
-
-	case nir_op_f2f16_rtne:
-	case nir_op_f2f16_rtz:
-	case nir_op_f2f16:
-		/* TODO how to handle rounding mode? */
-	case nir_op_i2f16:
-	case nir_op_u2f16:
-		dst_type = TYPE_F16;
-		break;
-
-	case nir_op_f2i32:
-	case nir_op_i2i32:
-		dst_type = TYPE_S32;
-		break;
-
-	case nir_op_f2i16:
-	case nir_op_i2i16:
-		dst_type = TYPE_S16;
-		break;
-
-	case nir_op_f2i8:
-	case nir_op_i2i8:
-		dst_type = TYPE_S8;
-		break;
-
-	case nir_op_f2u32:
-	case nir_op_u2u32:
-		dst_type = TYPE_U32;
-		break;
-
-	case nir_op_f2u16:
-	case nir_op_u2u16:
-		dst_type = TYPE_U16;
-		break;
-
-	case nir_op_f2u8:
-	case nir_op_u2u8:
-		dst_type = TYPE_U8;
-		break;
-
-	default:
-		compile_error(ctx, "invalid conversion op: %u", op);
-	}
-
-	return ir3_COV(ctx->block, src, src_type, dst_type);
-}
-
-static void
-emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
-{
-	const nir_op_info *info = &nir_op_infos[alu->op];
-	struct ir3_instruction **dst, *src[info->num_inputs];
-	unsigned bs[info->num_inputs];     /* bit size */
-	struct ir3_block *b = ctx->block;
-	unsigned dst_sz, wrmask;
-
-	if (alu->dest.dest.is_ssa) {
-		dst_sz = alu->dest.dest.ssa.num_components;
-		wrmask = (1 << dst_sz) - 1;
-	} else {
-		dst_sz = alu->dest.dest.reg.reg->num_components;
-		wrmask = alu->dest.write_mask;
-	}
-
-	dst = get_dst(ctx, &alu->dest.dest, dst_sz);
-
-	/* Vectors are special in that they have non-scalarized writemasks,
-	 * and just take the first swizzle channel for each argument in
-	 * order into each writemask channel.
-	 */
-	if ((alu->op == nir_op_vec2) ||
-			(alu->op == nir_op_vec3) ||
-			(alu->op == nir_op_vec4)) {
-
-		for (int i = 0; i < info->num_inputs; i++) {
-			nir_alu_src *asrc = &alu->src[i];
-
-			compile_assert(ctx, !asrc->abs);
-			compile_assert(ctx, !asrc->negate);
-
-			src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]];
-			if (!src[i])
-				src[i] = create_immed(ctx->block, 0);
-			dst[i] = ir3_MOV(b, src[i], TYPE_U32);
-		}
-
-		put_dst(ctx, &alu->dest.dest);
-		return;
-	}
-
-	/* We also get mov's with more than one component for mov's so
-	 * handle those specially:
-	 */
-	if ((alu->op == nir_op_imov) || (alu->op == nir_op_fmov)) {
-		type_t type = (alu->op == nir_op_imov) ? TYPE_U32 : TYPE_F32;
-		nir_alu_src *asrc = &alu->src[0];
-		struct ir3_instruction *const *src0 = get_src(ctx, &asrc->src);
-
-		for (unsigned i = 0; i < dst_sz; i++) {
-			if (wrmask & (1 << i)) {
-				dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], type);
-			} else {
-				dst[i] = NULL;
-			}
-		}
-
-		put_dst(ctx, &alu->dest.dest);
-		return;
-	}
-
-	/* General case: We can just grab the one used channel per src. */
-	for (int i = 0; i < info->num_inputs; i++) {
-		unsigned chan = ffs(alu->dest.write_mask) - 1;
-		nir_alu_src *asrc = &alu->src[i];
-
-		compile_assert(ctx, !asrc->abs);
-		compile_assert(ctx, !asrc->negate);
-
-		src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
-		bs[i] = nir_src_bit_size(asrc->src);
-
-		compile_assert(ctx, src[i]);
-	}
-
-	switch (alu->op) {
-	case nir_op_f2f32:
-	case nir_op_f2f16_rtne:
-	case nir_op_f2f16_rtz:
-	case nir_op_f2f16:
-	case nir_op_f2i32:
-	case nir_op_f2i16:
-	case nir_op_f2i8:
-	case nir_op_f2u32:
-	case nir_op_f2u16:
-	case nir_op_f2u8:
-	case nir_op_i2f32:
-	case nir_op_i2f16:
-	case nir_op_i2i32:
-	case nir_op_i2i16:
-	case nir_op_i2i8:
-	case nir_op_u2f32:
-	case nir_op_u2f16:
-	case nir_op_u2u32:
-	case nir_op_u2u16:
-	case nir_op_u2u8:
-		dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
-		break;
-	case nir_op_f2b:
-		dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0);
-		dst[0]->cat2.condition = IR3_COND_NE;
-		dst[0] = ir3_n2b(b, dst[0]);
-		break;
-	case nir_op_b2f:
-		dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32);
-		break;
-	case nir_op_b2i:
-		dst[0] = ir3_b2n(b, src[0]);
-		break;
-	case nir_op_i2b:
-		dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
-		dst[0]->cat2.condition = IR3_COND_NE;
-		dst[0] = ir3_n2b(b, dst[0]);
-		break;
-
-	case nir_op_fneg:
-		dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
-		break;
-	case nir_op_fabs:
-		dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
-		break;
-	case nir_op_fmax:
-		dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_fmin:
-		dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_fsat:
-		/* if there is just a single use of the src, and it supports
-		 * (sat) bit, we can just fold the (sat) flag back to the
-		 * src instruction and create a mov.  This is easier for cp
-		 * to eliminate.
-		 *
-		 * TODO probably opc_cat==4 is ok too
-		 */
-		if (alu->src[0].src.is_ssa &&
-				(list_length(&alu->src[0].src.ssa->uses) == 1) &&
-				((opc_cat(src[0]->opc) == 2) || (opc_cat(src[0]->opc) == 3))) {
-			src[0]->flags |= IR3_INSTR_SAT;
-			dst[0] = ir3_MOV(b, src[0], TYPE_U32);
-		} else {
-			/* otherwise generate a max.f that saturates.. blob does
-			 * similar (generating a cat2 mov using max.f)
-			 */
-			dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
-			dst[0]->flags |= IR3_INSTR_SAT;
-		}
-		break;
-	case nir_op_fmul:
-		dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_fadd:
-		dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_fsub:
-		dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
-		break;
-	case nir_op_ffma:
-		dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
-		break;
-	case nir_op_fddx:
-		dst[0] = ir3_DSX(b, src[0], 0);
-		dst[0]->cat5.type = TYPE_F32;
-		break;
-	case nir_op_fddy:
-		dst[0] = ir3_DSY(b, src[0], 0);
-		dst[0]->cat5.type = TYPE_F32;
-		break;
-		break;
-	case nir_op_flt:
-		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-		dst[0]->cat2.condition = IR3_COND_LT;
-		dst[0] = ir3_n2b(b, dst[0]);
-		break;
-	case nir_op_fge:
-		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-		dst[0]->cat2.condition = IR3_COND_GE;
-		dst[0] = ir3_n2b(b, dst[0]);
-		break;
-	case nir_op_feq:
-		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-		dst[0]->cat2.condition = IR3_COND_EQ;
-		dst[0] = ir3_n2b(b, dst[0]);
-		break;
-	case nir_op_fne:
-		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-		dst[0]->cat2.condition = IR3_COND_NE;
-		dst[0] = ir3_n2b(b, dst[0]);
-		break;
-	case nir_op_fceil:
-		dst[0] = ir3_CEIL_F(b, src[0], 0);
-		break;
-	case nir_op_ffloor:
-		dst[0] = ir3_FLOOR_F(b, src[0], 0);
-		break;
-	case nir_op_ftrunc:
-		dst[0] = ir3_TRUNC_F(b, src[0], 0);
-		break;
-	case nir_op_fround_even:
-		dst[0] = ir3_RNDNE_F(b, src[0], 0);
-		break;
-	case nir_op_fsign:
-		dst[0] = ir3_SIGN_F(b, src[0], 0);
-		break;
-
-	case nir_op_fsin:
-		dst[0] = ir3_SIN(b, src[0], 0);
-		break;
-	case nir_op_fcos:
-		dst[0] = ir3_COS(b, src[0], 0);
-		break;
-	case nir_op_frsq:
-		dst[0] = ir3_RSQ(b, src[0], 0);
-		break;
-	case nir_op_frcp:
-		dst[0] = ir3_RCP(b, src[0], 0);
-		break;
-	case nir_op_flog2:
-		dst[0] = ir3_LOG2(b, src[0], 0);
-		break;
-	case nir_op_fexp2:
-		dst[0] = ir3_EXP2(b, src[0], 0);
-		break;
-	case nir_op_fsqrt:
-		dst[0] = ir3_SQRT(b, src[0], 0);
-		break;
-
-	case nir_op_iabs:
-		dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
-		break;
-	case nir_op_iadd:
-		dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_iand:
-		dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_imax:
-		dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_umax:
-		dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_imin:
-		dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_umin:
-		dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_imul:
-		/*
-		 * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
-		 *   mull.u tmp0, a, b           ; mul low, i.e. al * bl
-		 *   madsh.m16 tmp1, a, b, tmp0  ; mul-add shift high mix, i.e. ah * bl << 16
-		 *   madsh.m16 dst, b, a, tmp1   ; i.e. al * bh << 16
-		 */
-		dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0,
-					ir3_MADSH_M16(b, src[0], 0, src[1], 0,
-						ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0);
-		break;
-	case nir_op_ineg:
-		dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
-		break;
-	case nir_op_inot:
-		dst[0] = ir3_NOT_B(b, src[0], 0);
-		break;
-	case nir_op_ior:
-		dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_ishl:
-		dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_ishr:
-		dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_isign: {
-		/* maybe this would be sane to lower in nir.. */
-		struct ir3_instruction *neg, *pos;
-
-		neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
-		neg->cat2.condition = IR3_COND_LT;
-
-		pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
-		pos->cat2.condition = IR3_COND_GT;
-
-		dst[0] = ir3_SUB_U(b, pos, 0, neg, 0);
-
-		break;
-	}
-	case nir_op_isub:
-		dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_ixor:
-		dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_ushr:
-		dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0);
-		break;
-	case nir_op_ilt:
-		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-		dst[0]->cat2.condition = IR3_COND_LT;
-		dst[0] = ir3_n2b(b, dst[0]);
-		break;
-	case nir_op_ige:
-		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-		dst[0]->cat2.condition = IR3_COND_GE;
-		dst[0] = ir3_n2b(b, dst[0]);
-		break;
-	case nir_op_ieq:
-		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-		dst[0]->cat2.condition = IR3_COND_EQ;
-		dst[0] = ir3_n2b(b, dst[0]);
-		break;
-	case nir_op_ine:
-		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-		dst[0]->cat2.condition = IR3_COND_NE;
-		dst[0] = ir3_n2b(b, dst[0]);
-		break;
-	case nir_op_ult:
-		dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
-		dst[0]->cat2.condition = IR3_COND_LT;
-		dst[0] = ir3_n2b(b, dst[0]);
-		break;
-	case nir_op_uge:
-		dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
-		dst[0]->cat2.condition = IR3_COND_GE;
-		dst[0] = ir3_n2b(b, dst[0]);
-		break;
-
-	case nir_op_bcsel: {
-		struct ir3_instruction *cond = ir3_b2n(b, src[0]);
-		compile_assert(ctx, bs[1] == bs[2]);
-		/* the boolean condition is 32b even if src[1] and src[2] are
-		 * half-precision, but sel.b16 wants all three src's to be the
-		 * same type.
-		 */
-		if (bs[1] < 32)
-			cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
-		dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
-		break;
-	}
-	case nir_op_bit_count:
-		dst[0] = ir3_CBITS_B(b, src[0], 0);
-		break;
-	case nir_op_ifind_msb: {
-		struct ir3_instruction *cmp;
-		dst[0] = ir3_CLZ_S(b, src[0], 0);
-		cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
-		cmp->cat2.condition = IR3_COND_GE;
-		dst[0] = ir3_SEL_B32(b,
-				ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
-				cmp, 0, dst[0], 0);
-		break;
-	}
-	case nir_op_ufind_msb:
-		dst[0] = ir3_CLZ_B(b, src[0], 0);
-		dst[0] = ir3_SEL_B32(b,
-				ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
-				src[0], 0, dst[0], 0);
-		break;
-	case nir_op_find_lsb:
-		dst[0] = ir3_BFREV_B(b, src[0], 0);
-		dst[0] = ir3_CLZ_B(b, dst[0], 0);
-		break;
-	case nir_op_bitfield_reverse:
-		dst[0] = ir3_BFREV_B(b, src[0], 0);
-		break;
-
-	default:
-		compile_error(ctx, "Unhandled ALU op: %s\n",
-				nir_op_infos[alu->op].name);
-		break;
-	}
-
-	put_dst(ctx, &alu->dest.dest);
-}
-
-/* handles direct/indirect UBO reads: */
-static void
-emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-		struct ir3_instruction **dst)
-{
-	struct ir3_block *b = ctx->block;
-	struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
-	nir_const_value *const_offset;
-	/* UBO addresses are the first driver params: */
-	unsigned ubo = regid(ctx->so->constbase.ubo, 0);
-	const unsigned ptrsz = pointer_size(ctx);
-
-	int off = 0;
-
-	/* First src is ubo index, which could either be an immed or not: */
-	src0 = get_src(ctx, &intr->src[0])[0];
-	if (is_same_type_mov(src0) &&
-			(src0->regs[1]->flags & IR3_REG_IMMED)) {
-		base_lo = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz));
-		base_hi = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz) + 1);
-	} else {
-		base_lo = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0, 4));
-		base_hi = create_uniform_indirect(ctx, ubo + 1, get_addr(ctx, src0, 4));
-	}
-
-	/* note: on 32bit gpu's base_hi is ignored and DCE'd */
-	addr = base_lo;
-
-	const_offset = nir_src_as_const_value(intr->src[1]);
-	if (const_offset) {
-		off += const_offset->u32[0];
-	} else {
-		/* For load_ubo_indirect, second src is indirect offset: */
-		src1 = get_src(ctx, &intr->src[1])[0];
-
-		/* and add offset to addr: */
-		addr = ir3_ADD_S(b, addr, 0, src1, 0);
-	}
-
-	/* if offset is to large to encode in the ldg, split it out: */
-	if ((off + (intr->num_components * 4)) > 1024) {
-		/* split out the minimal amount to improve the odds that
-		 * cp can fit the immediate in the add.s instruction:
-		 */
-		unsigned off2 = off + (intr->num_components * 4) - 1024;
-		addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
-		off -= off2;
-	}
-
-	if (ptrsz == 2) {
-		struct ir3_instruction *carry;
-
-		/* handle 32b rollover, ie:
-		 *   if (addr < base_lo)
-		 *      base_hi++
-		 */
-		carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
-		carry->cat2.condition = IR3_COND_LT;
-		base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
-
-		addr = create_collect(ctx, (struct ir3_instruction*[]){ addr, base_hi }, 2);
-	}
-
-	for (int i = 0; i < intr->num_components; i++) {
-		struct ir3_instruction *load =
-				ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
-		load->cat6.type = TYPE_U32;
-		load->cat6.src_offset = off + i * 4;     /* byte offset */
-		dst[i] = load;
-	}
-}
-
-/* src[] = { buffer_index, offset }. No const_index */
-static void
-emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-		struct ir3_instruction **dst)
-{
-	struct ir3_block *b = ctx->block;
-	struct ir3_instruction *ldgb, *src0, *src1, *offset;
-	nir_const_value *const_offset;
-
-	/* can this be non-const buffer_index?  how do we handle that? */
-	const_offset = nir_src_as_const_value(intr->src[0]);
-	compile_assert(ctx, const_offset);
-
-	offset = get_src(ctx, &intr->src[1])[0];
-
-	/* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
-	src0 = create_collect(ctx, (struct ir3_instruction*[]){
-		offset,
-		create_immed(b, 0),
-	}, 2);
-	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
-
-	ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0,
-			src0, 0, src1, 0);
-	ldgb->regs[0]->wrmask = MASK(intr->num_components);
-	ldgb->cat6.iim_val = intr->num_components;
-	ldgb->cat6.d = 4;
-	ldgb->cat6.type = TYPE_U32;
-	ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
-	ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
-
-	split_dest(b, dst, ldgb, 0, intr->num_components);
-}
-
-/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
-static void
-emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-	struct ir3_block *b = ctx->block;
-	struct ir3_instruction *stgb, *src0, *src1, *src2, *offset;
-	nir_const_value *const_offset;
-	/* TODO handle wrmask properly, see _store_shared().. but I think
-	 * it is more a PITA than that, since blob ends up loading the
-	 * masked components and writing them back out.
-	 */
-	unsigned wrmask = intr->const_index[0];
-	unsigned ncomp = ffs(~wrmask) - 1;
-
-	/* can this be non-const buffer_index?  how do we handle that? */
-	const_offset = nir_src_as_const_value(intr->src[1]);
-	compile_assert(ctx, const_offset);
-
-	offset = get_src(ctx, &intr->src[2])[0];
-
-	/* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
-	 * nir already *= 4:
-	 */
-	src0 = create_collect(ctx, get_src(ctx, &intr->src[0]), ncomp);
-	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
-	src2 = create_collect(ctx, (struct ir3_instruction*[]){
-		offset,
-		create_immed(b, 0),
-	}, 2);
-
-	stgb = ir3_STGB(b, create_immed(b, const_offset->u32[0]), 0,
-			src0, 0, src1, 0, src2, 0);
-	stgb->cat6.iim_val = ncomp;
-	stgb->cat6.d = 4;
-	stgb->cat6.type = TYPE_U32;
-	stgb->barrier_class = IR3_BARRIER_BUFFER_W;
-	stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
-
-	array_insert(b, b->keeps, stgb);
-}
-
-/* src[] = { block_index } */
-static void
-emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-		struct ir3_instruction **dst)
-{
-	/* SSBO size stored as a const starting at ssbo_sizes: */
-	unsigned blk_idx = nir_src_as_const_value(intr->src[0])->u32[0];
-	unsigned idx = regid(ctx->so->constbase.ssbo_sizes, 0) +
-		ctx->so->const_layout.ssbo_size.off[blk_idx];
-
-	debug_assert(ctx->so->const_layout.ssbo_size.mask & (1 << blk_idx));
-
-	dst[0] = create_uniform(ctx, idx);
-}
-
-/*
- * SSBO atomic intrinsics
- *
- * All of the SSBO atomic memory operations read a value from memory,
- * compute a new value using one of the operations below, write the new
- * value to memory, and return the original value read.
- *
- * All operations take 3 sources except CompSwap that takes 4. These
- * sources represent:
- *
- * 0: The SSBO buffer index.
- * 1: The offset into the SSBO buffer of the variable that the atomic
- *    operation will operate on.
- * 2: The data parameter to the atomic function (i.e. the value to add
- *    in ssbo_atomic_add, etc).
- * 3: For CompSwap only: the second data parameter.
- */
-static struct ir3_instruction *
-emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-	struct ir3_block *b = ctx->block;
-	struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *offset;
-	nir_const_value *const_offset;
-	type_t type = TYPE_U32;
-
-	/* can this be non-const buffer_index?  how do we handle that? */
-	const_offset = nir_src_as_const_value(intr->src[0]);
-	compile_assert(ctx, const_offset);
-	ssbo = create_immed(b, const_offset->u32[0]);
-
-	offset = get_src(ctx, &intr->src[1])[0];
-
-	/* src0 is data (or uvec2(data, compare))
-	 * src1 is offset
-	 * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
-	 *
-	 * Note that nir already multiplies the offset by four
-	 */
-	src0 = get_src(ctx, &intr->src[2])[0];
-	src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
-	src2 = create_collect(ctx, (struct ir3_instruction*[]){
-		offset,
-		create_immed(b, 0),
-	}, 2);
-
-	switch (intr->intrinsic) {
-	case nir_intrinsic_ssbo_atomic_add:
-		atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_ssbo_atomic_imin:
-		atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-		type = TYPE_S32;
-		break;
-	case nir_intrinsic_ssbo_atomic_umin:
-		atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_ssbo_atomic_imax:
-		atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-		type = TYPE_S32;
-		break;
-	case nir_intrinsic_ssbo_atomic_umax:
-		atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_ssbo_atomic_and:
-		atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_ssbo_atomic_or:
-		atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_ssbo_atomic_xor:
-		atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_ssbo_atomic_exchange:
-		atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_ssbo_atomic_comp_swap:
-		/* for cmpxchg, src0 is [ui]vec2(data, compare): */
-		src0 = create_collect(ctx, (struct ir3_instruction*[]){
-			get_src(ctx, &intr->src[3])[0],
-			src0,
-		}, 2);
-		atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	default:
-		unreachable("boo");
-	}
-
-	atomic->cat6.iim_val = 1;
-	atomic->cat6.d = 4;
-	atomic->cat6.type = type;
-	atomic->barrier_class = IR3_BARRIER_BUFFER_W;
-	atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
-
-	/* even if nothing consume the result, we can't DCE the instruction: */
-	array_insert(b, b->keeps, atomic);
-
-	return atomic;
-}
-
-/* src[] = { offset }. const_index[] = { base } */
-static void
-emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-		struct ir3_instruction **dst)
-{
-	struct ir3_block *b = ctx->block;
-	struct ir3_instruction *ldl, *offset;
-	unsigned base;
-
-	offset = get_src(ctx, &intr->src[0])[0];
-	base   = nir_intrinsic_base(intr);
-
-	ldl = ir3_LDL(b, offset, 0, create_immed(b, intr->num_components), 0);
-	ldl->cat6.src_offset = base;
-	ldl->cat6.type = utype_dst(intr->dest);
-	ldl->regs[0]->wrmask = MASK(intr->num_components);
-
-	ldl->barrier_class = IR3_BARRIER_SHARED_R;
-	ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
-
-	split_dest(b, dst, ldl, 0, intr->num_components);
-}
-
-/* src[] = { value, offset }. const_index[] = { base, write_mask } */
-static void
-emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-	struct ir3_block *b = ctx->block;
-	struct ir3_instruction *stl, *offset;
-	struct ir3_instruction * const *value;
-	unsigned base, wrmask;
-
-	value  = get_src(ctx, &intr->src[0]);
-	offset = get_src(ctx, &intr->src[1])[0];
-
-	base   = nir_intrinsic_base(intr);
-	wrmask = nir_intrinsic_write_mask(intr);
-
-	/* Combine groups of consecutive enabled channels in one write
-	 * message. We use ffs to find the first enabled channel and then ffs on
-	 * the bit-inverse, down-shifted writemask to determine the length of
-	 * the block of enabled bits.
-	 *
-	 * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic())
-	 */
-	while (wrmask) {
-		unsigned first_component = ffs(wrmask) - 1;
-		unsigned length = ffs(~(wrmask >> first_component)) - 1;
-
-		stl = ir3_STL(b, offset, 0,
-			create_collect(ctx, &value[first_component], length), 0,
-			create_immed(b, length), 0);
-		stl->cat6.dst_offset = first_component + base;
-		stl->cat6.type = utype_src(intr->src[0]);
-		stl->barrier_class = IR3_BARRIER_SHARED_W;
-		stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
-
-		array_insert(b, b->keeps, stl);
-
-		/* Clear the bits in the writemask that we just wrote, then try
-		 * again to see if more channels are left.
-		 */
-		wrmask &= (15 << (first_component + length));
-	}
-}
-
-/*
- * CS shared variable atomic intrinsics
- *
- * All of the shared variable atomic memory operations read a value from
- * memory, compute a new value using one of the operations below, write the
- * new value to memory, and return the original value read.
- *
- * All operations take 2 sources except CompSwap that takes 3. These
- * sources represent:
- *
- * 0: The offset into the shared variable storage region that the atomic
- *    operation will operate on.
- * 1: The data parameter to the atomic function (i.e. the value to add
- *    in shared_atomic_add, etc).
- * 2: For CompSwap only: the second data parameter.
- */
-static struct ir3_instruction *
-emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-	struct ir3_block *b = ctx->block;
-	struct ir3_instruction *atomic, *src0, *src1;
-	type_t type = TYPE_U32;
-
-	src0 = get_src(ctx, &intr->src[0])[0];   /* offset */
-	src1 = get_src(ctx, &intr->src[1])[0];   /* value */
-
-	switch (intr->intrinsic) {
-	case nir_intrinsic_shared_atomic_add:
-		atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
-		break;
-	case nir_intrinsic_shared_atomic_imin:
-		atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
-		type = TYPE_S32;
-		break;
-	case nir_intrinsic_shared_atomic_umin:
-		atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
-		break;
-	case nir_intrinsic_shared_atomic_imax:
-		atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
-		type = TYPE_S32;
-		break;
-	case nir_intrinsic_shared_atomic_umax:
-		atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
-		break;
-	case nir_intrinsic_shared_atomic_and:
-		atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
-		break;
-	case nir_intrinsic_shared_atomic_or:
-		atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
-		break;
-	case nir_intrinsic_shared_atomic_xor:
-		atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
-		break;
-	case nir_intrinsic_shared_atomic_exchange:
-		atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
-		break;
-	case nir_intrinsic_shared_atomic_comp_swap:
-		/* for cmpxchg, src1 is [ui]vec2(data, compare): */
-		src1 = create_collect(ctx, (struct ir3_instruction*[]){
-			get_src(ctx, &intr->src[2])[0],
-			src1,
-		}, 2);
-		atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
-		break;
-	default:
-		unreachable("boo");
-	}
-
-	atomic->cat6.iim_val = 1;
-	atomic->cat6.d = 1;
-	atomic->cat6.type = type;
-	atomic->barrier_class = IR3_BARRIER_SHARED_W;
-	atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
-
-	/* even if nothing consume the result, we can't DCE the instruction: */
-	array_insert(b, b->keeps, atomic);
-
-	return atomic;
-}
-
-/* Images get mapped into SSBO/image state (for store/atomic) and texture
- * state block (for load).  To simplify things, invert the image id and
- * map it from end of state block, ie. image 0 becomes num-1, image 1
- * becomes num-2, etc.  This potentially avoids needing to re-emit texture
- * state when switching shaders.
- *
- * TODO is max # of samplers and SSBOs the same.  This shouldn't be hard-
- * coded.  Also, since all the gl shader stages (ie. everything but CS)
- * share the same SSBO/image state block, this might require some more
- * logic if we supported images in anything other than FS..
- */
-static unsigned
-get_image_slot(struct ir3_context *ctx, nir_deref_instr *deref)
-{
-	unsigned int loc = 0;
-	unsigned inner_size = 1;
-
-	while (deref->deref_type != nir_deref_type_var) {
-		assert(deref->deref_type == nir_deref_type_array);
-		nir_const_value *const_index = nir_src_as_const_value(deref->arr.index);
-		assert(const_index);
-
-		/* Go to the next instruction */
-		deref = nir_deref_instr_parent(deref);
-
-		assert(glsl_type_is_array(deref->type));
-		const unsigned array_len = glsl_get_length(deref->type);
-		loc += MIN2(const_index->u32[0], array_len - 1) * inner_size;
-
-		/* Update the inner size */
-		inner_size *= array_len;
-	}
-
-	loc += deref->var->data.driver_location;
-
-	/* TODO figure out real limit per generation, and don't hardcode: */
-	const unsigned max_samplers = 16;
-	return max_samplers - loc - 1;
-}
-
-/* see tex_info() for equiv logic for texture instructions.. it would be
- * nice if this could be better unified..
- */
-static unsigned
-get_image_coords(const nir_variable *var, unsigned *flagsp)
-{
-	const struct glsl_type *type = glsl_without_array(var->type);
-	unsigned coords, flags = 0;
-
-	switch (glsl_get_sampler_dim(type)) {
-	case GLSL_SAMPLER_DIM_1D:
-	case GLSL_SAMPLER_DIM_BUF:
-		coords = 1;
-		break;
-	case GLSL_SAMPLER_DIM_2D:
-	case GLSL_SAMPLER_DIM_RECT:
-	case GLSL_SAMPLER_DIM_EXTERNAL:
-	case GLSL_SAMPLER_DIM_MS:
-		coords = 2;
-		break;
-	case GLSL_SAMPLER_DIM_3D:
-	case GLSL_SAMPLER_DIM_CUBE:
-		flags |= IR3_INSTR_3D;
-		coords = 3;
-		break;
-	default:
-		unreachable("bad sampler dim");
-		return 0;
-	}
-
-	if (glsl_sampler_type_is_array(type)) {
-		/* note: unlike tex_info(), adjust # of coords to include array idx: */
-		coords++;
-		flags |= IR3_INSTR_A;
-	}
-
-	if (flagsp)
-		*flagsp = flags;
-
-	return coords;
-}
-
-static type_t
-get_image_type(const nir_variable *var)
-{
-	switch (glsl_get_sampler_result_type(glsl_without_array(var->type))) {
-	case GLSL_TYPE_UINT:
-		return TYPE_U32;
-	case GLSL_TYPE_INT:
-		return TYPE_S32;
-	case GLSL_TYPE_FLOAT:
-		return TYPE_F32;
-	default:
-		unreachable("bad sampler type.");
-		return 0;
-	}
-}
-
-static struct ir3_instruction *
-get_image_offset(struct ir3_context *ctx, const nir_variable *var,
-		struct ir3_instruction * const *coords, bool byteoff)
-{
-	struct ir3_block *b = ctx->block;
-	struct ir3_instruction *offset;
-	unsigned ncoords = get_image_coords(var, NULL);
-
-	/* to calculate the byte offset (yes, uggg) we need (up to) three
-	 * const values to know the bytes per pixel, and y and z stride:
-	 */
-	unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
-		ctx->so->const_layout.image_dims.off[var->data.driver_location];
-
-	debug_assert(ctx->so->const_layout.image_dims.mask &
-			(1 << var->data.driver_location));
-
-	/* offset = coords.x * bytes_per_pixel: */
-	offset = ir3_MUL_S(b, coords[0], 0, create_uniform(ctx, cb + 0), 0);
-	if (ncoords > 1) {
-		/* offset += coords.y * y_pitch: */
-		offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 1), 0,
-				coords[1], 0, offset, 0);
-	}
-	if (ncoords > 2) {
-		/* offset += coords.z * z_pitch: */
-		offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 2), 0,
-				coords[2], 0, offset, 0);
-	}
-
-	if (!byteoff) {
-		/* Some cases, like atomics, seem to use dword offset instead
-		 * of byte offsets.. blob just puts an extra shr.b in there
-		 * in those cases:
-		 */
-		offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
-	}
-
-	return create_collect(ctx, (struct ir3_instruction*[]){
-		offset,
-		create_immed(b, 0),
-	}, 2);
-}
-
-/* src[] = { deref, coord, sample_index }. const_index[] = {} */
-static void
-emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-		struct ir3_instruction **dst)
-{
-	struct ir3_block *b = ctx->block;
-	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
-	struct ir3_instruction *sam;
-	struct ir3_instruction * const *src0 = get_src(ctx, &intr->src[1]);
-	struct ir3_instruction *coords[4];
-	unsigned flags, ncoords = get_image_coords(var, &flags);
-	unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
-	type_t type = get_image_type(var);
-
-	/* hmm, this seems a bit odd, but it is what blob does and (at least
-	 * a5xx) just faults on bogus addresses otherwise:
-	 */
-	if (flags & IR3_INSTR_3D) {
-		flags &= ~IR3_INSTR_3D;
-		flags |= IR3_INSTR_A;
-	}
-
-	for (unsigned i = 0; i < ncoords; i++)
-		coords[i] = src0[i];
-
-	if (ncoords == 1)
-		coords[ncoords++] = create_immed(b, 0);
-
-	sam = ir3_SAM(b, OPC_ISAM, type, 0b1111, flags,
-			tex_idx, tex_idx, create_collect(ctx, coords, ncoords), NULL);
-
-	sam->barrier_class = IR3_BARRIER_IMAGE_R;
-	sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
-
-	split_dest(b, dst, sam, 0, 4);
-}
-
-/* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
-static void
-emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-	struct ir3_block *b = ctx->block;
-	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
-	struct ir3_instruction *stib, *offset;
-	struct ir3_instruction * const *value = get_src(ctx, &intr->src[3]);
-	struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
-	unsigned ncoords = get_image_coords(var, NULL);
-	unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
-
-	/* src0 is value
-	 * src1 is coords
-	 * src2 is 64b byte offset
-	 */
-
-	offset = get_image_offset(ctx, var, coords, true);
-
-	/* NOTE: stib seems to take byte offset, but stgb.typed can be used
-	 * too and takes a dword offset.. not quite sure yet why blob uses
-	 * one over the other in various cases.
-	 */
-
-	stib = ir3_STIB(b, create_immed(b, tex_idx), 0,
-			create_collect(ctx, value, 4), 0,
-			create_collect(ctx, coords, ncoords), 0,
-			offset, 0);
-	stib->cat6.iim_val = 4;
-	stib->cat6.d = ncoords;
-	stib->cat6.type = get_image_type(var);
-	stib->cat6.typed = true;
-	stib->barrier_class = IR3_BARRIER_IMAGE_W;
-	stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
-
-	array_insert(b, b->keeps, stib);
-}
-
-static void
-emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-		struct ir3_instruction **dst)
-{
-	struct ir3_block *b = ctx->block;
-	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
-	unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
-	struct ir3_instruction *sam, *lod;
-	unsigned flags, ncoords = get_image_coords(var, &flags);
-
-	lod = create_immed(b, 0);
-	sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags,
-			tex_idx, tex_idx, lod, NULL);
-
-	/* Array size actually ends up in .w rather than .z. This doesn't
-	 * matter for miplevel 0, but for higher mips the value in z is
-	 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
-	 * returned, which means that we have to add 1 to it for arrays for
-	 * a3xx.
-	 *
-	 * Note use a temporary dst and then copy, since the size of the dst
-	 * array that is passed in is based on nir's understanding of the
-	 * result size, not the hardware's
-	 */
-	struct ir3_instruction *tmp[4];
-
-	split_dest(b, tmp, sam, 0, 4);
-
-	/* get_size instruction returns size in bytes instead of texels
-	 * for imageBuffer, so we need to divide it by the pixel size
-	 * of the image format.
-	 *
-	 * TODO: This is at least true on a5xx. Check other gens.
-	 */
-	enum glsl_sampler_dim dim =
-		glsl_get_sampler_dim(glsl_without_array(var->type));
-	if (dim == GLSL_SAMPLER_DIM_BUF) {
-		/* Since all the possible values the divisor can take are
-		 * power-of-two (4, 8, or 16), the division is implemented
-		 * as a shift-right.
-		 * During shader setup, the log2 of the image format's
-		 * bytes-per-pixel should have been emitted in 2nd slot of
-		 * image_dims. See ir3_shader::emit_image_dims().
-		 */
-		unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
-			ctx->so->const_layout.image_dims.off[var->data.driver_location];
-		struct ir3_instruction *aux = create_uniform(ctx, cb + 1);
-
-		tmp[0] = ir3_SHR_B(b, tmp[0], 0, aux, 0);
-	}
-
-	for (unsigned i = 0; i < ncoords; i++)
-		dst[i] = tmp[i];
-
-	if (flags & IR3_INSTR_A) {
-		if (ctx->compiler->levels_add_one) {
-			dst[ncoords-1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
-		} else {
-			dst[ncoords-1] = ir3_MOV(b, tmp[3], TYPE_U32);
-		}
-	}
-}
-
-/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
-static struct ir3_instruction *
-emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-	struct ir3_block *b = ctx->block;
-	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
-	struct ir3_instruction *atomic, *image, *src0, *src1, *src2;
-	struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
-	unsigned ncoords = get_image_coords(var, NULL);
-
-	image = create_immed(b, get_image_slot(ctx, nir_src_as_deref(intr->src[0])));
-
-	/* src0 is value (or uvec2(value, compare))
-	 * src1 is coords
-	 * src2 is 64b byte offset
-	 */
-	src0 = get_src(ctx, &intr->src[3])[0];
-	src1 = create_collect(ctx, coords, ncoords);
-	src2 = get_image_offset(ctx, var, coords, false);
-
-	switch (intr->intrinsic) {
-	case nir_intrinsic_image_deref_atomic_add:
-		atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_image_deref_atomic_min:
-		atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_image_deref_atomic_max:
-		atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_image_deref_atomic_and:
-		atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_image_deref_atomic_or:
-		atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_image_deref_atomic_xor:
-		atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_image_deref_atomic_exchange:
-		atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	case nir_intrinsic_image_deref_atomic_comp_swap:
-		/* for cmpxchg, src0 is [ui]vec2(data, compare): */
-		src0 = create_collect(ctx, (struct ir3_instruction*[]){
-			get_src(ctx, &intr->src[4])[0],
-			src0,
-		}, 2);
-		atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-		break;
-	default:
-		unreachable("boo");
-	}
-
-	atomic->cat6.iim_val = 1;
-	atomic->cat6.d = ncoords;
-	atomic->cat6.type = get_image_type(var);
-	atomic->cat6.typed = true;
-	atomic->barrier_class = IR3_BARRIER_IMAGE_W;
-	atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
-
-	/* even if nothing consume the result, we can't DCE the instruction: */
-	array_insert(b, b->keeps, atomic);
-
-	return atomic;
-}
-
-static void
-emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-	struct ir3_block *b = ctx->block;
-	struct ir3_instruction *barrier;
-
-	switch (intr->intrinsic) {
-	case nir_intrinsic_barrier:
-		barrier = ir3_BAR(b);
-		barrier->cat7.g = true;
-		barrier->cat7.l = true;
-		barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
-		barrier->barrier_class = IR3_BARRIER_EVERYTHING;
-		break;
-	case nir_intrinsic_memory_barrier:
-		barrier = ir3_FENCE(b);
-		barrier->cat7.g = true;
-		barrier->cat7.r = true;
-		barrier->cat7.w = true;
-		barrier->barrier_class = IR3_BARRIER_IMAGE_W |
-				IR3_BARRIER_BUFFER_W;
-		barrier->barrier_conflict =
-				IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
-				IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
-		break;
-	case nir_intrinsic_memory_barrier_atomic_counter:
-	case nir_intrinsic_memory_barrier_buffer:
-		barrier = ir3_FENCE(b);
-		barrier->cat7.g = true;
-		barrier->cat7.r = true;
-		barrier->cat7.w = true;
-		barrier->barrier_class = IR3_BARRIER_BUFFER_W;
-		barrier->barrier_conflict = IR3_BARRIER_BUFFER_R |
-				IR3_BARRIER_BUFFER_W;
-		break;
-	case nir_intrinsic_memory_barrier_image:
-		// TODO double check if this should have .g set
-		barrier = ir3_FENCE(b);
-		barrier->cat7.g = true;
-		barrier->cat7.r = true;
-		barrier->cat7.w = true;
-		barrier->barrier_class = IR3_BARRIER_IMAGE_W;
-		barrier->barrier_conflict = IR3_BARRIER_IMAGE_R |
-				IR3_BARRIER_IMAGE_W;
-		break;
-	case nir_intrinsic_memory_barrier_shared:
-		barrier = ir3_FENCE(b);
-		barrier->cat7.g = true;
-		barrier->cat7.l = true;
-		barrier->cat7.r = true;
-		barrier->cat7.w = true;
-		barrier->barrier_class = IR3_BARRIER_SHARED_W;
-		barrier->barrier_conflict = IR3_BARRIER_SHARED_R |
-				IR3_BARRIER_SHARED_W;
-		break;
-	case nir_intrinsic_group_memory_barrier:
-		barrier = ir3_FENCE(b);
-		barrier->cat7.g = true;
-		barrier->cat7.l = true;
-		barrier->cat7.r = true;
-		barrier->cat7.w = true;
-		barrier->barrier_class = IR3_BARRIER_SHARED_W |
-				IR3_BARRIER_IMAGE_W |
-				IR3_BARRIER_BUFFER_W;
-		barrier->barrier_conflict =
-				IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
-				IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
-				IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
-		break;
-	default:
-		unreachable("boo");
-	}
-
-	/* make sure barrier doesn't get DCE'd */
-	array_insert(b, b->keeps, barrier);
-}
-
-static void add_sysval_input_compmask(struct ir3_context *ctx,
-		gl_system_value slot, unsigned compmask,
-		struct ir3_instruction *instr)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	unsigned r = regid(so->inputs_count, 0);
-	unsigned n = so->inputs_count++;
-
-	so->inputs[n].sysval = true;
-	so->inputs[n].slot = slot;
-	so->inputs[n].compmask = compmask;
-	so->inputs[n].regid = r;
-	so->inputs[n].interpolate = INTERP_MODE_FLAT;
-	so->total_in++;
-
-	ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1);
-	ctx->ir->inputs[r] = instr;
-}
-
-static void add_sysval_input(struct ir3_context *ctx, gl_system_value slot,
-		struct ir3_instruction *instr)
-{
-	add_sysval_input_compmask(ctx, slot, 0x1, instr);
-}
-
-static void
-emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-	const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
-	struct ir3_instruction **dst;
-	struct ir3_instruction * const *src;
-	struct ir3_block *b = ctx->block;
-	nir_const_value *const_offset;
-	int idx, comp;
-
-	if (info->has_dest) {
-		unsigned n = nir_intrinsic_dest_components(intr);
-		dst = get_dst(ctx, &intr->dest, n);
-	} else {
-		dst = NULL;
-	}
-
-	switch (intr->intrinsic) {
-	case nir_intrinsic_load_uniform:
-		idx = nir_intrinsic_base(intr);
-		const_offset = nir_src_as_const_value(intr->src[0]);
-		if (const_offset) {
-			idx += const_offset->u32[0];
-			for (int i = 0; i < intr->num_components; i++) {
-				unsigned n = idx * 4 + i;
-				dst[i] = create_uniform(ctx, n);
-			}
-		} else {
-			src = get_src(ctx, &intr->src[0]);
-			for (int i = 0; i < intr->num_components; i++) {
-				int n = idx * 4 + i;
-				dst[i] = create_uniform_indirect(ctx, n,
-						get_addr(ctx, src[0], 4));
-			}
-			/* NOTE: if relative addressing is used, we set
-			 * constlen in the compiler (to worst-case value)
-			 * since we don't know in the assembler what the max
-			 * addr reg value can be:
-			 */
-			ctx->so->constlen = ctx->s->num_uniforms;
-		}
-		break;
-	case nir_intrinsic_load_ubo:
-		emit_intrinsic_load_ubo(ctx, intr, dst);
-		break;
-	case nir_intrinsic_load_input:
-		idx = nir_intrinsic_base(intr);
-		comp = nir_intrinsic_component(intr);
-		const_offset = nir_src_as_const_value(intr->src[0]);
-		if (const_offset) {
-			idx += const_offset->u32[0];
-			for (int i = 0; i < intr->num_components; i++) {
-				unsigned n = idx * 4 + i + comp;
-				dst[i] = ctx->ir->inputs[n];
-			}
-		} else {
-			src = get_src(ctx, &intr->src[0]);
-			struct ir3_instruction *collect =
-					create_collect(ctx, ctx->ir->inputs, ctx->ir->ninputs);
-			struct ir3_instruction *addr = get_addr(ctx, src[0], 4);
-			for (int i = 0; i < intr->num_components; i++) {
-				unsigned n = idx * 4 + i + comp;
-				dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
-						n, addr, collect);
-			}
-		}
-		break;
-	case nir_intrinsic_load_ssbo:
-		emit_intrinsic_load_ssbo(ctx, intr, dst);
-		break;
-	case nir_intrinsic_store_ssbo:
-		emit_intrinsic_store_ssbo(ctx, intr);
-		break;
-	case nir_intrinsic_get_buffer_size:
-		emit_intrinsic_ssbo_size(ctx, intr, dst);
-		break;
-	case nir_intrinsic_ssbo_atomic_add:
-	case nir_intrinsic_ssbo_atomic_imin:
-	case nir_intrinsic_ssbo_atomic_umin:
-	case nir_intrinsic_ssbo_atomic_imax:
-	case nir_intrinsic_ssbo_atomic_umax:
-	case nir_intrinsic_ssbo_atomic_and:
-	case nir_intrinsic_ssbo_atomic_or:
-	case nir_intrinsic_ssbo_atomic_xor:
-	case nir_intrinsic_ssbo_atomic_exchange:
-	case nir_intrinsic_ssbo_atomic_comp_swap:
-		dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr);
-		break;
-	case nir_intrinsic_load_shared:
-		emit_intrinsic_load_shared(ctx, intr, dst);
-		break;
-	case nir_intrinsic_store_shared:
-		emit_intrinsic_store_shared(ctx, intr);
-		break;
-	case nir_intrinsic_shared_atomic_add:
-	case nir_intrinsic_shared_atomic_imin:
-	case nir_intrinsic_shared_atomic_umin:
-	case nir_intrinsic_shared_atomic_imax:
-	case nir_intrinsic_shared_atomic_umax:
-	case nir_intrinsic_shared_atomic_and:
-	case nir_intrinsic_shared_atomic_or:
-	case nir_intrinsic_shared_atomic_xor:
-	case nir_intrinsic_shared_atomic_exchange:
-	case nir_intrinsic_shared_atomic_comp_swap:
-		dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
-		break;
-	case nir_intrinsic_image_deref_load:
-		emit_intrinsic_load_image(ctx, intr, dst);
-		break;
-	case nir_intrinsic_image_deref_store:
-		emit_intrinsic_store_image(ctx, intr);
-		break;
-	case nir_intrinsic_image_deref_size:
-		emit_intrinsic_image_size(ctx, intr, dst);
-		break;
-	case nir_intrinsic_image_deref_atomic_add:
-	case nir_intrinsic_image_deref_atomic_min:
-	case nir_intrinsic_image_deref_atomic_max:
-	case nir_intrinsic_image_deref_atomic_and:
-	case nir_intrinsic_image_deref_atomic_or:
-	case nir_intrinsic_image_deref_atomic_xor:
-	case nir_intrinsic_image_deref_atomic_exchange:
-	case nir_intrinsic_image_deref_atomic_comp_swap:
-		dst[0] = emit_intrinsic_atomic_image(ctx, intr);
-		break;
-	case nir_intrinsic_barrier:
-	case nir_intrinsic_memory_barrier:
-	case nir_intrinsic_group_memory_barrier:
-	case nir_intrinsic_memory_barrier_atomic_counter:
-	case nir_intrinsic_memory_barrier_buffer:
-	case nir_intrinsic_memory_barrier_image:
-	case nir_intrinsic_memory_barrier_shared:
-		emit_intrinsic_barrier(ctx, intr);
-		/* note that blk ptr no longer valid, make that obvious: */
-		b = NULL;
-		break;
-	case nir_intrinsic_store_output:
-		idx = nir_intrinsic_base(intr);
-		comp = nir_intrinsic_component(intr);
-		const_offset = nir_src_as_const_value(intr->src[1]);
-		compile_assert(ctx, const_offset != NULL);
-		idx += const_offset->u32[0];
-
-		src = get_src(ctx, &intr->src[0]);
-		for (int i = 0; i < intr->num_components; i++) {
-			unsigned n = idx * 4 + i + comp;
-			ctx->ir->outputs[n] = src[i];
-		}
-		break;
-	case nir_intrinsic_load_base_vertex:
-	case nir_intrinsic_load_first_vertex:
-		if (!ctx->basevertex) {
-			ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
-			add_sysval_input(ctx, SYSTEM_VALUE_FIRST_VERTEX, ctx->basevertex);
-		}
-		dst[0] = ctx->basevertex;
-		break;
-	case nir_intrinsic_load_vertex_id_zero_base:
-	case nir_intrinsic_load_vertex_id:
-		if (!ctx->vertex_id) {
-			gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) ?
-				SYSTEM_VALUE_VERTEX_ID : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
-			ctx->vertex_id = create_input(ctx, 0);
-			add_sysval_input(ctx, sv, ctx->vertex_id);
-		}
-		dst[0] = ctx->vertex_id;
-		break;
-	case nir_intrinsic_load_instance_id:
-		if (!ctx->instance_id) {
-			ctx->instance_id = create_input(ctx, 0);
-			add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID,
-					ctx->instance_id);
-		}
-		dst[0] = ctx->instance_id;
-		break;
-	case nir_intrinsic_load_sample_id:
-	case nir_intrinsic_load_sample_id_no_per_sample:
-		if (!ctx->samp_id) {
-			ctx->samp_id = create_input(ctx, 0);
-			ctx->samp_id->regs[0]->flags |= IR3_REG_HALF;
-			add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID,
-					ctx->samp_id);
-		}
-		dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
-		break;
-	case nir_intrinsic_load_sample_mask_in:
-		if (!ctx->samp_mask_in) {
-			ctx->samp_mask_in = create_input(ctx, 0);
-			add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN,
-					ctx->samp_mask_in);
-		}
-		dst[0] = ctx->samp_mask_in;
-		break;
-	case nir_intrinsic_load_user_clip_plane:
-		idx = nir_intrinsic_ucp_id(intr);
-		for (int i = 0; i < intr->num_components; i++) {
-			unsigned n = idx * 4 + i;
-			dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
-		}
-		break;
-	case nir_intrinsic_load_front_face:
-		if (!ctx->frag_face) {
-			ctx->so->frag_face = true;
-			ctx->frag_face = create_input(ctx, 0);
-			add_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, ctx->frag_face);
-			ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
-		}
-		/* for fragface, we get -1 for back and 0 for front. However this is
-		 * the inverse of what nir expects (where ~0 is true).
-		 */
-		dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32);
-		dst[0] = ir3_NOT_B(b, dst[0], 0);
-		break;
-	case nir_intrinsic_load_local_invocation_id:
-		if (!ctx->local_invocation_id) {
-			ctx->local_invocation_id = create_input_compmask(ctx, 0, 0x7);
-			add_sysval_input_compmask(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID,
-					0x7, ctx->local_invocation_id);
-		}
-		split_dest(b, dst, ctx->local_invocation_id, 0, 3);
-		break;
-	case nir_intrinsic_load_work_group_id:
-		if (!ctx->work_group_id) {
-			ctx->work_group_id = create_input_compmask(ctx, 0, 0x7);
-			add_sysval_input_compmask(ctx, SYSTEM_VALUE_WORK_GROUP_ID,
-					0x7, ctx->work_group_id);
-			ctx->work_group_id->regs[0]->flags |= IR3_REG_HIGH;
-		}
-		split_dest(b, dst, ctx->work_group_id, 0, 3);
-		break;
-	case nir_intrinsic_load_num_work_groups:
-		for (int i = 0; i < intr->num_components; i++) {
-			dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
-		}
-		break;
-	case nir_intrinsic_load_local_group_size:
-		for (int i = 0; i < intr->num_components; i++) {
-			dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
-		}
-		break;
-	case nir_intrinsic_discard_if:
-	case nir_intrinsic_discard: {
-		struct ir3_instruction *cond, *kill;
-
-		if (intr->intrinsic == nir_intrinsic_discard_if) {
-			/* conditional discard: */
-			src = get_src(ctx, &intr->src[0]);
-			cond = ir3_b2n(b, src[0]);
-		} else {
-			/* unconditional discard: */
-			cond = create_immed(b, 1);
-		}
-
-		/* NOTE: only cmps.*.* can write p0.x: */
-		cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
-		cond->cat2.condition = IR3_COND_NE;
-
-		/* condition always goes in predicate register: */
-		cond->regs[0]->num = regid(REG_P0, 0);
-
-		kill = ir3_KILL(b, cond, 0);
-		array_insert(ctx->ir, ctx->ir->predicates, kill);
-
-		array_insert(b, b->keeps, kill);
-		ctx->so->has_kill = true;
-
-		break;
-	}
-	default:
-		compile_error(ctx, "Unhandled intrinsic type: %s\n",
-				nir_intrinsic_infos[intr->intrinsic].name);
-		break;
-	}
-
-	if (info->has_dest)
-		put_dst(ctx, &intr->dest);
-}
-
-static void
-emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
-{
-	struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
-			instr->def.num_components);
-	type_t type = (instr->def.bit_size < 32) ? TYPE_U16 : TYPE_U32;
-
-	for (int i = 0; i < instr->def.num_components; i++)
-		dst[i] = create_immed_typed(ctx->block, instr->value.u32[i], type);
-}
-
-static void
-emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)
-{
-	struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def,
-			undef->def.num_components);
-	type_t type = (undef->def.bit_size < 32) ? TYPE_U16 : TYPE_U32;
-
-	/* backend doesn't want undefined instructions, so just plug
-	 * in 0.0..
-	 */
-	for (int i = 0; i < undef->def.num_components; i++)
-		dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
-}
-
-/*
- * texture fetch/sample instructions:
- */
-
-static void
-tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
-{
-	unsigned coords, flags = 0;
-
-	/* note: would use tex->coord_components.. except txs.. also,
-	 * since array index goes after shadow ref, we don't want to
-	 * count it:
-	 */
-	switch (tex->sampler_dim) {
-	case GLSL_SAMPLER_DIM_1D:
-	case GLSL_SAMPLER_DIM_BUF:
-		coords = 1;
-		break;
-	case GLSL_SAMPLER_DIM_2D:
-	case GLSL_SAMPLER_DIM_RECT:
-	case GLSL_SAMPLER_DIM_EXTERNAL:
-	case GLSL_SAMPLER_DIM_MS:
-		coords = 2;
-		break;
-	case GLSL_SAMPLER_DIM_3D:
-	case GLSL_SAMPLER_DIM_CUBE:
-		coords = 3;
-		flags |= IR3_INSTR_3D;
-		break;
-	default:
-		unreachable("bad sampler_dim");
-	}
-
-	if (tex->is_shadow && tex->op != nir_texop_lod)
-		flags |= IR3_INSTR_S;
-
-	if (tex->is_array && tex->op != nir_texop_lod)
-		flags |= IR3_INSTR_A;
-
-	*flagsp = flags;
-	*coordsp = coords;
-}
-
-static void
-emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
-{
-	struct ir3_block *b = ctx->block;
-	struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
-	struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy;
-	struct ir3_instruction *lod, *compare, *proj, *sample_index;
-	bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
-	unsigned i, coords, flags;
-	unsigned nsrc0 = 0, nsrc1 = 0;
-	type_t type;
-	opc_t opc = 0;
-
-	coord = off = ddx = ddy = NULL;
-	lod = proj = compare = sample_index = NULL;
-
-	/* TODO: might just be one component for gathers? */
-	dst = get_dst(ctx, &tex->dest, 4);
-
-	for (unsigned i = 0; i < tex->num_srcs; i++) {
-		switch (tex->src[i].src_type) {
-		case nir_tex_src_coord:
-			coord = get_src(ctx, &tex->src[i].src);
-			break;
-		case nir_tex_src_bias:
-			lod = get_src(ctx, &tex->src[i].src)[0];
-			has_bias = true;
-			break;
-		case nir_tex_src_lod:
-			lod = get_src(ctx, &tex->src[i].src)[0];
-			has_lod = true;
-			break;
-		case nir_tex_src_comparator: /* shadow comparator */
-			compare = get_src(ctx, &tex->src[i].src)[0];
-			break;
-		case nir_tex_src_projector:
-			proj = get_src(ctx, &tex->src[i].src)[0];
-			has_proj = true;
-			break;
-		case nir_tex_src_offset:
-			off = get_src(ctx, &tex->src[i].src);
-			has_off = true;
-			break;
-		case nir_tex_src_ddx:
-			ddx = get_src(ctx, &tex->src[i].src);
-			break;
-		case nir_tex_src_ddy:
-			ddy = get_src(ctx, &tex->src[i].src);
-			break;
-		case nir_tex_src_ms_index:
-			sample_index = get_src(ctx, &tex->src[i].src)[0];
-			break;
-		default:
-			compile_error(ctx, "Unhandled NIR tex src type: %d\n",
-					tex->src[i].src_type);
-			return;
-		}
-	}
-
-	switch (tex->op) {
-	case nir_texop_tex:      opc = has_lod ? OPC_SAML : OPC_SAM; break;
-	case nir_texop_txb:      opc = OPC_SAMB;     break;
-	case nir_texop_txl:      opc = OPC_SAML;     break;
-	case nir_texop_txd:      opc = OPC_SAMGQ;    break;
-	case nir_texop_txf:      opc = OPC_ISAML;    break;
-	case nir_texop_lod:      opc = OPC_GETLOD;   break;
-	case nir_texop_tg4:
-		/* NOTE: a4xx might need to emulate gather w/ txf (this is
-		 * what blob does, seems gather  is broken?), and a3xx did
-		 * not support it (but probably could also emulate).
-		 */
-		switch (tex->component) {
-		case 0:              opc = OPC_GATHER4R; break;
-		case 1:              opc = OPC_GATHER4G; break;
-		case 2:              opc = OPC_GATHER4B; break;
-		case 3:              opc = OPC_GATHER4A; break;
-		}
-		break;
-	case nir_texop_txf_ms:   opc = OPC_ISAMM;    break;
-	case nir_texop_txs:
-	case nir_texop_query_levels:
-	case nir_texop_texture_samples:
-	case nir_texop_samples_identical:
-	case nir_texop_txf_ms_mcs:
-		compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
-		return;
-	}
-
-	tex_info(tex, &flags, &coords);
-
-	/*
-	 * lay out the first argument in the proper order:
-	 *  - actual coordinates first
-	 *  - shadow reference
-	 *  - array index
-	 *  - projection w
-	 *  - starting at offset 4, dpdx.xy, dpdy.xy
-	 *
-	 * bias/lod go into the second arg
-	 */
-
-	/* insert tex coords: */
-	for (i = 0; i < coords; i++)
-		src0[i] = coord[i];
-
-	nsrc0 = i;
-
-	/* NOTE a3xx (and possibly a4xx?) might be different, using isaml
-	 * with scaled x coord according to requested sample:
-	 */
-	if (tex->op == nir_texop_txf_ms) {
-		if (ctx->compiler->txf_ms_with_isaml) {
-			/* the samples are laid out in x dimension as
-			 *     0 1 2 3
-			 * x_ms = (x << ms) + sample_index;
-			 */
-			struct ir3_instruction *ms;
-			ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
-
-			src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
-			src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
-
-			opc = OPC_ISAML;
-		} else {
-			src0[nsrc0++] = sample_index;
-		}
-	}
-
-	/* scale up integer coords for TXF based on the LOD */
-	if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
-		assert(has_lod);
-		for (i = 0; i < coords; i++)
-			src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
-	}
-
-	if (coords == 1) {
-		/* hw doesn't do 1d, so we treat it as 2d with
-		 * height of 1, and patch up the y coord.
-		 * TODO: y coord should be (int)0 in some cases..
-		 */
-		src0[nsrc0++] = create_immed(b, fui(0.5));
-	}
-
-	if (tex->is_shadow && tex->op != nir_texop_lod)
-		src0[nsrc0++] = compare;
-
-	if (tex->is_array && tex->op != nir_texop_lod) {
-		struct ir3_instruction *idx = coord[coords];
-
-		/* the array coord for cube arrays needs 0.5 added to it */
-		if (ctx->compiler->array_index_add_half && (opc != OPC_ISAML))
-			idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);
-
-		src0[nsrc0++] = idx;
-	}
-
-	if (has_proj) {
-		src0[nsrc0++] = proj;
-		flags |= IR3_INSTR_P;
-	}
-
-	/* pad to 4, then ddx/ddy: */
-	if (tex->op == nir_texop_txd) {
-		while (nsrc0 < 4)
-			src0[nsrc0++] = create_immed(b, fui(0.0));
-		for (i = 0; i < coords; i++)
-			src0[nsrc0++] = ddx[i];
-		if (coords < 2)
-			src0[nsrc0++] = create_immed(b, fui(0.0));
-		for (i = 0; i < coords; i++)
-			src0[nsrc0++] = ddy[i];
-		if (coords < 2)
-			src0[nsrc0++] = create_immed(b, fui(0.0));
-	}
-
-	/*
-	 * second argument (if applicable):
-	 *  - offsets
-	 *  - lod
-	 *  - bias
-	 */
-	if (has_off | has_lod | has_bias) {
-		if (has_off) {
-			unsigned off_coords = coords;
-			if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
-				off_coords--;
-			for (i = 0; i < off_coords; i++)
-				src1[nsrc1++] = off[i];
-			if (off_coords < 2)
-				src1[nsrc1++] = create_immed(b, fui(0.0));
-			flags |= IR3_INSTR_O;
-		}
-
-		if (has_lod | has_bias)
-			src1[nsrc1++] = lod;
-	}
-
-	switch (tex->dest_type) {
-	case nir_type_invalid:
-	case nir_type_float:
-		type = TYPE_F32;
-		break;
-	case nir_type_int:
-		type = TYPE_S32;
-		break;
-	case nir_type_uint:
-	case nir_type_bool:
-		type = TYPE_U32;
-		break;
-	default:
-		unreachable("bad dest_type");
-	}
-
-	if (opc == OPC_GETLOD)
-		type = TYPE_U32;
-
-	unsigned tex_idx = tex->texture_index;
-
-	ctx->max_texture_index = MAX2(ctx->max_texture_index, tex_idx);
-
-	struct ir3_instruction *col0 = create_collect(ctx, src0, nsrc0);
-	struct ir3_instruction *col1 = create_collect(ctx, src1, nsrc1);
-
-	sam = ir3_SAM(b, opc, type, 0b1111, flags,
-			tex_idx, tex_idx, col0, col1);
-
-	if ((ctx->astc_srgb & (1 << tex_idx)) && !nir_tex_instr_is_query(tex)) {
-		/* only need first 3 components: */
-		sam->regs[0]->wrmask = 0x7;
-		split_dest(b, dst, sam, 0, 3);
-
-		/* we need to sample the alpha separately with a non-ASTC
-		 * texture state:
-		 */
-		sam = ir3_SAM(b, opc, type, 0b1000, flags,
-				tex_idx, tex_idx, col0, col1);
-
-		array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
-
-		/* fixup .w component: */
-		split_dest(b, &dst[3], sam, 3, 1);
-	} else {
-		/* normal (non-workaround) case: */
-		split_dest(b, dst, sam, 0, 4);
-	}
-
-	/* GETLOD returns results in 4.8 fixed point */
-	if (opc == OPC_GETLOD) {
-		struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
-
-		compile_assert(ctx, tex->dest_type == nir_type_float);
-		for (i = 0; i < 2; i++) {
-			dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0,
-							   factor, 0);
-		}
-	}
-
-	put_dst(ctx, &tex->dest);
-}
-
-static void
-emit_tex_query_levels(struct ir3_context *ctx, nir_tex_instr *tex)
-{
-	struct ir3_block *b = ctx->block;
-	struct ir3_instruction **dst, *sam;
-
-	dst = get_dst(ctx, &tex->dest, 1);
-
-	sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, 0b0100, 0,
-			tex->texture_index, tex->texture_index, NULL, NULL);
-
-	/* even though there is only one component, since it ends
-	 * up in .z rather than .x, we need a split_dest()
-	 */
-	split_dest(b, dst, sam, 0, 3);
-
-	/* The # of levels comes from getinfo.z. We need to add 1 to it, since
-	 * the value in TEX_CONST_0 is zero-based.
-	 */
-	if (ctx->compiler->levels_add_one)
-		dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
-
-	put_dst(ctx, &tex->dest);
-}
-
-static void
-emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
-{
-	struct ir3_block *b = ctx->block;
-	struct ir3_instruction **dst, *sam;
-	struct ir3_instruction *lod;
-	unsigned flags, coords;
-
-	tex_info(tex, &flags, &coords);
-
-	/* Actually we want the number of dimensions, not coordinates. This
-	 * distinction only matters for cubes.
-	 */
-	if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
-		coords = 2;
-
-	dst = get_dst(ctx, &tex->dest, 4);
-
-	compile_assert(ctx, tex->num_srcs == 1);
-	compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod);
-
-	lod = get_src(ctx, &tex->src[0].src)[0];
-
-	sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags,
-			tex->texture_index, tex->texture_index, lod, NULL);
-
-	split_dest(b, dst, sam, 0, 4);
-
-	/* Array size actually ends up in .w rather than .z. This doesn't
-	 * matter for miplevel 0, but for higher mips the value in z is
-	 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
-	 * returned, which means that we have to add 1 to it for arrays.
-	 */
-	if (tex->is_array) {
-		if (ctx->compiler->levels_add_one) {
-			dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
-		} else {
-			dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
-		}
-	}
-
-	put_dst(ctx, &tex->dest);
-}
-
-static void
-emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
-{
-	switch (jump->type) {
-	case nir_jump_break:
-	case nir_jump_continue:
-	case nir_jump_return:
-		/* I *think* we can simply just ignore this, and use the
-		 * successor block link to figure out where we need to
-		 * jump to for break/continue
-		 */
-		break;
-	default:
-		compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
-		break;
-	}
-}
-
-static void
-emit_instr(struct ir3_context *ctx, nir_instr *instr)
-{
-	switch (instr->type) {
-	case nir_instr_type_alu:
-		emit_alu(ctx, nir_instr_as_alu(instr));
-		break;
-	case nir_instr_type_deref:
-		/* ignored, handled as part of the intrinsic they are src to */
-		break;
-	case nir_instr_type_intrinsic:
-		emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
-		break;
-	case nir_instr_type_load_const:
-		emit_load_const(ctx, nir_instr_as_load_const(instr));
-		break;
-	case nir_instr_type_ssa_undef:
-		emit_undef(ctx, nir_instr_as_ssa_undef(instr));
-		break;
-	case nir_instr_type_tex: {
-		nir_tex_instr *tex = nir_instr_as_tex(instr);
-		/* couple tex instructions get special-cased:
-		 */
-		switch (tex->op) {
-		case nir_texop_txs:
-			emit_tex_txs(ctx, tex);
-			break;
-		case nir_texop_query_levels:
-			emit_tex_query_levels(ctx, tex);
-			break;
-		default:
-			emit_tex(ctx, tex);
-			break;
-		}
-		break;
-	}
-	case nir_instr_type_jump:
-		emit_jump(ctx, nir_instr_as_jump(instr));
-		break;
-	case nir_instr_type_phi:
-		/* we have converted phi webs to regs in NIR by now */
-		compile_error(ctx, "Unexpected NIR instruction type: %d\n", instr->type);
-		break;
-	case nir_instr_type_call:
-	case nir_instr_type_parallel_copy:
-		compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
-		break;
-	}
-}
-
-static struct ir3_block *
-get_block(struct ir3_context *ctx, const nir_block *nblock)
-{
-	struct ir3_block *block;
-	struct hash_entry *hentry;
-	unsigned i;
-
-	hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
-	if (hentry)
-		return hentry->data;
-
-	block = ir3_block_create(ctx->ir);
-	block->nblock = nblock;
-	_mesa_hash_table_insert(ctx->block_ht, nblock, block);
-
-	block->predecessors_count = nblock->predecessors->entries;
-	block->predecessors = ralloc_array_size(block,
-		sizeof(block->predecessors[0]), block->predecessors_count);
-	i = 0;
-	set_foreach(nblock->predecessors, sentry) {
-		block->predecessors[i++] = get_block(ctx, sentry->key);
-	}
-
-	return block;
-}
-
-static void
-emit_block(struct ir3_context *ctx, nir_block *nblock)
-{
-	struct ir3_block *block = get_block(ctx, nblock);
-
-	for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
-		if (nblock->successors[i]) {
-			block->successors[i] =
-				get_block(ctx, nblock->successors[i]);
-		}
-	}
-
-	ctx->block = block;
-	list_addtail(&block->node, &ctx->ir->block_list);
-
-	/* re-emit addr register in each block if needed: */
-	for (int i = 0; i < ARRAY_SIZE(ctx->addr_ht); i++) {
-		_mesa_hash_table_destroy(ctx->addr_ht[i], NULL);
-		ctx->addr_ht[i] = NULL;
-	}
-
-	nir_foreach_instr(instr, nblock) {
-		ctx->cur_instr = instr;
-		emit_instr(ctx, instr);
-		ctx->cur_instr = NULL;
-		if (ctx->error)
-			return;
-	}
-}
-
-static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
-
-static void
-emit_if(struct ir3_context *ctx, nir_if *nif)
-{
-	struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0];
-
-	ctx->block->condition =
-		get_predicate(ctx, ir3_b2n(condition->block, condition));
-
-	emit_cf_list(ctx, &nif->then_list);
-	emit_cf_list(ctx, &nif->else_list);
-}
-
-static void
-emit_loop(struct ir3_context *ctx, nir_loop *nloop)
-{
-	emit_cf_list(ctx, &nloop->body);
-}
-
-static void
-emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
-{
-	foreach_list_typed(nir_cf_node, node, node, list) {
-		switch (node->type) {
-		case nir_cf_node_block:
-			emit_block(ctx, nir_cf_node_as_block(node));
-			break;
-		case nir_cf_node_if:
-			emit_if(ctx, nir_cf_node_as_if(node));
-			break;
-		case nir_cf_node_loop:
-			emit_loop(ctx, nir_cf_node_as_loop(node));
-			break;
-		case nir_cf_node_function:
-			compile_error(ctx, "TODO\n");
-			break;
-		}
-	}
-}
-
-/* emit stream-out code.  At this point, the current block is the original
- * (nir) end block, and nir ensures that all flow control paths terminate
- * into the end block.  We re-purpose the original end block to generate
- * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
- * block holding stream-out write instructions, followed by the new end
- * block:
- *
- *   blockOrigEnd {
- *      p0.x = (vtxcnt < maxvtxcnt)
- *      // succs: blockStreamOut, blockNewEnd
- *   }
- *   blockStreamOut {
- *      ... stream-out instructions ...
- *      // succs: blockNewEnd
- *   }
- *   blockNewEnd {
- *   }
- */
-static void
-emit_stream_out(struct ir3_context *ctx)
-{
-	struct ir3_shader_variant *v = ctx->so;
-	struct ir3 *ir = ctx->ir;
-	struct ir3_stream_output_info *strmout =
-			&ctx->so->shader->stream_output;
-	struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
-	struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
-	struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
-
-	/* create vtxcnt input in input block at top of shader,
-	 * so that it is seen as live over the entire duration
-	 * of the shader:
-	 */
-	vtxcnt = create_input(ctx, 0);
-	add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt);
-
-	maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
-
-	/* at this point, we are at the original 'end' block,
-	 * re-purpose this block to stream-out condition, then
-	 * append stream-out block and new-end block
-	 */
-	orig_end_block = ctx->block;
-
-// TODO these blocks need to update predecessors..
-// maybe w/ store_global intrinsic, we could do this
-// stuff in nir->nir pass
-
-	stream_out_block = ir3_block_create(ir);
-	list_addtail(&stream_out_block->node, &ir->block_list);
-
-	new_end_block = ir3_block_create(ir);
-	list_addtail(&new_end_block->node, &ir->block_list);
-
-	orig_end_block->successors[0] = stream_out_block;
-	orig_end_block->successors[1] = new_end_block;
-	stream_out_block->successors[0] = new_end_block;
-
-	/* setup 'if (vtxcnt < maxvtxcnt)' condition: */
-	cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
-	cond->regs[0]->num = regid(REG_P0, 0);
-	cond->cat2.condition = IR3_COND_LT;
-
-	/* condition goes on previous block to the conditional,
-	 * since it is used to pick which of the two successor
-	 * paths to take:
-	 */
-	orig_end_block->condition = cond;
-
-	/* switch to stream_out_block to generate the stream-out
-	 * instructions:
-	 */
-	ctx->block = stream_out_block;
-
-	/* Calculate base addresses based on vtxcnt.  Instructions
-	 * generated for bases not used in following loop will be
-	 * stripped out in the backend.
-	 */
-	for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
-		unsigned stride = strmout->stride[i];
-		struct ir3_instruction *base, *off;
-
-		base = create_uniform(ctx, regid(v->constbase.tfbo, i));
-
-		/* 24-bit should be enough: */
-		off = ir3_MUL_U(ctx->block, vtxcnt, 0,
-				create_immed(ctx->block, stride * 4), 0);
-
-		bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
-	}
-
-	/* Generate the per-output store instructions: */
-	for (unsigned i = 0; i < strmout->num_outputs; i++) {
-		for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
-			unsigned c = j + strmout->output[i].start_component;
-			struct ir3_instruction *base, *out, *stg;
-
-			base = bases[strmout->output[i].output_buffer];
-			out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
-
-			stg = ir3_STG(ctx->block, base, 0, out, 0,
-					create_immed(ctx->block, 1), 0);
-			stg->cat6.type = TYPE_U32;
-			stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4;
-
-			array_insert(ctx->block, ctx->block->keeps, stg);
-		}
-	}
-
-	/* and finally switch to the new_end_block: */
-	ctx->block = new_end_block;
-}
-
-static void
-emit_function(struct ir3_context *ctx, nir_function_impl *impl)
-{
-	nir_metadata_require(impl, nir_metadata_block_index);
-
-	emit_cf_list(ctx, &impl->body);
-	emit_block(ctx, impl->end_block);
-
-	/* at this point, we should have a single empty block,
-	 * into which we emit the 'end' instruction.
-	 */
-	compile_assert(ctx, list_empty(&ctx->block->instr_list));
-
-	/* If stream-out (aka transform-feedback) enabled, emit the
-	 * stream-out instructions, followed by a new empty block (into
-	 * which the 'end' instruction lands).
-	 *
-	 * NOTE: it is done in this order, rather than inserting before
-	 * we emit end_block, because NIR guarantees that all blocks
-	 * flow into end_block, and that end_block has no successors.
-	 * So by re-purposing end_block as the first block of stream-
-	 * out, we guarantee that all exit paths flow into the stream-
-	 * out instructions.
-	 */
-	if ((ctx->compiler->gpu_id < 500) &&
-			(ctx->so->shader->stream_output.num_outputs > 0) &&
-			!ctx->so->binning_pass) {
-		debug_assert(ctx->so->type == MESA_SHADER_VERTEX);
-		emit_stream_out(ctx);
-	}
-
-	ir3_END(ctx->block);
-}
-
-static struct ir3_instruction *
-create_frag_coord(struct ir3_context *ctx, unsigned comp)
-{
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction *instr;
-
-	if (!ctx->frag_coord) {
-		ctx->frag_coord = create_input_compmask(ctx, 0, 0xf);
-		/* defer add_sysval_input() until after all inputs created */
-	}
-
-	split_dest(block, &instr, ctx->frag_coord, comp, 1);
-
-	switch (comp) {
-	case 0: /* .x */
-	case 1: /* .y */
-		/* for frag_coord, we get unsigned values.. we need
-		 * to subtract (integer) 8 and divide by 16 (right-
-		 * shift by 4) then convert to float:
-		 *
-		 *    sub.s tmp, src, 8
-		 *    shr.b tmp, tmp, 4
-		 *    mov.u32f32 dst, tmp
-		 *
-		 */
-		instr = ir3_SUB_S(block, instr, 0,
-				create_immed(block, 8), 0);
-		instr = ir3_SHR_B(block, instr, 0,
-				create_immed(block, 4), 0);
-		instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32);
-
-		return instr;
-	case 2: /* .z */
-	case 3: /* .w */
-	default:
-		/* seems that we can use these as-is: */
-		return instr;
-	}
-}
-
-static void
-setup_input(struct ir3_context *ctx, nir_variable *in)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	unsigned ncomp = glsl_get_components(in->type);
-	unsigned n = in->data.driver_location;
-	unsigned slot = in->data.location;
-
-	/* let's pretend things other than vec4 don't exist: */
-	ncomp = MAX2(ncomp, 4);
-
-	/* skip unread inputs, we could end up with (for example), unsplit
-	 * matrix/etc inputs in the case they are not read, so just silently
-	 * skip these.
-	 */
-	if (ncomp > 4)
-		return;
-
-	compile_assert(ctx, ncomp == 4);
-
-	so->inputs[n].slot = slot;
-	so->inputs[n].compmask = (1 << ncomp) - 1;
-	so->inputs_count = MAX2(so->inputs_count, n + 1);
-	so->inputs[n].interpolate = in->data.interpolation;
-
-	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
-		for (int i = 0; i < ncomp; i++) {
-			struct ir3_instruction *instr = NULL;
-			unsigned idx = (n * 4) + i;
-
-			if (slot == VARYING_SLOT_POS) {
-				so->inputs[n].bary = false;
-				so->frag_coord = true;
-				instr = create_frag_coord(ctx, i);
-			} else if (slot == VARYING_SLOT_PNTC) {
-				/* see for example st_nir_fixup_varying_slots().. this is
-				 * maybe a bit mesa/st specific.  But we need things to line
-				 * up for this in fdN_program:
-				 *    unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0);
-				 *    if (emit->sprite_coord_enable & texmask) {
-				 *       ...
-				 *    }
-				 */
-				so->inputs[n].slot = VARYING_SLOT_VAR8;
-				so->inputs[n].bary = true;
-				instr = create_frag_input(ctx, false);
-			} else {
-				bool use_ldlv = false;
-
-				/* detect the special case for front/back colors where
-				 * we need to do flat vs smooth shading depending on
-				 * rast state:
-				 */
-				if (in->data.interpolation == INTERP_MODE_NONE) {
-					switch (slot) {
-					case VARYING_SLOT_COL0:
-					case VARYING_SLOT_COL1:
-					case VARYING_SLOT_BFC0:
-					case VARYING_SLOT_BFC1:
-						so->inputs[n].rasterflat = true;
-						break;
-					default:
-						break;
-					}
-				}
-
-				if (ctx->compiler->flat_bypass) {
-					if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) ||
-							(so->inputs[n].rasterflat && ctx->so->key.rasterflat))
-						use_ldlv = true;
-				}
-
-				so->inputs[n].bary = true;
-
-				instr = create_frag_input(ctx, use_ldlv);
-			}
-
-			compile_assert(ctx, idx < ctx->ir->ninputs);
-
-			ctx->ir->inputs[idx] = instr;
-		}
-	} else if (ctx->so->type == MESA_SHADER_VERTEX) {
-		for (int i = 0; i < ncomp; i++) {
-			unsigned idx = (n * 4) + i;
-			compile_assert(ctx, idx < ctx->ir->ninputs);
-			ctx->ir->inputs[idx] = create_input(ctx, idx);
-		}
-	} else {
-		compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
-	}
-
-	if (so->inputs[n].bary || (ctx->so->type == MESA_SHADER_VERTEX)) {
-		so->total_in += ncomp;
-	}
-}
-
-static void
-setup_output(struct ir3_context *ctx, nir_variable *out)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	unsigned ncomp = glsl_get_components(out->type);
-	unsigned n = out->data.driver_location;
-	unsigned slot = out->data.location;
-	unsigned comp = 0;
-
-	/* let's pretend things other than vec4 don't exist: */
-	ncomp = MAX2(ncomp, 4);
-	compile_assert(ctx, ncomp == 4);
-
-	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
-		switch (slot) {
-		case FRAG_RESULT_DEPTH:
-			comp = 2;  /* tgsi will write to .z component */
-			so->writes_pos = true;
-			break;
-		case FRAG_RESULT_COLOR:
-			so->color0_mrt = 1;
-			break;
-		default:
-			if (slot >= FRAG_RESULT_DATA0)
-				break;
-			compile_error(ctx, "unknown FS output name: %s\n",
-					gl_frag_result_name(slot));
-		}
-	} else if (ctx->so->type == MESA_SHADER_VERTEX) {
-		switch (slot) {
-		case VARYING_SLOT_POS:
-			so->writes_pos = true;
-			break;
-		case VARYING_SLOT_PSIZ:
-			so->writes_psize = true;
-			break;
-		case VARYING_SLOT_COL0:
-		case VARYING_SLOT_COL1:
-		case VARYING_SLOT_BFC0:
-		case VARYING_SLOT_BFC1:
-		case VARYING_SLOT_FOGC:
-		case VARYING_SLOT_CLIP_DIST0:
-		case VARYING_SLOT_CLIP_DIST1:
-		case VARYING_SLOT_CLIP_VERTEX:
-			break;
-		default:
-			if (slot >= VARYING_SLOT_VAR0)
-				break;
-			if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
-				break;
-			compile_error(ctx, "unknown VS output name: %s\n",
-					gl_varying_slot_name(slot));
-		}
-	} else {
-		compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
-	}
-
-	compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
-
-	so->outputs[n].slot = slot;
-	so->outputs[n].regid = regid(n, comp);
-	so->outputs_count = MAX2(so->outputs_count, n + 1);
-
-	for (int i = 0; i < ncomp; i++) {
-		unsigned idx = (n * 4) + i;
-		compile_assert(ctx, idx < ctx->ir->noutputs);
-		ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
-	}
-}
-
-static int
-max_drvloc(struct exec_list *vars)
-{
-	int drvloc = -1;
-	nir_foreach_variable(var, vars) {
-		drvloc = MAX2(drvloc, (int)var->data.driver_location);
-	}
-	return drvloc;
-}
-
-static const unsigned max_sysvals[] = {
-	[MESA_SHADER_FRAGMENT] = 24,  // TODO
-	[MESA_SHADER_VERTEX]  = 16,
-	[MESA_SHADER_COMPUTE] = 16, // TODO how many do we actually need?
-};
-
-static void
-emit_instructions(struct ir3_context *ctx)
-{
-	unsigned ninputs, noutputs;
-	nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
-
-	ninputs  = (max_drvloc(&ctx->s->inputs) + 1) * 4;
-	noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4;
-
-	/* we need to leave room for sysvals:
-	 */
-	ninputs += max_sysvals[ctx->so->type];
-
-	ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
-
-	/* Create inputs in first block: */
-	ctx->block = get_block(ctx, nir_start_block(fxn));
-	ctx->in_block = ctx->block;
-	list_addtail(&ctx->block->node, &ctx->ir->block_list);
-
-	ninputs -= max_sysvals[ctx->so->type];
-
-	/* for fragment shader, the vcoord input register is used as the
-	 * base for bary.f varying fetch instrs:
-	 */
-	struct ir3_instruction *vcoord = NULL;
-	if (ctx->so->type == MESA_SHADER_FRAGMENT) {
-		struct ir3_instruction *xy[2];
-
-		vcoord = create_input_compmask(ctx, 0, 0x3);
-		split_dest(ctx->block, xy, vcoord, 0, 2);
-
-		ctx->frag_vcoord = create_collect(ctx, xy, 2);
-	}
-
-	/* Setup inputs: */
-	nir_foreach_variable(var, &ctx->s->inputs) {
-		setup_input(ctx, var);
-	}
-
-	/* Defer add_sysval_input() stuff until after setup_inputs(),
-	 * because sysvals need to be appended after varyings:
-	 */
-	if (vcoord) {
-		add_sysval_input_compmask(ctx, SYSTEM_VALUE_VARYING_COORD,
-				0x3, vcoord);
-	}
-
-	if (ctx->frag_coord) {
-		add_sysval_input_compmask(ctx, SYSTEM_VALUE_FRAG_COORD,
-				0xf, ctx->frag_coord);
-	}
-
-	/* Setup outputs: */
-	nir_foreach_variable(var, &ctx->s->outputs) {
-		setup_output(ctx, var);
-	}
-
-	/* Setup registers (which should only be arrays): */
-	nir_foreach_register(reg, &ctx->s->registers) {
-		declare_array(ctx, reg);
-	}
-
-	/* NOTE: need to do something more clever when we support >1 fxn */
-	nir_foreach_register(reg, &fxn->registers) {
-		declare_array(ctx, reg);
-	}
-	/* And emit the body: */
-	ctx->impl = fxn;
-	emit_function(ctx, fxn);
-}
-
-/* from NIR perspective, we actually have varying inputs.  But the varying
- * inputs, from an IR standpoint, are just bary.f/ldlv instructions.  The
- * only actual inputs are the sysvals.
- */
-static void
-fixup_frag_inputs(struct ir3_context *ctx)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	struct ir3 *ir = ctx->ir;
-	unsigned i = 0;
-
-	/* sysvals should appear at the end of the inputs, drop everything else: */
-	while ((i < so->inputs_count) && !so->inputs[i].sysval)
-		i++;
-
-	/* at IR level, inputs are always blocks of 4 scalars: */
-	i *= 4;
-
-	ir->inputs = &ir->inputs[i];
-	ir->ninputs -= i;
-}
-
-/* Fixup tex sampler state for astc/srgb workaround instructions.  We
- * need to assign the tex state indexes for these after we know the
- * max tex index.
- */
-static void
-fixup_astc_srgb(struct ir3_context *ctx)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	/* indexed by original tex idx, value is newly assigned alpha sampler
-	 * state tex idx.  Zero is invalid since there is at least one sampler
-	 * if we get here.
-	 */
-	unsigned alt_tex_state[16] = {0};
-	unsigned tex_idx = ctx->max_texture_index + 1;
-	unsigned idx = 0;
-
-	so->astc_srgb.base = tex_idx;
-
-	for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
-		struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
-
-		compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
-
-		if (alt_tex_state[sam->cat5.tex] == 0) {
-			/* assign new alternate/alpha tex state slot: */
-			alt_tex_state[sam->cat5.tex] = tex_idx++;
-			so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
-			so->astc_srgb.count++;
-		}
-
-		sam->cat5.tex = alt_tex_state[sam->cat5.tex];
-	}
-}
-
-static void
-fixup_binning_pass(struct ir3_context *ctx)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	struct ir3 *ir = ctx->ir;
-	unsigned i, j;
-
-	for (i = 0, j = 0; i < so->outputs_count; i++) {
-		unsigned slot = so->outputs[i].slot;
-
-		/* throw away everything but first position/psize */
-		if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) {
-			if (i != j) {
-				so->outputs[j] = so->outputs[i];
-				ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0];
-				ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1];
-				ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2];
-				ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3];
-			}
-			j++;
-		}
-	}
-	so->outputs_count = j;
-	ir->noutputs = j * 4;
-}
-
-int
-ir3_compile_shader_nir(struct ir3_compiler *compiler,
-		struct ir3_shader_variant *so)
-{
-	struct ir3_context *ctx;
-	struct ir3 *ir;
-	struct ir3_instruction **inputs;
-	unsigned i, actual_in, inloc;
-	int ret = 0, max_bary;
-
-	assert(!so->ir);
-
-	ctx = compile_init(compiler, so);
-	if (!ctx) {
-		DBG("INIT failed!");
-		ret = -1;
-		goto out;
-	}
-
-	emit_instructions(ctx);
-
-	if (ctx->error) {
-		DBG("EMIT failed!");
-		ret = -1;
-		goto out;
-	}
-
-	ir = so->ir = ctx->ir;
-
-	/* keep track of the inputs from TGSI perspective.. */
-	inputs = ir->inputs;
-
-	/* but fixup actual inputs for frag shader: */
-	if (so->type == MESA_SHADER_FRAGMENT)
-		fixup_frag_inputs(ctx);
-
-	/* at this point, for binning pass, throw away unneeded outputs: */
-	if (so->binning_pass && (ctx->compiler->gpu_id < 600))
-		fixup_binning_pass(ctx);
-
-	/* if we want half-precision outputs, mark the output registers
-	 * as half:
-	 */
-	if (so->key.half_precision) {
-		for (i = 0; i < ir->noutputs; i++) {
-			struct ir3_instruction *out = ir->outputs[i];
-
-			if (!out)
-				continue;
-
-			/* if frag shader writes z, that needs to be full precision: */
-			if (so->outputs[i/4].slot == FRAG_RESULT_DEPTH)
-				continue;
-
-			out->regs[0]->flags |= IR3_REG_HALF;
-			/* output could be a fanout (ie. texture fetch output)
-			 * in which case we need to propagate the half-reg flag
-			 * up to the definer so that RA sees it:
-			 */
-			if (out->opc == OPC_META_FO) {
-				out = out->regs[1]->instr;
-				out->regs[0]->flags |= IR3_REG_HALF;
-			}
-
-			if (out->opc == OPC_MOV) {
-				out->cat1.dst_type = half_type(out->cat1.dst_type);
-			}
-		}
-	}
-
-	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-		printf("BEFORE CP:\n");
-		ir3_print(ir);
-	}
-
-	ir3_cp(ir, so);
-
-	/* at this point, for binning pass, throw away unneeded outputs:
-	 * Note that for a6xx and later, we do this after ir3_cp to ensure
-	 * that the uniform/constant layout for BS and VS matches, so that
-	 * we can re-use same VS_CONST state group.
-	 */
-	if (so->binning_pass && (ctx->compiler->gpu_id >= 600))
-		fixup_binning_pass(ctx);
-
-	/* Insert mov if there's same instruction for each output.
-	 * eg. dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_expression.vertex.sampler2dshadow
-	 */
-	for (int i = ir->noutputs - 1; i >= 0; i--) {
-		if (!ir->outputs[i])
-			continue;
-		for (unsigned j = 0; j < i; j++) {
-			if (ir->outputs[i] == ir->outputs[j]) {
-				ir->outputs[i] =
-					ir3_MOV(ir->outputs[i]->block, ir->outputs[i], TYPE_F32);
-			}
-		}
-	}
-
-	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-		printf("BEFORE GROUPING:\n");
-		ir3_print(ir);
-	}
-
-	ir3_sched_add_deps(ir);
-
-	/* Group left/right neighbors, inserting mov's where needed to
-	 * solve conflicts:
-	 */
-	ir3_group(ir);
-
-	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-		printf("AFTER GROUPING:\n");
-		ir3_print(ir);
-	}
-
-	ir3_depth(ir);
-
-	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-		printf("AFTER DEPTH:\n");
-		ir3_print(ir);
-	}
-
-	ret = ir3_sched(ir);
-	if (ret) {
-		DBG("SCHED failed!");
-		goto out;
-	}
-
-	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-		printf("AFTER SCHED:\n");
-		ir3_print(ir);
-	}
-
-	ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face);
-	if (ret) {
-		DBG("RA failed!");
-		goto out;
-	}
-
-	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-		printf("AFTER RA:\n");
-		ir3_print(ir);
-	}
-
-	/* fixup input/outputs: */
-	for (i = 0; i < so->outputs_count; i++) {
-		so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
-	}
-
-	/* Note that some or all channels of an input may be unused: */
-	actual_in = 0;
-	inloc = 0;
-	for (i = 0; i < so->inputs_count; i++) {
-		unsigned j, reg = regid(63,0), compmask = 0, maxcomp = 0;
-		so->inputs[i].ncomp = 0;
-		so->inputs[i].inloc = inloc;
-		for (j = 0; j < 4; j++) {
-			struct ir3_instruction *in = inputs[(i*4) + j];
-			if (in && !(in->flags & IR3_INSTR_UNUSED)) {
-				compmask |= (1 << j);
-				reg = in->regs[0]->num - j;
-				actual_in++;
-				so->inputs[i].ncomp++;
-				if ((so->type == MESA_SHADER_FRAGMENT) && so->inputs[i].bary) {
-					/* assign inloc: */
-					assert(in->regs[1]->flags & IR3_REG_IMMED);
-					in->regs[1]->iim_val = inloc + j;
-					maxcomp = j + 1;
-				}
-			}
-		}
-		if ((so->type == MESA_SHADER_FRAGMENT) && compmask && so->inputs[i].bary) {
-			so->varying_in++;
-			so->inputs[i].compmask = (1 << maxcomp) - 1;
-			inloc += maxcomp;
-		} else if (!so->inputs[i].sysval) {
-			so->inputs[i].compmask = compmask;
-		}
-		so->inputs[i].regid = reg;
-	}
-
-	if (ctx->astc_srgb)
-		fixup_astc_srgb(ctx);
-
-	/* We need to do legalize after (for frag shader's) the "bary.f"
-	 * offsets (inloc) have been assigned.
-	 */
-	ir3_legalize(ir, &so->num_samp, &so->has_ssbo, &max_bary);
-
-	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-		printf("AFTER LEGALIZE:\n");
-		ir3_print(ir);
-	}
-
-	/* Note that actual_in counts inputs that are not bary.f'd for FS: */
-	if (so->type == MESA_SHADER_VERTEX)
-		so->total_in = actual_in;
-	else
-		so->total_in = max_bary + 1;
-
-out:
-	if (ret) {
-		if (so->ir)
-			ir3_destroy(so->ir);
-		so->ir = NULL;
-	}
-	compile_free(ctx);
-
-	return ret;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
deleted file mode 100644
index e8e8cc311e3..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ /dev/null
@@ -1,653 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#include <math.h>
-
-#include "ir3.h"
-#include "ir3_shader.h"
-
-/*
- * Copy Propagate:
- */
-
-struct ir3_cp_ctx {
-	struct ir3 *shader;
-	struct ir3_shader_variant *so;
-	unsigned immediate_idx;
-};
-
-/* is it a type preserving mov, with ok flags? */
-static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
-{
-	if (is_same_type_mov(instr)) {
-		struct ir3_register *dst = instr->regs[0];
-		struct ir3_register *src = instr->regs[1];
-		struct ir3_instruction *src_instr = ssa(src);
-
-		/* only if mov src is SSA (not const/immed): */
-		if (!src_instr)
-			return false;
-
-		/* no indirect: */
-		if (dst->flags & IR3_REG_RELATIV)
-			return false;
-		if (src->flags & IR3_REG_RELATIV)
-			return false;
-
-		if (src->flags & IR3_REG_ARRAY)
-			return false;
-
-		if (!allow_flags)
-			if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG |
-					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
-				return false;
-
-		/* TODO: remove this hack: */
-		if (src_instr->opc == OPC_META_FO)
-			return false;
-
-		return true;
-	}
-	return false;
-}
-
-static unsigned cp_flags(unsigned flags)
-{
-	/* only considering these flags (at least for now): */
-	flags &= (IR3_REG_CONST | IR3_REG_IMMED |
-			IR3_REG_FNEG | IR3_REG_FABS |
-			IR3_REG_SNEG | IR3_REG_SABS |
-			IR3_REG_BNOT | IR3_REG_RELATIV);
-	return flags;
-}
-
-static bool valid_flags(struct ir3_instruction *instr, unsigned n,
-		unsigned flags)
-{
-	unsigned valid_flags;
-	flags = cp_flags(flags);
-
-	/* If destination is indirect, then source cannot be.. at least
-	 * I don't think so..
-	 */
-	if ((instr->regs[0]->flags & IR3_REG_RELATIV) &&
-			(flags & IR3_REG_RELATIV))
-		return false;
-
-	/* TODO it seems to *mostly* work to cp RELATIV, except we get some
-	 * intermittent piglit variable-indexing fails.  Newer blob driver
-	 * doesn't seem to cp these.  Possibly this is hw workaround?  Not
-	 * sure, but until that is understood better, lets just switch off
-	 * cp for indirect src's:
-	 */
-	if (flags & IR3_REG_RELATIV)
-		return false;
-
-	switch (opc_cat(instr->opc)) {
-	case 1:
-		valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
-		if (flags & ~valid_flags)
-			return false;
-		break;
-	case 2:
-		valid_flags = ir3_cat2_absneg(instr->opc) |
-				IR3_REG_CONST | IR3_REG_RELATIV;
-
-		if (ir3_cat2_int(instr->opc))
-			valid_flags |= IR3_REG_IMMED;
-
-		if (flags & ~valid_flags)
-			return false;
-
-		if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
-			unsigned m = (n ^ 1) + 1;
-			/* cannot deal w/ const in both srcs:
-			 * (note that some cat2 actually only have a single src)
-			 */
-			if (m < instr->regs_count) {
-				struct ir3_register *reg = instr->regs[m];
-				if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST))
-					return false;
-				if ((flags & IR3_REG_IMMED) && (reg->flags & IR3_REG_IMMED))
-					return false;
-			}
-			/* cannot be const + ABS|NEG: */
-			if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
-					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
-				return false;
-		}
-		break;
-	case 3:
-		valid_flags = ir3_cat3_absneg(instr->opc) |
-				IR3_REG_CONST | IR3_REG_RELATIV;
-
-		if (flags & ~valid_flags)
-			return false;
-
-		if (flags & (IR3_REG_CONST | IR3_REG_RELATIV)) {
-			/* cannot deal w/ const/relativ in 2nd src: */
-			if (n == 1)
-				return false;
-		}
-
-		if (flags & IR3_REG_CONST) {
-			/* cannot be const + ABS|NEG: */
-			if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
-					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
-				return false;
-		}
-		break;
-	case 4:
-		/* seems like blob compiler avoids const as src.. */
-		/* TODO double check if this is still the case on a4xx */
-		if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
-			return false;
-		if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
-			return false;
-		break;
-	case 5:
-		/* no flags allowed */
-		if (flags)
-			return false;
-		break;
-	case 6:
-		valid_flags = IR3_REG_IMMED;
-		if (flags & ~valid_flags)
-			return false;
-
-		if (flags & IR3_REG_IMMED) {
-			/* doesn't seem like we can have immediate src for store
-			 * instructions:
-			 *
-			 * TODO this restriction could also apply to load instructions,
-			 * but for load instructions this arg is the address (and not
-			 * really sure any good way to test a hard-coded immed addr src)
-			 */
-			if (is_store(instr) && (n == 1))
-				return false;
-
-			if ((instr->opc == OPC_LDL) && (n != 1))
-				return false;
-
-			if ((instr->opc == OPC_STL) && (n != 2))
-				return false;
-
-			/* disallow CP into anything but the SSBO slot argument for
-			 * atomics:
-			 */
-			if (is_atomic(instr->opc) && (n != 0))
-				return false;
-
-			if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
-				return false;
-		}
-
-		break;
-	}
-
-	return true;
-}
-
-/* propagate register flags from src to dst.. negates need special
- * handling to cancel each other out.
- */
-static void combine_flags(unsigned *dstflags, struct ir3_instruction *src)
-{
-	unsigned srcflags = src->regs[1]->flags;
-
-	/* if what we are combining into already has (abs) flags,
-	 * we can drop (neg) from src:
-	 */
-	if (*dstflags & IR3_REG_FABS)
-		srcflags &= ~IR3_REG_FNEG;
-	if (*dstflags & IR3_REG_SABS)
-		srcflags &= ~IR3_REG_SNEG;
-
-	if (srcflags & IR3_REG_FABS)
-		*dstflags |= IR3_REG_FABS;
-	if (srcflags & IR3_REG_SABS)
-		*dstflags |= IR3_REG_SABS;
-	if (srcflags & IR3_REG_FNEG)
-		*dstflags ^= IR3_REG_FNEG;
-	if (srcflags & IR3_REG_SNEG)
-		*dstflags ^= IR3_REG_SNEG;
-	if (srcflags & IR3_REG_BNOT)
-		*dstflags ^= IR3_REG_BNOT;
-
-	*dstflags &= ~IR3_REG_SSA;
-	*dstflags |= srcflags & IR3_REG_SSA;
-	*dstflags |= srcflags & IR3_REG_CONST;
-	*dstflags |= srcflags & IR3_REG_IMMED;
-	*dstflags |= srcflags & IR3_REG_RELATIV;
-	*dstflags |= srcflags & IR3_REG_ARRAY;
-
-	/* if src of the src is boolean we can drop the (abs) since we know
-	 * the source value is already a postitive integer.  This cleans
-	 * up the absnegs that get inserted when converting between nir and
-	 * native boolean (see ir3_b2n/n2b)
-	 */
-	struct ir3_instruction *srcsrc = ssa(src->regs[1]);
-	if (srcsrc && is_bool(srcsrc))
-		*dstflags &= ~IR3_REG_SABS;
-}
-
-static struct ir3_register *
-lower_immed(struct ir3_cp_ctx *ctx, struct ir3_register *reg, unsigned new_flags)
-{
-	unsigned swiz, idx, i;
-
-	reg = ir3_reg_clone(ctx->shader, reg);
-
-	/* in some cases, there are restrictions on (abs)/(neg) plus const..
-	 * so just evaluate those and clear the flags:
-	 */
-	if (new_flags & IR3_REG_SABS) {
-		reg->iim_val = abs(reg->iim_val);
-		new_flags &= ~IR3_REG_SABS;
-	}
-
-	if (new_flags & IR3_REG_FABS) {
-		reg->fim_val = fabs(reg->fim_val);
-		new_flags &= ~IR3_REG_FABS;
-	}
-
-	if (new_flags & IR3_REG_SNEG) {
-		reg->iim_val = -reg->iim_val;
-		new_flags &= ~IR3_REG_SNEG;
-	}
-
-	if (new_flags & IR3_REG_FNEG) {
-		reg->fim_val = -reg->fim_val;
-		new_flags &= ~IR3_REG_FNEG;
-	}
-
-	/* Reallocate for 4 more elements whenever it's necessary */
-	if (ctx->immediate_idx == ctx->so->immediates_size * 4) {
-		ctx->so->immediates_size += 4;
-		ctx->so->immediates = realloc (ctx->so->immediates,
-			ctx->so->immediates_size * sizeof (ctx->so->immediates[0]));
-	}
-
-	for (i = 0; i < ctx->immediate_idx; i++) {
-		swiz = i % 4;
-		idx  = i / 4;
-
-		if (ctx->so->immediates[idx].val[swiz] == reg->uim_val) {
-			break;
-		}
-	}
-
-	if (i == ctx->immediate_idx) {
-		/* need to generate a new immediate: */
-		swiz = i % 4;
-		idx  = i / 4;
-		ctx->so->immediates[idx].val[swiz] = reg->uim_val;
-		ctx->so->immediates_count = idx + 1;
-		ctx->immediate_idx++;
-	}
-
-	new_flags &= ~IR3_REG_IMMED;
-	new_flags |= IR3_REG_CONST;
-	reg->flags = new_flags;
-	reg->num = i + (4 * ctx->so->constbase.immediate);
-
-	return reg;
-}
-
-static void
-unuse(struct ir3_instruction *instr)
-{
-	debug_assert(instr->use_count > 0);
-
-	if (--instr->use_count == 0) {
-		struct ir3_block *block = instr->block;
-
-		instr->barrier_class = 0;
-		instr->barrier_conflict = 0;
-
-		/* we don't want to remove anything in keeps (which could
-		 * be things like array store's)
-		 */
-		for (unsigned i = 0; i < block->keeps_count; i++) {
-			debug_assert(block->keeps[i] != instr);
-		}
-	}
-}
-
-/**
- * Handle cp for a given src register.  This additionally handles
- * the cases of collapsing immedate/const (which replace the src
- * register with a non-ssa src) or collapsing mov's from relative
- * src (which needs to also fixup the address src reference by the
- * instruction).
- */
-static void
-reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
-		struct ir3_register *reg, unsigned n)
-{
-	struct ir3_instruction *src = ssa(reg);
-
-	if (is_eligible_mov(src, true)) {
-		/* simple case, no immed/const/relativ, only mov's w/ ssa src: */
-		struct ir3_register *src_reg = src->regs[1];
-		unsigned new_flags = reg->flags;
-
-		combine_flags(&new_flags, src);
-
-		if (valid_flags(instr, n, new_flags)) {
-			if (new_flags & IR3_REG_ARRAY) {
-				debug_assert(!(reg->flags & IR3_REG_ARRAY));
-				reg->array = src_reg->array;
-			}
-			reg->flags = new_flags;
-			reg->instr = ssa(src_reg);
-
-			instr->barrier_class |= src->barrier_class;
-			instr->barrier_conflict |= src->barrier_conflict;
-
-			unuse(src);
-			reg->instr->use_count++;
-		}
-
-	} else if (is_same_type_mov(src) &&
-			/* cannot collapse const/immed/etc into meta instrs: */
-			!is_meta(instr)) {
-		/* immed/const/etc cases, which require some special handling: */
-		struct ir3_register *src_reg = src->regs[1];
-		unsigned new_flags = reg->flags;
-
-		combine_flags(&new_flags, src);
-
-		if (!valid_flags(instr, n, new_flags)) {
-			/* See if lowering an immediate to const would help. */
-			if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
-				debug_assert(new_flags & IR3_REG_IMMED);
-				instr->regs[n + 1] = lower_immed(ctx, src_reg, new_flags);
-				return;
-			}
-
-			/* special case for "normal" mad instructions, we can
-			 * try swapping the first two args if that fits better.
-			 *
-			 * the "plain" MAD's (ie. the ones that don't shift first
-			 * src prior to multiply) can swap their first two srcs if
-			 * src[0] is !CONST and src[1] is CONST:
-			 */
-			if ((n == 1) && is_mad(instr->opc) &&
-					!(instr->regs[0 + 1]->flags & (IR3_REG_CONST | IR3_REG_RELATIV)) &&
-					valid_flags(instr, 0, new_flags & ~IR3_REG_IMMED)) {
-				/* swap src[0] and src[1]: */
-				struct ir3_register *tmp;
-				tmp = instr->regs[0 + 1];
-				instr->regs[0 + 1] = instr->regs[1 + 1];
-				instr->regs[1 + 1] = tmp;
-
-				n = 0;
-			} else {
-				return;
-			}
-		}
-
-		/* Here we handle the special case of mov from
-		 * CONST and/or RELATIV.  These need to be handled
-		 * specially, because in the case of move from CONST
-		 * there is no src ir3_instruction so we need to
-		 * replace the ir3_register.  And in the case of
-		 * RELATIV we need to handle the address register
-		 * dependency.
-		 */
-		if (src_reg->flags & IR3_REG_CONST) {
-			/* an instruction cannot reference two different
-			 * address registers:
-			 */
-			if ((src_reg->flags & IR3_REG_RELATIV) &&
-					conflicts(instr->address, reg->instr->address))
-				return;
-
-			/* This seems to be a hw bug, or something where the timings
-			 * just somehow don't work out.  This restriction may only
-			 * apply if the first src is also CONST.
-			 */
-			if ((opc_cat(instr->opc) == 3) && (n == 2) &&
-					(src_reg->flags & IR3_REG_RELATIV) &&
-					(src_reg->array.offset == 0))
-				return;
-
-			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
-			src_reg->flags = new_flags;
-			instr->regs[n+1] = src_reg;
-
-			if (src_reg->flags & IR3_REG_RELATIV)
-				ir3_instr_set_address(instr, reg->instr->address);
-
-			return;
-		}
-
-		if ((src_reg->flags & IR3_REG_RELATIV) &&
-				!conflicts(instr->address, reg->instr->address)) {
-			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
-			src_reg->flags = new_flags;
-			instr->regs[n+1] = src_reg;
-			ir3_instr_set_address(instr, reg->instr->address);
-
-			return;
-		}
-
-		/* NOTE: seems we can only do immed integers, so don't
-		 * need to care about float.  But we do need to handle
-		 * abs/neg *before* checking that the immediate requires
-		 * few enough bits to encode:
-		 *
-		 * TODO: do we need to do something to avoid accidentally
-		 * catching a float immed?
-		 */
-		if (src_reg->flags & IR3_REG_IMMED) {
-			int32_t iim_val = src_reg->iim_val;
-
-			debug_assert((opc_cat(instr->opc) == 1) ||
-					(opc_cat(instr->opc) == 6) ||
-					ir3_cat2_int(instr->opc) ||
-					(is_mad(instr->opc) && (n == 0)));
-
-			if (new_flags & IR3_REG_SABS)
-				iim_val = abs(iim_val);
-
-			if (new_flags & IR3_REG_SNEG)
-				iim_val = -iim_val;
-
-			if (new_flags & IR3_REG_BNOT)
-				iim_val = ~iim_val;
-
-			/* other than category 1 (mov) we can only encode up to 10 bits: */
-			if ((instr->opc == OPC_MOV) ||
-					!((iim_val & ~0x3ff) && (-iim_val & ~0x3ff))) {
-				new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
-				src_reg = ir3_reg_clone(instr->block->shader, src_reg);
-				src_reg->flags = new_flags;
-				src_reg->iim_val = iim_val;
-				instr->regs[n+1] = src_reg;
-			} else if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
-				/* See if lowering an immediate to const would help. */
-				instr->regs[n+1] = lower_immed(ctx, src_reg, new_flags);
-			}
-
-			return;
-		}
-	}
-}
-
-/* Handle special case of eliminating output mov, and similar cases where
- * there isn't a normal "consuming" instruction.  In this case we cannot
- * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot
- * be eliminated)
- */
-static struct ir3_instruction *
-eliminate_output_mov(struct ir3_instruction *instr)
-{
-	if (is_eligible_mov(instr, false)) {
-		struct ir3_register *reg = instr->regs[1];
-		if (!(reg->flags & IR3_REG_ARRAY)) {
-			struct ir3_instruction *src_instr = ssa(reg);
-			debug_assert(src_instr);
-			return src_instr;
-		}
-	}
-	return instr;
-}
-
-/**
- * Find instruction src's which are mov's that can be collapsed, replacing
- * the mov dst with the mov src
- */
-static void
-instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr)
-{
-	struct ir3_register *reg;
-
-	if (instr->regs_count == 0)
-		return;
-
-	if (ir3_instr_check_mark(instr))
-		return;
-
-	/* walk down the graph from each src: */
-	foreach_src_n(reg, n, instr) {
-		struct ir3_instruction *src = ssa(reg);
-
-		if (!src)
-			continue;
-
-		instr_cp(ctx, src);
-
-		/* TODO non-indirect access we could figure out which register
-		 * we actually want and allow cp..
-		 */
-		if (reg->flags & IR3_REG_ARRAY)
-			continue;
-
-		/* Don't CP absneg into meta instructions, that won't end well: */
-		if (is_meta(instr) && (src->opc != OPC_MOV))
-			continue;
-
-		reg_cp(ctx, instr, reg, n);
-	}
-
-	if (instr->regs[0]->flags & IR3_REG_ARRAY) {
-		struct ir3_instruction *src = ssa(instr->regs[0]);
-		if (src)
-			instr_cp(ctx, src);
-	}
-
-	if (instr->address) {
-		instr_cp(ctx, instr->address);
-		ir3_instr_set_address(instr, eliminate_output_mov(instr->address));
-	}
-
-	/* we can end up with extra cmps.s from frontend, which uses a
-	 *
-	 *    cmps.s p0.x, cond, 0
-	 *
-	 * as a way to mov into the predicate register.  But frequently 'cond'
-	 * is itself a cmps.s/cmps.f/cmps.u.  So detect this special case and
-	 * just re-write the instruction writing predicate register to get rid
-	 * of the double cmps.
-	 */
-	if ((instr->opc == OPC_CMPS_S) &&
-			(instr->regs[0]->num == regid(REG_P0, 0)) &&
-			ssa(instr->regs[1]) &&
-			(instr->regs[2]->flags & IR3_REG_IMMED) &&
-			(instr->regs[2]->iim_val == 0)) {
-		struct ir3_instruction *cond = ssa(instr->regs[1]);
-		switch (cond->opc) {
-		case OPC_CMPS_S:
-		case OPC_CMPS_F:
-		case OPC_CMPS_U:
-			instr->opc   = cond->opc;
-			instr->flags = cond->flags;
-			instr->cat2  = cond->cat2;
-			instr->address = cond->address;
-			instr->regs[1] = cond->regs[1];
-			instr->regs[2] = cond->regs[2];
-			instr->barrier_class |= cond->barrier_class;
-			instr->barrier_conflict |= cond->barrier_conflict;
-			unuse(cond);
-			break;
-		default:
-			break;
-		}
-	}
-}
-
-void
-ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so)
-{
-	struct ir3_cp_ctx ctx = {
-			.shader = ir,
-			.so = so,
-	};
-
-	/* This is a bit annoying, and probably wouldn't be necessary if we
-	 * tracked a reverse link from producing instruction to consumer.
-	 * But we need to know when we've eliminated the last consumer of
-	 * a mov, so we need to do a pass to first count consumers of a
-	 * mov.
-	 */
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-			struct ir3_instruction *src;
-
-			/* by the way, we don't account for false-dep's, so the CP
-			 * pass should always happen before false-dep's are inserted
-			 */
-			debug_assert(instr->deps_count == 0);
-
-			foreach_ssa_src(src, instr) {
-				src->use_count++;
-			}
-		}
-	}
-
-	ir3_clear_mark(ir);
-
-	for (unsigned i = 0; i < ir->noutputs; i++) {
-		if (ir->outputs[i]) {
-			instr_cp(&ctx, ir->outputs[i]);
-			ir->outputs[i] = eliminate_output_mov(ir->outputs[i]);
-		}
-	}
-
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		if (block->condition) {
-			instr_cp(&ctx, block->condition);
-			block->condition = eliminate_output_mov(block->condition);
-		}
-
-		for (unsigned i = 0; i < block->keeps_count; i++) {
-			instr_cp(&ctx, block->keeps[i]);
-			block->keeps[i] = eliminate_output_mov(block->keeps[i]);
-		}
-	}
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
deleted file mode 100644
index 73bf5e19926..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#include "util/u_math.h"
-
-#include "ir3.h"
-
-/*
- * Instruction Depth:
- *
- * Calculates weighted instruction depth, ie. the sum of # of needed
- * instructions plus delay slots back to original input (ie INPUT or
- * CONST).  That is to say, an instructions depth is:
- *
- *   depth(instr) {
- *     d = 0;
- *     // for each src register:
- *     foreach (src in instr->regs[1..n])
- *       d = max(d, delayslots(src->instr, n) + depth(src->instr));
- *     return d + 1;
- *   }
- *
- * After an instruction's depth is calculated, it is inserted into the
- * blocks depth sorted list, which is used by the scheduling pass.
- */
-
-/* generally don't count false dependencies, since this can just be
- * something like a barrier, or SSBO store.  The exception is array
- * dependencies if the assigner is an array write and the consumer
- * reads the same array.
- */
-static bool
-ignore_dep(struct ir3_instruction *assigner,
-		struct ir3_instruction *consumer, unsigned n)
-{
-	if (!__is_false_dep(consumer, n))
-		return false;
-
-	if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
-		struct ir3_register *dst = assigner->regs[0];
-		struct ir3_register *src;
-
-		debug_assert(dst->flags & IR3_REG_ARRAY);
-
-		foreach_src(src, consumer) {
-			if ((src->flags & IR3_REG_ARRAY) &&
-					(dst->array.id == src->array.id)) {
-				return false;
-			}
-		}
-	}
-
-	return true;
-}
-
-/* calculate required # of delay slots between the instruction that
- * assigns a value and the one that consumes
- */
-int ir3_delayslots(struct ir3_instruction *assigner,
-		struct ir3_instruction *consumer, unsigned n)
-{
-	if (ignore_dep(assigner, consumer, n))
-		return 0;
-
-	/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
-	 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
-	 * handled with sync bits
-	 */
-
-	if (is_meta(assigner))
-		return 0;
-
-	if (writes_addr(assigner))
-		return 6;
-
-	/* handled via sync flags: */
-	if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
-		return 0;
-
-	/* assigner must be alu: */
-	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
-			is_mem(consumer)) {
-		return 6;
-	} else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
-			(n == 3)) {
-		/* special case, 3rd src to cat3 not required on first cycle */
-		return 1;
-	} else {
-		return 3;
-	}
-}
-
-void
-ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list)
-{
-	/* remove from existing spot in list: */
-	list_delinit(&instr->node);
-
-	/* find where to re-insert instruction: */
-	list_for_each_entry (struct ir3_instruction, pos, list, node) {
-		if (pos->depth > instr->depth) {
-			list_add(&instr->node, &pos->node);
-			return;
-		}
-	}
-	/* if we get here, we didn't find an insertion spot: */
-	list_addtail(&instr->node, list);
-}
-
-static void
-ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep)
-{
-	struct ir3_instruction *src;
-
-	/* don't mark falsedep's as used, but otherwise process them normally: */
-	if (!falsedep)
-		instr->flags &= ~IR3_INSTR_UNUSED;
-
-	if (ir3_instr_check_mark(instr))
-		return;
-
-	instr->depth = 0;
-
-	foreach_ssa_src_n(src, i, instr) {
-		unsigned sd;
-
-		/* visit child to compute it's depth: */
-		ir3_instr_depth(src, boost, __is_false_dep(instr, i));
-
-		/* for array writes, no need to delay on previous write: */
-		if (i == 0)
-			continue;
-
-		sd = ir3_delayslots(src, instr, i) + src->depth;
-		sd += boost;
-
-		instr->depth = MAX2(instr->depth, sd);
-	}
-
-	if (!is_meta(instr))
-		instr->depth++;
-
-	ir3_insert_by_depth(instr, &instr->block->instr_list);
-}
-
-static bool
-remove_unused_by_block(struct ir3_block *block)
-{
-	bool progress = false;
-	list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
-		if (instr->opc == OPC_END)
-			continue;
-		if (instr->flags & IR3_INSTR_UNUSED) {
-			list_delinit(&instr->node);
-			progress = true;
-		}
-	}
-	return progress;
-}
-
-static bool
-compute_depth_and_remove_unused(struct ir3 *ir)
-{
-	unsigned i;
-	bool progress = false;
-
-	ir3_clear_mark(ir);
-
-	/* initially mark everything as unused, we'll clear the flag as we
-	 * visit the instructions:
-	 */
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-			instr->flags |= IR3_INSTR_UNUSED;
-		}
-	}
-
-	for (i = 0; i < ir->noutputs; i++)
-		if (ir->outputs[i])
-			ir3_instr_depth(ir->outputs[i], 0, false);
-
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		for (i = 0; i < block->keeps_count; i++)
-			ir3_instr_depth(block->keeps[i], 0, false);
-
-		/* We also need to account for if-condition: */
-		if (block->condition)
-			ir3_instr_depth(block->condition, 6, false);
-	}
-
-	/* mark un-used instructions: */
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		progress |= remove_unused_by_block(block);
-	}
-
-	/* note that we can end up with unused indirects, but we should
-	 * not end up with unused predicates.
-	 */
-	for (i = 0; i < ir->indirects_count; i++) {
-		struct ir3_instruction *instr = ir->indirects[i];
-		if (instr && (instr->flags & IR3_INSTR_UNUSED))
-			ir->indirects[i] = NULL;
-	}
-
-	/* cleanup unused inputs: */
-	for (i = 0; i < ir->ninputs; i++) {
-		struct ir3_instruction *in = ir->inputs[i];
-		if (in && (in->flags & IR3_INSTR_UNUSED))
-			ir->inputs[i] = NULL;
-	}
-
-	return progress;
-}
-
-void
-ir3_depth(struct ir3 *ir)
-{
-	bool progress;
-	do {
-		progress = compute_depth_and_remove_unused(ir);
-	} while (progress);
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
index 3a1b857e010..cc6efa1ca17 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@@ -37,10 +37,10 @@
 #include "freedreno_context.h"
 #include "freedreno_util.h"
 
-#include "ir3_shader.h"
-#include "ir3_gallium.h"
-#include "ir3_compiler.h"
-#include "ir3_nir.h"
+#include "ir3/ir3_shader.h"
+#include "ir3/ir3_gallium.h"
+#include "ir3/ir3_compiler.h"
+#include "ir3/ir3_nir.h"
 
 static void
 dump_shader_info(struct ir3_shader_variant *v, struct pipe_debug_callback *debug)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.h b/src/gallium/drivers/freedreno/ir3/ir3_gallium.h
index cf1d48d97ba..5fb74596781 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.h
@@ -28,7 +28,7 @@
 #define IR3_GALLIUM_H_
 
 #include "pipe/p_state.h"
-#include "ir3_shader.h"
+#include "ir3/ir3_shader.h"
 
 struct ir3_shader * ir3_shader_create(struct ir3_compiler *compiler,
 		const struct pipe_shader_state *cso, gl_shader_stage type,
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c
deleted file mode 100644
index 570055973e8..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_group.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#include "ir3.h"
-
-/*
- * Find/group instruction neighbors:
- */
-
-/* bleh.. we need to do the same group_n() thing for both inputs/outputs
- * (where we have a simple instr[] array), and fanin nodes (where we have
- * an extra indirection via reg->instr).
- */
-struct group_ops {
-	struct ir3_instruction *(*get)(void *arr, int idx);
-	void (*insert_mov)(void *arr, int idx, struct ir3_instruction *instr);
-};
-
-static struct ir3_instruction *arr_get(void *arr, int idx)
-{
-	return ((struct ir3_instruction **)arr)[idx];
-}
-static void arr_insert_mov_out(void *arr, int idx, struct ir3_instruction *instr)
-{
-	((struct ir3_instruction **)arr)[idx] =
-			ir3_MOV(instr->block, instr, TYPE_F32);
-}
-static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr)
-{
-	/* so, we can't insert a mov in front of a meta:in.. and the downstream
-	 * instruction already has a pointer to 'instr'.  So we cheat a bit and
-	 * morph the meta:in instruction into a mov and insert a new meta:in
-	 * in front.
-	 */
-	struct ir3_instruction *in;
-
-	debug_assert(instr->regs_count == 1);
-
-	in = ir3_instr_create(instr->block, OPC_META_INPUT);
-	in->inout.block = instr->block;
-	ir3_reg_create(in, instr->regs[0]->num, 0);
-
-	/* create src reg for meta:in and fixup to now be a mov: */
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = in;
-	instr->opc = OPC_MOV;
-	instr->cat1.src_type = TYPE_F32;
-	instr->cat1.dst_type = TYPE_F32;
-
-	((struct ir3_instruction **)arr)[idx] = in;
-}
-static struct group_ops arr_ops_out = { arr_get, arr_insert_mov_out };
-static struct group_ops arr_ops_in = { arr_get, arr_insert_mov_in };
-
-static struct ir3_instruction *instr_get(void *arr, int idx)
-{
-	return ssa(((struct ir3_instruction *)arr)->regs[idx+1]);
-}
-static void
-instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr)
-{
-	((struct ir3_instruction *)arr)->regs[idx+1]->instr =
-			ir3_MOV(instr->block, instr, TYPE_F32);
-}
-static struct group_ops instr_ops = { instr_get, instr_insert_mov };
-
-/* verify that cur != instr, but cur is also not in instr's neighbor-list: */
-static bool
-in_neighbor_list(struct ir3_instruction *instr, struct ir3_instruction *cur, int pos)
-{
-	int idx = 0;
-
-	if (!instr)
-		return false;
-
-	if (instr == cur)
-		return true;
-
-	for (instr = ir3_neighbor_first(instr); instr; instr = instr->cp.right)
-		if ((idx++ != pos) && (instr == cur))
-			return true;
-
-	return false;
-}
-
-static void
-group_n(struct group_ops *ops, void *arr, unsigned n)
-{
-	unsigned i, j;
-
-	/* first pass, figure out what has conflicts and needs a mov
-	 * inserted.  Do this up front, before starting to setup
-	 * left/right neighbor pointers.  Trying to do it in a single
-	 * pass could result in a situation where we can't even setup
-	 * the mov's right neighbor ptr if the next instr also needs
-	 * a mov.
-	 */
-restart:
-	for (i = 0; i < n; i++) {
-		struct ir3_instruction *instr = ops->get(arr, i);
-		if (instr) {
-			struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
-			struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
-			bool conflict;
-
-			/* check for left/right neighbor conflicts: */
-			conflict = conflicts(instr->cp.left, left) ||
-				conflicts(instr->cp.right, right);
-
-			/* Mixing array elements and higher register classes
-			 * (ie. groups) doesn't really work out in RA.  See:
-			 *
-			 * https://trello.com/c/DqeDkeVf/156-bug-with-stk-70frag
-			 */
-			if (instr->regs[0]->flags & IR3_REG_ARRAY)
-				conflict = true;
-
-			/* we also can't have an instr twice in the group: */
-			for (j = i + 1; (j < n) && !conflict; j++)
-				if (in_neighbor_list(ops->get(arr, j), instr, i))
-					conflict = true;
-
-			if (conflict) {
-				ops->insert_mov(arr, i, instr);
-				/* inserting the mov may have caused a conflict
-				 * against the previous:
-				 */
-				goto restart;
-			}
-		}
-	}
-
-	/* second pass, now that we've inserted mov's, fixup left/right
-	 * neighbors.  This is guaranteed to succeed, since by definition
-	 * the newly inserted mov's cannot conflict with anything.
-	 */
-	for (i = 0; i < n; i++) {
-		struct ir3_instruction *instr = ops->get(arr, i);
-		if (instr) {
-			struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
-			struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
-
-			debug_assert(!conflicts(instr->cp.left, left));
-			if (left) {
-				instr->cp.left_cnt++;
-				instr->cp.left = left;
-			}
-
-			debug_assert(!conflicts(instr->cp.right, right));
-			if (right) {
-				instr->cp.right_cnt++;
-				instr->cp.right = right;
-			}
-		}
-	}
-}
-
-static void
-instr_find_neighbors(struct ir3_instruction *instr)
-{
-	struct ir3_instruction *src;
-
-	if (ir3_instr_check_mark(instr))
-		return;
-
-	if (instr->opc == OPC_META_FI)
-		group_n(&instr_ops, instr, instr->regs_count - 1);
-
-	foreach_ssa_src(src, instr)
-		instr_find_neighbors(src);
-}
-
-/* a bit of sadness.. we can't have "holes" in inputs from PoV of
- * register assignment, they still need to be grouped together.  So
- * we need to insert dummy/padding instruction for grouping, and
- * then take it back out again before anyone notices.
- */
-static void
-pad_and_group_input(struct ir3_instruction **input, unsigned n)
-{
-	int i, mask = 0;
-	struct ir3_block *block = NULL;
-
-	for (i = n - 1; i >= 0; i--) {
-		struct ir3_instruction *instr = input[i];
-		if (instr) {
-			block = instr->block;
-		} else if (block) {
-			instr = ir3_NOP(block);
-			ir3_reg_create(instr, 0, IR3_REG_SSA);    /* dummy dst */
-			input[i] = instr;
-			mask |= (1 << i);
-		}
-	}
-
-	group_n(&arr_ops_in, input, n);
-
-	for (i = 0; i < n; i++) {
-		if (mask & (1 << i))
-			input[i] = NULL;
-	}
-}
-
-static void
-find_neighbors(struct ir3 *ir)
-{
-	unsigned i;
-
-	/* shader inputs/outputs themselves must be contiguous as well:
-	 *
-	 * NOTE: group inputs first, since we only insert mov's
-	 * *before* the conflicted instr (and that would go badly
-	 * for inputs).  By doing inputs first, we should never
-	 * have a conflict on inputs.. pushing any conflict to
-	 * resolve to the outputs, for stuff like:
-	 *
-	 *     MOV OUT[n], IN[m].wzyx
-	 *
-	 * NOTE: we assume here inputs/outputs are grouped in vec4.
-	 * This logic won't quite cut it if we don't align smaller
-	 * on vec4 boundaries
-	 */
-	for (i = 0; i < ir->ninputs; i += 4)
-		pad_and_group_input(&ir->inputs[i], 4);
-	for (i = 0; i < ir->noutputs; i += 4)
-		group_n(&arr_ops_out, &ir->outputs[i], 4);
-
-	for (i = 0; i < ir->noutputs; i++) {
-		if (ir->outputs[i]) {
-			struct ir3_instruction *instr = ir->outputs[i];
-			instr_find_neighbors(instr);
-		}
-	}
-
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		for (i = 0; i < block->keeps_count; i++) {
-			struct ir3_instruction *instr = block->keeps[i];
-			instr_find_neighbors(instr);
-		}
-
-		/* We also need to account for if-condition: */
-		if (block->condition)
-			instr_find_neighbors(block->condition);
-	}
-}
-
-void
-ir3_group(struct ir3 *ir)
-{
-	ir3_clear_mark(ir);
-	find_neighbors(ir);
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
deleted file mode 100644
index ff4c644eab5..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ /dev/null
@@ -1,496 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#include "util/ralloc.h"
-#include "util/u_math.h"
-
-#include "ir3.h"
-
-/*
- * Legalize:
- *
- * We currently require that scheduling ensures that we have enough nop's
- * in all the right places.  The legalize step mostly handles fixing up
- * instruction flags ((ss)/(sy)/(ei)), and collapses sequences of nop's
- * into fewer nop's w/ rpt flag.
- */
-
-struct ir3_legalize_ctx {
-	int num_samp;
-	bool has_ssbo;
-	int max_bary;
-};
-
-struct ir3_legalize_state {
-	regmask_t needs_ss;
-	regmask_t needs_ss_war;       /* write after read */
-	regmask_t needs_sy;
-};
-
-struct ir3_legalize_block_data {
-	bool valid;
-	struct ir3_legalize_state state;
-};
-
-/* We want to evaluate each block from the position of any other
- * predecessor block, in order that the flags set are the union of
- * all possible program paths.
- *
- * To do this, we need to know the output state (needs_ss/ss_war/sy)
- * of all predecessor blocks.  The tricky thing is loops, which mean
- * that we can't simply recursively process each predecessor block
- * before legalizing the current block.
- *
- * How we handle that is by looping over all the blocks until the
- * results converge.  If the output state of a given block changes
- * in a given pass, this means that all successor blocks are not
- * yet fully legalized.
- */
-
-static bool
-legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
-{
-	struct ir3_legalize_block_data *bd = block->data;
-
-	if (bd->valid)
-		return false;
-
-	struct ir3_instruction *last_input = NULL;
-	struct ir3_instruction *last_rel = NULL;
-	struct ir3_instruction *last_n = NULL;
-	struct list_head instr_list;
-	struct ir3_legalize_state prev_state = bd->state;
-	struct ir3_legalize_state *state = &bd->state;
-
-	/* our input state is the OR of all predecessor blocks' state: */
-	for (unsigned i = 0; i < block->predecessors_count; i++) {
-		struct ir3_legalize_block_data *pbd = block->predecessors[i]->data;
-		struct ir3_legalize_state *pstate = &pbd->state;
-
-		/* Our input (ss)/(sy) state is based on OR'ing the output
-		 * state of all our predecessor blocks
-		 */
-		regmask_or(&state->needs_ss,
-				&state->needs_ss, &pstate->needs_ss);
-		regmask_or(&state->needs_ss_war,
-				&state->needs_ss_war, &pstate->needs_ss_war);
-		regmask_or(&state->needs_sy,
-				&state->needs_sy, &pstate->needs_sy);
-	}
-
-	/* remove all the instructions from the list, we'll be adding
-	 * them back in as we go
-	 */
-	list_replace(&block->instr_list, &instr_list);
-	list_inithead(&block->instr_list);
-
-	list_for_each_entry_safe (struct ir3_instruction, n, &instr_list, node) {
-		struct ir3_register *reg;
-		unsigned i;
-
-		n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
-
-		if (is_meta(n))
-			continue;
-
-		if (is_input(n)) {
-			struct ir3_register *inloc = n->regs[1];
-			assert(inloc->flags & IR3_REG_IMMED);
-			ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
-		}
-
-		if (last_n && is_barrier(last_n))
-			n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
-
-		/* NOTE: consider dst register too.. it could happen that
-		 * texture sample instruction (for example) writes some
-		 * components which are unused.  A subsequent instruction
-		 * that writes the same register can race w/ the sam instr
-		 * resulting in undefined results:
-		 */
-		for (i = 0; i < n->regs_count; i++) {
-			reg = n->regs[i];
-
-			if (reg_gpr(reg)) {
-
-				/* TODO: we probably only need (ss) for alu
-				 * instr consuming sfu result.. need to make
-				 * some tests for both this and (sy)..
-				 */
-				if (regmask_get(&state->needs_ss, reg)) {
-					n->flags |= IR3_INSTR_SS;
-					regmask_init(&state->needs_ss_war);
-					regmask_init(&state->needs_ss);
-				}
-
-				if (regmask_get(&state->needs_sy, reg)) {
-					n->flags |= IR3_INSTR_SY;
-					regmask_init(&state->needs_sy);
-				}
-			}
-
-			/* TODO: is it valid to have address reg loaded from a
-			 * relative src (ie. mova a0, c<a0.x+4>)?  If so, the
-			 * last_rel check below should be moved ahead of this:
-			 */
-			if (reg->flags & IR3_REG_RELATIV)
-				last_rel = n;
-		}
-
-		if (n->regs_count > 0) {
-			reg = n->regs[0];
-			if (regmask_get(&state->needs_ss_war, reg)) {
-				n->flags |= IR3_INSTR_SS;
-				regmask_init(&state->needs_ss_war);
-				regmask_init(&state->needs_ss);
-			}
-
-			if (last_rel && (reg->num == regid(REG_A0, 0))) {
-				last_rel->flags |= IR3_INSTR_UL;
-				last_rel = NULL;
-			}
-		}
-
-		/* cat5+ does not have an (ss) bit, if needed we need to
-		 * insert a nop to carry the sync flag.  Would be kinda
-		 * clever if we were aware of this during scheduling, but
-		 * this should be a pretty rare case:
-		 */
-		if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
-			struct ir3_instruction *nop;
-			nop = ir3_NOP(block);
-			nop->flags |= IR3_INSTR_SS;
-			n->flags &= ~IR3_INSTR_SS;
-		}
-
-		/* need to be able to set (ss) on first instruction: */
-		if (list_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
-			ir3_NOP(block);
-
-		if (is_nop(n) && !list_empty(&block->instr_list)) {
-			struct ir3_instruction *last = list_last_entry(&block->instr_list,
-					struct ir3_instruction, node);
-			if (is_nop(last) && (last->repeat < 5)) {
-				last->repeat++;
-				last->flags |= n->flags;
-				continue;
-			}
-		}
-
-		list_addtail(&n->node, &block->instr_list);
-
-		if (is_sfu(n))
-			regmask_set(&state->needs_ss, n->regs[0]);
-
-		if (is_tex(n)) {
-			/* this ends up being the # of samp instructions.. but that
-			 * is ok, everything else only cares whether it is zero or
-			 * not.  We do this here, rather than when we encounter a
-			 * SAMP decl, because (especially in binning pass shader)
-			 * the samp instruction(s) could get eliminated if the
-			 * result is not used.
-			 */
-			ctx->num_samp = MAX2(ctx->num_samp, n->cat5.samp + 1);
-			regmask_set(&state->needs_sy, n->regs[0]);
-		} else if (n->opc == OPC_RESINFO) {
-			regmask_set(&state->needs_ss, n->regs[0]);
-			ir3_NOP(block)->flags |= IR3_INSTR_SS;
-		} else if (is_load(n)) {
-			/* seems like ldlv needs (ss) bit instead??  which is odd but
-			 * makes a bunch of flat-varying tests start working on a4xx.
-			 */
-			if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL))
-				regmask_set(&state->needs_ss, n->regs[0]);
-			else
-				regmask_set(&state->needs_sy, n->regs[0]);
-		} else if (is_atomic(n->opc)) {
-			if (n->flags & IR3_INSTR_G)
-				regmask_set(&state->needs_sy, n->regs[0]);
-			else
-				regmask_set(&state->needs_ss, n->regs[0]);
-		}
-
-		if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
-			ctx->has_ssbo = true;
-
-		/* both tex/sfu appear to not always immediately consume
-		 * their src register(s):
-		 */
-		if (is_tex(n) || is_sfu(n) || is_mem(n)) {
-			foreach_src(reg, n) {
-				if (reg_gpr(reg))
-					regmask_set(&state->needs_ss_war, reg);
-			}
-		}
-
-		if (is_input(n))
-			last_input = n;
-
-		last_n = n;
-	}
-
-	if (last_input) {
-		/* special hack.. if using ldlv to bypass interpolation,
-		 * we need to insert a dummy bary.f on which we can set
-		 * the (ei) flag:
-		 */
-		if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) {
-			struct ir3_instruction *baryf;
-
-			/* (ss)bary.f (ei)r63.x, 0, r0.x */
-			baryf = ir3_instr_create(block, OPC_BARY_F);
-			baryf->flags |= IR3_INSTR_SS;
-			ir3_reg_create(baryf, regid(63, 0), 0);
-			ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
-			ir3_reg_create(baryf, regid(0, 0), 0);
-
-			/* insert the dummy bary.f after last_input: */
-			list_delinit(&baryf->node);
-			list_add(&baryf->node, &last_input->node);
-
-			last_input = baryf;
-		}
-		last_input->regs[0]->flags |= IR3_REG_EI;
-	}
-
-	if (last_rel)
-		last_rel->flags |= IR3_INSTR_UL;
-
-	bd->valid = true;
-
-	if (memcmp(&prev_state, state, sizeof(*state))) {
-		/* our output state changed, this invalidates all of our
-		 * successors:
-		 */
-		for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
-			if (!block->successors[i])
-				break;
-			struct ir3_legalize_block_data *pbd = block->successors[i]->data;
-			pbd->valid = false;
-		}
-	}
-
-	return true;
-}
-
-/* NOTE: branch instructions are always the last instruction(s)
- * in the block.  We take advantage of this as we resolve the
- * branches, since "if (foo) break;" constructs turn into
- * something like:
- *
- *   block3 {
- *   	...
- *   	0029:021: mov.s32s32 r62.x, r1.y
- *   	0082:022: br !p0.x, target=block5
- *   	0083:023: br p0.x, target=block4
- *   	// succs: if _[0029:021: mov.s32s32] block4; else block5;
- *   }
- *   block4 {
- *   	0084:024: jump, target=block6
- *   	// succs: block6;
- *   }
- *   block5 {
- *   	0085:025: jump, target=block7
- *   	// succs: block7;
- *   }
- *
- * ie. only instruction in block4/block5 is a jump, so when
- * resolving branches we can easily detect this by checking
- * that the first instruction in the target block is itself
- * a jump, and setup the br directly to the jump's target
- * (and strip back out the now unreached jump)
- *
- * TODO sometimes we end up with things like:
- *
- *    br !p0.x, #2
- *    br p0.x, #12
- *    add.u r0.y, r0.y, 1
- *
- * If we swapped the order of the branches, we could drop one.
- */
-static struct ir3_block *
-resolve_dest_block(struct ir3_block *block)
-{
-	/* special case for last block: */
-	if (!block->successors[0])
-		return block;
-
-	/* NOTE that we may or may not have inserted the jump
-	 * in the target block yet, so conditions to resolve
-	 * the dest to the dest block's successor are:
-	 *
-	 *   (1) successor[1] == NULL &&
-	 *   (2) (block-is-empty || only-instr-is-jump)
-	 */
-	if (block->successors[1] == NULL) {
-		if (list_empty(&block->instr_list)) {
-			return block->successors[0];
-		} else if (list_length(&block->instr_list) == 1) {
-			struct ir3_instruction *instr = list_first_entry(
-					&block->instr_list, struct ir3_instruction, node);
-			if (instr->opc == OPC_JUMP)
-				return block->successors[0];
-		}
-	}
-	return block;
-}
-
-static bool
-resolve_jump(struct ir3_instruction *instr)
-{
-	struct ir3_block *tblock =
-		resolve_dest_block(instr->cat0.target);
-	struct ir3_instruction *target;
-
-	if (tblock != instr->cat0.target) {
-		list_delinit(&instr->cat0.target->node);
-		instr->cat0.target = tblock;
-		return true;
-	}
-
-	target = list_first_entry(&tblock->instr_list,
-				struct ir3_instruction, node);
-
-	/* TODO maybe a less fragile way to do this.  But we are expecting
-	 * a pattern from sched_block() that looks like:
-	 *
-	 *   br !p0.x, #else-block
-	 *   br p0.x, #if-block
-	 *
-	 * if the first branch target is +2, or if 2nd branch target is +1
-	 * then we can just drop the jump.
-	 */
-	unsigned next_block;
-	if (instr->cat0.inv == true)
-		next_block = 2;
-	else
-		next_block = 1;
-
-	if ((!target) || (target->ip == (instr->ip + next_block))) {
-		list_delinit(&instr->node);
-		return true;
-	} else {
-		instr->cat0.immed =
-			(int)target->ip - (int)instr->ip;
-	}
-	return false;
-}
-
-/* resolve jumps, removing jumps/branches to immediately following
- * instruction which we end up with from earlier stages.  Since
- * removing an instruction can invalidate earlier instruction's
- * branch offsets, we need to do this iteratively until no more
- * branches are removed.
- */
-static bool
-resolve_jumps(struct ir3 *ir)
-{
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
-		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
-			if (is_flow(instr) && instr->cat0.target)
-				if (resolve_jump(instr))
-					return true;
-
-	return false;
-}
-
-/* we want to mark points where divergent flow control re-converges
- * with (jp) flags.  For now, since we don't do any optimization for
- * things that start out as a 'do {} while()', re-convergence points
- * will always be a branch or jump target.  Note that this is overly
- * conservative, since unconditional jump targets are not convergence
- * points, we are just assuming that the other path to reach the jump
- * target was divergent.  If we were clever enough to optimize the
- * jump at end of a loop back to a conditional branch into a single
- * conditional branch, ie. like:
- *
- *    add.f r1.w, r0.x, (neg)(r)c2.x   <= loop start
- *    mul.f r1.z, r1.z, r0.x
- *    mul.f r1.y, r1.y, r0.x
- *    mul.f r0.z, r1.x, r0.x
- *    mul.f r0.w, r0.y, r0.x
- *    cmps.f.ge r0.x, (r)c2.y, (r)r1.w
- *    add.s r0.x, (r)r0.x, (r)-1
- *    sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x
- *    cmps.f.eq p0.x, r0.x, c3.y
- *    mov.f32f32 r0.x, r1.w
- *    mov.f32f32 r0.y, r0.w
- *    mov.f32f32 r1.x, r0.z
- *    (rpt2)nop
- *    br !p0.x, #-13
- *    (jp)mul.f r0.x, c263.y, r1.y
- *
- * Then we'd have to be more clever, as the convergence point is no
- * longer a branch or jump target.
- */
-static void
-mark_convergence_points(struct ir3 *ir)
-{
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-			if (is_flow(instr) && instr->cat0.target) {
-				struct ir3_instruction *target =
-					list_first_entry(&instr->cat0.target->instr_list,
-							struct ir3_instruction, node);
-				target->flags |= IR3_INSTR_JP;
-			}
-		}
-	}
-}
-
-void
-ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary)
-{
-	struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
-	bool progress;
-
-	ctx->max_bary = -1;
-
-	/* allocate per-block data: */
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		block->data = rzalloc(ctx, struct ir3_legalize_block_data);
-	}
-
-	/* process each block: */
-	do {
-		progress = false;
-		list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-			progress |= legalize_block(ctx, block);
-		}
-	} while (progress);
-
-	*num_samp = ctx->num_samp;
-	*has_ssbo = ctx->has_ssbo;
-	*max_bary = ctx->max_bary;
-
-	do {
-		ir3_count_instructions(ir);
-	} while(resolve_jumps(ir));
-
-	mark_convergence_points(ir);
-
-	ralloc_free(ctx);
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
deleted file mode 100644
index 70c01ee0593..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Copyright (C) 2015 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-
-#include "util/debug.h"
-
-#include "ir3_nir.h"
-#include "ir3_compiler.h"
-#include "ir3_shader.h"
-
-static const nir_shader_compiler_options options = {
-		.lower_fpow = true,
-		.lower_scmp = true,
-		.lower_flrp32 = true,
-		.lower_flrp64 = true,
-		.lower_ffract = true,
-		.lower_fmod32 = true,
-		.lower_fmod64 = true,
-		.lower_fdiv = true,
-		.lower_ldexp = true,
-		.fuse_ffma = true,
-		.native_integers = true,
-		.vertex_id_zero_based = true,
-		.lower_extract_byte = true,
-		.lower_extract_word = true,
-		.lower_all_io_to_temps = true,
-		.lower_helper_invocation = true,
-};
-
-const nir_shader_compiler_options *
-ir3_get_compiler_options(struct ir3_compiler *compiler)
-{
-	return &options;
-}
-
-/* for given shader key, are any steps handled in nir? */
-bool
-ir3_key_lowers_nir(const struct ir3_shader_key *key)
-{
-	return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r |
-			key->vsaturate_s | key->vsaturate_t | key->vsaturate_r |
-			key->ucp_enables | key->color_two_side |
-			key->fclamp_color | key->vclamp_color;
-}
-
-#define OPT(nir, pass, ...) ({                             \
-   bool this_progress = false;                             \
-   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
-   this_progress;                                          \
-})
-
-#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
-
-static void
-ir3_optimize_loop(nir_shader *s)
-{
-	bool progress;
-	do {
-		progress = false;
-
-		OPT_V(s, nir_lower_vars_to_ssa);
-		progress |= OPT(s, nir_opt_copy_prop_vars);
-		progress |= OPT(s, nir_opt_dead_write_vars);
-		progress |= OPT(s, nir_lower_alu_to_scalar);
-		progress |= OPT(s, nir_lower_phis_to_scalar);
-
-		progress |= OPT(s, nir_copy_prop);
-		progress |= OPT(s, nir_opt_dce);
-		progress |= OPT(s, nir_opt_cse);
-		static int gcm = -1;
-		if (gcm == -1)
-			gcm = env_var_as_unsigned("GCM", 0);
-		if (gcm == 1)
-			progress |= OPT(s, nir_opt_gcm, true);
-		else if (gcm == 2)
-			progress |= OPT(s, nir_opt_gcm, false);
-		progress |= OPT(s, nir_opt_peephole_select, 16);
-		progress |= OPT(s, nir_opt_intrinsics);
-		progress |= OPT(s, nir_opt_algebraic);
-		progress |= OPT(s, nir_opt_constant_folding);
-		progress |= OPT(s, nir_opt_dead_cf);
-		if (OPT(s, nir_opt_trivial_continues)) {
-			progress |= true;
-			/* If nir_opt_trivial_continues makes progress, then we need to clean
-			 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
-			 * to make progress.
-			 */
-			OPT(s, nir_copy_prop);
-			OPT(s, nir_opt_dce);
-		}
-		progress |= OPT(s, nir_opt_if);
-		progress |= OPT(s, nir_opt_remove_phis);
-		progress |= OPT(s, nir_opt_undef);
-
-	} while (progress);
-}
-
-struct nir_shader *
-ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
-		const struct ir3_shader_key *key)
-{
-	struct nir_lower_tex_options tex_options = {
-			.lower_rect = 0,
-	};
-
-	if (key) {
-		switch (shader->type) {
-		case MESA_SHADER_FRAGMENT:
-			tex_options.saturate_s = key->fsaturate_s;
-			tex_options.saturate_t = key->fsaturate_t;
-			tex_options.saturate_r = key->fsaturate_r;
-			break;
-		case MESA_SHADER_VERTEX:
-			tex_options.saturate_s = key->vsaturate_s;
-			tex_options.saturate_t = key->vsaturate_t;
-			tex_options.saturate_r = key->vsaturate_r;
-			break;
-		default:
-			/* TODO */
-			break;
-		}
-	}
-
-	if (shader->compiler->gpu_id >= 400) {
-		/* a4xx seems to have *no* sam.p */
-		tex_options.lower_txp = ~0;  /* lower all txp */
-	} else {
-		/* a3xx just needs to avoid sam.p for 3d tex */
-		tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
-	}
-
-	if (ir3_shader_debug & IR3_DBG_DISASM) {
-		debug_printf("----------------------\n");
-		nir_print_shader(s, stdout);
-		debug_printf("----------------------\n");
-	}
-
-	OPT_V(s, nir_opt_global_to_local);
-	OPT_V(s, nir_lower_regs_to_ssa);
-
-	if (key) {
-		if (s->info.stage == MESA_SHADER_VERTEX) {
-			OPT_V(s, nir_lower_clip_vs, key->ucp_enables, false);
-			if (key->vclamp_color)
-				OPT_V(s, nir_lower_clamp_color_outputs);
-		} else if (s->info.stage == MESA_SHADER_FRAGMENT) {
-			OPT_V(s, nir_lower_clip_fs, key->ucp_enables);
-			if (key->fclamp_color)
-				OPT_V(s, nir_lower_clamp_color_outputs);
-		}
-		if (key->color_two_side) {
-			OPT_V(s, nir_lower_two_sided_color);
-		}
-	} else {
-		/* only want to do this the first time (when key is null)
-		 * and not again on any potential 2nd variant lowering pass:
-		 */
-		OPT_V(s, ir3_nir_apply_trig_workarounds);
-	}
-
-	OPT_V(s, nir_lower_tex, &tex_options);
-	OPT_V(s, nir_lower_load_const_to_scalar);
-	if (shader->compiler->gpu_id < 500)
-		OPT_V(s, ir3_nir_lower_tg4_to_tex);
-
-	ir3_optimize_loop(s);
-
-	/* do idiv lowering after first opt loop to give a chance for
-	 * divide by immed power-of-two to be caught first:
-	 */
-	if (OPT(s, nir_lower_idiv))
-		ir3_optimize_loop(s);
-
-	OPT_V(s, nir_remove_dead_variables, nir_var_local);
-
-	OPT_V(s, nir_move_load_const);
-
-	if (ir3_shader_debug & IR3_DBG_DISASM) {
-		debug_printf("----------------------\n");
-		nir_print_shader(s, stdout);
-		debug_printf("----------------------\n");
-	}
-
-	nir_sweep(s);
-
-	return s;
-}
-
-void
-ir3_nir_scan_driver_consts(nir_shader *shader,
-		struct ir3_driver_const_layout *layout)
-{
-	nir_foreach_function(function, shader) {
-		if (!function->impl)
-			continue;
-
-		nir_foreach_block(block, function->impl) {
-			nir_foreach_instr(instr, block) {
-				if (instr->type != nir_instr_type_intrinsic)
-					continue;
-
-				nir_intrinsic_instr *intr =
-					nir_instr_as_intrinsic(instr);
-				unsigned idx;
-
-				switch (intr->intrinsic) {
-				case nir_intrinsic_get_buffer_size:
-					idx = nir_src_as_const_value(intr->src[0])->u32[0];
-					if (layout->ssbo_size.mask & (1 << idx))
-						break;
-					layout->ssbo_size.mask |= (1 << idx);
-					layout->ssbo_size.off[idx] =
-						layout->ssbo_size.count;
-					layout->ssbo_size.count += 1; /* one const per */
-					break;
-				case nir_intrinsic_image_deref_atomic_add:
-				case nir_intrinsic_image_deref_atomic_min:
-				case nir_intrinsic_image_deref_atomic_max:
-				case nir_intrinsic_image_deref_atomic_and:
-				case nir_intrinsic_image_deref_atomic_or:
-				case nir_intrinsic_image_deref_atomic_xor:
-				case nir_intrinsic_image_deref_atomic_exchange:
-				case nir_intrinsic_image_deref_atomic_comp_swap:
-				case nir_intrinsic_image_deref_store:
-				case nir_intrinsic_image_deref_size:
-					idx = nir_intrinsic_get_var(intr, 0)->data.driver_location;
-					if (layout->image_dims.mask & (1 << idx))
-						break;
-					layout->image_dims.mask |= (1 << idx);
-					layout->image_dims.off[idx] =
-						layout->image_dims.count;
-					layout->image_dims.count += 3; /* three const per */
-					break;
-				default:
-					break;
-				}
-			}
-		}
-	}
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/src/gallium/drivers/freedreno/ir3/ir3_nir.h
deleted file mode 100644
index 74201d34160..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2015 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#ifndef IR3_NIR_H_
-#define IR3_NIR_H_
-
-#include "compiler/nir/nir.h"
-#include "compiler/shader_enums.h"
-
-#include "ir3_shader.h"
-
-void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layout *layout);
-
-bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
-bool ir3_nir_lower_tg4_to_tex(nir_shader *shader);
-
-const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
-bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
-struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
-		const struct ir3_shader_key *key);
-
-#endif /* IR3_NIR_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
deleted file mode 100644
index 37a3dcb26f8..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright © 2017 Ilia Mirkin
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "ir3_nir.h"
-#include "compiler/nir/nir_builder.h"
-
-/* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the
- * gather results, rather than before. As a result, it must be emulated with
- * direct texture calls.
- */
-
-static bool
-lower_tg4(nir_block *block, nir_builder *b, void *mem_ctx)
-{
-	bool progress = false;
-
-	static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} };
-
-	nir_foreach_instr_safe(instr, block) {
-		if (instr->type != nir_instr_type_tex)
-			continue;
-
-		nir_tex_instr *tg4 = (nir_tex_instr *)instr;
-
-		if (tg4->op != nir_texop_tg4)
-			continue;
-
-		b->cursor = nir_before_instr(&tg4->instr);
-
-		nir_ssa_def *results[4];
-		int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
-		for (int i = 0; i < 4; i++) {
-			int num_srcs = tg4->num_srcs + 1 /* lod */;
-			if (offset_index < 0 && i < 3)
-				num_srcs++;
-
-			nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
-			tex->op = nir_texop_txl;
-			tex->sampler_dim = tg4->sampler_dim;
-			tex->coord_components = tg4->coord_components;
-			tex->is_array = tg4->is_array;
-			tex->is_shadow = tg4->is_shadow;
-			tex->is_new_style_shadow = tg4->is_new_style_shadow;
-			tex->texture_index = tg4->texture_index;
-			tex->sampler_index = tg4->sampler_index;
-			tex->dest_type = tg4->dest_type;
-
-			for (int j = 0; j < tg4->num_srcs; j++) {
-				nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
-				tex->src[j].src_type = tg4->src[j].src_type;
-			}
-			if (i != 3) {
-				nir_ssa_def *offset =
-					nir_vec2(b, nir_imm_int(b, offsets[i][0]),
-							 nir_imm_int(b, offsets[i][1]));
-				if (offset_index < 0) {
-					tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
-					tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
-				} else {
-					assert(nir_tex_instr_src_size(tex, offset_index) == 2);
-					nir_ssa_def *orig = nir_ssa_for_src(
-							b, tex->src[offset_index].src, 2);
-					tex->src[offset_index].src =
-						nir_src_for_ssa(nir_iadd(b, orig, offset));
-				}
-			}
-			tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
-			tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
-
-			nir_ssa_dest_init(&tex->instr, &tex->dest,
-							  nir_tex_instr_dest_size(tex), 32, NULL);
-			nir_builder_instr_insert(b, &tex->instr);
-
-			results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
-		}
-
-		nir_ssa_def *result = nir_vec4(b, results[0], results[1], results[2], results[3]);
-		nir_ssa_def_rewrite_uses(&tg4->dest.ssa, nir_src_for_ssa(result));
-
-		nir_instr_remove(&tg4->instr);
-
-		progress = true;
-	}
-
-	return progress;
-}
-
-static bool
-lower_tg4_func(nir_function_impl *impl)
-{
-	void *mem_ctx = ralloc_parent(impl);
-	nir_builder b;
-	nir_builder_init(&b, impl);
-
-	bool progress = false;
-	nir_foreach_block_safe(block, impl) {
-		progress |= lower_tg4(block, &b, mem_ctx);
-	}
-
-	if (progress)
-		nir_metadata_preserve(impl, nir_metadata_block_index |
-									nir_metadata_dominance);
-
-	return progress;
-}
-
-bool
-ir3_nir_lower_tg4_to_tex(nir_shader *shader)
-{
-	bool progress = false;
-
-	nir_foreach_function(function, shader) {
-		if (function->impl)
-			progress |= lower_tg4_func(function->impl);
-	}
-
-	return progress;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py b/src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py
deleted file mode 100644
index 3968aea543c..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#
-# Copyright (C) 2016 Intel Corporation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-from __future__ import print_function
-
-import argparse
-import sys
-
-trig_workarounds = [
-   (('fsin', 'x'), ('fsin', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))),
-   (('fcos', 'x'), ('fcos', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))),
-]
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-p', '--import-path', required=True)
-    args = parser.parse_args()
-    sys.path.insert(0, args.import_path)
-    run()
-
-
-def run():
-    import nir_algebraic  # pylint: disable=import-error
-
-    print('#include "ir3_nir.h"')
-    print(nir_algebraic.AlgebraicPass("ir3_nir_apply_trig_workarounds",
-                                      trig_workarounds).render())
-
-
-if __name__ == '__main__':
-    main()
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
deleted file mode 100644
index b6ef6e4b5a7..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#include <stdarg.h>
-#include <stdio.h>
-
-#include "ir3.h"
-
-#define PTRID(x) ((unsigned long)(x))
-
-static void print_instr_name(struct ir3_instruction *instr)
-{
-	if (!instr)
-		return;
-#ifdef DEBUG
-	printf("%04u:", instr->serialno);
-#endif
-	printf("%04u:", instr->name);
-	printf("%04u:", instr->ip);
-	printf("%03u: ", instr->depth);
-
-	if (instr->flags & IR3_INSTR_SY)
-		printf("(sy)");
-	if (instr->flags & IR3_INSTR_SS)
-		printf("(ss)");
-
-	if (is_meta(instr)) {
-		switch (instr->opc) {
-		case OPC_META_INPUT:  printf("_meta:in");   break;
-		case OPC_META_FO:     printf("_meta:fo");   break;
-		case OPC_META_FI:     printf("_meta:fi");   break;
-
-		/* shouldn't hit here.. just for debugging: */
-		default: printf("_meta:%d", instr->opc);    break;
-		}
-	} else if (instr->opc == OPC_MOV) {
-		static const char *type[] = {
-				[TYPE_F16] = "f16",
-				[TYPE_F32] = "f32",
-				[TYPE_U16] = "u16",
-				[TYPE_U32] = "u32",
-				[TYPE_S16] = "s16",
-				[TYPE_S32] = "s32",
-				[TYPE_U8]  = "u8",
-				[TYPE_S8]  = "s8",
-		};
-		if (instr->cat1.src_type == instr->cat1.dst_type)
-			printf("mov");
-		else
-			printf("cov");
-		printf(".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
-	} else {
-		printf("%s", ir3_instr_name(instr));
-		if (instr->flags & IR3_INSTR_3D)
-			printf(".3d");
-		if (instr->flags & IR3_INSTR_A)
-			printf(".a");
-		if (instr->flags & IR3_INSTR_O)
-			printf(".o");
-		if (instr->flags & IR3_INSTR_P)
-			printf(".p");
-		if (instr->flags & IR3_INSTR_S)
-			printf(".s");
-		if (instr->flags & IR3_INSTR_S2EN)
-			printf(".s2en");
-	}
-}
-
-static void print_reg_name(struct ir3_register *reg)
-{
-	if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
-			(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
-		printf("(absneg)");
-	else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
-		printf("(neg)");
-	else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
-		printf("(abs)");
-
-	if (reg->flags & IR3_REG_IMMED) {
-		printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
-	} else if (reg->flags & IR3_REG_ARRAY) {
-		printf("arr[id=%u, offset=%d, size=%u", reg->array.id,
-				reg->array.offset, reg->size);
-		/* for ARRAY we could have null src, for example first write
-		 * instruction..
-		 */
-		if (reg->instr) {
-			printf(", _[");
-			print_instr_name(reg->instr);
-			printf("]");
-		}
-		printf("]");
-	} else if (reg->flags & IR3_REG_SSA) {
-		printf("_[");
-		print_instr_name(reg->instr);
-		printf("]");
-	} else if (reg->flags & IR3_REG_RELATIV) {
-		if (reg->flags & IR3_REG_HALF)
-			printf("h");
-		if (reg->flags & IR3_REG_CONST)
-			printf("c<a0.x + %d>", reg->array.offset);
-		else
-			printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size);
-	} else {
-		if (reg->flags & IR3_REG_HALF)
-			printf("h");
-		if (reg->flags & IR3_REG_CONST)
-			printf("c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
-		else
-			printf("\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]);
-	}
-}
-
-static void
-tab(int lvl)
-{
-	for (int i = 0; i < lvl; i++)
-		printf("\t");
-}
-
-static void
-print_instr(struct ir3_instruction *instr, int lvl)
-{
-	unsigned i;
-
-	tab(lvl);
-
-	print_instr_name(instr);
-	for (i = 0; i < instr->regs_count; i++) {
-		struct ir3_register *reg = instr->regs[i];
-		printf(i ? ", " : " ");
-		print_reg_name(reg);
-	}
-
-	if (instr->address) {
-		printf(", address=_");
-		printf("[");
-		print_instr_name(instr->address);
-		printf("]");
-	}
-
-	if (instr->cp.left) {
-		printf(", left=_");
-		printf("[");
-		print_instr_name(instr->cp.left);
-		printf("]");
-	}
-
-	if (instr->cp.right) {
-		printf(", right=_");
-		printf("[");
-		print_instr_name(instr->cp.right);
-		printf("]");
-	}
-
-	if (instr->opc == OPC_META_FO) {
-		printf(", off=%d", instr->fo.off);
-	}
-
-	if (is_flow(instr) && instr->cat0.target) {
-		/* the predicate register src is implied: */
-		if (instr->opc == OPC_BR) {
-			printf(" %sp0.x", instr->cat0.inv ? "!" : "");
-		}
-		printf(", target=block%u", block_id(instr->cat0.target));
-	}
-
-	if (instr->deps_count) {
-		printf(", false-deps:");
-		for (unsigned i = 0; i < instr->deps_count; i++) {
-			if (i > 0)
-				printf(", ");
-			printf("_[");
-			print_instr_name(instr->deps[i]);
-			printf("]");
-		}
-	}
-
-	printf("\n");
-}
-
-void ir3_print_instr(struct ir3_instruction *instr)
-{
-	print_instr(instr, 0);
-}
-
-static void
-print_block(struct ir3_block *block, int lvl)
-{
-	tab(lvl); printf("block%u {\n", block_id(block));
-
-	if (block->predecessors_count > 0) {
-		tab(lvl+1);
-		printf("pred: ");
-		for (unsigned i = 0; i < block->predecessors_count; i++) {
-			if (i)
-				printf(", ");
-			printf("block%u", block_id(block->predecessors[i]));
-		}
-		printf("\n");
-	}
-
-	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		print_instr(instr, lvl+1);
-	}
-
-	tab(lvl+1); printf("/* keeps:\n");
-	for (unsigned i = 0; i < block->keeps_count; i++) {
-		print_instr(block->keeps[i], lvl+2);
-	}
-	tab(lvl+1); printf(" */\n");
-
-	if (block->successors[1]) {
-		/* leading into if/else: */
-		tab(lvl+1);
-		printf("/* succs: if _[");
-		print_instr_name(block->condition);
-		printf("] block%u; else block%u; */\n",
-				block_id(block->successors[0]),
-				block_id(block->successors[1]));
-	} else if (block->successors[0]) {
-		tab(lvl+1);
-		printf("/* succs: block%u; */\n",
-				block_id(block->successors[0]));
-	}
-	tab(lvl); printf("}\n");
-}
-
-void
-ir3_print(struct ir3 *ir)
-{
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
-		print_block(block, 0);
-
-	for (unsigned i = 0; i < ir->noutputs; i++) {
-		if (!ir->outputs[i])
-			continue;
-		printf("out%d: ", i);
-		print_instr(ir->outputs[i], 0);
-	}
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
deleted file mode 100644
index ad09c4018d3..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ /dev/null
@@ -1,1124 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#include "util/u_math.h"
-#include "util/register_allocate.h"
-#include "util/ralloc.h"
-#include "util/bitset.h"
-
-#include "ir3.h"
-#include "ir3_compiler.h"
-
-/*
- * Register Assignment:
- *
- * Uses the register_allocate util, which implements graph coloring
- * algo with interference classes.  To handle the cases where we need
- * consecutive registers (for example, texture sample instructions),
- * we model these as larger (double/quad/etc) registers which conflict
- * with the corresponding registers in other classes.
- *
- * Additionally we create additional classes for half-regs, which
- * do not conflict with the full-reg classes.  We do need at least
- * sizes 1-4 (to deal w/ texture sample instructions output to half-
- * reg).  At the moment we don't create the higher order half-reg
- * classes as half-reg frequently does not have enough precision
- * for texture coords at higher resolutions.
- *
- * There are some additional cases that we need to handle specially,
- * as the graph coloring algo doesn't understand "partial writes".
- * For example, a sequence like:
- *
- *   add r0.z, ...
- *   sam (f32)(xy)r0.x, ...
- *   ...
- *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
- *
- * In this scenario, we treat r0.xyz as class size 3, which is written
- * (from a use/def perspective) at the 'add' instruction and ignore the
- * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
- * defining instruction, as it is the first to partially write r0.xyz.
- *
- * Note i965 has a similar scenario, which they solve with a virtual
- * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
- * register assignment.  But for us that is horrible from a scheduling
- * standpoint.  Instead what we do is use idea of 'definer' instruction.
- * Ie. the first instruction (lowest ip) to write to the variable is the
- * one we consider from use/def perspective when building interference
- * graph.  (Other instructions which write other variable components
- * just define the variable some more.)
- *
- * Arrays of arbitrary size are handled via pre-coloring a consecutive
- * sequence of registers.  Additional scalar (single component) reg
- * names are allocated starting at ctx->class_base[total_class_count]
- * (see arr->base), which are pre-colored.  In the use/def graph direct
- * access is treated as a single element use/def, and indirect access
- * is treated as use or def of all array elements.  (Only the first
- * def is tracked, in case of multiple indirect writes, etc.)
- *
- * TODO arrays that fit in one of the pre-defined class sizes should
- * not need to be pre-colored, but instead could be given a normal
- * vreg name.  (Ignoring this for now since it is a good way to work
- * out the kinks with arbitrary sized arrays.)
- *
- * TODO might be easier for debugging to split this into two passes,
- * the first assigning vreg names in a way that we could ir3_print()
- * the result.
- */
-
-static const unsigned class_sizes[] = {
-	1, 2, 3, 4,
-	4 + 4, /* txd + 1d/2d */
-	4 + 6, /* txd + 3d */
-};
-#define class_count ARRAY_SIZE(class_sizes)
-
-static const unsigned half_class_sizes[] = {
-	1, 2, 3, 4,
-};
-#define half_class_count  ARRAY_SIZE(half_class_sizes)
-
-/* seems to just be used for compute shaders?  Seems like vec1 and vec3
- * are sufficient (for now?)
- */
-static const unsigned high_class_sizes[] = {
-	1, 3,
-};
-#define high_class_count ARRAY_SIZE(high_class_sizes)
-
-#define total_class_count (class_count + half_class_count + high_class_count)
-
-/* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
-#define NUM_REGS             (4 * 48)  /* r0 to r47 */
-#define NUM_HIGH_REGS        (4 * 8)   /* r48 to r55 */
-#define FIRST_HIGH_REG       (4 * 48)
-/* Number of virtual regs in a given class: */
-#define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
-#define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
-#define HIGH_CLASS_REGS(i)   (NUM_HIGH_REGS - (high_class_sizes[i] - 1))
-
-#define HALF_OFFSET          (class_count)
-#define HIGH_OFFSET          (class_count + half_class_count)
-
-/* register-set, created one time, used for all shaders: */
-struct ir3_ra_reg_set {
-	struct ra_regs *regs;
-	unsigned int classes[class_count];
-	unsigned int half_classes[half_class_count];
-	unsigned int high_classes[high_class_count];
-	/* maps flat virtual register space to base gpr: */
-	uint16_t *ra_reg_to_gpr;
-	/* maps cls,gpr to flat virtual register space: */
-	uint16_t **gpr_to_ra_reg;
-};
-
-static void
-build_q_values(unsigned int **q_values, unsigned off,
-		const unsigned *sizes, unsigned count)
-{
-	for (unsigned i = 0; i < count; i++) {
-		q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count);
-
-		/* From register_allocate.c:
-		 *
-		 * q(B,C) (indexed by C, B is this register class) in
-		 * Runeson/Nyström paper.  This is "how many registers of B could
-		 * the worst choice register from C conflict with".
-		 *
-		 * If we just let the register allocation algorithm compute these
-		 * values, is extremely expensive.  However, since all of our
-		 * registers are laid out, we can very easily compute them
-		 * ourselves.  View the register from C as fixed starting at GRF n
-		 * somewhere in the middle, and the register from B as sliding back
-		 * and forth.  Then the first register to conflict from B is the
-		 * one starting at n - class_size[B] + 1 and the last register to
-		 * conflict will start at n + class_size[B] - 1.  Therefore, the
-		 * number of conflicts from B is class_size[B] + class_size[C] - 1.
-		 *
-		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
-		 * B | | | | | |n| --> | | | | | | |
-		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
-		 *             +-+-+-+-+-+
-		 * C           |n| | | | |
-		 *             +-+-+-+-+-+
-		 *
-		 * (Idea copied from brw_fs_reg_allocate.cpp)
-		 */
-		for (unsigned j = 0; j < count; j++)
-			q_values[i + off][j + off] = sizes[i] + sizes[j] - 1;
-	}
-}
-
-/* One-time setup of RA register-set, which describes all the possible
- * "virtual" registers and their interferences.  Ie. double register
- * occupies (and conflicts with) two single registers, and so forth.
- * Since registers do not need to be aligned to their class size, they
- * can conflict with other registers in the same class too.  Ie:
- *
- *    Single (base) |  Double
- *    --------------+---------------
- *       R0         |  D0
- *       R1         |  D0 D1
- *       R2         |     D1 D2
- *       R3         |        D2
- *           .. and so on..
- *
- * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
- * really just four scalar registers.  Don't let that confuse you.)
- */
-struct ir3_ra_reg_set *
-ir3_ra_alloc_reg_set(struct ir3_compiler *compiler)
-{
-	struct ir3_ra_reg_set *set = rzalloc(compiler, struct ir3_ra_reg_set);
-	unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base;
-	unsigned int **q_values;
-
-	/* calculate # of regs across all classes: */
-	ra_reg_count = 0;
-	for (unsigned i = 0; i < class_count; i++)
-		ra_reg_count += CLASS_REGS(i);
-	for (unsigned i = 0; i < half_class_count; i++)
-		ra_reg_count += HALF_CLASS_REGS(i);
-	for (unsigned i = 0; i < high_class_count; i++)
-		ra_reg_count += HIGH_CLASS_REGS(i);
-
-	/* allocate and populate q_values: */
-	q_values = ralloc_array(set, unsigned *, total_class_count);
-
-	build_q_values(q_values, 0, class_sizes, class_count);
-	build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count);
-	build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count);
-
-	/* allocate the reg-set.. */
-	set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
-	set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
-	set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
-
-	/* .. and classes */
-	reg = 0;
-	for (unsigned i = 0; i < class_count; i++) {
-		set->classes[i] = ra_alloc_reg_class(set->regs);
-
-		set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
-
-		for (unsigned j = 0; j < CLASS_REGS(i); j++) {
-			ra_class_add_reg(set->regs, set->classes[i], reg);
-
-			set->ra_reg_to_gpr[reg] = j;
-			set->gpr_to_ra_reg[i][j] = reg;
-
-			for (unsigned br = j; br < j + class_sizes[i]; br++)
-				ra_add_transitive_reg_conflict(set->regs, br, reg);
-
-			reg++;
-		}
-	}
-
-	first_half_reg = reg;
-	base = HALF_OFFSET;
-
-	for (unsigned i = 0; i < half_class_count; i++) {
-		set->half_classes[i] = ra_alloc_reg_class(set->regs);
-
-		set->gpr_to_ra_reg[base + i] =
-				ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
-
-		for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
-			ra_class_add_reg(set->regs, set->half_classes[i], reg);
-
-			set->ra_reg_to_gpr[reg] = j;
-			set->gpr_to_ra_reg[base + i][j] = reg;
-
-			for (unsigned br = j; br < j + half_class_sizes[i]; br++)
-				ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
-
-			reg++;
-		}
-	}
-
-	first_high_reg = reg;
-	base = HIGH_OFFSET;
-
-	for (unsigned i = 0; i < high_class_count; i++) {
-		set->high_classes[i] = ra_alloc_reg_class(set->regs);
-
-		set->gpr_to_ra_reg[base + i] =
-				ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i));
-
-		for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
-			ra_class_add_reg(set->regs, set->high_classes[i], reg);
-
-			set->ra_reg_to_gpr[reg] = j;
-			set->gpr_to_ra_reg[base + i][j] = reg;
-
-			for (unsigned br = j; br < j + high_class_sizes[i]; br++)
-				ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg);
-
-			reg++;
-		}
-	}
-
-	/* starting a6xx, half precision regs conflict w/ full precision regs: */
-	if (compiler->gpu_id >= 600) {
-		/* because of transitivity, we can get away with just setting up
-		 * conflicts between the first class of full and half regs:
-		 */
-		for (unsigned j = 0; j < CLASS_REGS(0) / 2; j++) {
-			unsigned freg  = set->gpr_to_ra_reg[0][j];
-			unsigned hreg0 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 0];
-			unsigned hreg1 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 1];
-
-			ra_add_transitive_reg_conflict(set->regs, freg, hreg0);
-			ra_add_transitive_reg_conflict(set->regs, freg, hreg1);
-		}
-
-		// TODO also need to update q_values, but for now:
-		ra_set_finalize(set->regs, NULL);
-	} else {
-		ra_set_finalize(set->regs, q_values);
-	}
-
-	ralloc_free(q_values);
-
-	return set;
-}
-
-/* additional block-data (per-block) */
-struct ir3_ra_block_data {
-	BITSET_WORD *def;        /* variables defined before used in block */
-	BITSET_WORD *use;        /* variables used before defined in block */
-	BITSET_WORD *livein;     /* which defs reach entry point of block */
-	BITSET_WORD *liveout;    /* which defs reach exit point of block */
-};
-
-/* additional instruction-data (per-instruction) */
-struct ir3_ra_instr_data {
-	/* cached instruction 'definer' info: */
-	struct ir3_instruction *defn;
-	int off, sz, cls;
-};
-
-/* register-assign context, per-shader */
-struct ir3_ra_ctx {
-	struct ir3 *ir;
-	gl_shader_stage type;
-	bool frag_face;
-
-	struct ir3_ra_reg_set *set;
-	struct ra_graph *g;
-	unsigned alloc_count;
-	/* one per class, plus one slot for arrays: */
-	unsigned class_alloc_count[total_class_count + 1];
-	unsigned class_base[total_class_count + 1];
-	unsigned instr_cnt;
-	unsigned *def, *use;     /* def/use table */
-	struct ir3_ra_instr_data *instrd;
-};
-
-/* does it conflict? */
-static inline bool
-intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
-{
-	return !((a_start >= b_end) || (b_start >= a_end));
-}
-
-static bool
-is_half(struct ir3_instruction *instr)
-{
-	return !!(instr->regs[0]->flags & IR3_REG_HALF);
-}
-
-static bool
-is_high(struct ir3_instruction *instr)
-{
-	return !!(instr->regs[0]->flags & IR3_REG_HIGH);
-}
-
-static int
-size_to_class(unsigned sz, bool half, bool high)
-{
-	if (high) {
-		for (unsigned i = 0; i < high_class_count; i++)
-			if (high_class_sizes[i] >= sz)
-				return i + HIGH_OFFSET;
-	} else if (half) {
-		for (unsigned i = 0; i < half_class_count; i++)
-			if (half_class_sizes[i] >= sz)
-				return i + HALF_OFFSET;
-	} else {
-		for (unsigned i = 0; i < class_count; i++)
-			if (class_sizes[i] >= sz)
-				return i;
-	}
-	debug_assert(0);
-	return -1;
-}
-
-static bool
-writes_gpr(struct ir3_instruction *instr)
-{
-	if (is_store(instr))
-		return false;
-	/* is dest a normal temp register: */
-	struct ir3_register *reg = instr->regs[0];
-	if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
-		return false;
-	if ((reg->num == regid(REG_A0, 0)) ||
-			(reg->num == regid(REG_P0, 0)))
-		return false;
-	return true;
-}
-
-static bool
-instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
-{
-	if (a->flags & IR3_INSTR_UNUSED)
-		return false;
-	return (a->ip < b->ip);
-}
-
-static struct ir3_instruction *
-get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
-		int *sz, int *off)
-{
-	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-	struct ir3_instruction *d = NULL;
-
-	if (id->defn) {
-		*sz = id->sz;
-		*off = id->off;
-		return id->defn;
-	}
-
-	if (instr->opc == OPC_META_FI) {
-		/* What about the case where collect is subset of array, we
-		 * need to find the distance between where actual array starts
-		 * and fanin..  that probably doesn't happen currently.
-		 */
-		struct ir3_register *src;
-		int dsz, doff;
-
-		/* note: don't use foreach_ssa_src as this gets called once
-		 * while assigning regs (which clears SSA flag)
-		 */
-		foreach_src_n(src, n, instr) {
-			struct ir3_instruction *dd;
-			if (!src->instr)
-				continue;
-
-			dd = get_definer(ctx, src->instr, &dsz, &doff);
-
-			if ((!d) || instr_before(dd, d)) {
-				d = dd;
-				*sz = dsz;
-				*off = doff - n;
-			}
-		}
-
-	} else if (instr->cp.right || instr->cp.left) {
-		/* covers also the meta:fo case, which ends up w/ single
-		 * scalar instructions for each component:
-		 */
-		struct ir3_instruction *f = ir3_neighbor_first(instr);
-
-		/* by definition, the entire sequence forms one linked list
-		 * of single scalar register nodes (even if some of them may
-		 * be fanouts from a texture sample (for example) instr.  We
-		 * just need to walk the list finding the first element of
-		 * the group defined (lowest ip)
-		 */
-		int cnt = 0;
-
-		/* need to skip over unused in the group: */
-		while (f && (f->flags & IR3_INSTR_UNUSED)) {
-			f = f->cp.right;
-			cnt++;
-		}
-
-		while (f) {
-			if ((!d) || instr_before(f, d))
-				d = f;
-			if (f == instr)
-				*off = cnt;
-			f = f->cp.right;
-			cnt++;
-		}
-
-		*sz = cnt;
-
-	} else {
-		/* second case is looking directly at the instruction which
-		 * produces multiple values (eg, texture sample), rather
-		 * than the fanout nodes that point back to that instruction.
-		 * This isn't quite right, because it may be part of a larger
-		 * group, such as:
-		 *
-		 *     sam (f32)(xyzw)r0.x, ...
-		 *     add r1.x, ...
-		 *     add r1.y, ...
-		 *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
-		 *
-		 * need to come up with a better way to handle that case.
-		 */
-		if (instr->address) {
-			*sz = instr->regs[0]->size;
-		} else {
-			*sz = util_last_bit(instr->regs[0]->wrmask);
-		}
-		*off = 0;
-		d = instr;
-	}
-
-	if (d->opc == OPC_META_FO) {
-		struct ir3_instruction *dd;
-		int dsz, doff;
-
-		dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
-
-		/* by definition, should come before: */
-		debug_assert(instr_before(dd, d));
-
-		*sz = MAX2(*sz, dsz);
-
-		debug_assert(instr->opc == OPC_META_FO);
-		*off = MAX2(*off, instr->fo.off);
-
-		d = dd;
-	}
-
-	id->defn = d;
-	id->sz = *sz;
-	id->off = *off;
-
-	return d;
-}
-
-static void
-ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
-{
-	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-		if (instr->regs_count == 0)
-			continue;
-		/* couple special cases: */
-		if (writes_addr(instr) || writes_pred(instr)) {
-			id->cls = -1;
-		} else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
-			id->cls = total_class_count;
-		} else {
-			id->defn = get_definer(ctx, instr, &id->sz, &id->off);
-			id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn));
-		}
-	}
-}
-
-/* give each instruction a name (and ip), and count up the # of names
- * of each class
- */
-static void
-ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
-{
-	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-
-#ifdef DEBUG
-		instr->name = ~0;
-#endif
-
-		ctx->instr_cnt++;
-
-		if (instr->regs_count == 0)
-			continue;
-
-		if (!writes_gpr(instr))
-			continue;
-
-		if (id->defn != instr)
-			continue;
-
-		/* arrays which don't fit in one of the pre-defined class
-		 * sizes are pre-colored:
-		 */
-		if ((id->cls >= 0) && (id->cls < total_class_count)) {
-			instr->name = ctx->class_alloc_count[id->cls]++;
-			ctx->alloc_count++;
-		}
-	}
-}
-
-static void
-ra_init(struct ir3_ra_ctx *ctx)
-{
-	unsigned n, base;
-
-	ir3_clear_mark(ctx->ir);
-	n = ir3_count_instructions(ctx->ir);
-
-	ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
-
-	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
-		ra_block_find_definers(ctx, block);
-	}
-
-	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
-		ra_block_name_instructions(ctx, block);
-	}
-
-	/* figure out the base register name for each class.  The
-	 * actual ra name is class_base[cls] + instr->name;
-	 */
-	ctx->class_base[0] = 0;
-	for (unsigned i = 1; i <= total_class_count; i++) {
-		ctx->class_base[i] = ctx->class_base[i-1] +
-				ctx->class_alloc_count[i-1];
-	}
-
-	/* and vreg names for array elements: */
-	base = ctx->class_base[total_class_count];
-	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
-		arr->base = base;
-		ctx->class_alloc_count[total_class_count] += arr->length;
-		base += arr->length;
-	}
-	ctx->alloc_count += ctx->class_alloc_count[total_class_count];
-
-	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
-	ralloc_steal(ctx->g, ctx->instrd);
-	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
-	ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
-}
-
-static unsigned
-__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
-{
-	unsigned name;
-	debug_assert(cls >= 0);
-	debug_assert(cls < total_class_count);  /* we shouldn't get arrays here.. */
-	name = ctx->class_base[cls] + defn->name;
-	debug_assert(name < ctx->alloc_count);
-	return name;
-}
-
-static int
-ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
-{
-	/* TODO handle name mapping for arrays */
-	return __ra_name(ctx, id->cls, id->defn);
-}
-
-static void
-ra_destroy(struct ir3_ra_ctx *ctx)
-{
-	ralloc_free(ctx->g);
-}
-
-static void
-ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
-{
-	struct ir3_ra_block_data *bd;
-	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
-
-#define def(name, instr) \
-		do { \
-			/* defined on first write: */ \
-			if (!ctx->def[name]) \
-				ctx->def[name] = instr->ip; \
-			ctx->use[name] = instr->ip; \
-			BITSET_SET(bd->def, name); \
-		} while(0);
-
-#define use(name, instr) \
-		do { \
-			ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
-			if (!BITSET_TEST(bd->def, name)) \
-				BITSET_SET(bd->use, name); \
-		} while(0);
-
-	bd = rzalloc(ctx->g, struct ir3_ra_block_data);
-
-	bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
-	bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
-	bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
-	bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
-
-	block->data = bd;
-
-	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		struct ir3_instruction *src;
-		struct ir3_register *reg;
-
-		if (instr->regs_count == 0)
-			continue;
-
-		/* There are a couple special cases to deal with here:
-		 *
-		 * fanout: used to split values from a higher class to a lower
-		 *     class, for example split the results of a texture fetch
-		 *     into individual scalar values;  We skip over these from
-		 *     a 'def' perspective, and for a 'use' we walk the chain
-		 *     up to the defining instruction.
-		 *
-		 * fanin: used to collect values from lower class and assemble
-		 *     them together into a higher class, for example arguments
-		 *     to texture sample instructions;  We consider these to be
-		 *     defined at the earliest fanin source.
-		 *
-		 * Most of this is handled in the get_definer() helper.
-		 *
-		 * In either case, we trace the instruction back to the original
-		 * definer and consider that as the def/use ip.
-		 */
-
-		if (writes_gpr(instr)) {
-			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-			struct ir3_register *dst = instr->regs[0];
-
-			if (dst->flags & IR3_REG_ARRAY) {
-				struct ir3_array *arr =
-					ir3_lookup_array(ctx->ir, dst->array.id);
-				unsigned i;
-
-				arr->start_ip = MIN2(arr->start_ip, instr->ip);
-				arr->end_ip = MAX2(arr->end_ip, instr->ip);
-
-				/* set the node class now.. in case we don't encounter
-				 * this array dst again.  From register_alloc algo's
-				 * perspective, these are all single/scalar regs:
-				 */
-				for (i = 0; i < arr->length; i++) {
-					unsigned name = arr->base + i;
-					ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
-				}
-
-				/* indirect write is treated like a write to all array
-				 * elements, since we don't know which one is actually
-				 * written:
-				 */
-				if (dst->flags & IR3_REG_RELATIV) {
-					for (i = 0; i < arr->length; i++) {
-						unsigned name = arr->base + i;
-						def(name, instr);
-					}
-				} else {
-					unsigned name = arr->base + dst->array.offset;
-					def(name, instr);
-				}
-
-			} else if (id->defn == instr) {
-				unsigned name = ra_name(ctx, id);
-
-				/* since we are in SSA at this point: */
-				debug_assert(!BITSET_TEST(bd->use, name));
-
-				def(name, id->defn);
-
-				if (is_high(id->defn)) {
-					ra_set_node_class(ctx->g, name,
-							ctx->set->high_classes[id->cls - HIGH_OFFSET]);
-				} else if (is_half(id->defn)) {
-					ra_set_node_class(ctx->g, name,
-							ctx->set->half_classes[id->cls - HALF_OFFSET]);
-				} else {
-					ra_set_node_class(ctx->g, name,
-							ctx->set->classes[id->cls]);
-				}
-			}
-		}
-
-		foreach_src(reg, instr) {
-			if (reg->flags & IR3_REG_ARRAY) {
-				struct ir3_array *arr =
-					ir3_lookup_array(ctx->ir, reg->array.id);
-				arr->start_ip = MIN2(arr->start_ip, instr->ip);
-				arr->end_ip = MAX2(arr->end_ip, instr->ip);
-
-				/* indirect read is treated like a read fromall array
-				 * elements, since we don't know which one is actually
-				 * read:
-				 */
-				if (reg->flags & IR3_REG_RELATIV) {
-					unsigned i;
-					for (i = 0; i < arr->length; i++) {
-						unsigned name = arr->base + i;
-						use(name, instr);
-					}
-				} else {
-					unsigned name = arr->base + reg->array.offset;
-					use(name, instr);
-					/* NOTE: arrays are not SSA so unconditionally
-					 * set use bit:
-					 */
-					BITSET_SET(bd->use, name);
-					debug_assert(reg->array.offset < arr->length);
-				}
-			} else if ((src = ssa(reg)) && writes_gpr(src)) {
-				unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
-				use(name, instr);
-			}
-		}
-	}
-}
-
-static bool
-ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
-{
-	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
-	bool progress = false;
-
-	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
-		struct ir3_ra_block_data *bd = block->data;
-
-		/* update livein: */
-		for (unsigned i = 0; i < bitset_words; i++) {
-			BITSET_WORD new_livein =
-				(bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
-
-			if (new_livein & ~bd->livein[i]) {
-				bd->livein[i] |= new_livein;
-				progress = true;
-			}
-		}
-
-		/* update liveout: */
-		for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
-			struct ir3_block *succ = block->successors[j];
-			struct ir3_ra_block_data *succ_bd;
-
-			if (!succ)
-				continue;
-
-			succ_bd = succ->data;
-
-			for (unsigned i = 0; i < bitset_words; i++) {
-				BITSET_WORD new_liveout =
-					(succ_bd->livein[i] & ~bd->liveout[i]);
-
-				if (new_liveout) {
-					bd->liveout[i] |= new_liveout;
-					progress = true;
-				}
-			}
-		}
-	}
-
-	return progress;
-}
-
-static void
-print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
-{
-	bool first = true;
-	debug_printf("  %s:", name);
-	for (unsigned i = 0; i < cnt; i++) {
-		if (BITSET_TEST(bs, i)) {
-			if (!first)
-				debug_printf(",");
-			debug_printf(" %04u", i);
-			first = false;
-		}
-	}
-	debug_printf("\n");
-}
-
-static void
-ra_add_interference(struct ir3_ra_ctx *ctx)
-{
-	struct ir3 *ir = ctx->ir;
-
-	/* initialize array live ranges: */
-	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
-		arr->start_ip = ~0;
-		arr->end_ip = 0;
-	}
-
-	/* compute live ranges (use/def) on a block level, also updating
-	 * block's def/use bitmasks (used below to calculate per-block
-	 * livein/liveout):
-	 */
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		ra_block_compute_live_ranges(ctx, block);
-	}
-
-	/* update per-block livein/liveout: */
-	while (ra_compute_livein_liveout(ctx)) {}
-
-	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-		debug_printf("AFTER LIVEIN/OUT:\n");
-		ir3_print(ir);
-		list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-			struct ir3_ra_block_data *bd = block->data;
-			debug_printf("block%u:\n", block_id(block));
-			print_bitset("  def", bd->def, ctx->alloc_count);
-			print_bitset("  use", bd->use, ctx->alloc_count);
-			print_bitset("  l/i", bd->livein, ctx->alloc_count);
-			print_bitset("  l/o", bd->liveout, ctx->alloc_count);
-		}
-		list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
-			debug_printf("array%u:\n", arr->id);
-			debug_printf("  length:   %u\n", arr->length);
-			debug_printf("  start_ip: %u\n", arr->start_ip);
-			debug_printf("  end_ip:   %u\n", arr->end_ip);
-		}
-	}
-
-	/* extend start/end ranges based on livein/liveout info from cfg: */
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		struct ir3_ra_block_data *bd = block->data;
-
-		for (unsigned i = 0; i < ctx->alloc_count; i++) {
-			if (BITSET_TEST(bd->livein, i)) {
-				ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
-				ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
-			}
-
-			if (BITSET_TEST(bd->liveout, i)) {
-				ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
-				ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
-			}
-		}
-
-		list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
-			for (unsigned i = 0; i < arr->length; i++) {
-				if (BITSET_TEST(bd->livein, i + arr->base)) {
-					arr->start_ip = MIN2(arr->start_ip, block->start_ip);
-				}
-				if (BITSET_TEST(bd->livein, i + arr->base)) {
-					arr->end_ip = MAX2(arr->end_ip, block->end_ip);
-				}
-			}
-		}
-	}
-
-	/* need to fix things up to keep outputs live: */
-	for (unsigned i = 0; i < ir->noutputs; i++) {
-		struct ir3_instruction *instr = ir->outputs[i];
-		unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
-		ctx->use[name] = ctx->instr_cnt;
-	}
-
-	for (unsigned i = 0; i < ctx->alloc_count; i++) {
-		for (unsigned j = 0; j < ctx->alloc_count; j++) {
-			if (intersects(ctx->def[i], ctx->use[i],
-					ctx->def[j], ctx->use[j])) {
-				ra_add_node_interference(ctx->g, i, j);
-			}
-		}
-	}
-}
-
-/* some instructions need fix-up if dst register is half precision: */
-static void fixup_half_instr_dst(struct ir3_instruction *instr)
-{
-	switch (opc_cat(instr->opc)) {
-	case 1: /* move instructions */
-		instr->cat1.dst_type = half_type(instr->cat1.dst_type);
-		break;
-	case 3:
-		switch (instr->opc) {
-		case OPC_MAD_F32:
-			instr->opc = OPC_MAD_F16;
-			break;
-		case OPC_SEL_B32:
-			instr->opc = OPC_SEL_B16;
-			break;
-		case OPC_SEL_S32:
-			instr->opc = OPC_SEL_S16;
-			break;
-		case OPC_SEL_F32:
-			instr->opc = OPC_SEL_F16;
-			break;
-		case OPC_SAD_S32:
-			instr->opc = OPC_SAD_S16;
-			break;
-		/* instructions may already be fixed up: */
-		case OPC_MAD_F16:
-		case OPC_SEL_B16:
-		case OPC_SEL_S16:
-		case OPC_SEL_F16:
-		case OPC_SAD_S16:
-			break;
-		default:
-			assert(0);
-			break;
-		}
-		break;
-	case 5:
-		instr->cat5.type = half_type(instr->cat5.type);
-		break;
-	}
-}
-/* some instructions need fix-up if src register is half precision: */
-static void fixup_half_instr_src(struct ir3_instruction *instr)
-{
-	switch (instr->opc) {
-	case OPC_MOV:
-		instr->cat1.src_type = half_type(instr->cat1.src_type);
-		break;
-	default:
-		break;
-	}
-}
-
-/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
- * array access(es) which do not have any previous access to depend
- * on from scheduling point of view
- */
-static void
-reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
-		struct ir3_instruction *instr)
-{
-	struct ir3_ra_instr_data *id;
-
-	if (reg->flags & IR3_REG_ARRAY) {
-		struct ir3_array *arr =
-			ir3_lookup_array(ctx->ir, reg->array.id);
-		unsigned name = arr->base + reg->array.offset;
-		unsigned r = ra_get_node_reg(ctx->g, name);
-		unsigned num = ctx->set->ra_reg_to_gpr[r];
-
-		if (reg->flags & IR3_REG_RELATIV) {
-			reg->array.offset = num;
-		} else {
-			reg->num = num;
-			reg->flags &= ~IR3_REG_SSA;
-		}
-
-		reg->flags &= ~IR3_REG_ARRAY;
-	} else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
-		unsigned name = ra_name(ctx, id);
-		unsigned r = ra_get_node_reg(ctx->g, name);
-		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
-
-		debug_assert(!(reg->flags & IR3_REG_RELATIV));
-
-		if (is_high(id->defn))
-			num += FIRST_HIGH_REG;
-
-		reg->num = num;
-		reg->flags &= ~IR3_REG_SSA;
-
-		if (is_half(id->defn))
-			reg->flags |= IR3_REG_HALF;
-	}
-}
-
-static void
-ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
-{
-	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		struct ir3_register *reg;
-
-		if (instr->regs_count == 0)
-			continue;
-
-		if (writes_gpr(instr)) {
-			reg_assign(ctx, instr->regs[0], instr);
-			if (instr->regs[0]->flags & IR3_REG_HALF)
-				fixup_half_instr_dst(instr);
-		}
-
-		foreach_src_n(reg, n, instr) {
-			struct ir3_instruction *src = reg->instr;
-			/* Note: reg->instr could be null for IR3_REG_ARRAY */
-			if (!(src || (reg->flags & IR3_REG_ARRAY)))
-				continue;
-			reg_assign(ctx, instr->regs[n+1], src);
-			if (instr->regs[n+1]->flags & IR3_REG_HALF)
-				fixup_half_instr_src(instr);
-		}
-	}
-}
-
-static int
-ra_alloc(struct ir3_ra_ctx *ctx)
-{
-	/* pre-assign array elements:
-	 */
-	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
-		unsigned base = 0;
-
-		if (arr->end_ip == 0)
-			continue;
-
-		/* figure out what else we conflict with which has already
-		 * been assigned:
-		 */
-retry:
-		list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
-			if (arr2 == arr)
-				break;
-			if (arr2->end_ip == 0)
-				continue;
-			/* if it intersects with liverange AND register range.. */
-			if (intersects(arr->start_ip, arr->end_ip,
-					arr2->start_ip, arr2->end_ip) &&
-				intersects(base, base + arr->length,
-					arr2->reg, arr2->reg + arr2->length)) {
-				base = MAX2(base, arr2->reg + arr2->length);
-				goto retry;
-			}
-		}
-
-		arr->reg = base;
-
-		for (unsigned i = 0; i < arr->length; i++) {
-			unsigned name, reg;
-
-			name = arr->base + i;
-			reg = ctx->set->gpr_to_ra_reg[0][base++];
-
-			ra_set_node_reg(ctx->g, name, reg);
-		}
-	}
-
-	if (!ra_allocate(ctx->g))
-		return -1;
-
-	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
-		ra_block_alloc(ctx, block);
-	}
-
-	return 0;
-}
-
-int ir3_ra(struct ir3 *ir, gl_shader_stage type,
-		bool frag_coord, bool frag_face)
-{
-	struct ir3_ra_ctx ctx = {
-			.ir = ir,
-			.type = type,
-			.frag_face = frag_face,
-			.set = ir->compiler->set,
-	};
-	int ret;
-
-	ra_init(&ctx);
-	ra_add_interference(&ctx);
-	ret = ra_alloc(&ctx);
-	ra_destroy(&ctx);
-
-	return ret;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
deleted file mode 100644
index 6552980d90c..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-
-#include "util/u_math.h"
-
-#include "ir3.h"
-
-/*
- * Instruction Scheduling:
- *
- * A recursive depth based scheduling algo.  Recursively find an eligible
- * instruction to schedule from the deepest instruction (recursing through
- * it's unscheduled src instructions).  Normally this would result in a
- * lot of re-traversal of the same instructions, so we cache results in
- * instr->data (and clear cached results that would be no longer valid
- * after scheduling an instruction).
- *
- * There are a few special cases that need to be handled, since sched
- * is currently independent of register allocation.  Usages of address
- * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
- * if you have two pairs of instructions that write the same special
- * register and then read it, then those pairs cannot be interleaved.
- * To solve this, when we are in such a scheduling "critical section",
- * and we encounter a conflicting write to a special register, we try
- * to schedule any remaining instructions that use that value first.
- */
-
-struct ir3_sched_ctx {
-	struct ir3_block *block;           /* the current block */
-	struct list_head depth_list;       /* depth sorted unscheduled instrs */
-	struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
-	struct ir3_instruction *addr;      /* current a0.x user, if any */
-	struct ir3_instruction *pred;      /* current p0.x user, if any */
-	bool error;
-};
-
-static bool is_sfu_or_mem(struct ir3_instruction *instr)
-{
-	return is_sfu(instr) || is_mem(instr);
-}
-
-#define NULL_INSTR ((void *)~0)
-
-static void
-clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
-{
-	list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
-		if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
-			instr2->data = NULL;
-	}
-}
-
-static void
-schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
-{
-	debug_assert(ctx->block == instr->block);
-
-	/* maybe there is a better way to handle this than just stuffing
-	 * a nop.. ideally we'd know about this constraint in the
-	 * scheduling and depth calculation..
-	 */
-	if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
-		ir3_NOP(ctx->block);
-
-	/* remove from depth list:
-	 */
-	list_delinit(&instr->node);
-
-	if (writes_addr(instr)) {
-		debug_assert(ctx->addr == NULL);
-		ctx->addr = instr;
-	}
-
-	if (writes_pred(instr)) {
-		debug_assert(ctx->pred == NULL);
-		ctx->pred = instr;
-	}
-
-	instr->flags |= IR3_INSTR_MARK;
-
-	list_addtail(&instr->node, &instr->block->instr_list);
-	ctx->scheduled = instr;
-
-	if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
-		clear_cache(ctx, NULL);
-	} else {
-		/* invalidate only the necessary entries.. */
-		clear_cache(ctx, instr);
-	}
-}
-
-static struct ir3_instruction *
-deepest(struct ir3_instruction **srcs, unsigned nsrcs)
-{
-	struct ir3_instruction *d = NULL;
-	unsigned i = 0, id = 0;
-
-	while ((i < nsrcs) && !(d = srcs[id = i]))
-		i++;
-
-	if (!d)
-		return NULL;
-
-	for (; i < nsrcs; i++)
-		if (srcs[i] && (srcs[i]->depth > d->depth))
-			d = srcs[id = i];
-
-	srcs[id] = NULL;
-
-	return d;
-}
-
-/**
- * @block: the block to search in, starting from end; in first pass,
- *    this will be the block the instruction would be inserted into
- *    (but has not yet, ie. it only contains already scheduled
- *    instructions).  For intra-block scheduling (second pass), this
- *    would be one of the predecessor blocks.
- * @instr: the instruction to search for
- * @maxd:  max distance, bail after searching this # of instruction
- *    slots, since it means the instruction we are looking for is
- *    far enough away
- * @pred:  if true, recursively search into predecessor blocks to
- *    find the worst case (shortest) distance (only possible after
- *    individual blocks are all scheduled
- */
-static unsigned
-distance(struct ir3_block *block, struct ir3_instruction *instr,
-		unsigned maxd, bool pred)
-{
-	unsigned d = 0;
-
-	list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
-		if ((n == instr) || (d >= maxd))
-			return d;
-		/* NOTE: don't count branch/jump since we don't know yet if they will
-		 * be eliminated later in resolve_jumps().. really should do that
-		 * earlier so we don't have this constraint.
-		 */
-		if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
-			d++;
-	}
-
-	/* if coming from a predecessor block, assume it is assigned far
-	 * enough away.. we'll fix up later.
-	 */
-	if (!pred)
-		return maxd;
-
-	if (pred && (block->data != block)) {
-		/* Search into predecessor blocks, finding the one with the
-		 * shortest distance, since that will be the worst case
-		 */
-		unsigned min = maxd - d;
-
-		/* (ab)use block->data to prevent recursion: */
-		block->data = block;
-
-		for (unsigned i = 0; i < block->predecessors_count; i++) {
-			unsigned n;
-
-			n = distance(block->predecessors[i], instr, min, pred);
-
-			min = MIN2(min, n);
-		}
-
-		block->data = NULL;
-		d += min;
-	}
-
-	return d;
-}
-
-/* calculate delay for specified src: */
-static unsigned
-delay_calc_srcn(struct ir3_block *block,
-		struct ir3_instruction *assigner,
-		struct ir3_instruction *consumer,
-		unsigned srcn, bool soft, bool pred)
-{
-	unsigned delay = 0;
-
-	if (is_meta(assigner)) {
-		struct ir3_instruction *src;
-		foreach_ssa_src(src, assigner) {
-			unsigned d;
-			d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
-			delay = MAX2(delay, d);
-		}
-	} else {
-		if (soft) {
-			if (is_sfu(assigner)) {
-				delay = 4;
-			} else {
-				delay = ir3_delayslots(assigner, consumer, srcn);
-			}
-		} else {
-			delay = ir3_delayslots(assigner, consumer, srcn);
-		}
-		delay -= distance(block, assigner, delay, pred);
-	}
-
-	return delay;
-}
-
-/* calculate delay for instruction (maximum of delay for all srcs): */
-static unsigned
-delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
-		bool soft, bool pred)
-{
-	unsigned delay = 0;
-	struct ir3_instruction *src;
-
-	foreach_ssa_src_n(src, i, instr) {
-		unsigned d;
-		d = delay_calc_srcn(block, src, instr, i, soft, pred);
-		delay = MAX2(delay, d);
-	}
-
-	return delay;
-}
-
-struct ir3_sched_notes {
-	/* there is at least one kill which could be scheduled, except
-	 * for unscheduled bary.f's:
-	 */
-	bool blocked_kill;
-	/* there is at least one instruction that could be scheduled,
-	 * except for conflicting address/predicate register usage:
-	 */
-	bool addr_conflict, pred_conflict;
-};
-
-static bool is_scheduled(struct ir3_instruction *instr)
-{
-	return !!(instr->flags & IR3_INSTR_MARK);
-}
-
-/* could an instruction be scheduled if specified ssa src was scheduled? */
-static bool
-could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
-{
-	struct ir3_instruction *other_src;
-	foreach_ssa_src(other_src, instr) {
-		/* if dependency not scheduled, we aren't ready yet: */
-		if ((src != other_src) && !is_scheduled(other_src)) {
-			return false;
-		}
-	}
-	return true;
-}
-
-/* Check if instruction is ok to schedule.  Make sure it is not blocked
- * by use of addr/predicate register, etc.
- */
-static bool
-check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		struct ir3_instruction *instr)
-{
-	/* For instructions that write address register we need to
-	 * make sure there is at least one instruction that uses the
-	 * addr value which is otherwise ready.
-	 *
-	 * TODO if any instructions use pred register and have other
-	 * src args, we would need to do the same for writes_pred()..
-	 */
-	if (writes_addr(instr)) {
-		struct ir3 *ir = instr->block->shader;
-		bool ready = false;
-		for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
-			struct ir3_instruction *indirect = ir->indirects[i];
-			if (!indirect)
-				continue;
-			if (indirect->address != instr)
-				continue;
-			ready = could_sched(indirect, instr);
-		}
-
-		/* nothing could be scheduled, so keep looking: */
-		if (!ready)
-			return false;
-	}
-
-	/* if this is a write to address/predicate register, and that
-	 * register is currently in use, we need to defer until it is
-	 * free:
-	 */
-	if (writes_addr(instr) && ctx->addr) {
-		debug_assert(ctx->addr != instr);
-		notes->addr_conflict = true;
-		return false;
-	}
-
-	if (writes_pred(instr) && ctx->pred) {
-		debug_assert(ctx->pred != instr);
-		notes->pred_conflict = true;
-		return false;
-	}
-
-	/* if the instruction is a kill, we need to ensure *every*
-	 * bary.f is scheduled.  The hw seems unhappy if the thread
-	 * gets killed before the end-input (ei) flag is hit.
-	 *
-	 * We could do this by adding each bary.f instruction as
-	 * virtual ssa src for the kill instruction.  But we have
-	 * fixed length instr->regs[].
-	 *
-	 * TODO this wouldn't be quite right if we had multiple
-	 * basic blocks, if any block was conditional.  We'd need
-	 * to schedule the bary.f's outside of any block which
-	 * was conditional that contained a kill.. I think..
-	 */
-	if (is_kill(instr)) {
-		struct ir3 *ir = instr->block->shader;
-
-		for (unsigned i = 0; i < ir->baryfs_count; i++) {
-			struct ir3_instruction *baryf = ir->baryfs[i];
-			if (baryf->flags & IR3_INSTR_UNUSED)
-				continue;
-			if (!is_scheduled(baryf)) {
-				notes->blocked_kill = true;
-				return false;
-			}
-		}
-	}
-
-	return true;
-}
-
-/* Find the best instruction to schedule from specified instruction or
- * recursively it's ssa sources.
- */
-static struct ir3_instruction *
-find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		struct ir3_instruction *instr)
-{
-	struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
-	struct ir3_instruction *src;
-	unsigned nsrcs = 0;
-
-	if (is_scheduled(instr))
-		return NULL;
-
-	/* use instr->data to cache the results of recursing up the
-	 * instr src's.  Otherwise the recursive algo can scale quite
-	 * badly w/ shader size.  But this takes some care to clear
-	 * the cache appropriately when instructions are scheduled.
-	 */
-	if (instr->data) {
-		if (instr->data == NULL_INSTR)
-			return NULL;
-		return instr->data;
-	}
-
-	/* find unscheduled srcs: */
-	foreach_ssa_src(src, instr) {
-		if (!is_scheduled(src)) {
-			debug_assert(nsrcs < ARRAY_SIZE(srcs));
-			srcs[nsrcs++] = src;
-		}
-	}
-
-	/* if all our src's are already scheduled: */
-	if (nsrcs == 0) {
-		if (check_instr(ctx, notes, instr)) {
-			instr->data = instr;
-			return instr;
-		}
-		return NULL;
-	}
-
-	while ((src = deepest(srcs, nsrcs))) {
-		struct ir3_instruction *candidate;
-
-		candidate = find_instr_recursive(ctx, notes, src);
-		if (!candidate)
-			continue;
-
-		if (check_instr(ctx, notes, candidate)) {
-			instr->data = candidate;
-			return candidate;
-		}
-	}
-
-	instr->data = NULL_INSTR;
-	return NULL;
-}
-
-/* find instruction to schedule: */
-static struct ir3_instruction *
-find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		bool soft)
-{
-	struct ir3_instruction *best_instr = NULL;
-	unsigned min_delay = ~0;
-
-	/* TODO we'd really rather use the list/array of block outputs.  But we
-	 * don't have such a thing.  Recursing *every* instruction in the list
-	 * will result in a lot of repeated traversal, since instructions will
-	 * get traversed both when they appear as ssa src to a later instruction
-	 * as well as where they appear in the depth_list.
-	 */
-	list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
-		struct ir3_instruction *candidate;
-		unsigned delay;
-
-		candidate = find_instr_recursive(ctx, notes, instr);
-		if (!candidate)
-			continue;
-
-		delay = delay_calc(ctx->block, candidate, soft, false);
-		if (delay < min_delay) {
-			best_instr = candidate;
-			min_delay = delay;
-		}
-
-		if (min_delay == 0)
-			break;
-	}
-
-	return best_instr;
-}
-
-/* "spill" the address register by remapping any unscheduled
- * instructions which depend on the current address register
- * to a clone of the instruction which wrote the address reg.
- */
-static struct ir3_instruction *
-split_addr(struct ir3_sched_ctx *ctx)
-{
-	struct ir3 *ir;
-	struct ir3_instruction *new_addr = NULL;
-	unsigned i;
-
-	debug_assert(ctx->addr);
-
-	ir = ctx->addr->block->shader;
-
-	for (i = 0; i < ir->indirects_count; i++) {
-		struct ir3_instruction *indirect = ir->indirects[i];
-
-		if (!indirect)
-			continue;
-
-		/* skip instructions already scheduled: */
-		if (is_scheduled(indirect))
-			continue;
-
-		/* remap remaining instructions using current addr
-		 * to new addr:
-		 */
-		if (indirect->address == ctx->addr) {
-			if (!new_addr) {
-				new_addr = ir3_instr_clone(ctx->addr);
-				/* original addr is scheduled, but new one isn't: */
-				new_addr->flags &= ~IR3_INSTR_MARK;
-			}
-			ir3_instr_set_address(indirect, new_addr);
-		}
-	}
-
-	/* all remaining indirects remapped to new addr: */
-	ctx->addr = NULL;
-
-	return new_addr;
-}
-
-/* "spill" the predicate register by remapping any unscheduled
- * instructions which depend on the current predicate register
- * to a clone of the instruction which wrote the address reg.
- */
-static struct ir3_instruction *
-split_pred(struct ir3_sched_ctx *ctx)
-{
-	struct ir3 *ir;
-	struct ir3_instruction *new_pred = NULL;
-	unsigned i;
-
-	debug_assert(ctx->pred);
-
-	ir = ctx->pred->block->shader;
-
-	for (i = 0; i < ir->predicates_count; i++) {
-		struct ir3_instruction *predicated = ir->predicates[i];
-
-		/* skip instructions already scheduled: */
-		if (is_scheduled(predicated))
-			continue;
-
-		/* remap remaining instructions using current pred
-		 * to new pred:
-		 *
-		 * TODO is there ever a case when pred isn't first
-		 * (and only) src?
-		 */
-		if (ssa(predicated->regs[1]) == ctx->pred) {
-			if (!new_pred) {
-				new_pred = ir3_instr_clone(ctx->pred);
-				/* original pred is scheduled, but new one isn't: */
-				new_pred->flags &= ~IR3_INSTR_MARK;
-			}
-			predicated->regs[1]->instr = new_pred;
-		}
-	}
-
-	/* all remaining predicated remapped to new pred: */
-	ctx->pred = NULL;
-
-	return new_pred;
-}
-
-static void
-sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
-{
-	struct list_head unscheduled_list;
-
-	ctx->block = block;
-
-	/* addr/pred writes are per-block: */
-	ctx->addr = NULL;
-	ctx->pred = NULL;
-
-	/* move all instructions to the unscheduled list, and
-	 * empty the block's instruction list (to which we will
-	 * be inserting).
-	 */
-	list_replace(&block->instr_list, &unscheduled_list);
-	list_inithead(&block->instr_list);
-	list_inithead(&ctx->depth_list);
-
-	/* first a pre-pass to schedule all meta:input instructions
-	 * (which need to appear first so that RA knows the register is
-	 * occupied), and move remaining to depth sorted list:
-	 */
-	list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
-		if (instr->opc == OPC_META_INPUT) {
-			schedule(ctx, instr);
-		} else {
-			ir3_insert_by_depth(instr, &ctx->depth_list);
-		}
-	}
-
-	while (!list_empty(&ctx->depth_list)) {
-		struct ir3_sched_notes notes = {0};
-		struct ir3_instruction *instr;
-
-		instr = find_eligible_instr(ctx, &notes, true);
-		if (!instr)
-			instr = find_eligible_instr(ctx, &notes, false);
-
-		if (instr) {
-			unsigned delay = delay_calc(ctx->block, instr, false, false);
-
-			/* and if we run out of instructions that can be scheduled,
-			 * then it is time for nop's:
-			 */
-			debug_assert(delay <= 6);
-			while (delay > 0) {
-				ir3_NOP(block);
-				delay--;
-			}
-
-			schedule(ctx, instr);
-		} else {
-			struct ir3_instruction *new_instr = NULL;
-
-			/* nothing available to schedule.. if we are blocked on
-			 * address/predicate register conflict, then break the
-			 * deadlock by cloning the instruction that wrote that
-			 * reg:
-			 */
-			if (notes.addr_conflict) {
-				new_instr = split_addr(ctx);
-			} else if (notes.pred_conflict) {
-				new_instr = split_pred(ctx);
-			} else {
-				debug_assert(0);
-				ctx->error = true;
-				return;
-			}
-
-			if (new_instr) {
-				/* clearing current addr/pred can change what is
-				 * available to schedule, so clear cache..
-				 */
-				clear_cache(ctx, NULL);
-
-				ir3_insert_by_depth(new_instr, &ctx->depth_list);
-				/* the original instr that wrote addr/pred may have
-				 * originated from a different block:
-				 */
-				new_instr->block = block;
-			}
-		}
-	}
-
-	/* And lastly, insert branch/jump instructions to take us to
-	 * the next block.  Later we'll strip back out the branches
-	 * that simply jump to next instruction.
-	 */
-	if (block->successors[1]) {
-		/* if/else, conditional branches to "then" or "else": */
-		struct ir3_instruction *br;
-		unsigned delay = 6;
-
-		debug_assert(ctx->pred);
-		debug_assert(block->condition);
-
-		delay -= distance(ctx->block, ctx->pred, delay, false);
-
-		while (delay > 0) {
-			ir3_NOP(block);
-			delay--;
-		}
-
-		/* create "else" branch first (since "then" block should
-		 * frequently/always end up being a fall-thru):
-		 */
-		br = ir3_BR(block);
-		br->cat0.inv = true;
-		br->cat0.target = block->successors[1];
-
-		/* NOTE: we have to hard code delay of 6 above, since
-		 * we want to insert the nop's before constructing the
-		 * branch.  Throw in an assert so we notice if this
-		 * ever breaks on future generation:
-		 */
-		debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
-
-		br = ir3_BR(block);
-		br->cat0.target = block->successors[0];
-
-	} else if (block->successors[0]) {
-		/* otherwise unconditional jump to next block: */
-		struct ir3_instruction *jmp;
-
-		jmp = ir3_JUMP(block);
-		jmp->cat0.target = block->successors[0];
-	}
-
-	/* NOTE: if we kept track of the predecessors, we could do a better
-	 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
-	 * Note that as we eliminate blocks which contain only an unconditional
-	 * jump we probably need to propagate (jp) flag..
-	 */
-}
-
-/* After scheduling individual blocks, we still could have cases where
- * one (or more) paths into a block, a value produced by a previous
- * has too few delay slots to be legal.  We can't deal with this in the
- * first pass, because loops (ie. we can't ensure all predecessor blocks
- * are already scheduled in the first pass).  All we can really do at
- * this point is stuff in extra nop's until things are legal.
- */
-static void
-sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
-{
-	unsigned n = 0;
-
-	ctx->block = block;
-
-	list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
-		unsigned delay = 0;
-
-		for (unsigned i = 0; i < block->predecessors_count; i++) {
-			unsigned d = delay_calc(block->predecessors[i], instr, false, true);
-			delay = MAX2(d, delay);
-		}
-
-		while (delay > n) {
-			struct ir3_instruction *nop = ir3_NOP(block);
-
-			/* move to before instr: */
-			list_delinit(&nop->node);
-			list_addtail(&nop->node, &instr->node);
-
-			n++;
-		}
-
-		/* we can bail once we hit worst case delay: */
-		if (++n > 6)
-			break;
-	}
-}
-
-int ir3_sched(struct ir3 *ir)
-{
-	struct ir3_sched_ctx ctx = {0};
-
-	ir3_clear_mark(ir);
-
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		sched_block(&ctx, block);
-	}
-
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		sched_intra_block(&ctx, block);
-	}
-
-	if (ctx.error)
-		return -1;
-	return 0;
-}
-
-/* does instruction 'prior' need to be scheduled before 'instr'? */
-static bool
-depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
-{
-	/* TODO for dependencies that are related to a specific object, ie
-	 * a specific SSBO/image/array, we could relax this constraint to
-	 * make accesses to unrelated objects not depend on each other (at
-	 * least as long as not declared coherent)
-	 */
-	if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
-			((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
-		return true;
-	return !!(instr->barrier_class & prior->barrier_conflict);
-}
-
-static void
-add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
-{
-	struct list_head *prev = instr->node.prev;
-	struct list_head *next = instr->node.next;
-
-	/* add dependencies on previous instructions that must be scheduled
-	 * prior to the current instruction
-	 */
-	while (prev != &block->instr_list) {
-		struct ir3_instruction *pi =
-			LIST_ENTRY(struct ir3_instruction, prev, node);
-
-		prev = prev->prev;
-
-		if (is_meta(pi))
-			continue;
-
-		if (instr->barrier_class == pi->barrier_class) {
-			ir3_instr_add_dep(instr, pi);
-			break;
-		}
-
-		if (depends_on(instr, pi))
-			ir3_instr_add_dep(instr, pi);
-	}
-
-	/* add dependencies on this instruction to following instructions
-	 * that must be scheduled after the current instruction:
-	 */
-	while (next != &block->instr_list) {
-		struct ir3_instruction *ni =
-			LIST_ENTRY(struct ir3_instruction, next, node);
-
-		next = next->next;
-
-		if (is_meta(ni))
-			continue;
-
-		if (instr->barrier_class == ni->barrier_class) {
-			ir3_instr_add_dep(ni, instr);
-			break;
-		}
-
-		if (depends_on(ni, instr))
-			ir3_instr_add_dep(ni, instr);
-	}
-}
-
-/* before scheduling a block, we need to add any necessary false-dependencies
- * to ensure that:
- *
- *  (1) barriers are scheduled in the right order wrt instructions related
- *      to the barrier
- *
- *  (2) reads that come before a write actually get scheduled before the
- *      write
- */
-static void
-calculate_deps(struct ir3_block *block)
-{
-	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		if (instr->barrier_class) {
-			add_barrier_deps(block, instr);
-		}
-	}
-}
-
-void
-ir3_sched_add_deps(struct ir3 *ir)
-{
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		calculate_deps(block);
-	}
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
deleted file mode 100644
index b58a204c6b9..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ /dev/null
@@ -1,436 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#include "util/u_string.h"
-#include "util/u_memory.h"
-#include "util/u_format.h"
-
-#include "freedreno_util.h"
-
-#include "ir3_shader.h"
-#include "ir3_compiler.h"
-#include "ir3_nir.h"
-
-int
-ir3_glsl_type_size(const struct glsl_type *type)
-{
-	return glsl_count_attribute_slots(type, false);
-}
-
-static void
-delete_variant(struct ir3_shader_variant *v)
-{
-	if (v->ir)
-		ir3_destroy(v->ir);
-	if (v->bo)
-		fd_bo_del(v->bo);
-	if (v->immediates)
-		free(v->immediates);
-	free(v);
-}
-
-/* for vertex shader, the inputs are loaded into registers before the shader
- * is executed, so max_regs from the shader instructions might not properly
- * reflect the # of registers actually used, especially in case passthrough
- * varyings.
- *
- * Likewise, for fragment shader, we can have some regs which are passed
- * input values but never touched by the resulting shader (ie. as result
- * of dead code elimination or simply because we don't know how to turn
- * the reg off.
- */
-static void
-fixup_regfootprint(struct ir3_shader_variant *v)
-{
-	unsigned i;
-
-	for (i = 0; i < v->inputs_count; i++) {
-		/* skip frag inputs fetch via bary.f since their reg's are
-		 * not written by gpu before shader starts (and in fact the
-		 * regid's might not even be valid)
-		 */
-		if (v->inputs[i].bary)
-			continue;
-
-		/* ignore high regs that are global to all threads in a warp
-		 * (they exist by default) (a5xx+)
-		 */
-		if (v->inputs[i].regid >= regid(48,0))
-			continue;
-
-		if (v->inputs[i].compmask) {
-			unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
-			int32_t regid = (v->inputs[i].regid + n) >> 2;
-			v->info.max_reg = MAX2(v->info.max_reg, regid);
-		}
-	}
-
-	for (i = 0; i < v->outputs_count; i++) {
-		int32_t regid = (v->outputs[i].regid + 3) >> 2;
-		v->info.max_reg = MAX2(v->info.max_reg, regid);
-	}
-}
-
-/* wrapper for ir3_assemble() which does some info fixup based on
- * shader state.  Non-static since used by ir3_cmdline too.
- */
-void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id)
-{
-	void *bin;
-
-	bin = ir3_assemble(v->ir, &v->info, gpu_id);
-	if (!bin)
-		return NULL;
-
-	if (gpu_id >= 400) {
-		v->instrlen = v->info.sizedwords / (2 * 16);
-	} else {
-		v->instrlen = v->info.sizedwords / (2 * 4);
-	}
-
-	/* NOTE: if relative addressing is used, we set constlen in
-	 * the compiler (to worst-case value) since we don't know in
-	 * the assembler what the max addr reg value can be:
-	 */
-	v->constlen = MIN2(255, MAX2(v->constlen, v->info.max_const + 1));
-
-	fixup_regfootprint(v);
-
-	return bin;
-}
-
-static void
-assemble_variant(struct ir3_shader_variant *v)
-{
-	struct ir3_compiler *compiler = v->shader->compiler;
-	uint32_t gpu_id = compiler->gpu_id;
-	uint32_t sz, *bin;
-
-	bin = ir3_shader_assemble(v, gpu_id);
-	sz = v->info.sizedwords * 4;
-
-	v->bo = fd_bo_new(compiler->dev, sz,
-			DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
-			DRM_FREEDRENO_GEM_TYPE_KMEM);
-
-	memcpy(fd_bo_map(v->bo), bin, sz);
-
-	if (ir3_shader_debug & IR3_DBG_DISASM) {
-		struct ir3_shader_key key = v->key;
-		printf("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
-			v->binning_pass, key.color_two_side, key.half_precision);
-		ir3_shader_disasm(v, bin, stdout);
-	}
-
-	if (shader_debug_enabled(v->shader->type)) {
-		fprintf(stderr, "Native code for unnamed %s shader %s:\n",
-			_mesa_shader_stage_to_string(v->shader->type),
-			v->shader->nir->info.name);
-		if (v->shader->type == MESA_SHADER_FRAGMENT)
-			fprintf(stderr, "SIMD0\n");
-		ir3_shader_disasm(v, bin, stderr);
-	}
-
-	free(bin);
-
-	/* no need to keep the ir around beyond this point: */
-	ir3_destroy(v->ir);
-	v->ir = NULL;
-}
-
-static struct ir3_shader_variant *
-create_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
-		bool binning_pass)
-{
-	struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
-	int ret;
-
-	if (!v)
-		return NULL;
-
-	v->id = ++shader->variant_count;
-	v->shader = shader;
-	v->binning_pass = binning_pass;
-	v->key = *key;
-	v->type = shader->type;
-
-	ret = ir3_compile_shader_nir(shader->compiler, v);
-	if (ret) {
-		debug_error("compile failed!");
-		goto fail;
-	}
-
-	assemble_variant(v);
-	if (!v->bo) {
-		debug_error("assemble failed!");
-		goto fail;
-	}
-
-	return v;
-
-fail:
-	delete_variant(v);
-	return NULL;
-}
-
-static inline struct ir3_shader_variant *
-shader_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
-		bool *created)
-{
-	struct ir3_shader_variant *v;
-
-	*created = false;
-
-	for (v = shader->variants; v; v = v->next)
-		if (ir3_shader_key_equal(key, &v->key))
-			return v;
-
-	/* compile new variant if it doesn't exist already: */
-	v = create_variant(shader, key, false);
-	if (v) {
-		v->next = shader->variants;
-		shader->variants = v;
-		*created = true;
-	}
-
-	return v;
-}
-
-struct ir3_shader_variant *
-ir3_shader_get_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
-		bool binning_pass, bool *created)
-{
-	struct ir3_shader_variant *v =
-			shader_variant(shader, key, created);
-
-	if (binning_pass) {
-		if (!v->binning)
-			v->binning = create_variant(shader, key, true);
-		return v->binning;
-	}
-
-	return v;
-}
-
-void
-ir3_shader_destroy(struct ir3_shader *shader)
-{
-	struct ir3_shader_variant *v, *t;
-	for (v = shader->variants; v; ) {
-		t = v;
-		v = v->next;
-		delete_variant(t);
-	}
-	ralloc_free(shader->nir);
-	free(shader);
-}
-
-struct ir3_shader *
-ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir)
-{
-	struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
-
-	shader->compiler = compiler;
-	shader->id = ++shader->compiler->shader_count;
-	shader->type = nir->info.stage;
-
-	NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size,
-			   (nir_lower_io_options)0);
-
-	/* do first pass optimization, ignoring the key: */
-	shader->nir = ir3_optimize_nir(shader, nir, NULL);
-	if (ir3_shader_debug & IR3_DBG_DISASM) {
-		printf("dump nir%d: type=%d", shader->id, shader->type);
-		nir_print_shader(shader->nir, stdout);
-	}
-
-	return shader;
-}
-
-static void dump_reg(FILE *out, const char *name, uint32_t r)
-{
-	if (r != regid(63,0))
-		fprintf(out, "; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
-}
-
-static void dump_output(FILE *out, struct ir3_shader_variant *so,
-		unsigned slot, const char *name)
-{
-	uint32_t regid;
-	regid = ir3_find_output_regid(so, slot);
-	dump_reg(out, name, regid);
-}
-
-void
-ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
-{
-	struct ir3 *ir = so->ir;
-	struct ir3_register *reg;
-	const char *type = ir3_shader_stage(so->shader);
-	uint8_t regid;
-	unsigned i;
-
-	for (i = 0; i < ir->ninputs; i++) {
-		if (!ir->inputs[i]) {
-			fprintf(out, "; in%d unused\n", i);
-			continue;
-		}
-		reg = ir->inputs[i]->regs[0];
-		regid = reg->num;
-		fprintf(out, "@in(%sr%d.%c)\tin%d\n",
-				(reg->flags & IR3_REG_HALF) ? "h" : "",
-				(regid >> 2), "xyzw"[regid & 0x3], i);
-	}
-
-	for (i = 0; i < ir->noutputs; i++) {
-		if (!ir->outputs[i]) {
-			fprintf(out, "; out%d unused\n", i);
-			continue;
-		}
-		/* kill shows up as a virtual output.. skip it! */
-		if (is_kill(ir->outputs[i]))
-			continue;
-		reg = ir->outputs[i]->regs[0];
-		regid = reg->num;
-		fprintf(out, "@out(%sr%d.%c)\tout%d\n",
-				(reg->flags & IR3_REG_HALF) ? "h" : "",
-				(regid >> 2), "xyzw"[regid & 0x3], i);
-	}
-
-	for (i = 0; i < so->immediates_count; i++) {
-		fprintf(out, "@const(c%d.x)\t", so->constbase.immediate + i);
-		fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
-				so->immediates[i].val[0],
-				so->immediates[i].val[1],
-				so->immediates[i].val[2],
-				so->immediates[i].val[3]);
-	}
-
-	disasm_a3xx(bin, so->info.sizedwords, 0, out);
-
-	switch (so->type) {
-	case MESA_SHADER_VERTEX:
-		fprintf(out, "; %s: outputs:", type);
-		for (i = 0; i < so->outputs_count; i++) {
-			uint8_t regid = so->outputs[i].regid;
-			fprintf(out, " r%d.%c (%s)",
-					(regid >> 2), "xyzw"[regid & 0x3],
-					gl_varying_slot_name(so->outputs[i].slot));
-		}
-		fprintf(out, "\n");
-		fprintf(out, "; %s: inputs:", type);
-		for (i = 0; i < so->inputs_count; i++) {
-			uint8_t regid = so->inputs[i].regid;
-			fprintf(out, " r%d.%c (cm=%x,il=%u,b=%u)",
-					(regid >> 2), "xyzw"[regid & 0x3],
-					so->inputs[i].compmask,
-					so->inputs[i].inloc,
-					so->inputs[i].bary);
-		}
-		fprintf(out, "\n");
-		break;
-	case MESA_SHADER_FRAGMENT:
-		fprintf(out, "; %s: outputs:", type);
-		for (i = 0; i < so->outputs_count; i++) {
-			uint8_t regid = so->outputs[i].regid;
-			fprintf(out, " r%d.%c (%s)",
-					(regid >> 2), "xyzw"[regid & 0x3],
-					gl_frag_result_name(so->outputs[i].slot));
-		}
-		fprintf(out, "\n");
-		fprintf(out, "; %s: inputs:", type);
-		for (i = 0; i < so->inputs_count; i++) {
-			uint8_t regid = so->inputs[i].regid;
-			fprintf(out, " r%d.%c (%s,cm=%x,il=%u,b=%u)",
-					(regid >> 2), "xyzw"[regid & 0x3],
-					gl_varying_slot_name(so->inputs[i].slot),
-					so->inputs[i].compmask,
-					so->inputs[i].inloc,
-					so->inputs[i].bary);
-		}
-		fprintf(out, "\n");
-		break;
-	default:
-		/* TODO */
-		break;
-	}
-
-	/* print generic shader info: */
-	fprintf(out, "; %s prog %d/%d: %u instructions, %d half, %d full\n",
-			type, so->shader->id, so->id,
-			so->info.instrs_count,
-			so->info.max_half_reg + 1,
-			so->info.max_reg + 1);
-
-	fprintf(out, "; %d const, %u constlen\n",
-			so->info.max_const + 1,
-			so->constlen);
-
-	fprintf(out, "; %u (ss), %u (sy)\n", so->info.ss, so->info.sy);
-
-	/* print shader type specific info: */
-	switch (so->type) {
-	case MESA_SHADER_VERTEX:
-		dump_output(out, so, VARYING_SLOT_POS, "pos");
-		dump_output(out, so, VARYING_SLOT_PSIZ, "psize");
-		break;
-	case MESA_SHADER_FRAGMENT:
-		dump_reg(out, "pos (bary)",
-			ir3_find_sysval_regid(so, SYSTEM_VALUE_VARYING_COORD));
-		dump_output(out, so, FRAG_RESULT_DEPTH, "posz");
-		if (so->color0_mrt) {
-			dump_output(out, so, FRAG_RESULT_COLOR, "color");
-		} else {
-			dump_output(out, so, FRAG_RESULT_DATA0, "data0");
-			dump_output(out, so, FRAG_RESULT_DATA1, "data1");
-			dump_output(out, so, FRAG_RESULT_DATA2, "data2");
-			dump_output(out, so, FRAG_RESULT_DATA3, "data3");
-			dump_output(out, so, FRAG_RESULT_DATA4, "data4");
-			dump_output(out, so, FRAG_RESULT_DATA5, "data5");
-			dump_output(out, so, FRAG_RESULT_DATA6, "data6");
-			dump_output(out, so, FRAG_RESULT_DATA7, "data7");
-		}
-		/* these two are hard-coded since we don't know how to
-		 * program them to anything but all 0's...
-		 */
-		if (so->frag_coord)
-			fprintf(out, "; fragcoord: r0.x\n");
-		if (so->frag_face)
-			fprintf(out, "; fragface: hr0.x\n");
-		break;
-	default:
-		/* TODO */
-		break;
-	}
-
-	fprintf(out, "\n");
-}
-
-uint64_t
-ir3_shader_outputs(const struct ir3_shader *so)
-{
-	return so->nir->info.outputs_written;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
deleted file mode 100644
index bc47160d6ea..00000000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ /dev/null
@@ -1,587 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <[email protected]>
- */
-
-#ifndef IR3_SHADER_H_
-#define IR3_SHADER_H_
-
-#include <stdio.h>
-
-#include "compiler/shader_enums.h"
-#include "compiler/nir/nir.h"
-#include "util/bitscan.h"
-
-#include "ir3.h"
-
-struct glsl_type;
-
-/* driver param indices: */
-enum ir3_driver_param {
-	/* compute shader driver params: */
-	IR3_DP_NUM_WORK_GROUPS_X = 0,
-	IR3_DP_NUM_WORK_GROUPS_Y = 1,
-	IR3_DP_NUM_WORK_GROUPS_Z = 2,
-	IR3_DP_LOCAL_GROUP_SIZE_X = 4,
-	IR3_DP_LOCAL_GROUP_SIZE_Y = 5,
-	IR3_DP_LOCAL_GROUP_SIZE_Z = 6,
-	/* NOTE: gl_NumWorkGroups should be vec4 aligned because
-	 * glDispatchComputeIndirect() needs to load these from
-	 * the info->indirect buffer.  Keep that in mind when/if
-	 * adding any addition CS driver params.
-	 */
-	IR3_DP_CS_COUNT   = 8,   /* must be aligned to vec4 */
-
-	/* vertex shader driver params: */
-	IR3_DP_VTXID_BASE = 0,
-	IR3_DP_VTXCNT_MAX = 1,
-	/* user-clip-plane components, up to 8x vec4's: */
-	IR3_DP_UCP0_X     = 4,
-	/* .... */
-	IR3_DP_UCP7_W     = 35,
-	IR3_DP_VS_COUNT   = 36   /* must be aligned to vec4 */
-};
-
-#define IR3_MAX_SHADER_BUFFERS   32
-#define IR3_MAX_SHADER_IMAGES    32
-#define IR3_MAX_SO_BUFFERS        4
-#define IR3_MAX_SO_OUTPUTS       64
-
-/**
- * For consts needed to pass internal values to shader which may or may not
- * be required, rather than allocating worst-case const space, we scan the
- * shader and allocate consts as-needed:
- *
- *   + SSBO sizes: only needed if shader has a get_buffer_size intrinsic
- *     for a given SSBO
- *
- *   + Image dimensions: needed to calculate pixel offset, but only for
- *     images that have a image_store intrinsic
- */
-struct ir3_driver_const_layout {
-	struct {
-		uint32_t mask;  /* bitmask of SSBOs that have get_buffer_size */
-		uint32_t count; /* number of consts allocated */
-		/* one const allocated per SSBO which has get_buffer_size,
-		 * ssbo_sizes.off[ssbo_id] is offset from start of ssbo_sizes
-		 * consts:
-		 */
-		uint32_t off[IR3_MAX_SHADER_BUFFERS];
-	} ssbo_size;
-
-	struct {
-		uint32_t mask;  /* bitmask of images that have image_store */
-		uint32_t count; /* number of consts allocated */
-		/* three const allocated per image which has image_store:
-		 *  + cpp         (bytes per pixel)
-		 *  + pitch       (y pitch)
-		 *  + array_pitch (z pitch)
-		 */
-		uint32_t off[IR3_MAX_SHADER_IMAGES];
-	} image_dims;
-};
-
-/**
- * A single output for vertex transform feedback.
- */
-struct ir3_stream_output {
-	unsigned register_index:6;  /**< 0 to 63 (OUT index) */
-	unsigned start_component:2; /** 0 to 3 */
-	unsigned num_components:3;  /** 1 to 4 */
-	unsigned output_buffer:3;   /**< 0 to PIPE_MAX_SO_BUFFERS */
-	unsigned dst_offset:16;     /**< offset into the buffer in dwords */
-	unsigned stream:2;          /**< 0 to 3 */
-};
-
-/**
- * Stream output for vertex transform feedback.
- */
-struct ir3_stream_output_info {
-	unsigned num_outputs;
-	/** stride for an entire vertex for each buffer in dwords */
-	uint16_t stride[IR3_MAX_SO_BUFFERS];
-
-	/**
-	 * Array of stream outputs, in the order they are to be written in.
-	 * Selected components are tightly packed into the output buffer.
-	 */
-	struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
-};
-
-/* Configuration key used to identify a shader variant.. different
- * shader variants can be used to implement features not supported
- * in hw (two sided color), binning-pass vertex shader, etc.
- */
-struct ir3_shader_key {
-	union {
-		struct {
-			/*
-			 * Combined Vertex/Fragment shader parameters:
-			 */
-			unsigned ucp_enables : 8;
-
-			/* do we need to check {v,f}saturate_{s,t,r}? */
-			unsigned has_per_samp : 1;
-
-			/*
-			 * Vertex shader variant parameters:
-			 */
-			unsigned vclamp_color : 1;
-
-			/*
-			 * Fragment shader variant parameters:
-			 */
-			unsigned color_two_side : 1;
-			unsigned half_precision : 1;
-			/* used when shader needs to handle flat varyings (a4xx)
-			 * for front/back color inputs to frag shader:
-			 */
-			unsigned rasterflat : 1;
-			unsigned fclamp_color : 1;
-		};
-		uint32_t global;
-	};
-
-	/* bitmask of sampler which needs coords clamped for vertex
-	 * shader:
-	 */
-	uint16_t vsaturate_s, vsaturate_t, vsaturate_r;
-
-	/* bitmask of sampler which needs coords clamped for frag
-	 * shader:
-	 */
-	uint16_t fsaturate_s, fsaturate_t, fsaturate_r;
-
-	/* bitmask of ms shifts */
-	uint32_t vsamples, fsamples;
-
-	/* bitmask of samplers which need astc srgb workaround: */
-	uint16_t vastc_srgb, fastc_srgb;
-};
-
-static inline bool
-ir3_shader_key_equal(struct ir3_shader_key *a, struct ir3_shader_key *b)
-{
-	/* slow-path if we need to check {v,f}saturate_{s,t,r} */
-	if (a->has_per_samp || b->has_per_samp)
-		return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
-	return a->global == b->global;
-}
-
-/* will the two keys produce different lowering for a fragment shader? */
-static inline bool
-ir3_shader_key_changes_fs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
-{
-	if (last_key->has_per_samp || key->has_per_samp) {
-		if ((last_key->fsaturate_s != key->fsaturate_s) ||
-				(last_key->fsaturate_t != key->fsaturate_t) ||
-				(last_key->fsaturate_r != key->fsaturate_r) ||
-				(last_key->fsamples != key->fsamples) ||
-				(last_key->fastc_srgb != key->fastc_srgb))
-			return true;
-	}
-
-	if (last_key->fclamp_color != key->fclamp_color)
-		return true;
-
-	if (last_key->color_two_side != key->color_two_side)
-		return true;
-
-	if (last_key->half_precision != key->half_precision)
-		return true;
-
-	if (last_key->rasterflat != key->rasterflat)
-		return true;
-
-	if (last_key->ucp_enables != key->ucp_enables)
-		return true;
-
-	return false;
-}
-
-/* will the two keys produce different lowering for a vertex shader? */
-static inline bool
-ir3_shader_key_changes_vs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
-{
-	if (last_key->has_per_samp || key->has_per_samp) {
-		if ((last_key->vsaturate_s != key->vsaturate_s) ||
-				(last_key->vsaturate_t != key->vsaturate_t) ||
-				(last_key->vsaturate_r != key->vsaturate_r) ||
-				(last_key->vsamples != key->vsamples) ||
-				(last_key->vastc_srgb != key->vastc_srgb))
-			return true;
-	}
-
-	if (last_key->vclamp_color != key->vclamp_color)
-		return true;
-
-	if (last_key->ucp_enables != key->ucp_enables)
-		return true;
-
-	return false;
-}
-
-/* clears shader-key flags which don't apply to the given shader
- * stage
- */
-static inline void
-ir3_normalize_key(struct ir3_shader_key *key, gl_shader_stage type)
-{
-	switch (type) {
-	case MESA_SHADER_FRAGMENT:
-		if (key->has_per_samp) {
-			key->vsaturate_s = 0;
-			key->vsaturate_t = 0;
-			key->vsaturate_r = 0;
-			key->vastc_srgb = 0;
-			key->vsamples = 0;
-		}
-		break;
-	case MESA_SHADER_VERTEX:
-		key->color_two_side = false;
-		key->half_precision = false;
-		key->rasterflat = false;
-		if (key->has_per_samp) {
-			key->fsaturate_s = 0;
-			key->fsaturate_t = 0;
-			key->fsaturate_r = 0;
-			key->fastc_srgb = 0;
-			key->fsamples = 0;
-		}
-		break;
-	default:
-		/* TODO */
-		break;
-	}
-
-}
-
-struct ir3_shader_variant {
-	struct fd_bo *bo;
-
-	/* variant id (for debug) */
-	uint32_t id;
-
-	struct ir3_shader_key key;
-
-	/* vertex shaders can have an extra version for hwbinning pass,
-	 * which is pointed to by so->binning:
-	 */
-	bool binning_pass;
-	struct ir3_shader_variant *binning;
-
-	struct ir3_driver_const_layout const_layout;
-	struct ir3_info info;
-	struct ir3 *ir;
-
-	/* the instructions length is in units of instruction groups
-	 * (4 instructions for a3xx, 16 instructions for a4xx.. each
-	 * instruction is 2 dwords):
-	 */
-	unsigned instrlen;
-
-	/* the constants length is in units of vec4's, and is the sum of
-	 * the uniforms and the built-in compiler constants
-	 */
-	unsigned constlen;
-
-	/* number of uniforms (in vec4), not including built-in compiler
-	 * constants, etc.
-	 */
-	unsigned num_uniforms;
-
-	unsigned num_ubos;
-
-	/* About Linkage:
-	 *   + Let the frag shader determine the position/compmask for the
-	 *     varyings, since it is the place where we know if the varying
-	 *     is actually used, and if so, which components are used.  So
-	 *     what the hw calls "outloc" is taken from the "inloc" of the
-	 *     frag shader.
-	 *   + From the vert shader, we only need the output regid
-	 */
-
-	bool frag_coord, frag_face, color0_mrt;
-
-	/* NOTE: for input/outputs, slot is:
-	 *   gl_vert_attrib  - for VS inputs
-	 *   gl_varying_slot - for VS output / FS input
-	 *   gl_frag_result  - for FS output
-	 */
-
-	/* varyings/outputs: */
-	unsigned outputs_count;
-	struct {
-		uint8_t slot;
-		uint8_t regid;
-	} outputs[16 + 2];  /* +POSITION +PSIZE */
-	bool writes_pos, writes_psize;
-
-	/* attributes (VS) / varyings (FS):
-	 * Note that sysval's should come *after* normal inputs.
-	 */
-	unsigned inputs_count;
-	struct {
-		uint8_t slot;
-		uint8_t regid;
-		uint8_t compmask;
-		uint8_t ncomp;
-		/* location of input (ie. offset passed to bary.f, etc).  This
-		 * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
-		 * have the OUTLOCn value offset by 8, presumably to account
-		 * for gl_Position/gl_PointSize)
-		 */
-		uint8_t inloc;
-		/* vertex shader specific: */
-		bool    sysval     : 1;   /* slot is a gl_system_value */
-		/* fragment shader specific: */
-		bool    bary       : 1;   /* fetched varying (vs one loaded into reg) */
-		bool    rasterflat : 1;   /* special handling for emit->rasterflat */
-		enum glsl_interp_mode interpolate;
-	} inputs[16 + 2];  /* +POSITION +FACE */
-
-	/* sum of input components (scalar).  For frag shaders, it only counts
-	 * the varying inputs:
-	 */
-	unsigned total_in;
-
-	/* For frag shaders, the total number of inputs (not scalar,
-	 * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
-	 */
-	unsigned varying_in;
-
-	/* number of samplers/textures (which are currently 1:1): */
-	int num_samp;
-
-	/* do we have one or more SSBO instructions: */
-	bool has_ssbo;
-
-	/* do we have kill instructions: */
-	bool has_kill;
-
-	/* Layout of constant registers, each section (in vec4). Pointer size
-	 * is 32b (a3xx, a4xx), or 64b (a5xx+), which effects the size of the
-	 * UBO and stream-out consts.
-	 */
-	struct {
-		/* user const start at zero */
-		unsigned ubo;
-		/* NOTE that a3xx might need a section for SSBO addresses too */
-		unsigned ssbo_sizes;
-		unsigned image_dims;
-		unsigned driver_param;
-		unsigned tfbo;
-		unsigned immediate;
-	} constbase;
-
-	unsigned immediates_count;
-	unsigned immediates_size;
-	struct {
-		uint32_t val[4];
-	} *immediates;
-
-	/* for astc srgb workaround, the number/base of additional
-	 * alpha tex states we need, and index of original tex states
-	 */
-	struct {
-		unsigned base, count;
-		unsigned orig_idx[16];
-	} astc_srgb;
-
-	/* shader variants form a linked list: */
-	struct ir3_shader_variant *next;
-
-	/* replicated here to avoid passing extra ptrs everywhere: */
-	gl_shader_stage type;
-	struct ir3_shader *shader;
-};
-
-struct ir3_shader {
-	gl_shader_stage type;
-
-	/* shader id (for debug): */
-	uint32_t id;
-	uint32_t variant_count;
-
-	/* so we know when we can disable TGSI related hacks: */
-	bool from_tgsi;
-
-	struct ir3_compiler *compiler;
-
-	struct nir_shader *nir;
-	struct ir3_stream_output_info stream_output;
-
-	struct ir3_shader_variant *variants;
-};
-
-void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
-struct ir3_shader_variant * ir3_shader_get_variant(struct ir3_shader *shader,
-		struct ir3_shader_key *key, bool binning_pass, bool *created);
-struct ir3_shader * ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir);
-void ir3_shader_destroy(struct ir3_shader *shader);
-void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
-uint64_t ir3_shader_outputs(const struct ir3_shader *so);
-
-int
-ir3_glsl_type_size(const struct glsl_type *type);
-
-static inline const char *
-ir3_shader_stage(struct ir3_shader *shader)
-{
-	switch (shader->type) {
-	case MESA_SHADER_VERTEX:     return "VERT";
-	case MESA_SHADER_FRAGMENT:   return "FRAG";
-	case MESA_SHADER_COMPUTE:    return "CL";
-	default:
-		unreachable("invalid type");
-		return NULL;
-	}
-}
-
-/*
- * Helper/util:
- */
-
-static inline int
-ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
-{
-	int j;
-
-	for (j = 0; j < so->outputs_count; j++)
-		if (so->outputs[j].slot == slot)
-			return j;
-
-	/* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
-	 * in the vertex shader.. but the fragment shader doesn't know this
-	 * so  it will always have both IN.COLOR[n] and IN.BCOLOR[n].  So
-	 * at link time if there is no matching OUT.BCOLOR[n], we must map
-	 * OUT.COLOR[n] to IN.BCOLOR[n].  And visa versa if there is only
-	 * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
-	 */
-	if (slot == VARYING_SLOT_BFC0) {
-		slot = VARYING_SLOT_COL0;
-	} else if (slot == VARYING_SLOT_BFC1) {
-		slot = VARYING_SLOT_COL1;
-	} else if (slot == VARYING_SLOT_COL0) {
-		slot = VARYING_SLOT_BFC0;
-	} else if (slot == VARYING_SLOT_COL1) {
-		slot = VARYING_SLOT_BFC1;
-	} else {
-		return 0;
-	}
-
-	for (j = 0; j < so->outputs_count; j++)
-		if (so->outputs[j].slot == slot)
-			return j;
-
-	debug_assert(0);
-
-	return 0;
-}
-
-static inline int
-ir3_next_varying(const struct ir3_shader_variant *so, int i)
-{
-	while (++i < so->inputs_count)
-		if (so->inputs[i].compmask && so->inputs[i].bary)
-			break;
-	return i;
-}
-
-struct ir3_shader_linkage {
-	uint8_t max_loc;
-	uint8_t cnt;
-	struct {
-		uint8_t regid;
-		uint8_t compmask;
-		uint8_t loc;
-	} var[32];
-};
-
-static inline void
-ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid, uint8_t compmask, uint8_t loc)
-{
-	int i = l->cnt++;
-
-	debug_assert(i < ARRAY_SIZE(l->var));
-
-	l->var[i].regid    = regid;
-	l->var[i].compmask = compmask;
-	l->var[i].loc      = loc;
-	l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
-}
-
-static inline void
-ir3_link_shaders(struct ir3_shader_linkage *l,
-		const struct ir3_shader_variant *vs,
-		const struct ir3_shader_variant *fs)
-{
-	int j = -1, k;
-
-	while (l->cnt < ARRAY_SIZE(l->var)) {
-		j = ir3_next_varying(fs, j);
-
-		if (j >= fs->inputs_count)
-			break;
-
-		if (fs->inputs[j].inloc >= fs->total_in)
-			continue;
-
-		k = ir3_find_output(vs, fs->inputs[j].slot);
-
-		ir3_link_add(l, vs->outputs[k].regid,
-			fs->inputs[j].compmask, fs->inputs[j].inloc);
-	}
-}
-
-static inline uint32_t
-ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
-{
-	int j;
-	for (j = 0; j < so->outputs_count; j++)
-		if (so->outputs[j].slot == slot)
-			return so->outputs[j].regid;
-	return regid(63, 0);
-}
-
-static inline uint32_t
-ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
-{
-	int j;
-	for (j = 0; j < so->inputs_count; j++)
-		if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
-			return so->inputs[j].regid;
-	return regid(63, 0);
-}
-
-/* calculate register footprint in terms of half-regs (ie. one full
- * reg counts as two half-regs).
- */
-static inline uint32_t
-ir3_shader_halfregs(const struct ir3_shader_variant *v)
-{
-	return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
-}
-
-#endif /* IR3_SHADER_H_ */
diff --git a/src/gallium/drivers/freedreno/meson.build b/src/gallium/drivers/freedreno/meson.build
index 797ba081758..f996126e386 100644
--- a/src/gallium/drivers/freedreno/meson.build
+++ b/src/gallium/drivers/freedreno/meson.build
@@ -18,18 +18,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-ir3_nir_trig_c = custom_target(
-  'ir3_nir_trig.c',
-  input : 'ir3/ir3_nir_trig.py',
-  output : 'ir3_nir_trig.c',
-  command : [
-    prog_python, '@INPUT@',
-    '-p', join_paths(meson.source_root(), 'src/compiler/nir/'),
-  ],
-  capture : true,
-  depend_files : nir_algebraic_py,
-)
-
 files_libfreedreno = files(
   'adreno_common.xml.h',
   'adreno_pm4.xml.h',
@@ -215,35 +203,15 @@ files_libfreedreno = files(
   'a6xx/fd6_texture.h',
   'a6xx/fd6_zsa.c',
   'a6xx/fd6_zsa.h',
-  'ir3/disasm-a3xx.c',
-  'ir3/instr-a3xx.h',
-  'ir3/ir3.c',
   'ir3/ir3_cache.c',
   'ir3/ir3_cache.h',
-  'ir3/ir3_compiler_nir.c',
-  'ir3/ir3_compiler.c',
-  'ir3/ir3_compiler.h',
-  'ir3/ir3_cp.c',
-  'ir3/ir3_depth.c',
   'ir3/ir3_gallium.c',
   'ir3/ir3_gallium.h',
-  'ir3/ir3_group.c',
-  'ir3/ir3.h',
-  'ir3/ir3_legalize.c',
-  'ir3/ir3_nir.c',
-  'ir3/ir3_nir.h',
-  'ir3/ir3_nir_lower_tg4_to_tex.c',
-  'ir3/ir3_print.c',
-  'ir3/ir3_ra.c',
-  'ir3/ir3_sched.c',
-  'ir3/ir3_shader.c',
-  'ir3/ir3_shader.h',
 )
 
 freedreno_includes = [
   inc_src, inc_include, inc_gallium, inc_gallium_aux,
-  inc_freedreno,
-  include_directories('ir3')
+  inc_freedreno, include_directories('ir3'),
 ]
 
 freedreno_c_args = []
@@ -258,7 +226,7 @@ endif
 
 libfreedreno = static_library(
   'freedreno',
-  [files_libfreedreno, ir3_nir_trig_c],
+  [files_libfreedreno],
   include_directories : freedreno_includes,
   c_args : [freedreno_c_args, c_vis_args],
   cpp_args : [freedreno_cpp_args, cpp_vis_args],
@@ -273,6 +241,7 @@ driver_freedreno = declare_dependency(
     libfreedrenowinsys,
     libfreedreno,
     libfreedreno_drm,
+    libfreedreno_ir3,
   ],
   dependencies : idep_nir,
 )
@@ -288,6 +257,7 @@ ir3_compiler = executable(
   link_with : [
     libfreedreno,
     libfreedreno_drm,
+    libfreedreno_ir3,
     libgallium,
     libglsl_standalone,
     libmesa_util,