21 files changed, 7988 insertions, 0 deletions
diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h
new file mode 100644
index 00000000000..79fe7dfc78a
--- /dev/null
+++ b/src/panfrost/midgard/compiler.h
@@ -0,0 +1,456 @@
+/*
+ * Copyright (C) 2019 Alyssa Rosenzweig <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MDG_COMPILER_H
+#define _MDG_COMPILER_H
+
+#include "midgard.h"
+#include "helpers.h"
+#include "midgard_compile.h"
+
+#include "util/hash_table.h"
+#include "util/u_dynarray.h"
+#include "util/set.h"
+#include "util/list.h"
+
+#include "main/mtypes.h"
+#include "compiler/nir_types.h"
+#include "compiler/nir/nir.h"
+
+/* Forward declare */
+struct midgard_block;
+
+/* Target types. Defaults to TARGET_GOTO (the type corresponding directly to
+ * the hardware), hence why that must be zero. TARGET_DISCARD signals this
+ * instruction is actually a discard op. */
+
+#define TARGET_GOTO 0
+#define TARGET_BREAK 1
+#define TARGET_CONTINUE 2
+#define TARGET_DISCARD 3
+
+typedef struct midgard_branch {
+        /* If conditional, the condition is specified in r31.w */
+        bool conditional;
+
+        /* For conditionals, if this is true, we branch on FALSE. If false, we  branch on TRUE. */
+        bool invert_conditional;
+
+        /* Branch targets: the start of a block, the start of a loop (continue), the end of a loop (break). Value is one of TARGET_ */
+        unsigned target_type;
+
+        /* The actual target */
+        union {
+                int target_block;
+                int target_break;
+                int target_continue;
+        };
+} midgard_branch;
+
+/* Instruction arguments represented as block-local SSA indices, rather than
+ * registers. Negative values mean unused. */
+
+typedef struct {
+        int src0;
+        int src1;
+        int dest;
+
+        /* src1 is -not- SSA but instead a 16-bit inline constant to be smudged
+         * in. Only valid for ALU ops. */
+        bool inline_constant;
+} ssa_args;
+
+/* Generic in-memory data type repesenting a single logical instruction, rather
+ * than a single instruction group. This is the preferred form for code gen.
+ * Multiple midgard_insturctions will later be combined during scheduling,
+ * though this is not represented in this structure.  Its format bridges
+ * the low-level binary representation with the higher level semantic meaning.
+ *
+ * Notably, it allows registers to be specified as block local SSA, for code
+ * emitted before the register allocation pass.
+ */
+
+typedef struct midgard_instruction {
+        /* Must be first for casting */
+        struct list_head link;
+
+        unsigned type; /* ALU, load/store, texture */
+
+        /* If the register allocator has not run yet... */
+        ssa_args ssa_args;
+
+        /* Special fields for an ALU instruction */
+        midgard_reg_info registers;
+
+        /* I.e. (1 << alu_bit) */
+        int unit;
+
+        /* When emitting bundle, should this instruction have a break forced
+         * before it? Used for r31 writes which are valid only within a single
+         * bundle and *need* to happen as early as possible... this is a hack,
+         * TODO remove when we have a scheduler */
+        bool precede_break;
+
+        bool has_constants;
+        float constants[4];
+        uint16_t inline_constant;
+        bool has_blend_constant;
+
+        bool compact_branch;
+        bool writeout;
+        bool prepacked_branch;
+
+        /* Masks in a saneish format. One bit per channel, not packed fancy.
+         * Use this instead of the op specific ones, and switch over at emit
+         * time */
+        uint16_t mask;
+
+        union {
+                midgard_load_store_word load_store;
+                midgard_vector_alu alu;
+                midgard_texture_word texture;
+                midgard_branch_extended branch_extended;
+                uint16_t br_compact;
+
+                /* General branch, rather than packed br_compact. Higher level
+                 * than the other components */
+                midgard_branch branch;
+        };
+} midgard_instruction;
+
+typedef struct midgard_block {
+        /* Link to next block. Must be first for mir_get_block */
+        struct list_head link;
+
+        /* List of midgard_instructions emitted for the current block */
+        struct list_head instructions;
+
+        bool is_scheduled;
+
+        /* List of midgard_bundles emitted (after the scheduler has run) */
+        struct util_dynarray bundles;
+
+        /* Number of quadwords _actually_ emitted, as determined after scheduling */
+        unsigned quadword_count;
+
+        /* Successors: always one forward (the block after us), maybe
+         * one backwards (for a backward branch). No need for a second
+         * forward, since graph traversal would get there eventually
+         * anyway */
+        struct midgard_block *successors[2];
+        unsigned nr_successors;
+
+        /* The successors pointer form a graph, and in the case of
+         * complex control flow, this graph has a cycles. To aid
+         * traversal during liveness analysis, we have a visited?
+         * boolean for passes to use as they see fit, provided they
+         * clean up later */
+        bool visited;
+} midgard_block;
+
+typedef struct midgard_bundle {
+        /* Tag for the overall bundle */
+        int tag;
+
+        /* Instructions contained by the bundle */
+        int instruction_count;
+        midgard_instruction *instructions[5];
+
+        /* Bundle-wide ALU configuration */
+        int padding;
+        int control;
+        bool has_embedded_constants;
+        float constants[4];
+        bool has_blend_constant;
+} midgard_bundle;
+
+typedef struct compiler_context {
+        nir_shader *nir;
+        gl_shader_stage stage;
+
+        /* Is internally a blend shader? Depends on stage == FRAGMENT */
+        bool is_blend;
+
+        /* Tracking for blend constant patching */
+        int blend_constant_offset;
+
+        /* Current NIR function */
+        nir_function *func;
+
+        /* Unordered list of midgard_blocks */
+        int block_count;
+        struct list_head blocks;
+
+        midgard_block *initial_block;
+        midgard_block *previous_source_block;
+        midgard_block *final_block;
+
+        /* List of midgard_instructions emitted for the current block */
+        midgard_block *current_block;
+
+        /* The current "depth" of the loop, for disambiguating breaks/continues
+         * when using nested loops */
+        int current_loop_depth;
+
+        /* Total number of loops for shader-db */
+        unsigned loop_count;
+
+        /* Constants which have been loaded, for later inlining */
+        struct hash_table_u64 *ssa_constants;
+
+        /* SSA values / registers which have been aliased. Naively, these
+         * demand a fmov output; instead, we alias them in a later pass to
+         * avoid the wasted op.
+         *
+         * A note on encoding: to avoid dynamic memory management here, rather
+         * than ampping to a pointer, we map to the source index; the key
+         * itself is just the destination index. */
+
+        struct hash_table_u64 *ssa_to_alias;
+        struct set *leftover_ssa_to_alias;
+
+        /* Actual SSA-to-register for RA */
+        struct hash_table_u64 *ssa_to_register;
+
+        /* Mapping of hashes computed from NIR indices to the sequential temp indices ultimately used in MIR */
+        struct hash_table_u64 *hash_to_temp;
+        int temp_count;
+        int max_hash;
+
+        /* Just the count of the max register used. Higher count => higher
+         * register pressure */
+        int work_registers;
+
+        /* Used for cont/last hinting. Increase when a tex op is added.
+         * Decrease when a tex op is removed. */
+        int texture_op_count;
+
+        /* Mapping of texture register -> SSA index for unaliasing */
+        int texture_index[2];
+
+        /* If any path hits a discard instruction */
+        bool can_discard;
+
+        /* The number of uniforms allowable for the fast path */
+        int uniform_cutoff;
+
+        /* Count of instructions emitted from NIR overall, across all blocks */
+        int instruction_count;
+
+        /* Alpha ref value passed in */
+        float alpha_ref;
+
+        /* The index corresponding to the fragment output */
+        unsigned fragment_output;
+
+        /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */
+        unsigned sysvals[MAX_SYSVAL_COUNT];
+        unsigned sysval_count;
+        struct hash_table_u64 *sysval_to_id;
+} compiler_context;
+
+/* Helpers for manipulating the above structures (forming the driver IR) */
+
+/* Append instruction to end of current block */
+
+static inline midgard_instruction *
+mir_upload_ins(struct midgard_instruction ins)
+{
+        midgard_instruction *heap = malloc(sizeof(ins));
+        memcpy(heap, &ins, sizeof(ins));
+        return heap;
+}
+
+static inline void
+emit_mir_instruction(struct compiler_context *ctx, struct midgard_instruction ins)
+{
+        list_addtail(&(mir_upload_ins(ins))->link, &ctx->current_block->instructions);
+}
+
+static inline void
+mir_insert_instruction_before(struct midgard_instruction *tag, struct midgard_instruction ins)
+{
+        list_addtail(&(mir_upload_ins(ins))->link, &tag->link);
+}
+
+static inline void
+mir_remove_instruction(struct midgard_instruction *ins)
+{
+        list_del(&ins->link);
+}
+
+static inline midgard_instruction*
+mir_prev_op(struct midgard_instruction *ins)
+{
+        return list_last_entry(&(ins->link), midgard_instruction, link);
+}
+
+static inline midgard_instruction*
+mir_next_op(struct midgard_instruction *ins)
+{
+        return list_first_entry(&(ins->link), midgard_instruction, link);
+}
+
+#define mir_foreach_block(ctx, v) \
+        list_for_each_entry(struct midgard_block, v, &ctx->blocks, link)
+
+#define mir_foreach_block_from(ctx, from, v) \
+        list_for_each_entry_from(struct midgard_block, v, from, &ctx->blocks, link)
+
+#define mir_foreach_instr(ctx, v) \
+        list_for_each_entry(struct midgard_instruction, v, &ctx->current_block->instructions, link)
+
+#define mir_foreach_instr_safe(ctx, v) \
+        list_for_each_entry_safe(struct midgard_instruction, v, &ctx->current_block->instructions, link)
+
+#define mir_foreach_instr_in_block(block, v) \
+        list_for_each_entry(struct midgard_instruction, v, &block->instructions, link)
+
+#define mir_foreach_instr_in_block_safe(block, v) \
+        list_for_each_entry_safe(struct midgard_instruction, v, &block->instructions, link)
+
+#define mir_foreach_instr_in_block_safe_rev(block, v) \
+        list_for_each_entry_safe_rev(struct midgard_instruction, v, &block->instructions, link)
+
+#define mir_foreach_instr_in_block_from(block, v, from) \
+        list_for_each_entry_from(struct midgard_instruction, v, from, &block->instructions, link)
+
+#define mir_foreach_instr_in_block_from_rev(block, v, from) \
+        list_for_each_entry_from_rev(struct midgard_instruction, v, from, &block->instructions, link)
+
+#define mir_foreach_bundle_in_block(block, v) \
+        util_dynarray_foreach(&block->bundles, midgard_bundle, v)
+
+#define mir_foreach_instr_global(ctx, v) \
+        mir_foreach_block(ctx, v_block) \
+                mir_foreach_instr_in_block(v_block, v)
+
+
+static inline midgard_instruction *
+mir_last_in_block(struct midgard_block *block)
+{
+        return list_last_entry(&block->instructions, struct midgard_instruction, link);
+}
+
+static inline midgard_block *
+mir_get_block(compiler_context *ctx, int idx)
+{
+        struct list_head *lst = &ctx->blocks;
+
+        while ((idx--) + 1)
+                lst = lst->next;
+
+        return (struct midgard_block *) lst;
+}
+
+static inline bool
+mir_is_alu_bundle(midgard_bundle *bundle)
+{
+        return IS_ALU(bundle->tag);
+}
+
+/* MIR manipulation */
+
+void mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new);
+void mir_rewrite_index_src(compiler_context *ctx, unsigned old, unsigned new);
+void mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new);
+
+/* MIR printing */
+
+void mir_print_instruction(midgard_instruction *ins);
+void mir_print_bundle(midgard_bundle *ctx);
+void mir_print_block(midgard_block *block);
+void mir_print_shader(compiler_context *ctx);
+
+/* MIR goodies */
+
+static const midgard_vector_alu_src blank_alu_src = {
+        .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+};
+
+static const midgard_vector_alu_src blank_alu_src_xxxx = {
+        .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X),
+};
+
+static const midgard_scalar_alu_src blank_scalar_alu_src = {
+        .full = true
+};
+
+/* Used for encoding the unused source of 1-op instructions */
+static const midgard_vector_alu_src zero_alu_src = { 0 };
+
+/* 'Intrinsic' move for aliasing */
+
+static inline midgard_instruction
+v_mov(unsigned src, midgard_vector_alu_src mod, unsigned dest)
+{
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .mask = 0xF,
+                .ssa_args = {
+                        .src0 = SSA_UNUSED_1,
+                        .src1 = src,
+                        .dest = dest,
+                },
+                .alu = {
+                        .op = midgard_alu_op_imov,
+                        .reg_mode = midgard_reg_mode_32,
+                        .dest_override = midgard_dest_override_none,
+                        .outmod = midgard_outmod_int_wrap,
+                        .src1 = vector_alu_srco_unsigned(zero_alu_src),
+                        .src2 = vector_alu_srco_unsigned(mod)
+                },
+        };
+
+        return ins;
+}
+
+/* Scheduling */
+
+void schedule_program(compiler_context *ctx);
+
+/* Register allocation */
+
+struct ra_graph;
+
+struct ra_graph* allocate_registers(compiler_context *ctx);
+void install_registers(compiler_context *ctx, struct ra_graph *g);
+bool mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src);
+bool mir_has_multiple_writes(compiler_context *ctx, int src);
+
+void mir_create_pipeline_registers(compiler_context *ctx);
+
+/* Final emission */
+
+void emit_binary_bundle(
+        compiler_context *ctx,
+        midgard_bundle *bundle,
+        struct util_dynarray *emission,
+        int next_tag);
+
+/* NIR stuff */
+
+bool
+nir_undef_to_zero(nir_shader *shader);
+
+#endif
diff --git a/src/panfrost/midgard/cppwrap.cpp b/src/panfrost/midgard/cppwrap.cpp
new file mode 100644
index 00000000000..cf2ca3b7a11
--- /dev/null
+++ b/src/panfrost/midgard/cppwrap.cpp
@@ -0,0 +1,9 @@
+struct exec_list;
+
+bool do_mat_op_to_vec(struct exec_list *instructions);
+
+extern "C" {
+	bool c_do_mat_op_to_vec(struct exec_list *instructions) {
+		return do_mat_op_to_vec(instructions);
+	}
+};
diff --git a/src/panfrost/midgard/disassemble.c b/src/panfrost/midgard/disassemble.c
new file mode 100644
index 00000000000..bed803162f3
--- /dev/null
+++ b/src/panfrost/midgard/disassemble.c
@@ -0,0 +1,1317 @@
+/* Author(s):
+ *   Connor Abbott
+ *   Alyssa Rosenzweig
+ *
+ * Copyright (c) 2013 Connor Abbott ([email protected])
+ * Copyright (c) 2018 Alyssa Rosenzweig ([email protected])
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <ctype.h>
+#include <string.h>
+#include "midgard.h"
+#include "midgard-parse.h"
+#include "midgard_ops.h"
+#include "disassemble.h"
+#include "helpers.h"
+#include "util/half_float.h"
+#include "util/u_math.h"
+
+#define DEFINE_CASE(define, str) case define: { printf(str); break; }
+
+static bool is_instruction_int = false;
+
+/* Prints a short form of the tag for branching, the minimum needed to be
+ * legible and unambiguous */
+
+static void
+print_tag_short(unsigned tag)
+{
+        switch (midgard_word_types[tag]) {
+        case midgard_word_type_texture:
+                printf("tex/%X", tag);
+                break;
+
+        case midgard_word_type_load_store:
+                printf("ldst");
+                break;
+
+        case midgard_word_type_alu:
+                printf("alu%d/%X", midgard_word_size[tag], tag);
+                break;
+
+        default:
+                printf("%s%X", (tag > 0) ? "" : "unk", tag);
+                break;
+        }
+}
+
+static void
+print_alu_opcode(midgard_alu_op op)
+{
+        bool int_op = false;
+
+        if (alu_opcode_props[op].name) {
+                printf("%s", alu_opcode_props[op].name);
+
+                int_op = midgard_is_integer_op(op);
+        } else
+                printf("alu_op_%02X", op);
+
+        /* For constant analysis */
+        is_instruction_int = int_op;
+}
+
+static void
+print_ld_st_opcode(midgard_load_store_op op)
+{
+        if (load_store_opcode_names[op])
+                printf("%s", load_store_opcode_names[op]);
+        else
+                printf("ldst_op_%02X", op);
+}
+
+static bool is_embedded_constant_half = false;
+static bool is_embedded_constant_int = false;
+
+static char
+prefix_for_bits(unsigned bits)
+{
+        switch (bits) {
+        case 8:
+                return 'q';
+        case 16:
+                return 'h';
+        case 64:
+                return 'd';
+        default:
+                return 0;
+        }
+}
+
+static void
+print_reg(unsigned reg, unsigned bits)
+{
+        /* Perform basic static analysis for expanding constants correctly */
+
+        if (reg == 26) {
+                is_embedded_constant_int = is_instruction_int;
+                is_embedded_constant_half = (bits < 32);
+        }
+
+        char prefix = prefix_for_bits(bits);
+
+        if (prefix)
+                putchar(prefix);
+
+        printf("r%u", reg);
+}
+
+static char *outmod_names_float[4] = {
+        "",
+        ".pos",
+        ".unk2",
+        ".sat"
+};
+
+static char *outmod_names_int[4] = {
+        ".isat",
+        ".usat",
+        "",
+        ".hi"
+};
+
+static char *srcmod_names_int[4] = {
+        "sext(",
+        "zext(",
+        "",
+        "("
+};
+
+static void
+print_outmod(unsigned outmod, bool is_int)
+{
+        printf("%s", is_int ? outmod_names_int[outmod] :
+               outmod_names_float[outmod]);
+}
+
+static void
+print_quad_word(uint32_t *words, unsigned tabs)
+{
+        unsigned i;
+
+        for (i = 0; i < 4; i++)
+                printf("0x%08X%s ", words[i], i == 3 ? "" : ",");
+
+        printf("\n");
+}
+
+static const char components[16] = "xyzwefghijklmnop";
+
+/* Helper to print 4 chars of a swizzle */
+static void
+print_swizzle_helper(unsigned swizzle, bool upper)
+{
+        for (unsigned i = 0; i < 4; ++i) {
+                unsigned c = (swizzle >> (i * 2)) & 3;
+                c += upper*4;
+                printf("%c", components[c]);
+        }
+}
+
+/* Helper to print 8 chars of a swizzle, duplicating over */
+static void
+print_swizzle_helper_8(unsigned swizzle, bool upper)
+{
+        for (unsigned i = 0; i < 4; ++i) {
+                unsigned c = (swizzle >> (i * 2)) & 3;
+                c *= 2;
+                c += upper*8;
+                printf("%c%c", components[c], components[c+1]);
+        }
+}
+
+static void
+print_swizzle_vec16(unsigned swizzle, bool rep_high, bool rep_low,
+                    midgard_dest_override override)
+{
+        printf(".");
+
+        if (override == midgard_dest_override_upper) {
+                if (rep_high)
+                        printf(" /* rep_high */ ");
+                if (rep_low)
+                        printf(" /* rep_low */ ");
+
+                if (!rep_high && rep_low)
+                        print_swizzle_helper_8(swizzle, true);
+                else
+                        print_swizzle_helper_8(swizzle, false);
+        } else {
+                print_swizzle_helper_8(swizzle, rep_high & 1);
+                print_swizzle_helper_8(swizzle, !rep_low & 1);
+        }
+}
+
+static void
+print_swizzle_vec8(unsigned swizzle, bool rep_high, bool rep_low)
+{
+        printf(".");
+
+        print_swizzle_helper(swizzle, rep_high & 1);
+        print_swizzle_helper(swizzle, !rep_low & 1);
+}
+
+static void
+print_swizzle_vec4(unsigned swizzle, bool rep_high, bool rep_low)
+{
+        if (rep_high)
+                printf(" /* rep_high */ ");
+        if (rep_low)
+                printf(" /* rep_low */ ");
+
+        if (swizzle == 0xE4) return; /* xyzw */
+
+        printf(".");
+        print_swizzle_helper(swizzle, 0);
+}
+static void
+print_swizzle_vec2(unsigned swizzle, bool rep_high, bool rep_low)
+{
+        if (rep_high)
+                printf(" /* rep_high */ ");
+        if (rep_low)
+                printf(" /* rep_low */ ");
+
+        if (swizzle == 0xE4) return; /* XY */
+
+        printf(".");
+
+        for (unsigned i = 0; i < 4; i += 2) {
+                unsigned a = (swizzle >> (i * 2)) & 3;
+                unsigned b = (swizzle >> ((i+1) * 2)) & 3;
+
+                /* Normally we're adjacent, but if there's an issue, don't make
+                 * it ambiguous */
+
+                if (a & 0x1)
+                        printf("[%c%c]", components[a], components[b]);
+                else if (a == b)
+                        printf("%c", components[a >> 1]);
+                else if (b == (a + 1))
+                        printf("%c", "XY"[a >> 1]);
+                else
+                        printf("[%c%c]", components[a], components[b]);
+        }
+}
+
+static int
+bits_for_mode(midgard_reg_mode mode)
+{
+        switch (mode) {
+        case midgard_reg_mode_8:
+                return 8;
+        case midgard_reg_mode_16:
+                return 16;
+        case midgard_reg_mode_32:
+                return 32;
+        case midgard_reg_mode_64:
+                return 64;
+        default:
+                return 0;
+        }
+}
+
+static int
+bits_for_mode_halved(midgard_reg_mode mode, bool half)
+{
+        unsigned bits = bits_for_mode(mode);
+
+        if (half)
+                bits >>= 1;
+
+        return bits;
+}
+
+static void
+print_vector_src(unsigned src_binary,
+                 midgard_reg_mode mode, unsigned reg,
+                 midgard_dest_override override, bool is_int)
+{
+        midgard_vector_alu_src *src = (midgard_vector_alu_src *)&src_binary;
+
+        /* Modifiers change meaning depending on the op's context */
+
+        midgard_int_mod int_mod = src->mod;
+
+        if (is_int) {
+                printf("%s", srcmod_names_int[int_mod]);
+        } else {
+                if (src->mod & MIDGARD_FLOAT_MOD_NEG)
+                        printf("-");
+
+                if (src->mod & MIDGARD_FLOAT_MOD_ABS)
+                        printf("abs(");
+        }
+
+        //register
+        unsigned bits = bits_for_mode_halved(mode, src->half);
+        print_reg(reg, bits);
+
+        //swizzle
+        if (bits == 16)
+                print_swizzle_vec8(src->swizzle, src->rep_high, src->rep_low);
+        else if (bits == 8)
+                print_swizzle_vec16(src->swizzle, src->rep_high, src->rep_low, override);
+        else if (bits == 32)
+                print_swizzle_vec4(src->swizzle, src->rep_high, src->rep_low);
+        else if (bits == 64)
+                print_swizzle_vec2(src->swizzle, src->rep_high, src->rep_low);
+
+        /* Since we wrapped with a function-looking thing */
+
+        if (is_int && int_mod == midgard_int_shift)
+                printf(") << %d", bits);
+        else if ((is_int && (int_mod != midgard_int_normal))
+                 || (!is_int && src->mod & MIDGARD_FLOAT_MOD_ABS))
+                printf(")");
+}
+
+static uint16_t
+decode_vector_imm(unsigned src2_reg, unsigned imm)
+{
+        uint16_t ret;
+        ret = src2_reg << 11;
+        ret |= (imm & 0x7) << 8;
+        ret |= (imm >> 3) & 0xFF;
+        return ret;
+}
+
+static void
+print_immediate(uint16_t imm)
+{
+        if (is_instruction_int)
+                printf("#%d", imm);
+        else
+                printf("#%g", _mesa_half_to_float(imm));
+}
+
+static unsigned
+print_dest(unsigned reg, midgard_reg_mode mode, midgard_dest_override override)
+{
+        /* Depending on the mode and override, we determine the type of
+         * destination addressed. Absent an override, we address just the
+         * type of the operation itself */
+
+        unsigned bits = bits_for_mode(mode);
+
+        if (override != midgard_dest_override_none)
+                bits /= 2;
+
+        print_reg(reg, bits);
+
+        return bits;
+}
+
+static void
+print_mask_vec16(uint8_t mask, midgard_dest_override override)
+{
+        printf(".");
+
+        if (override == midgard_dest_override_none) {
+                for (unsigned i = 0; i < 8; i++) {
+                        if (mask & (1 << i))
+                                printf("%c%c",
+                                       components[i*2 + 0],
+                                       components[i*2 + 1]);
+                }
+        } else {
+                bool upper = (override == midgard_dest_override_upper);
+
+                for (unsigned i = 0; i < 8; i++) {
+                        if (mask & (1 << i))
+                                printf("%c", components[i + (upper ? 8 : 0)]);
+                }
+        }
+}
+
+/* For 16-bit+ masks, we read off from the 8-bit mask field. For 16-bit (vec8),
+ * it's just one bit per channel, easy peasy. For 32-bit (vec4), it's one bit
+ * per channel with one duplicate bit in the middle. For 64-bit (vec2), it's
+ * one-bit per channel with _3_ duplicate bits in the middle. Basically, just
+ * subdividing the 128-bit word in 16-bit increments. For 64-bit, we uppercase
+ * the mask to make it obvious what happened */
+
+static void
+print_mask(uint8_t mask, unsigned bits, midgard_dest_override override)
+{
+        if (bits == 8) {
+                print_mask_vec16(mask, override);
+                return;
+        }
+
+        /* Skip 'complete' masks */
+
+        if (bits >= 32 && mask == 0xFF) return;
+
+        if (bits == 16) {
+                if (mask == 0x0F)
+                        return;
+                else if (mask == 0xF0) {
+                        printf("'");
+                        return;
+                }
+        }
+
+        printf(".");
+
+        unsigned skip = (bits / 16);
+        bool uppercase = bits > 32;
+        bool tripped = false;
+
+        for (unsigned i = 0; i < 8; i += skip) {
+                bool a = (mask & (1 << i)) != 0;
+
+                for (unsigned j = 1; j < skip; ++j) {
+                        bool dupe = (mask & (1 << (i + j))) != 0;
+                        tripped |= (dupe != a);
+                }
+
+                if (a) {
+                        char c = components[i / skip];
+
+                        if (uppercase)
+                                c = toupper(c);
+
+                        printf("%c", c);
+                }
+        }
+
+        if (tripped)
+                printf(" /* %X */", mask);
+}
+
+/* Prints the 4-bit masks found in texture and load/store ops, as opposed to
+ * the 8-bit masks found in (vector) ALU ops */
+
+static void
+print_mask_4(unsigned mask)
+{
+        if (mask == 0xF) return;
+
+        printf(".");
+
+        for (unsigned i = 0; i < 4; ++i) {
+                bool a = (mask & (1 << i)) != 0;
+                if (a)
+                        printf("%c", components[i]);
+        }
+}
+
+static void
+print_vector_field(const char *name, uint16_t *words, uint16_t reg_word,
+                   unsigned tabs)
+{
+        midgard_reg_info *reg_info = (midgard_reg_info *)&reg_word;
+        midgard_vector_alu *alu_field = (midgard_vector_alu *) words;
+        midgard_reg_mode mode = alu_field->reg_mode;
+        unsigned override = alu_field->dest_override;
+
+        /* For now, prefix instruction names with their unit, until we
+         * understand how this works on a deeper level */
+        printf("%s.", name);
+
+        print_alu_opcode(alu_field->op);
+
+        /* Postfix with the size to disambiguate if necessary */
+        char postfix = prefix_for_bits(bits_for_mode(mode));
+        bool size_ambiguous = override != midgard_dest_override_none;
+
+        if (size_ambiguous)
+                printf("%c", postfix ? postfix : 'r');
+
+        /* Print the outmod, if there is one */
+        print_outmod(alu_field->outmod,
+                     midgard_is_integer_out_op(alu_field->op));
+
+        printf(" ");
+
+        /* Mask denoting status of 8-lanes */
+        uint8_t mask = alu_field->mask;
+
+        /* First, print the destination */
+        unsigned dest_size =
+                print_dest(reg_info->out_reg, mode, alu_field->dest_override);
+
+        /* Apply the destination override to the mask */
+
+        if (mode == midgard_reg_mode_32 || mode == midgard_reg_mode_64) {
+                if (override == midgard_dest_override_lower)
+                        mask &= 0x0F;
+                else if (override == midgard_dest_override_upper)
+                        mask &= 0xF0;
+        } else if (mode == midgard_reg_mode_16
+                   && override == midgard_dest_override_lower) {
+                /* stub */
+        }
+
+        if (override != midgard_dest_override_none) {
+                bool modeable = (mode != midgard_reg_mode_8);
+                bool known = override != 0x3; /* Unused value */
+
+                if (!(modeable && known))
+                        printf("/* do%d */ ", override);
+        }
+
+        print_mask(mask, dest_size, override);
+
+        printf(", ");
+
+        bool is_int = midgard_is_integer_op(alu_field->op);
+        print_vector_src(alu_field->src1, mode, reg_info->src1_reg, override, is_int);
+
+        printf(", ");
+
+        if (reg_info->src2_imm) {
+                uint16_t imm = decode_vector_imm(reg_info->src2_reg, alu_field->src2 >> 2);
+                print_immediate(imm);
+        } else {
+                print_vector_src(alu_field->src2, mode,
+                                 reg_info->src2_reg, override, is_int);
+        }
+
+        printf("\n");
+}
+
+static void
+print_scalar_src(unsigned src_binary, unsigned reg)
+{
+        midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary;
+
+        if (src->negate)
+                printf("-");
+
+        if (src->abs)
+                printf("abs(");
+
+        print_reg(reg, src->full ? 32 : 16);
+
+        unsigned c = src->component;
+
+        if (src->full) {
+                assert((c & 1) == 0);
+                c >>= 1;
+        }
+
+        printf(".%c", components[c]);
+
+        if (src->abs)
+                printf(")");
+
+}
+
+static uint16_t
+decode_scalar_imm(unsigned src2_reg, unsigned imm)
+{
+        uint16_t ret;
+        ret = src2_reg << 11;
+        ret |= (imm & 3) << 9;
+        ret |= (imm & 4) << 6;
+        ret |= (imm & 0x38) << 2;
+        ret |= imm >> 6;
+        return ret;
+}
+
+static void
+print_scalar_field(const char *name, uint16_t *words, uint16_t reg_word,
+                   unsigned tabs)
+{
+        midgard_reg_info *reg_info = (midgard_reg_info *)&reg_word;
+        midgard_scalar_alu *alu_field = (midgard_scalar_alu *) words;
+
+        if (alu_field->unknown)
+                printf("scalar ALU unknown bit set\n");
+
+        printf("%s.", name);
+        print_alu_opcode(alu_field->op);
+        print_outmod(alu_field->outmod,
+                     midgard_is_integer_out_op(alu_field->op));
+        printf(" ");
+
+        bool full = alu_field->output_full;
+        print_reg(reg_info->out_reg, full ? 32 : 16);
+        unsigned c = alu_field->output_component;
+
+        if (full) {
+                assert((c & 1) == 0);
+                c >>= 1;
+        }
+
+        printf(".%c, ", components[c]);
+
+        print_scalar_src(alu_field->src1, reg_info->src1_reg);
+
+        printf(", ");
+
+        if (reg_info->src2_imm) {
+                uint16_t imm = decode_scalar_imm(reg_info->src2_reg,
+                                                 alu_field->src2);
+                print_immediate(imm);
+        } else
+                print_scalar_src(alu_field->src2, reg_info->src2_reg);
+
+        printf("\n");
+}
+
+static void
+print_branch_op(int op)
+{
+        switch (op) {
+        case midgard_jmp_writeout_op_branch_uncond:
+                printf("uncond.");
+                break;
+
+        case midgard_jmp_writeout_op_branch_cond:
+                printf("cond.");
+                break;
+
+        case midgard_jmp_writeout_op_writeout:
+                printf("write.");
+                break;
+
+        case midgard_jmp_writeout_op_tilebuffer_pending:
+                printf("tilebuffer.");
+                break;
+
+        case midgard_jmp_writeout_op_discard:
+                printf("discard.");
+                break;
+
+        default:
+                printf("unk%d.", op);
+                break;
+        }
+}
+
+static void
+print_branch_cond(int cond)
+{
+        switch (cond) {
+        case midgard_condition_write0:
+                printf("write0");
+                break;
+
+        case midgard_condition_false:
+                printf("false");
+                break;
+
+        case midgard_condition_true:
+                printf("true");
+                break;
+
+        case midgard_condition_always:
+                printf("always");
+                break;
+
+        default:
+                printf("unk%X", cond);
+                break;
+        }
+}
+
+static void
+print_compact_branch_writeout_field(uint16_t word)
+{
+        midgard_jmp_writeout_op op = word & 0x7;
+
+        switch (op) {
+        case midgard_jmp_writeout_op_branch_uncond: {
+                midgard_branch_uncond br_uncond;
+                memcpy((char *) &br_uncond, (char *) &word, sizeof(br_uncond));
+                printf("br.uncond ");
+
+                if (br_uncond.unknown != 1)
+                        printf("unknown:%d, ", br_uncond.unknown);
+
+                if (br_uncond.offset >= 0)
+                        printf("+");
+
+                printf("%d -> ", br_uncond.offset);
+                print_tag_short(br_uncond.dest_tag);
+                printf("\n");
+
+                break;
+        }
+
+        case midgard_jmp_writeout_op_branch_cond:
+        case midgard_jmp_writeout_op_writeout:
+        case midgard_jmp_writeout_op_discard:
+        default: {
+                midgard_branch_cond br_cond;
+                memcpy((char *) &br_cond, (char *) &word, sizeof(br_cond));
+
+                printf("br.");
+
+                print_branch_op(br_cond.op);
+                print_branch_cond(br_cond.cond);
+
+                printf(" ");
+
+                if (br_cond.offset >= 0)
+                        printf("+");
+
+                printf("%d -> ", br_cond.offset);
+                print_tag_short(br_cond.dest_tag);
+                printf("\n");
+
+                break;
+        }
+        }
+}
+
+static void
+print_extended_branch_writeout_field(uint8_t *words)
+{
+        midgard_branch_extended br;
+        memcpy((char *) &br, (char *) words, sizeof(br));
+
+        printf("brx.");
+
+        print_branch_op(br.op);
+
+        /* Condition repeated 8 times in all known cases. Check this. */
+
+        unsigned cond = br.cond & 0x3;
+
+        for (unsigned i = 0; i < 16; i += 2) {
+                assert(((br.cond >> i) & 0x3) == cond);
+        }
+
+        print_branch_cond(cond);
+
+        if (br.unknown)
+                printf(".unknown%d", br.unknown);
+
+        printf(" ");
+
+        if (br.offset >= 0)
+                printf("+");
+
+        printf("%d -> ", br.offset);
+        print_tag_short(br.dest_tag);
+        printf("\n");
+}
+
+static unsigned
+num_alu_fields_enabled(uint32_t control_word)
+{
+        unsigned ret = 0;
+
+        if ((control_word >> 17) & 1)
+                ret++;
+
+        if ((control_word >> 19) & 1)
+                ret++;
+
+        if ((control_word >> 21) & 1)
+                ret++;
+
+        if ((control_word >> 23) & 1)
+                ret++;
+
+        if ((control_word >> 25) & 1)
+                ret++;
+
+        return ret;
+}
+
+static float
+float_bitcast(uint32_t integer)
+{
+        union {
+                uint32_t i;
+                float f;
+        } v;
+
+        v.i = integer;
+        return v.f;
+}
+
+static void
+print_alu_word(uint32_t *words, unsigned num_quad_words,
+               unsigned tabs)
+{
+        uint32_t control_word = words[0];
+        uint16_t *beginning_ptr = (uint16_t *)(words + 1);
+        unsigned num_fields = num_alu_fields_enabled(control_word);
+        uint16_t *word_ptr = beginning_ptr + num_fields;
+        unsigned num_words = 2 + num_fields;
+
+        if ((control_word >> 16) & 1)
+                printf("unknown bit 16 enabled\n");
+
+        if ((control_word >> 17) & 1) {
+                print_vector_field("vmul", word_ptr, *beginning_ptr, tabs);
+                beginning_ptr += 1;
+                word_ptr += 3;
+                num_words += 3;
+        }
+
+        if ((control_word >> 18) & 1)
+                printf("unknown bit 18 enabled\n");
+
+        if ((control_word >> 19) & 1) {
+                print_scalar_field("sadd", word_ptr, *beginning_ptr, tabs);
+                beginning_ptr += 1;
+                word_ptr += 2;
+                num_words += 2;
+        }
+
+        if ((control_word >> 20) & 1)
+                printf("unknown bit 20 enabled\n");
+
+        if ((control_word >> 21) & 1) {
+                print_vector_field("vadd", word_ptr, *beginning_ptr, tabs);
+                beginning_ptr += 1;
+                word_ptr += 3;
+                num_words += 3;
+        }
+
+        if ((control_word >> 22) & 1)
+                printf("unknown bit 22 enabled\n");
+
+        if ((control_word >> 23) & 1) {
+                print_scalar_field("smul", word_ptr, *beginning_ptr, tabs);
+                beginning_ptr += 1;
+                word_ptr += 2;
+                num_words += 2;
+        }
+
+        if ((control_word >> 24) & 1)
+                printf("unknown bit 24 enabled\n");
+
+        if ((control_word >> 25) & 1) {
+                print_vector_field("lut", word_ptr, *beginning_ptr, tabs);
+                beginning_ptr += 1;
+                word_ptr += 3;
+                num_words += 3;
+        }
+
+        if ((control_word >> 26) & 1) {
+                print_compact_branch_writeout_field(*word_ptr);
+                word_ptr += 1;
+                num_words += 1;
+        }
+
+        if ((control_word >> 27) & 1) {
+                print_extended_branch_writeout_field((uint8_t *) word_ptr);
+                word_ptr += 3;
+                num_words += 3;
+        }
+
+        if (num_quad_words > (num_words + 7) / 8) {
+                assert(num_quad_words == (num_words + 15) / 8);
+                //Assume that the extra quadword is constants
+                void *consts = words + (4 * num_quad_words - 4);
+
+                if (is_embedded_constant_int) {
+                        if (is_embedded_constant_half) {
+                                int16_t *sconsts = (int16_t *) consts;
+                                printf("sconstants %d, %d, %d, %d\n",
+                                       sconsts[0],
+                                       sconsts[1],
+                                       sconsts[2],
+                                       sconsts[3]);
+                        } else {
+                                int32_t *iconsts = (int32_t *) consts;
+                                printf("iconstants %d, %d, %d, %d\n",
+                                       iconsts[0],
+                                       iconsts[1],
+                                       iconsts[2],
+                                       iconsts[3]);
+                        }
+                } else {
+                        if (is_embedded_constant_half) {
+                                uint16_t *hconsts = (uint16_t *) consts;
+                                printf("hconstants %g, %g, %g, %g\n",
+                                       _mesa_half_to_float(hconsts[0]),
+                                       _mesa_half_to_float(hconsts[1]),
+                                       _mesa_half_to_float(hconsts[2]),
+                                       _mesa_half_to_float(hconsts[3]));
+                        } else {
+                                uint32_t *fconsts = (uint32_t *) consts;
+                                printf("fconstants %g, %g, %g, %g\n",
+                                       float_bitcast(fconsts[0]),
+                                       float_bitcast(fconsts[1]),
+                                       float_bitcast(fconsts[2]),
+                                       float_bitcast(fconsts[3]));
+                        }
+
+                }
+        }
+}
+
+static void
+print_varying_parameters(midgard_load_store_word *word)
+{
+        midgard_varying_parameter param;
+        unsigned v = word->varying_parameters;
+        memcpy(&param, &v, sizeof(param));
+
+        if (param.is_varying) {
+                /* If a varying, there are qualifiers */
+                if (param.flat)
+                        printf(".flat");
+
+                if (param.interpolation != midgard_interp_default) {
+                        if (param.interpolation == midgard_interp_centroid)
+                                printf(".centroid");
+                        else
+                                printf(".interp%d", param.interpolation);
+                }
+
+                if (param.modifier != midgard_varying_mod_none) {
+                        if (param.modifier == midgard_varying_mod_perspective_w)
+                                printf(".perspectivew");
+                        else if (param.modifier == midgard_varying_mod_perspective_z)
+                                printf(".perspectivez");
+                        else
+                                printf(".mod%d", param.modifier);
+                }
+        } else if (param.flat || param.interpolation || param.modifier) {
+                printf(" /* is_varying not set but varying metadata attached */");
+        }
+
+        if (param.zero0 || param.zero1 || param.zero2)
+                printf(" /* zero tripped, %d %d %d */ ", param.zero0, param.zero1, param.zero2);
+}
+
+static bool
+is_op_varying(unsigned op)
+{
+        switch (op) {
+        case midgard_op_st_vary_16:
+        case midgard_op_st_vary_32:
+        case midgard_op_ld_vary_16:
+        case midgard_op_ld_vary_32:
+                return true;
+        }
+
+        return false;
+}
+
+static void
+print_load_store_instr(uint64_t data,
+                       unsigned tabs)
+{
+        midgard_load_store_word *word = (midgard_load_store_word *) &data;
+
+        print_ld_st_opcode(word->op);
+
+        if (is_op_varying(word->op))
+                print_varying_parameters(word);
+
+        printf(" r%d", word->reg);
+        print_mask_4(word->mask);
+
+        int address = word->address;
+
+        if (word->op == midgard_op_ld_uniform_32) {
+                /* Uniforms use their own addressing scheme */
+
+                int lo = word->varying_parameters >> 7;
+                int hi = word->address;
+
+                /* TODO: Combine fields logically */
+                address = (hi << 3) | lo;
+        }
+
+        printf(", %d", address);
+
+        print_swizzle_vec4(word->swizzle, false, false);
+
+        printf(", 0x%X /* %X */\n", word->unknown, word->varying_parameters);
+}
+
+static void
+print_load_store_word(uint32_t *word, unsigned tabs)
+{
+        midgard_load_store *load_store = (midgard_load_store *) word;
+
+        if (load_store->word1 != 3) {
+                print_load_store_instr(load_store->word1, tabs);
+        }
+
+        if (load_store->word2 != 3) {
+                print_load_store_instr(load_store->word2, tabs);
+        }
+}
+
+static void
+print_texture_reg(bool full, bool select, bool upper)
+{
+        if (full)
+                printf("r%d", REG_TEX_BASE + select);
+        else
+                printf("hr%d", (REG_TEX_BASE + select) * 2 + upper);
+
+        if (full && upper)
+                printf("// error: out full / upper mutually exclusive\n");
+
+}
+
+static void
+print_texture_reg_triple(unsigned triple)
+{
+        bool full = triple & 1;
+        bool select = triple & 2;
+        bool upper = triple & 4;
+
+        print_texture_reg(full, select, upper);
+}
+
+static void
+print_texture_format(int format)
+{
+        /* Act like a modifier */
+        printf(".");
+
+        switch (format) {
+                DEFINE_CASE(MALI_TEX_1D, "1d");
+                DEFINE_CASE(MALI_TEX_2D, "2d");
+                DEFINE_CASE(MALI_TEX_3D, "3d");
+                DEFINE_CASE(MALI_TEX_CUBE, "cube");
+
+        default:
+                unreachable("Bad format");
+        }
+}
+
+static void
+print_texture_op(unsigned op, bool gather)
+{
+        /* Act like a bare name, like ESSL functions */
+
+        if (gather) {
+                printf("textureGather");
+
+                unsigned component = op >> 4;
+                unsigned bottom = op & 0xF;
+
+                if (bottom != 0x2)
+                        printf("_unk%d", bottom);
+
+                printf(".%c", components[component]);
+                return;
+        }
+
+        switch (op) {
+                DEFINE_CASE(TEXTURE_OP_NORMAL, "texture");
+                DEFINE_CASE(TEXTURE_OP_LOD, "textureLod");
+                DEFINE_CASE(TEXTURE_OP_TEXEL_FETCH, "texelFetch");
+
+        default:
+                printf("tex_%d", op);
+                break;
+        }
+}
+
+static bool
+texture_op_takes_bias(unsigned op)
+{
+        return op == TEXTURE_OP_NORMAL;
+}
+
+static char
+sampler_type_name(enum mali_sampler_type t)
+{
+        switch (t) {
+        case MALI_SAMPLER_FLOAT:
+                return 'f';
+        case MALI_SAMPLER_UNSIGNED:
+                return 'u';
+        case MALI_SAMPLER_SIGNED:
+                return 'i';
+        default:
+                return '?';
+        }
+
+}
+
+#undef DEFINE_CASE
+
+static void
+print_texture_word(uint32_t *word, unsigned tabs)
+{
+        midgard_texture_word *texture = (midgard_texture_word *) word;
+
+        /* Broad category of texture operation in question */
+        print_texture_op(texture->op, texture->is_gather);
+
+        /* Specific format in question */
+        print_texture_format(texture->format);
+
+        assert(texture->zero == 0);
+
+        /* Instruction "modifiers" parallel the ALU instructions. */
+
+        if (texture->shadow)
+                printf(".shadow");
+
+        if (texture->cont)
+                printf(".cont");
+
+        if (texture->last)
+                printf(".last");
+
+        printf(" ");
+
+        print_texture_reg(texture->out_full, texture->out_reg_select, texture->out_upper);
+        print_mask_4(texture->mask);
+        printf(", ");
+
+        printf("texture%d, ", texture->texture_handle);
+
+        /* Print the type, GL style */
+        printf("%c", sampler_type_name(texture->sampler_type));
+        printf("sampler%d", texture->sampler_handle);
+        print_swizzle_vec4(texture->swizzle, false, false);
+        printf(", ");
+
+        print_texture_reg(texture->in_reg_full, texture->in_reg_select, texture->in_reg_upper);
+        print_swizzle_vec4(texture->in_reg_swizzle, false, false);
+
+        /* There is *always* an offset attached. Of
+         * course, that offset is just immediate #0 for a
+         * GLES call that doesn't take an offset. If there
+         * is a non-negative non-zero offset, this is
+         * specified in immediate offset mode, with the
+         * values in the offset_* fields as immediates. If
+         * this is a negative offset, we instead switch to
+         * a register offset mode, where the offset_*
+         * fields become register triplets */
+
+        if (texture->offset_register) {
+                printf(" + ");
+                print_texture_reg_triple(texture->offset_x);
+
+                /* The less questions you ask, the better. */
+
+                unsigned swizzle_lo, swizzle_hi;
+                unsigned orig_y = texture->offset_y;
+                unsigned orig_z = texture->offset_z;
+
+                memcpy(&swizzle_lo, &orig_y, sizeof(unsigned));
+                memcpy(&swizzle_hi, &orig_z, sizeof(unsigned));
+
+                /* Duplicate hi swizzle over */
+                assert(swizzle_hi < 4);
+                swizzle_hi = (swizzle_hi << 2) | swizzle_hi;
+
+                unsigned swiz = (swizzle_lo << 4) | swizzle_hi;
+                unsigned reversed = util_bitreverse(swiz) >> 24;
+                print_swizzle_vec4(reversed, false, false);
+
+                printf(", ");
+        } else if (texture->offset_x || texture->offset_y || texture->offset_z) {
+                /* Only select ops allow negative immediate offsets, verify */
+
+                bool neg_x = texture->offset_x < 0;
+                bool neg_y = texture->offset_y < 0;
+                bool neg_z = texture->offset_z < 0;
+                bool any_neg = neg_x || neg_y || neg_z;
+
+                if (any_neg && texture->op != TEXTURE_OP_TEXEL_FETCH)
+                        printf("/* invalid negative */ ");
+
+                /* Regardless, just print the immediate offset */
+
+                printf(" + <%d, %d, %d>, ",
+                       texture->offset_x,
+                       texture->offset_y,
+                       texture->offset_z);
+        } else {
+                printf(", ");
+        }
+
+        char lod_operand = texture_op_takes_bias(texture->op) ? '+' : '=';
+
+        if (texture->lod_register) {
+                midgard_tex_register_select sel;
+                uint8_t raw = texture->bias;
+                memcpy(&sel, &raw, sizeof(raw));
+
+                unsigned c = (sel.component_hi << 1) | sel.component_lo;
+
+                printf("lod %c ", lod_operand);
+                print_texture_reg(sel.full, sel.select, sel.upper);
+                printf(".%c, ", components[c]);
+
+                if (!sel.component_hi)
+                        printf(" /* gradient? */");
+
+                if (texture->bias_int)
+                        printf(" /* bias_int = 0x%X */", texture->bias_int);
+
+                if (sel.zero)
+                        printf(" /* sel.zero = 0x%X */", sel.zero);
+        } else if (texture->op == TEXTURE_OP_TEXEL_FETCH) {
+                /* For texel fetch, the int LOD is in the fractional place and
+                 * there is no fraction / possibility of bias. We *always* have
+                 * an explicit LOD, even if it's zero. */
+
+                if (texture->bias_int)
+                        printf(" /* bias_int = 0x%X */ ", texture->bias_int);
+
+                printf("lod = %d, ", texture->bias);
+        } else if (texture->bias || texture->bias_int) {
+                signed bias_int = texture->bias_int;
+                float bias_frac = texture->bias / 256.0f;
+                float bias = bias_int + bias_frac;
+
+                bool is_bias = texture_op_takes_bias(texture->op);
+                char sign = (bias >= 0.0) ? '+' : '-';
+                char operand = is_bias ? sign : '=';
+
+                printf("lod %c %f, ", operand, fabsf(bias));
+        }
+
+        printf("\n");
+
+        /* While not zero in general, for these simple instructions the
+         * following unknowns are zero, so we don't include them */
+
+        if (texture->unknown2 ||
+            texture->unknown4 ||
+            texture->unknownA ||
+            texture->unknown8) {
+                printf("// unknown2 = 0x%x\n", texture->unknown2);
+                printf("// unknown4 = 0x%x\n", texture->unknown4);
+                printf("// unknownA = 0x%x\n", texture->unknownA);
+                printf("// unknown8 = 0x%x\n", texture->unknown8);
+        }
+}
+
+void
+disassemble_midgard(uint8_t *code, size_t size)
+{
+        uint32_t *words = (uint32_t *) code;
+        unsigned num_words = size / 4;
+        int tabs = 0;
+
+        bool prefetch_flag = false;
+
+        unsigned i = 0;
+
+        while (i < num_words) {
+                unsigned tag = words[i] & 0xF;
+                unsigned num_quad_words = midgard_word_size[tag];
+
+                switch (midgard_word_types[tag]) {
+                case midgard_word_type_texture:
+                        print_texture_word(&words[i], tabs);
+                        break;
+
+                case midgard_word_type_load_store:
+                        print_load_store_word(&words[i], tabs);
+                        break;
+
+                case midgard_word_type_alu:
+                        print_alu_word(&words[i], num_quad_words, tabs);
+
+                        if (prefetch_flag)
+                                return;
+
+                        /* Reset word static analysis state */
+                        is_embedded_constant_half = false;
+                        is_embedded_constant_int = false;
+
+                        break;
+
+                default:
+                        printf("Unknown word type %u:\n", words[i] & 0xF);
+                        num_quad_words = 1;
+                        print_quad_word(&words[i], tabs);
+                        printf("\n");
+                        break;
+                }
+
+                printf("\n");
+
+                unsigned next = (words[i] & 0xF0) >> 4;
+
+                i += 4 * num_quad_words;
+
+                /* Break based on instruction prefetch flag */
+
+                if (i < num_words && next == 1) {
+                        prefetch_flag = true;
+
+                        if (midgard_word_types[words[i] & 0xF] != midgard_word_type_alu)
+                                return;
+                }
+        }
+
+        return;
+}
diff --git a/src/panfrost/midgard/disassemble.h b/src/panfrost/midgard/disassemble.h
new file mode 100644
index 00000000000..ab1837c201e
--- /dev/null
+++ b/src/panfrost/midgard/disassemble.h
@@ -0,0 +1,2 @@
+#include <stddef.h>
+void disassemble_midgard(uint8_t *code, size_t size);
diff --git a/src/panfrost/midgard/helpers.h b/src/panfrost/midgard/helpers.h
new file mode 100644
index 00000000000..ef854dc60c1
--- /dev/null
+++ b/src/panfrost/midgard/helpers.h
@@ -0,0 +1,282 @@
+/* Copyright (c) 2018-2019 Alyssa Rosenzweig ([email protected])
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __MDG_HELPERS_H
+#define __MDG_HELPERS_H
+
+#include "util/macros.h"
+#include <string.h>
+
+#define OP_IS_STORE_VARY(op) (\
+		op == midgard_op_st_vary_16 || \
+		op == midgard_op_st_vary_32 \
+	)
+
+#define OP_IS_STORE(op) (\
+                OP_IS_STORE_VARY(op) || \
+                op == midgard_op_st_cubemap_coords \
+	)
+
+#define OP_IS_MOVE(op) ( \
+                op == midgard_alu_op_fmov || \
+                op == midgard_alu_op_imov \
+        )
+
+/* ALU control words are single bit fields with a lot of space */
+
+#define ALU_ENAB_VEC_MUL  (1 << 17)
+#define ALU_ENAB_SCAL_ADD  (1 << 19)
+#define ALU_ENAB_VEC_ADD  (1 << 21)
+#define ALU_ENAB_SCAL_MUL  (1 << 23)
+#define ALU_ENAB_VEC_LUT  (1 << 25)
+#define ALU_ENAB_BR_COMPACT (1 << 26)
+#define ALU_ENAB_BRANCH   (1 << 27)
+
+/* Other opcode properties that don't conflict with the ALU_ENABs, non-ISA */
+
+/* Denotes an opcode that takes a vector input with a fixed-number of
+ * channels, but outputs to only a single output channel, like dot products.
+ * For these, to determine the effective mask, this quirk can be set. We have
+ * an intentional off-by-one (a la MALI_POSITIVE), since 0-channel makes no
+ * sense but we need to fit 4 channels in 2-bits. Similarly, 1-channel doesn't
+ * make sense (since then why are we quirked?), so that corresponds to "no
+ * count set" */
+
+#define OP_CHANNEL_COUNT(c) ((c - 1) << 0)
+#define GET_CHANNEL_COUNT(c) ((c & (0x3 << 0)) ? ((c & (0x3 << 0)) + 1) : 0)
+
+/* For instructions that take a single argument, normally the first argument
+ * slot is used for the argument and the second slot is a dummy #0 constant.
+ * However, there are exceptions: instructions like fmov store their argument
+ * in the _second_ slot and store a dummy r24 in the first slot, designated by
+ * QUIRK_FLIPPED_R24 */
+
+#define QUIRK_FLIPPED_R24 (1 << 2)
+
+/* Is the op commutative? */
+#define OP_COMMUTES (1 << 3)
+
+/* Does the op convert types between int- and float- space (i2f/f2u/etc) */
+#define OP_TYPE_CONVERT (1 << 4)
+
+/* Vector-independant shorthands for the above; these numbers are arbitrary and
+ * not from the ISA. Convert to the above with unit_enum_to_midgard */
+
+#define UNIT_MUL 0
+#define UNIT_ADD 1
+#define UNIT_LUT 2
+
+/* 4-bit type tags */
+
+#define TAG_TEXTURE_4_VTX 0x2
+#define TAG_TEXTURE_4 0x3
+#define TAG_LOAD_STORE_4 0x5
+#define TAG_ALU_4 0x8
+#define TAG_ALU_8 0x9
+#define TAG_ALU_12 0xA
+#define TAG_ALU_16 0xB
+
+static inline int
+quadword_size(int tag)
+{
+        switch (tag) {
+        case TAG_ALU_4:
+        case TAG_LOAD_STORE_4:
+        case TAG_TEXTURE_4:
+        case TAG_TEXTURE_4_VTX:
+                return 1;
+        case TAG_ALU_8:
+                return 2;
+        case TAG_ALU_12:
+                return 3;
+        case TAG_ALU_16:
+                return 4;
+        default:
+                unreachable("Unknown tag");
+        }
+}
+
+#define IS_ALU(tag) (tag == TAG_ALU_4 || tag == TAG_ALU_8 ||  \
+		     tag == TAG_ALU_12 || tag == TAG_ALU_16)
+
+/* Special register aliases */
+
+#define MAX_WORK_REGISTERS 16
+
+/* Uniforms are begin at (REGISTER_UNIFORMS - uniform_count) */
+#define REGISTER_UNIFORMS 24
+
+#define REGISTER_UNUSED 24
+#define REGISTER_CONSTANT 26
+#define REGISTER_VARYING_BASE 26
+#define REGISTER_OFFSET 27
+#define REGISTER_TEXTURE_BASE 28
+#define REGISTER_SELECT 31
+
+/* SSA helper aliases to mimic the registers. UNUSED_0 encoded as an inline
+ * constant. UNUSED_1 encoded as REGISTER_UNUSED */
+
+#define SSA_UNUSED_0 0
+#define SSA_UNUSED_1 -2
+
+#define SSA_FIXED_SHIFT 24
+#define SSA_FIXED_REGISTER(reg) ((1 + reg) << SSA_FIXED_SHIFT)
+#define SSA_REG_FROM_FIXED(reg) ((reg >> SSA_FIXED_SHIFT) - 1)
+#define SSA_FIXED_MINIMUM SSA_FIXED_REGISTER(0)
+
+/* Swizzle support */
+
+#define SWIZZLE(A, B, C, D) ((D << 6) | (C << 4) | (B << 2) | (A << 0))
+#define SWIZZLE_FROM_ARRAY(r) SWIZZLE(r[0], r[1], r[2], r[3])
+#define COMPONENT_X 0x0
+#define COMPONENT_Y 0x1
+#define COMPONENT_Z 0x2
+#define COMPONENT_W 0x3
+
+#define SWIZZLE_XXXX SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X)
+#define SWIZZLE_XYXX SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_X)
+#define SWIZZLE_XYZX SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X)
+#define SWIZZLE_XYZW SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W)
+#define SWIZZLE_XYXZ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_Z)
+#define SWIZZLE_XYZZ SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_Z)
+#define SWIZZLE_WWWW SWIZZLE(COMPONENT_W, COMPONENT_W, COMPONENT_W, COMPONENT_W)
+
+static inline unsigned
+swizzle_of(unsigned comp)
+{
+        switch (comp) {
+        case 1:
+                return SWIZZLE_XXXX;
+        case 2:
+                return SWIZZLE_XYXX;
+        case 3:
+                return SWIZZLE_XYZX;
+        case 4:
+                return SWIZZLE_XYZW;
+        default:
+                unreachable("Invalid component count");
+        }
+}
+
+static inline unsigned
+mask_of(unsigned nr_comp)
+{
+        return (1 << nr_comp) - 1;
+}
+
+
+/* See ISA notes */
+
+#define LDST_NOP (3)
+
+/* There are five ALU units: VMUL, VADD, SMUL, SADD, LUT. A given opcode is
+ * implemented on some subset of these units (or occassionally all of them).
+ * This table encodes a bit mask of valid units for each opcode, so the
+ * scheduler can figure where to plonk the instruction. */
+
+/* Shorthands for each unit */
+#define UNIT_VMUL ALU_ENAB_VEC_MUL
+#define UNIT_SADD ALU_ENAB_SCAL_ADD
+#define UNIT_VADD ALU_ENAB_VEC_ADD
+#define UNIT_SMUL ALU_ENAB_SCAL_MUL
+#define UNIT_VLUT ALU_ENAB_VEC_LUT
+
+/* Shorthands for usual combinations of units */
+
+#define UNITS_MUL (UNIT_VMUL | UNIT_SMUL)
+#define UNITS_ADD (UNIT_VADD | UNIT_SADD)
+#define UNITS_MOST (UNITS_MUL | UNITS_ADD)
+#define UNITS_ALL (UNITS_MOST | UNIT_VLUT)
+#define UNITS_SCALAR (UNIT_SADD | UNIT_SMUL)
+#define UNITS_VECTOR (UNIT_VMUL | UNIT_VADD)
+#define UNITS_ANY_VECTOR (UNITS_VECTOR | UNIT_VLUT)
+
+struct mir_op_props {
+        const char *name;
+        unsigned props;
+};
+
+/* This file is common, so don't define the tables themselves. #include
+ * midgard_op.h if you need that, or edit midgard_ops.c directly */
+
+/* Duplicate bits to convert a 4-bit writemask to duplicated 8-bit format,
+ * which is used for 32-bit vector units */
+
+static inline unsigned
+expand_writemask_32(unsigned mask)
+{
+        unsigned o = 0;
+
+        for (int i = 0; i < 4; ++i)
+                if (mask & (1 << i))
+                        o |= (3 << (2 * i));
+
+        return o;
+}
+
+/* Coerce structs to integer */
+
+static inline unsigned
+vector_alu_srco_unsigned(midgard_vector_alu_src src)
+{
+        unsigned u;
+        memcpy(&u, &src, sizeof(src));
+        return u;
+}
+
+static inline midgard_vector_alu_src
+vector_alu_from_unsigned(unsigned u)
+{
+        midgard_vector_alu_src s;
+        memcpy(&s, &u, sizeof(s));
+        return s;
+}
+
+/* Composes two swizzles */
+static inline unsigned
+pan_compose_swizzle(unsigned left, unsigned right)
+{
+        unsigned out = 0;
+
+        for (unsigned c = 0; c < 4; ++c) {
+                unsigned s = (left >> (2*c)) & 0x3;
+                unsigned q = (right >> (2*s)) & 0x3;
+
+                out |= (q << (2*c));
+        }
+
+        return out;
+}
+
+/* Applies a swizzle to an ALU source */
+
+static inline unsigned
+vector_alu_apply_swizzle(unsigned src, unsigned swizzle)
+{
+        midgard_vector_alu_src s =
+                vector_alu_from_unsigned(src);
+
+        s.swizzle = pan_compose_swizzle(s.swizzle, swizzle);
+
+        return vector_alu_srco_unsigned(s);
+}
+
+#endif
diff --git a/src/panfrost/midgard/meson.build b/src/panfrost/midgard/meson.build
new file mode 100644
index 00000000000..cbe26004e2d
--- /dev/null
+++ b/src/panfrost/midgard/meson.build
@@ -0,0 +1,63 @@
+# Copyright © 2018 Rob Clark
+# Copyright © 2019 Collabora
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libpanfrost_midgard_files = files(
+  'midgard_compile.c',
+  'mir.c',
+  'midgard_print.c',
+  'midgard_schedule.c',
+  'midgard_emit.c',
+  'midgard_ra.c',
+  'midgard_ra_pipeline.c',
+  'midgard_liveness.c',
+  'midgard_ops.c',
+  'cppwrap.cpp',
+  'disassemble.c',
+)
+
+midgard_nir_algebraic_c = custom_target(
+  'midgard_nir_algebraic.c',
+  input : 'midgard_nir_algebraic.py',
+  output : 'midgard_nir_algebraic.c',
+  command : [
+    prog_python, '@INPUT@',
+    '-p', join_paths(meson.source_root(), 'src/compiler/nir/'),
+  ],
+  capture : true,
+  depend_files : nir_algebraic_py,
+)
+
+libpanfrost_midgard = static_library(
+  'panfrost_midgard',
+  [libpanfrost_midgard_files, midgard_nir_algebraic_c],
+  include_directories : [
+   inc_common,
+   inc_include,
+   inc_src,
+   inc_panfrost_hw,
+  ],
+  dependencies: [
+   idep_nir
+  ],
+  c_args : [c_vis_args, no_override_init_args],
+  cpp_args : [cpp_vis_args],
+  build_by_default : false,
+)
diff --git a/src/panfrost/midgard/midgard-parse.h b/src/panfrost/midgard/midgard-parse.h
new file mode 100644
index 00000000000..5d134839406
--- /dev/null
+++ b/src/panfrost/midgard/midgard-parse.h
@@ -0,0 +1,70 @@
+/* Author(s):
+ *   Connor Abbott
+ *   Alyssa Rosenzweig
+ *
+ * Copyright (c) 2013 Connor Abbott ([email protected])
+ * Copyright (c) 2018 Alyssa Rosenzweig ([email protected])
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __midgard_parse_h__
+#define __midgard_parse_h__
+
+/* Additional metadata for parsing Midgard binaries, not needed for compilation */
+
+static midgard_word_type midgard_word_types[16] = {
+        midgard_word_type_unknown,    /* 0x0 */
+        midgard_word_type_unknown,    /* 0x1 */
+        midgard_word_type_texture,    /* 0x2 */
+        midgard_word_type_texture,    /* 0x3 */
+        midgard_word_type_unknown,    /* 0x4 */
+        midgard_word_type_load_store, /* 0x5 */
+        midgard_word_type_unknown,    /* 0x6 */
+        midgard_word_type_unknown,    /* 0x7 */
+        midgard_word_type_alu,        /* 0x8 */
+        midgard_word_type_alu,        /* 0x9 */
+        midgard_word_type_alu,        /* 0xA */
+        midgard_word_type_alu,        /* 0xB */
+        midgard_word_type_alu,        /* 0xC */
+        midgard_word_type_alu,        /* 0xD */
+        midgard_word_type_alu,        /* 0xE */
+        midgard_word_type_alu,        /* 0xF */
+};
+
+static unsigned midgard_word_size[16] = {
+        0, /* 0x0 */
+        0, /* 0x1 */
+        1, /* 0x2 */
+        1, /* 0x3 */
+        0, /* 0x4 */
+        1, /* 0x5 */
+        0, /* 0x6 */
+        0, /* 0x7 */
+        1, /* 0x8 */
+        2, /* 0x9 */
+        3, /* 0xA */
+        4, /* 0xB */
+        1, /* 0xC */
+        2, /* 0xD */
+        3, /* 0xE */
+        4, /* 0xF */
+};
+
+#endif
diff --git a/src/panfrost/midgard/midgard.h b/src/panfrost/midgard/midgard.h
new file mode 100644
index 00000000000..5953214c599
--- /dev/null
+++ b/src/panfrost/midgard/midgard.h
@@ -0,0 +1,646 @@
+/* Author(s):
+ *   Connor Abbott
+ *   Alyssa Rosenzweig
+ *
+ * Copyright (c) 2013 Connor Abbott ([email protected])
+ * Copyright (c) 2018 Alyssa Rosenzweig ([email protected])
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __midgard_h__
+#define __midgard_h__
+
+#include <stdint.h>
+#include <stdbool.h>
+#include "panfrost-job.h"
+
+#define MIDGARD_DBG_MSGS		0x0001
+#define MIDGARD_DBG_SHADERS		0x0002
+#define MIDGARD_DBG_SHADERDB            0x0004
+
+extern int midgard_debug;
+
+typedef enum {
+        midgard_word_type_alu,
+        midgard_word_type_load_store,
+        midgard_word_type_texture,
+        midgard_word_type_unknown
+} midgard_word_type;
+
+typedef enum {
+        midgard_alu_vmul,
+        midgard_alu_sadd,
+        midgard_alu_smul,
+        midgard_alu_vadd,
+        midgard_alu_lut
+} midgard_alu;
+
+/*
+ * ALU words
+ */
+
+typedef enum {
+        midgard_alu_op_fadd       = 0x10,
+        midgard_alu_op_fmul       = 0x14,
+
+        midgard_alu_op_fmin       = 0x28,
+        midgard_alu_op_fmax       = 0x2C,
+
+        midgard_alu_op_fmov       = 0x30, /* fmov_rte */
+        midgard_alu_op_fmov_rtz   = 0x31,
+        midgard_alu_op_fmov_rtn   = 0x32,
+        midgard_alu_op_fmov_rtp   = 0x33,
+        midgard_alu_op_froundeven = 0x34,
+        midgard_alu_op_ftrunc     = 0x35,
+        midgard_alu_op_ffloor     = 0x36,
+        midgard_alu_op_fceil      = 0x37,
+        midgard_alu_op_ffma       = 0x38,
+        midgard_alu_op_fdot3      = 0x3C,
+        midgard_alu_op_fdot3r     = 0x3D,
+        midgard_alu_op_fdot4      = 0x3E,
+        midgard_alu_op_freduce    = 0x3F,
+
+        midgard_alu_op_iadd       = 0x40,
+        midgard_alu_op_ishladd    = 0x41,
+        midgard_alu_op_isub       = 0x46,
+        midgard_alu_op_iaddsat    = 0x48,
+        midgard_alu_op_uaddsat    = 0x49,
+        midgard_alu_op_isubsat    = 0x4E,
+        midgard_alu_op_usubsat    = 0x4F,
+
+        midgard_alu_op_imul       = 0x58,
+
+        midgard_alu_op_imin       = 0x60,
+        midgard_alu_op_umin       = 0x61,
+        midgard_alu_op_imax       = 0x62,
+        midgard_alu_op_umax       = 0x63,
+        midgard_alu_op_ihadd      = 0x64,
+        midgard_alu_op_uhadd      = 0x65,
+        midgard_alu_op_irhadd     = 0x66,
+        midgard_alu_op_urhadd     = 0x67,
+        midgard_alu_op_iasr       = 0x68,
+        midgard_alu_op_ilsr       = 0x69,
+        midgard_alu_op_ishl       = 0x6E,
+
+        midgard_alu_op_iand       = 0x70,
+        midgard_alu_op_ior        = 0x71,
+        midgard_alu_op_inand      = 0x72, /* ~(a & b), for inot let a = b */
+        midgard_alu_op_inor       = 0x73, /* ~(a | b) */
+        midgard_alu_op_iandnot    = 0x74, /* (a & ~b), used for not/b2f */
+        midgard_alu_op_iornot     = 0x75, /* (a | ~b) */
+        midgard_alu_op_ixor       = 0x76,
+        midgard_alu_op_inxor      = 0x77, /* ~(a & b) */
+        midgard_alu_op_iclz       = 0x78, /* Number of zeroes on left */
+        midgard_alu_op_ibitcount8 = 0x7A, /* Counts bits in 8-bit increments */
+        midgard_alu_op_imov       = 0x7B,
+        midgard_alu_op_iabsdiff   = 0x7C,
+        midgard_alu_op_uabsdiff   = 0x7D,
+        midgard_alu_op_ichoose    = 0x7E, /* vector, component number - dupe for shuffle() */
+
+        midgard_alu_op_feq        = 0x80,
+        midgard_alu_op_fne        = 0x81,
+        midgard_alu_op_flt        = 0x82,
+        midgard_alu_op_fle        = 0x83,
+        midgard_alu_op_fball_eq   = 0x88,
+        midgard_alu_op_bball_eq   = 0x89,
+        midgard_alu_op_fball_lt   = 0x8A, /* all(lessThan(.., ..)) */
+        midgard_alu_op_fball_lte  = 0x8B, /* all(lessThanEqual(.., ..)) */
+
+        midgard_alu_op_bbany_neq  = 0x90, /* used for bvec4(1) */
+        midgard_alu_op_fbany_neq  = 0x91, /* bvec4(0) also */
+        midgard_alu_op_fbany_lt   = 0x92, /* any(lessThan(.., ..)) */
+        midgard_alu_op_fbany_lte  = 0x93, /* any(lessThanEqual(.., ..)) */
+
+        midgard_alu_op_f2i_rte    = 0x98,
+        midgard_alu_op_f2i_rtz    = 0x99,
+        midgard_alu_op_f2i_rtn    = 0x9A,
+        midgard_alu_op_f2i_rtp    = 0x9B,
+        midgard_alu_op_f2u_rte    = 0x9C,
+        midgard_alu_op_f2u_rtz    = 0x9D,
+        midgard_alu_op_f2u_rtn    = 0x9E,
+        midgard_alu_op_f2u_rtp    = 0x9F,
+
+        midgard_alu_op_ieq        = 0xA0,
+        midgard_alu_op_ine        = 0xA1,
+        midgard_alu_op_ult        = 0xA2,
+        midgard_alu_op_ule        = 0xA3,
+        midgard_alu_op_ilt        = 0xA4,
+        midgard_alu_op_ile        = 0xA5,
+        midgard_alu_op_iball_eq   = 0xA8,
+        midgard_alu_op_iball_neq  = 0xA9,
+        midgard_alu_op_uball_lt   = 0xAA,
+        midgard_alu_op_uball_lte  = 0xAB,
+        midgard_alu_op_iball_lt   = 0xAC,
+        midgard_alu_op_iball_lte  = 0xAD,
+
+        midgard_alu_op_ibany_eq   = 0xB0,
+        midgard_alu_op_ibany_neq  = 0xB1,
+        midgard_alu_op_ubany_lt   = 0xB2,
+        midgard_alu_op_ubany_lte  = 0xB3,
+        midgard_alu_op_ibany_lt   = 0xB4, /* any(lessThan(.., ..)) */
+        midgard_alu_op_ibany_lte  = 0xB5, /* any(lessThanEqual(.., ..)) */
+        midgard_alu_op_i2f_rte    = 0xB8,
+        midgard_alu_op_i2f_rtz    = 0xB9,
+        midgard_alu_op_i2f_rtn    = 0xBA,
+        midgard_alu_op_i2f_rtp    = 0xBB,
+        midgard_alu_op_u2f_rte    = 0xBC,
+        midgard_alu_op_u2f_rtz    = 0xBD,
+        midgard_alu_op_u2f_rtn    = 0xBE,
+        midgard_alu_op_u2f_rtp    = 0xBF,
+
+        midgard_alu_op_icsel_v    = 0xC0, /* condition code r31 */
+        midgard_alu_op_icsel      = 0xC1, /* condition code r31.w */
+        midgard_alu_op_fcsel_v    = 0xC4,
+        midgard_alu_op_fcsel      = 0xC5,
+        midgard_alu_op_fround     = 0xC6,
+
+        midgard_alu_op_fatan_pt2  = 0xE8,
+        midgard_alu_op_fpow_pt1   = 0xEC,
+        midgard_alu_op_fpown_pt1  = 0xED,
+        midgard_alu_op_fpowr_pt1  = 0xEE,
+
+        midgard_alu_op_frcp       = 0xF0,
+        midgard_alu_op_frsqrt     = 0xF2,
+        midgard_alu_op_fsqrt      = 0xF3,
+        midgard_alu_op_fexp2      = 0xF4,
+        midgard_alu_op_flog2      = 0xF5,
+        midgard_alu_op_fsin       = 0xF6,
+        midgard_alu_op_fcos       = 0xF7,
+        midgard_alu_op_fatan2_pt1 = 0xF9,
+} midgard_alu_op;
+
+typedef enum {
+        midgard_outmod_none = 0,
+        midgard_outmod_pos  = 1,
+        /* 0x2 unknown */
+        midgard_outmod_sat  = 3
+} midgard_outmod_float;
+
+typedef enum {
+        midgard_outmod_int_saturate = 0,
+        midgard_outmod_uint_saturate = 1,
+        midgard_outmod_int_wrap = 2,
+        midgard_outmod_int_high = 3, /* Overflowed portion */
+} midgard_outmod_int;
+
+typedef enum {
+        midgard_reg_mode_8 = 0,
+        midgard_reg_mode_16 = 1,
+        midgard_reg_mode_32 = 2,
+        midgard_reg_mode_64 = 3
+} midgard_reg_mode;
+
+typedef enum {
+        midgard_dest_override_lower = 0,
+        midgard_dest_override_upper = 1,
+        midgard_dest_override_none = 2
+} midgard_dest_override;
+
+typedef enum {
+        midgard_int_sign_extend = 0,
+        midgard_int_zero_extend = 1,
+        midgard_int_normal = 2,
+        midgard_int_shift = 3
+} midgard_int_mod;
+
+#define MIDGARD_FLOAT_MOD_ABS (1 << 0)
+#define MIDGARD_FLOAT_MOD_NEG (1 << 1)
+
+typedef struct
+__attribute__((__packed__))
+{
+        /* Either midgard_int_mod or from midgard_float_mod_*, depending on the
+         * type of op */
+        unsigned mod : 2;
+
+        /* replicate lower half if dest = half, or low/high half selection if
+         * dest = full
+         */
+        bool rep_low     : 1;
+        bool rep_high    : 1; /* unused if dest = full */
+        bool half        : 1; /* only matters if dest = full */
+        unsigned swizzle : 8;
+}
+midgard_vector_alu_src;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_alu_op op               :  8;
+        midgard_reg_mode reg_mode   :  2;
+        unsigned src1 : 13;
+        unsigned src2 : 13;
+        midgard_dest_override dest_override : 2;
+        midgard_outmod_float outmod               : 2;
+        unsigned mask                           : 8;
+}
+midgard_vector_alu;
+
+typedef struct
+__attribute__((__packed__))
+{
+        bool abs           : 1;
+        bool negate        : 1;
+        bool full          : 1; /* 0 = half, 1 = full */
+        unsigned component : 3;
+}
+midgard_scalar_alu_src;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_alu_op op         :  8;
+        unsigned src1             :  6;
+        unsigned src2             : 11;
+        unsigned unknown          :  1;
+        unsigned outmod :  2;
+        bool output_full          :  1;
+        unsigned output_component :  3;
+}
+midgard_scalar_alu;
+
+typedef struct
+__attribute__((__packed__))
+{
+        unsigned src1_reg : 5;
+        unsigned src2_reg : 5;
+        unsigned out_reg  : 5;
+        bool src2_imm     : 1;
+}
+midgard_reg_info;
+
+/* In addition to conditional branches and jumps (unconditional branches),
+ * Midgard implements a bit of fixed function functionality used in fragment
+ * shaders via specially crafted branches. These have special branch opcodes,
+ * which perform a fixed-function operation and/or use the results of a
+ * fixed-function operation as the branch condition.  */
+
+typedef enum {
+        /* Regular branches */
+        midgard_jmp_writeout_op_branch_uncond = 1,
+        midgard_jmp_writeout_op_branch_cond = 2,
+
+        /* In a fragment shader, execute a discard_if instruction, with the
+         * corresponding condition code. Terminates the shader, so generally
+         * set the branch target to out of the shader */
+        midgard_jmp_writeout_op_discard = 4,
+
+        /* Branch if the tilebuffer is not yet ready. At the beginning of a
+         * fragment shader that reads from the tile buffer, for instance via
+         * ARM_shader_framebuffer_fetch or EXT_pixel_local_storage, this branch
+         * operation should be used as a loop. An instruction like
+         * "br.tilebuffer.always -1" does the trick, corresponding to
+         * "while(!is_tilebuffer_ready) */
+        midgard_jmp_writeout_op_tilebuffer_pending = 6,
+
+        /* In a fragment shader, try to write out the value pushed to r0 to the
+         * tilebuffer, subject to unknown state in r1.z and r1.w. If this
+         * succeeds, the shader terminates. If it fails, it branches to the
+         * specified branch target. Generally, this should be used in a loop to
+         * itself, acting as "do { write(r0); } while(!write_successful);" */
+        midgard_jmp_writeout_op_writeout = 7,
+} midgard_jmp_writeout_op;
+
+typedef enum {
+        midgard_condition_write0 = 0,
+
+        /* These condition codes denote a conditional branch on FALSE and on
+         * TRUE respectively */
+        midgard_condition_false = 1,
+        midgard_condition_true = 2,
+
+        /* This condition code always branches. For a pure branch, the
+         * unconditional branch coding should be used instead, but for
+         * fixed-function branch opcodes, this is still useful */
+        midgard_condition_always = 3,
+} midgard_condition;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_jmp_writeout_op op : 3; /* == branch_uncond */
+        unsigned dest_tag : 4; /* tag of branch destination */
+        unsigned unknown : 2;
+        int offset : 7;
+}
+midgard_branch_uncond;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_jmp_writeout_op op : 3; /* == branch_cond */
+        unsigned dest_tag : 4; /* tag of branch destination */
+        int offset : 7;
+        midgard_condition cond : 2;
+}
+midgard_branch_cond;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_jmp_writeout_op op : 3; /* == branch_cond */
+        unsigned dest_tag : 4; /* tag of branch destination */
+        unsigned unknown : 2;
+        signed offset : 23;
+        unsigned cond : 16;
+}
+midgard_branch_extended;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_jmp_writeout_op op : 3; /* == writeout */
+        unsigned unknown : 13;
+}
+midgard_writeout;
+
+/*
+ * Load/store words
+ */
+
+typedef enum {
+        midgard_op_ld_st_noop   = 0x03,
+
+        /* Unclear why this is on the L/S unit, but (with an address of 0,
+         * appropriate swizzle, magic constant 0x24, and xy mask?) moves fp32 cube
+         * map coordinates in r27 to its cube map texture coordinate
+         * destination (e.g r29). 0x4 magic for lding from fp16 instead */
+
+        midgard_op_st_cubemap_coords = 0x0E,
+
+        /* Used in OpenCL. Probably can ld other things as well */
+        midgard_op_ld_global_id = 0x10,
+
+        /* The L/S unit can do perspective division a clock faster than the ALU
+         * if you're lucky. Put the vec4 in r27, and call with 0x24 as the
+         * unknown state; the output will be <x/w, y/w, z/w, 1>. Replace w with
+         * z for the z version */
+        midgard_op_ldst_perspective_division_z = 0x12,
+        midgard_op_ldst_perspective_division_w = 0x13,
+
+        /* val in r27.y, address embedded, outputs result to argument. Invert val for sub. Let val = +-1 for inc/dec. */
+        midgard_op_atomic_add = 0x40,
+        midgard_op_atomic_and = 0x44,
+        midgard_op_atomic_or = 0x48,
+        midgard_op_atomic_xor = 0x4C,
+
+        midgard_op_atomic_imin = 0x50,
+        midgard_op_atomic_umin = 0x54,
+        midgard_op_atomic_imax = 0x58,
+        midgard_op_atomic_umax = 0x5C,
+
+        midgard_op_atomic_xchg = 0x60,
+
+        /* Used for compute shader's __global arguments, __local variables (or
+         * for register spilling) */
+
+        midgard_op_ld_char = 0x81,
+        midgard_op_ld_char2 = 0x84,
+        midgard_op_ld_short = 0x85,
+        midgard_op_ld_char4 = 0x88, /* short2, int, float */
+        midgard_op_ld_short4 = 0x8C, /* int2, float2, long */
+        midgard_op_ld_int4 = 0x90, /* float4, long2 */
+
+        midgard_op_ld_attr_32 = 0x94,
+        midgard_op_ld_attr_16 = 0x95,
+        midgard_op_ld_attr_32u = 0x96,
+        midgard_op_ld_attr_32i = 0x97,
+        midgard_op_ld_vary_32 = 0x98,
+        midgard_op_ld_vary_16 = 0x99,
+        midgard_op_ld_vary_32u = 0x9A,
+        midgard_op_ld_vary_32i = 0x9B,
+        midgard_op_ld_color_buffer_16 = 0x9D,
+
+        midgard_op_ld_uniform_16 = 0xAC,
+        midgard_op_ld_uniform_32i = 0xA8,
+
+        midgard_op_ld_uniform_32 = 0xB0,
+        midgard_op_ld_color_buffer_8 = 0xBA,
+
+        midgard_op_st_char = 0xC0,
+        midgard_op_st_char2 = 0xC4, /* short */
+        midgard_op_st_char4 = 0xC8, /* short2, int, float */
+        midgard_op_st_short4 = 0xCC, /* int2, float2, long */
+        midgard_op_st_int4 = 0xD0, /* float4, long2 */
+
+        midgard_op_st_vary_32 = 0xD4,
+        midgard_op_st_vary_16 = 0xD5,
+        midgard_op_st_vary_32u = 0xD6,
+        midgard_op_st_vary_32i = 0xD7,
+
+        /* Value to st in r27, location r26.w as short2 */
+        midgard_op_st_image_f = 0xD8,
+        midgard_op_st_image_ui = 0xDA,
+        midgard_op_st_image_i = 0xDB,
+} midgard_load_store_op;
+
+typedef enum {
+        midgard_interp_centroid = 1,
+        midgard_interp_default = 2
+} midgard_interpolation;
+
+typedef enum {
+        midgard_varying_mod_none = 0,
+
+        /* Other values unknown */
+
+        /* Take the would-be result and divide all components by its z/w
+         * (perspective division baked in with the load)  */
+        midgard_varying_mod_perspective_z = 2,
+        midgard_varying_mod_perspective_w = 3,
+} midgard_varying_modifier;
+
+typedef struct
+__attribute__((__packed__))
+{
+        unsigned zero0 : 1; /* Always zero */
+
+        midgard_varying_modifier modifier : 2;
+
+        unsigned zero1: 1; /* Always zero */
+
+        /* Varying qualifiers, zero if not a varying */
+        unsigned flat    : 1;
+        unsigned is_varying : 1; /* Always one for varying, but maybe something else? */
+        midgard_interpolation interpolation : 2;
+
+        unsigned zero2 : 2; /* Always zero */
+}
+midgard_varying_parameter;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_load_store_op op : 8;
+        unsigned reg     : 5;
+        unsigned mask    : 4;
+        unsigned swizzle : 8;
+        unsigned unknown : 16;
+
+        unsigned varying_parameters : 10;
+
+        unsigned address : 9;
+}
+midgard_load_store_word;
+
+typedef struct
+__attribute__((__packed__))
+{
+        unsigned type      : 4;
+        unsigned next_type : 4;
+        uint64_t word1     : 60;
+        uint64_t word2     : 60;
+}
+midgard_load_store;
+
+/* 8-bit register selector used in texture ops to select a bias/LOD/gradient
+ * register, shoved into the `bias` field */
+
+typedef struct
+__attribute__((__packed__))
+{
+        /* Combines with component_hi to form 2-bit component select out of
+         * xyzw, as the component for bias/LOD and the starting component of a
+         * gradient vector */
+
+        unsigned component_lo : 1;
+
+        /* Register select between r28/r29 */
+        unsigned select : 1;
+
+        /* For a half-register, selects the upper half */
+        unsigned upper : 1;
+
+        /* Specifies a full-register, clear for a half-register. Mutually
+         * exclusive with upper. */
+        unsigned full : 1;
+
+        /* Higher half of component_lo. Always seen to be set for LOD/bias
+         * and clear for processed gradients, but I'm not sure if that's a
+         * hardware requirement. */
+        unsigned component_hi : 1;
+
+        /* Padding to make this 8-bit */
+        unsigned zero : 3;
+}
+midgard_tex_register_select;
+
+/* Texture pipeline results are in r28-r29 */
+#define REG_TEX_BASE 28
+
+/* Texture opcodes... maybe? */
+#define TEXTURE_OP_NORMAL 0x11          /* texture */
+#define TEXTURE_OP_LOD 0x12             /* textureLod */
+#define TEXTURE_OP_TEXEL_FETCH 0x14     /* texelFetch */
+
+enum mali_sampler_type {
+        MALI_SAMPLER_UNK        = 0x0,
+        MALI_SAMPLER_FLOAT      = 0x1, /* sampler */
+        MALI_SAMPLER_UNSIGNED   = 0x2, /* usampler */
+        MALI_SAMPLER_SIGNED     = 0x3, /* isampler */
+};
+
+typedef struct
+__attribute__((__packed__))
+{
+        unsigned type      : 4;
+        unsigned next_type : 4;
+
+        unsigned op  : 6;
+        unsigned shadow    : 1;
+        unsigned is_gather  : 1;
+
+        /* A little obscure, but last is set for the last texture operation in
+         * a shader. cont appears to just be last's opposite (?). Yeah, I know,
+         * kind of funky.. BiOpen thinks it could do with memory hinting, or
+         * tile locking? */
+
+        unsigned cont  : 1;
+        unsigned last  : 1;
+
+        enum mali_texture_type format : 2;
+        unsigned zero : 2;
+
+        /* Is a register used to specify the
+         * LOD/bias/offset? If set, use the `bias` field as
+         * a register index. If clear, use the `bias` field
+         * as an immediate. */
+        unsigned lod_register : 1;
+
+        /* Is a register used to specify an offset? If set, use the
+         * offset_reg_* fields to encode this, duplicated for each of the
+         * components. If clear, there is implcitly always an immediate offst
+         * specificed in offset_imm_* */
+        unsigned offset_register : 1;
+
+        unsigned in_reg_full  : 1;
+        unsigned in_reg_select : 1;
+        unsigned in_reg_upper  : 1;
+        unsigned in_reg_swizzle : 8;
+
+        unsigned unknown8  : 2;
+
+        unsigned out_full  : 1;
+
+        enum mali_sampler_type sampler_type : 2;
+
+        unsigned out_reg_select : 1;
+        unsigned out_upper : 1;
+
+        unsigned mask : 4;
+
+        unsigned unknown2  : 2;
+
+        unsigned swizzle  : 8;
+        unsigned unknown4  : 8;
+
+        unsigned unknownA  : 4;
+
+        /* In immediate mode, each offset field is an immediate range [0, 7].
+         *
+         * In register mode, offset_x becomes a register full / select / upper
+         * triplet and a vec3 swizzle is splattered across offset_y/offset_z in
+         * a genuinely bizarre way.
+         *
+         * For texel fetches in immediate mode, the range is the full [-8, 7],
+         * but for normal texturing the top bit must be zero and a register
+         * used instead. It's not clear where this limitation is from. */
+
+        signed offset_x : 4;
+        signed offset_y : 4;
+        signed offset_z : 4;
+
+        /* In immediate bias mode, for a normal texture op, this is
+         * texture bias, computed as int(2^8 * frac(biasf)), with
+         * bias_int = floor(bias). For a textureLod, it's that, but
+         * s/bias/lod. For a texel fetch, this is the LOD as-is.
+         *
+         * In register mode, this is a midgard_tex_register_select
+         * structure and bias_int is zero */
+
+        unsigned bias : 8;
+        signed bias_int  : 8;
+
+        unsigned texture_handle : 16;
+        unsigned sampler_handle : 16;
+}
+midgard_texture_word;
+
+#endif
diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c
new file mode 100644
index 00000000000..9c1349094bd
--- /dev/null
+++ b/src/panfrost/midgard/midgard_compile.c
@@ -0,0 +1,2901 @@
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <err.h>
+
+#include "main/mtypes.h"
+#include "compiler/glsl/glsl_to_nir.h"
+#include "compiler/nir_types.h"
+#include "main/imports.h"
+#include "compiler/nir/nir_builder.h"
+#include "util/half_float.h"
+#include "util/u_math.h"
+#include "util/u_debug.h"
+#include "util/u_dynarray.h"
+#include "util/list.h"
+#include "main/mtypes.h"
+
+#include "midgard.h"
+#include "midgard_nir.h"
+#include "midgard_compile.h"
+#include "midgard_ops.h"
+#include "helpers.h"
+#include "compiler.h"
+
+#include "disassemble.h"
+
+static const struct debug_named_value debug_options[] = {
+        {"msgs",      MIDGARD_DBG_MSGS,		"Print debug messages"},
+        {"shaders",   MIDGARD_DBG_SHADERS,	"Dump shaders in NIR and MIR"},
+        {"shaderdb",  MIDGARD_DBG_SHADERDB,     "Prints shader-db statistics"},
+        DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", debug_options, 0)
+
+unsigned SHADER_DB_COUNT = 0;
+
+int midgard_debug = 0;
+
+#define DBG(fmt, ...) \
+		do { if (midgard_debug & MIDGARD_DBG_MSGS) \
+			fprintf(stderr, "%s:%d: "fmt, \
+				__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
+
+static bool
+midgard_is_branch_unit(unsigned unit)
+{
+        return (unit == ALU_ENAB_BRANCH) || (unit == ALU_ENAB_BR_COMPACT);
+}
+
+static void
+midgard_block_add_successor(midgard_block *block, midgard_block *successor)
+{
+        block->successors[block->nr_successors++] = successor;
+        assert(block->nr_successors <= ARRAY_SIZE(block->successors));
+}
+
+/* Helpers to generate midgard_instruction's using macro magic, since every
+ * driver seems to do it that way */
+
+#define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__));
+
+#define M_LOAD_STORE(name, rname, uname) \
+	static midgard_instruction m_##name(unsigned ssa, unsigned address) { \
+		midgard_instruction i = { \
+			.type = TAG_LOAD_STORE_4, \
+                        .mask = 0xF, \
+			.ssa_args = { \
+				.rname = ssa, \
+				.uname = -1, \
+				.src1 = -1 \
+			}, \
+			.load_store = { \
+				.op = midgard_op_##name, \
+				.swizzle = SWIZZLE_XYZW, \
+				.address = address \
+			} \
+		}; \
+		\
+		return i; \
+	}
+
+#define M_LOAD(name) M_LOAD_STORE(name, dest, src0)
+#define M_STORE(name) M_LOAD_STORE(name, src0, dest)
+
+/* Inputs a NIR ALU source, with modifiers attached if necessary, and outputs
+ * the corresponding Midgard source */
+
+static midgard_vector_alu_src
+vector_alu_modifiers(nir_alu_src *src, bool is_int, unsigned broadcast_count,
+                     bool half, bool sext)
+{
+        if (!src) return blank_alu_src;
+
+        /* Figure out how many components there are so we can adjust the
+         * swizzle.  Specifically we want to broadcast the last channel so
+         * things like ball2/3 work
+         */
+
+        if (broadcast_count) {
+                uint8_t last_component = src->swizzle[broadcast_count - 1];
+
+                for (unsigned c = broadcast_count; c < NIR_MAX_VEC_COMPONENTS; ++c) {
+                        src->swizzle[c] = last_component;
+                }
+        }
+
+        midgard_vector_alu_src alu_src = {
+                .rep_low = 0,
+                .rep_high = 0,
+                .half = half,
+                .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle)
+        };
+
+        if (is_int) {
+                alu_src.mod = midgard_int_normal;
+
+                /* Sign/zero-extend if needed */
+
+                if (half) {
+                        alu_src.mod = sext ?
+                                      midgard_int_sign_extend
+                                      : midgard_int_zero_extend;
+                }
+
+                /* These should have been lowered away */
+                assert(!(src->abs || src->negate));
+        } else {
+                alu_src.mod = (src->abs << 0) | (src->negate << 1);
+        }
+
+        return alu_src;
+}
+
+/* load/store instructions have both 32-bit and 16-bit variants, depending on
+ * whether we are using vectors composed of highp or mediump. At the moment, we
+ * don't support half-floats -- this requires changes in other parts of the
+ * compiler -- therefore the 16-bit versions are commented out. */
+
+//M_LOAD(ld_attr_16);
+M_LOAD(ld_attr_32);
+//M_LOAD(ld_vary_16);
+M_LOAD(ld_vary_32);
+//M_LOAD(ld_uniform_16);
+M_LOAD(ld_uniform_32);
+M_LOAD(ld_color_buffer_8);
+//M_STORE(st_vary_16);
+M_STORE(st_vary_32);
+M_STORE(st_cubemap_coords);
+
+static midgard_instruction
+v_alu_br_compact_cond(midgard_jmp_writeout_op op, unsigned tag, signed offset, unsigned cond)
+{
+        midgard_branch_cond branch = {
+                .op = op,
+                .dest_tag = tag,
+                .offset = offset,
+                .cond = cond
+        };
+
+        uint16_t compact;
+        memcpy(&compact, &branch, sizeof(branch));
+
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .unit = ALU_ENAB_BR_COMPACT,
+                .prepacked_branch = true,
+                .compact_branch = true,
+                .br_compact = compact
+        };
+
+        if (op == midgard_jmp_writeout_op_writeout)
+                ins.writeout = true;
+
+        return ins;
+}
+
+static midgard_instruction
+v_branch(bool conditional, bool invert)
+{
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .unit = ALU_ENAB_BRANCH,
+                .compact_branch = true,
+                .branch = {
+                        .conditional = conditional,
+                        .invert_conditional = invert
+                }
+        };
+
+        return ins;
+}
+
+static midgard_branch_extended
+midgard_create_branch_extended( midgard_condition cond,
+                                midgard_jmp_writeout_op op,
+                                unsigned dest_tag,
+                                signed quadword_offset)
+{
+        /* For unclear reasons, the condition code is repeated 8 times */
+        uint16_t duplicated_cond =
+                (cond << 14) |
+                (cond << 12) |
+                (cond << 10) |
+                (cond << 8) |
+                (cond << 6) |
+                (cond << 4) |
+                (cond << 2) |
+                (cond << 0);
+
+        midgard_branch_extended branch = {
+                .op = op,
+                .dest_tag = dest_tag,
+                .offset = quadword_offset,
+                .cond = duplicated_cond
+        };
+
+        return branch;
+}
+
+static void
+attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name)
+{
+        ins->has_constants = true;
+        memcpy(&ins->constants, constants, 16);
+}
+
+static int
+glsl_type_size(const struct glsl_type *type, bool bindless)
+{
+        return glsl_count_attribute_slots(type, false);
+}
+
+/* Lower fdot2 to a vector multiplication followed by channel addition  */
+static void
+midgard_nir_lower_fdot2_body(nir_builder *b, nir_alu_instr *alu)
+{
+        if (alu->op != nir_op_fdot2)
+                return;
+
+        b->cursor = nir_before_instr(&alu->instr);
+
+        nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0);
+        nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1);
+
+        nir_ssa_def *product = nir_fmul(b, src0, src1);
+
+        nir_ssa_def *sum = nir_fadd(b,
+                                    nir_channel(b, product, 0),
+                                    nir_channel(b, product, 1));
+
+        /* Replace the fdot2 with this sum */
+        nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(sum));
+}
+
+static int
+midgard_nir_sysval_for_intrinsic(nir_intrinsic_instr *instr)
+{
+        switch (instr->intrinsic) {
+        case nir_intrinsic_load_viewport_scale:
+                return PAN_SYSVAL_VIEWPORT_SCALE;
+        case nir_intrinsic_load_viewport_offset:
+                return PAN_SYSVAL_VIEWPORT_OFFSET;
+        default:
+                return -1;
+        }
+}
+
+static unsigned
+nir_dest_index(compiler_context *ctx, nir_dest *dst)
+{
+        if (dst->is_ssa)
+                return dst->ssa.index;
+        else {
+                assert(!dst->reg.indirect);
+                return ctx->func->impl->ssa_alloc + dst->reg.reg->index;
+        }
+}
+
+static int sysval_for_instr(compiler_context *ctx, nir_instr *instr,
+                            unsigned *dest)
+{
+        nir_intrinsic_instr *intr;
+        nir_dest *dst = NULL;
+        nir_tex_instr *tex;
+        int sysval = -1;
+
+        switch (instr->type) {
+        case nir_instr_type_intrinsic:
+                intr = nir_instr_as_intrinsic(instr);
+                sysval = midgard_nir_sysval_for_intrinsic(intr);
+                dst = &intr->dest;
+                break;
+        case nir_instr_type_tex:
+                tex = nir_instr_as_tex(instr);
+                if (tex->op != nir_texop_txs)
+                        break;
+
+                sysval = PAN_SYSVAL(TEXTURE_SIZE,
+                                    PAN_TXS_SYSVAL_ID(tex->texture_index,
+                                                      nir_tex_instr_dest_size(tex) -
+                                                      (tex->is_array ? 1 : 0),
+                                                      tex->is_array));
+                dst  = &tex->dest;
+                break;
+        default:
+                break;
+        }
+
+        if (dest && dst)
+                *dest = nir_dest_index(ctx, dst);
+
+        return sysval;
+}
+
+static void
+midgard_nir_assign_sysval_body(compiler_context *ctx, nir_instr *instr)
+{
+        int sysval;
+
+        sysval = sysval_for_instr(ctx, instr, NULL);
+        if (sysval < 0)
+                return;
+
+        /* We have a sysval load; check if it's already been assigned */
+
+        if (_mesa_hash_table_u64_search(ctx->sysval_to_id, sysval))
+                return;
+
+        /* It hasn't -- so assign it now! */
+
+        unsigned id = ctx->sysval_count++;
+        _mesa_hash_table_u64_insert(ctx->sysval_to_id, sysval, (void *) ((uintptr_t) id + 1));
+        ctx->sysvals[id] = sysval;
+}
+
+static void
+midgard_nir_assign_sysvals(compiler_context *ctx, nir_shader *shader)
+{
+        ctx->sysval_count = 0;
+
+        nir_foreach_function(function, shader) {
+                if (!function->impl) continue;
+
+                nir_foreach_block(block, function->impl) {
+                        nir_foreach_instr_safe(instr, block) {
+                                midgard_nir_assign_sysval_body(ctx, instr);
+                        }
+                }
+        }
+}
+
+static bool
+midgard_nir_lower_fdot2(nir_shader *shader)
+{
+        bool progress = false;
+
+        nir_foreach_function(function, shader) {
+                if (!function->impl) continue;
+
+                nir_builder _b;
+                nir_builder *b = &_b;
+                nir_builder_init(b, function->impl);
+
+                nir_foreach_block(block, function->impl) {
+                        nir_foreach_instr_safe(instr, block) {
+                                if (instr->type != nir_instr_type_alu) continue;
+
+                                nir_alu_instr *alu = nir_instr_as_alu(instr);
+                                midgard_nir_lower_fdot2_body(b, alu);
+
+                                progress |= true;
+                        }
+                }
+
+                nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance);
+
+        }
+
+        return progress;
+}
+
+/* Flushes undefined values to zero */
+
+static void
+optimise_nir(nir_shader *nir)
+{
+        bool progress;
+        unsigned lower_flrp =
+                (nir->options->lower_flrp16 ? 16 : 0) |
+                (nir->options->lower_flrp32 ? 32 : 0) |
+                (nir->options->lower_flrp64 ? 64 : 0);
+
+        NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
+        NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
+        NIR_PASS(progress, nir, nir_lower_idiv);
+
+        nir_lower_tex_options lower_tex_1st_pass_options = {
+                .lower_rect = true,
+                .lower_txp = ~0
+        };
+
+        nir_lower_tex_options lower_tex_2nd_pass_options = {
+                .lower_txs_lod = true,
+        };
+
+        NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_1st_pass_options);
+        NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_2nd_pass_options);
+
+        do {
+                progress = false;
+
+                NIR_PASS(progress, nir, nir_lower_var_copies);
+                NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+
+                NIR_PASS(progress, nir, nir_copy_prop);
+                NIR_PASS(progress, nir, nir_opt_dce);
+                NIR_PASS(progress, nir, nir_opt_dead_cf);
+                NIR_PASS(progress, nir, nir_opt_cse);
+                NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
+                NIR_PASS(progress, nir, nir_opt_algebraic);
+                NIR_PASS(progress, nir, nir_opt_constant_folding);
+
+                if (lower_flrp != 0) {
+                        bool lower_flrp_progress = false;
+                        NIR_PASS(lower_flrp_progress,
+                                 nir,
+                                 nir_lower_flrp,
+                                 lower_flrp,
+                                 false /* always_precise */,
+                                 nir->options->lower_ffma);
+                        if (lower_flrp_progress) {
+                                NIR_PASS(progress, nir,
+                                         nir_opt_constant_folding);
+                                progress = true;
+                        }
+
+                        /* Nothing should rematerialize any flrps, so we only
+                         * need to do this lowering once.
+                         */
+                        lower_flrp = 0;
+                }
+
+                NIR_PASS(progress, nir, nir_opt_undef);
+                NIR_PASS(progress, nir, nir_undef_to_zero);
+
+                NIR_PASS(progress, nir, nir_opt_loop_unroll,
+                         nir_var_shader_in |
+                         nir_var_shader_out |
+                         nir_var_function_temp);
+
+                NIR_PASS(progress, nir, nir_opt_vectorize);
+        } while (progress);
+
+        /* Must be run at the end to prevent creation of fsin/fcos ops */
+        NIR_PASS(progress, nir, midgard_nir_scale_trig);
+
+        do {
+                progress = false;
+
+                NIR_PASS(progress, nir, nir_opt_dce);
+                NIR_PASS(progress, nir, nir_opt_algebraic);
+                NIR_PASS(progress, nir, nir_opt_constant_folding);
+                NIR_PASS(progress, nir, nir_copy_prop);
+        } while (progress);
+
+        NIR_PASS(progress, nir, nir_opt_algebraic_late);
+
+        /* We implement booleans as 32-bit 0/~0 */
+        NIR_PASS(progress, nir, nir_lower_bool_to_int32);
+
+        /* Now that booleans are lowered, we can run out late opts */
+        NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late);
+
+        /* Lower mods for float ops only. Integer ops don't support modifiers
+         * (saturate doesn't make sense on integers, neg/abs require dedicated
+         * instructions) */
+
+        NIR_PASS(progress, nir, nir_lower_to_source_mods, nir_lower_float_source_mods);
+        NIR_PASS(progress, nir, nir_copy_prop);
+        NIR_PASS(progress, nir, nir_opt_dce);
+
+        /* Take us out of SSA */
+        NIR_PASS(progress, nir, nir_lower_locals_to_regs);
+        NIR_PASS(progress, nir, nir_convert_from_ssa, true);
+
+        /* We are a vector architecture; write combine where possible */
+        NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
+        NIR_PASS(progress, nir, nir_lower_vec_to_movs);
+
+        NIR_PASS(progress, nir, nir_opt_dce);
+}
+
+/* Front-half of aliasing the SSA slots, merely by inserting the flag in the
+ * appropriate hash table. Intentional off-by-one to avoid confusing NULL with
+ * r0. See the comments in compiler_context */
+
+static void
+alias_ssa(compiler_context *ctx, int dest, int src)
+{
+        _mesa_hash_table_u64_insert(ctx->ssa_to_alias, dest + 1, (void *) ((uintptr_t) src + 1));
+        _mesa_set_add(ctx->leftover_ssa_to_alias, (void *) (uintptr_t) (dest + 1));
+}
+
+/* ...or undo it, after which the original index will be used (dummy move should be emitted alongside this) */
+
+static void
+unalias_ssa(compiler_context *ctx, int dest)
+{
+        _mesa_hash_table_u64_remove(ctx->ssa_to_alias, dest + 1);
+        /* TODO: Remove from leftover or no? */
+}
+
+/* Do not actually emit a load; instead, cache the constant for inlining */
+
+static void
+emit_load_const(compiler_context *ctx, nir_load_const_instr *instr)
+{
+        nir_ssa_def def = instr->def;
+
+        float *v = rzalloc_array(NULL, float, 4);
+        nir_const_load_to_arr(v, instr, f32);
+        _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v);
+}
+
+static unsigned
+nir_src_index(compiler_context *ctx, nir_src *src)
+{
+        if (src->is_ssa)
+                return src->ssa->index;
+        else {
+                assert(!src->reg.indirect);
+                return ctx->func->impl->ssa_alloc + src->reg.reg->index;
+        }
+}
+
+static unsigned
+nir_alu_src_index(compiler_context *ctx, nir_alu_src *src)
+{
+        return nir_src_index(ctx, &src->src);
+}
+
+static bool
+nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components)
+{
+        unsigned comp = src->swizzle[0];
+
+        for (unsigned c = 1; c < nr_components; ++c) {
+                if (src->swizzle[c] != comp)
+                        return true;
+        }
+
+        return false;
+}
+
+/* Midgard puts scalar conditionals in r31.w; move an arbitrary source (the
+ * output of a conditional test) into that register */
+
+static void
+emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned component)
+{
+        int condition = nir_src_index(ctx, src);
+
+        /* Source to swizzle the desired component into w */
+
+        const midgard_vector_alu_src alu_src = {
+                .swizzle = SWIZZLE(component, component, component, component),
+        };
+
+        /* There is no boolean move instruction. Instead, we simulate a move by
+         * ANDing the condition with itself to get it into r31.w */
+
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+
+                /* We need to set the conditional as close as possible */
+                .precede_break = true,
+                .unit = for_branch ? UNIT_SMUL : UNIT_SADD,
+                .mask = 1 << COMPONENT_W,
+
+                .ssa_args = {
+                        .src0 = condition,
+                        .src1 = condition,
+                        .dest = SSA_FIXED_REGISTER(31),
+                },
+
+                .alu = {
+                        .op = midgard_alu_op_iand,
+                        .outmod = midgard_outmod_int_wrap,
+                        .reg_mode = midgard_reg_mode_32,
+                        .dest_override = midgard_dest_override_none,
+                        .src1 = vector_alu_srco_unsigned(alu_src),
+                        .src2 = vector_alu_srco_unsigned(alu_src)
+                },
+        };
+
+        emit_mir_instruction(ctx, ins);
+}
+
+/* Or, for mixed conditions (with csel_v), here's a vector version using all of
+ * r31 instead */
+
+static void
+emit_condition_mixed(compiler_context *ctx, nir_alu_src *src, unsigned nr_comp)
+{
+        int condition = nir_src_index(ctx, &src->src);
+
+        /* Source to swizzle the desired component into w */
+
+        const midgard_vector_alu_src alu_src = {
+                .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle),
+        };
+
+        /* There is no boolean move instruction. Instead, we simulate a move by
+         * ANDing the condition with itself to get it into r31.w */
+
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .precede_break = true,
+                .mask = mask_of(nr_comp),
+                .ssa_args = {
+                        .src0 = condition,
+                        .src1 = condition,
+                        .dest = SSA_FIXED_REGISTER(31),
+                },
+                .alu = {
+                        .op = midgard_alu_op_iand,
+                        .outmod = midgard_outmod_int_wrap,
+                        .reg_mode = midgard_reg_mode_32,
+                        .dest_override = midgard_dest_override_none,
+                        .src1 = vector_alu_srco_unsigned(alu_src),
+                        .src2 = vector_alu_srco_unsigned(alu_src)
+                },
+        };
+
+        emit_mir_instruction(ctx, ins);
+}
+
+
+
+/* Likewise, indirect offsets are put in r27.w. TODO: Allow componentwise
+ * pinning to eliminate this move in all known cases */
+
+static void
+emit_indirect_offset(compiler_context *ctx, nir_src *src)
+{
+        int offset = nir_src_index(ctx, src);
+
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .mask = 1 << COMPONENT_W,
+                .ssa_args = {
+                        .src0 = SSA_UNUSED_1,
+                        .src1 = offset,
+                        .dest = SSA_FIXED_REGISTER(REGISTER_OFFSET),
+                },
+                .alu = {
+                        .op = midgard_alu_op_imov,
+                        .outmod = midgard_outmod_int_wrap,
+                        .reg_mode = midgard_reg_mode_32,
+                        .dest_override = midgard_dest_override_none,
+                        .src1 = vector_alu_srco_unsigned(zero_alu_src),
+                        .src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx)
+                },
+        };
+
+        emit_mir_instruction(ctx, ins);
+}
+
+#define ALU_CASE(nir, _op) \
+	case nir_op_##nir: \
+		op = midgard_alu_op_##_op; \
+                assert(src_bitsize == dst_bitsize); \
+		break;
+
+#define ALU_CASE_BCAST(nir, _op, count) \
+        case nir_op_##nir: \
+                op = midgard_alu_op_##_op; \
+                broadcast_swizzle = count; \
+                assert(src_bitsize == dst_bitsize); \
+                break;
+static bool
+nir_is_fzero_constant(nir_src src)
+{
+        if (!nir_src_is_const(src))
+                return false;
+
+        for (unsigned c = 0; c < nir_src_num_components(src); ++c) {
+                if (nir_src_comp_as_float(src, c) != 0.0)
+                        return false;
+        }
+
+        return true;
+}
+
+/* Analyze the sizes of the inputs to determine which reg mode. Ops needed
+ * special treatment override this anyway. */
+
+static midgard_reg_mode
+reg_mode_for_nir(nir_alu_instr *instr)
+{
+        unsigned src_bitsize = nir_src_bit_size(instr->src[0].src);
+
+        switch (src_bitsize) {
+        case 8:
+                return midgard_reg_mode_8;
+        case 16:
+                return midgard_reg_mode_16;
+        case 32:
+                return midgard_reg_mode_32;
+        case 64:
+                return midgard_reg_mode_64;
+        default:
+                unreachable("Invalid bit size");
+        }
+}
+
+static void
+emit_alu(compiler_context *ctx, nir_alu_instr *instr)
+{
+        bool is_ssa = instr->dest.dest.is_ssa;
+
+        unsigned dest = nir_dest_index(ctx, &instr->dest.dest);
+        unsigned nr_components = nir_dest_num_components(instr->dest.dest);
+        unsigned nr_inputs = nir_op_infos[instr->op].num_inputs;
+
+        /* Most Midgard ALU ops have a 1:1 correspondance to NIR ops; these are
+         * supported. A few do not and are commented for now. Also, there are a
+         * number of NIR ops which Midgard does not support and need to be
+         * lowered, also TODO. This switch block emits the opcode and calling
+         * convention of the Midgard instruction; actual packing is done in
+         * emit_alu below */
+
+        unsigned op;
+
+        /* Number of components valid to check for the instruction (the rest
+         * will be forced to the last), or 0 to use as-is. Relevant as
+         * ball-type instructions have a channel count in NIR but are all vec4
+         * in Midgard */
+
+        unsigned broadcast_swizzle = 0;
+
+        /* What register mode should we operate in? */
+        midgard_reg_mode reg_mode =
+                reg_mode_for_nir(instr);
+
+        /* Do we need a destination override? Used for inline
+         * type conversion */
+
+        midgard_dest_override dest_override =
+                midgard_dest_override_none;
+
+        /* Should we use a smaller respective source and sign-extend?  */
+
+        bool half_1 = false, sext_1 = false;
+        bool half_2 = false, sext_2 = false;
+
+        unsigned src_bitsize = nir_src_bit_size(instr->src[0].src);
+        unsigned dst_bitsize = nir_dest_bit_size(instr->dest.dest);
+
+        switch (instr->op) {
+                ALU_CASE(fadd, fadd);
+                ALU_CASE(fmul, fmul);
+                ALU_CASE(fmin, fmin);
+                ALU_CASE(fmax, fmax);
+                ALU_CASE(imin, imin);
+                ALU_CASE(imax, imax);
+                ALU_CASE(umin, umin);
+                ALU_CASE(umax, umax);
+                ALU_CASE(ffloor, ffloor);
+                ALU_CASE(fround_even, froundeven);
+                ALU_CASE(ftrunc, ftrunc);
+                ALU_CASE(fceil, fceil);
+                ALU_CASE(fdot3, fdot3);
+                ALU_CASE(fdot4, fdot4);
+                ALU_CASE(iadd, iadd);
+                ALU_CASE(isub, isub);
+                ALU_CASE(imul, imul);
+
+                /* Zero shoved as second-arg */
+                ALU_CASE(iabs, iabsdiff);
+
+                ALU_CASE(mov, imov);
+
+                ALU_CASE(feq32, feq);
+                ALU_CASE(fne32, fne);
+                ALU_CASE(flt32, flt);
+                ALU_CASE(ieq32, ieq);
+                ALU_CASE(ine32, ine);
+                ALU_CASE(ilt32, ilt);
+                ALU_CASE(ult32, ult);
+
+                /* We don't have a native b2f32 instruction. Instead, like many
+                 * GPUs, we exploit booleans as 0/~0 for false/true, and
+                 * correspondingly AND
+                 * by 1.0 to do the type conversion. For the moment, prime us
+                 * to emit:
+                 *
+                 * iand [whatever], #0
+                 *
+                 * At the end of emit_alu (as MIR), we'll fix-up the constant
+                 */
+
+                ALU_CASE(b2f32, iand);
+                ALU_CASE(b2i32, iand);
+
+                /* Likewise, we don't have a dedicated f2b32 instruction, but
+                 * we can do a "not equal to 0.0" test. */
+
+                ALU_CASE(f2b32, fne);
+                ALU_CASE(i2b32, ine);
+
+                ALU_CASE(frcp, frcp);
+                ALU_CASE(frsq, frsqrt);
+                ALU_CASE(fsqrt, fsqrt);
+                ALU_CASE(fexp2, fexp2);
+                ALU_CASE(flog2, flog2);
+
+                ALU_CASE(f2i32, f2i_rtz);
+                ALU_CASE(f2u32, f2u_rtz);
+                ALU_CASE(i2f32, i2f_rtz);
+                ALU_CASE(u2f32, u2f_rtz);
+
+                ALU_CASE(f2i16, f2i_rtz);
+                ALU_CASE(f2u16, f2u_rtz);
+                ALU_CASE(i2f16, i2f_rtz);
+                ALU_CASE(u2f16, u2f_rtz);
+
+                ALU_CASE(fsin, fsin);
+                ALU_CASE(fcos, fcos);
+
+                /* Second op implicit #0 */
+                ALU_CASE(inot, inor);
+                ALU_CASE(iand, iand);
+                ALU_CASE(ior, ior);
+                ALU_CASE(ixor, ixor);
+                ALU_CASE(ishl, ishl);
+                ALU_CASE(ishr, iasr);
+                ALU_CASE(ushr, ilsr);
+
+                ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2);
+                ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3);
+                ALU_CASE(b32all_fequal4, fball_eq);
+
+                ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2);
+                ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3);
+                ALU_CASE(b32any_fnequal4, fbany_neq);
+
+                ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2);
+                ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3);
+                ALU_CASE(b32all_iequal4, iball_eq);
+
+                ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2);
+                ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3);
+                ALU_CASE(b32any_inequal4, ibany_neq);
+
+                /* Source mods will be shoved in later */
+                ALU_CASE(fabs, fmov);
+                ALU_CASE(fneg, fmov);
+                ALU_CASE(fsat, fmov);
+
+        /* For size conversion, we use a move. Ideally though we would squash
+         * these ops together; maybe that has to happen after in NIR as part of
+         * propagation...? An earlier algebraic pass ensured we step down by
+         * only / exactly one size. If stepping down, we use a dest override to
+         * reduce the size; if stepping up, we use a larger-sized move with a
+         * half source and a sign/zero-extension modifier */
+
+        case nir_op_i2i8:
+        case nir_op_i2i16:
+        case nir_op_i2i32:
+                /* If we end up upscale, we'll need a sign-extend on the
+                 * operand (the second argument) */
+
+                sext_2 = true;
+        case nir_op_u2u8:
+        case nir_op_u2u16:
+        case nir_op_u2u32: {
+                op = midgard_alu_op_imov;
+
+                if (dst_bitsize == (src_bitsize * 2)) {
+                        /* Converting up */
+                        half_2 = true;
+
+                        /* Use a greater register mode */
+                        reg_mode++;
+                } else if (src_bitsize == (dst_bitsize * 2)) {
+                        /* Converting down */
+                        dest_override = midgard_dest_override_lower;
+                }
+
+                break;
+        }
+
+        case nir_op_f2f16: {
+                assert(src_bitsize == 32);
+
+                op = midgard_alu_op_fmov;
+                dest_override = midgard_dest_override_lower;
+                break;
+        }
+
+        case nir_op_f2f32: {
+                assert(src_bitsize == 16);
+
+                op = midgard_alu_op_fmov;
+                half_2 = true;
+                reg_mode++;
+                break;
+        }
+
+
+        /* For greater-or-equal, we lower to less-or-equal and flip the
+         * arguments */
+
+        case nir_op_fge:
+        case nir_op_fge32:
+        case nir_op_ige32:
+        case nir_op_uge32: {
+                op =
+                        instr->op == nir_op_fge   ? midgard_alu_op_fle :
+                        instr->op == nir_op_fge32 ? midgard_alu_op_fle :
+                        instr->op == nir_op_ige32 ? midgard_alu_op_ile :
+                        instr->op == nir_op_uge32 ? midgard_alu_op_ule :
+                        0;
+
+                /* Swap via temporary */
+                nir_alu_src temp = instr->src[1];
+                instr->src[1] = instr->src[0];
+                instr->src[0] = temp;
+
+                break;
+        }
+
+        case nir_op_b32csel: {
+                /* Midgard features both fcsel and icsel, depending on
+                 * the type of the arguments/output. However, as long
+                 * as we're careful we can _always_ use icsel and
+                 * _never_ need fcsel, since the latter does additional
+                 * floating-point-specific processing whereas the
+                 * former just moves bits on the wire. It's not obvious
+                 * why these are separate opcodes, save for the ability
+                 * to do things like sat/pos/abs/neg for free */
+
+                bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components);
+                op = mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel;
+
+                /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */
+                nr_inputs = 2;
+
+                /* Emit the condition into r31 */
+
+                if (mixed)
+                        emit_condition_mixed(ctx, &instr->src[0], nr_components);
+                else
+                        emit_condition(ctx, &instr->src[0].src, false, instr->src[0].swizzle[0]);
+
+                /* The condition is the first argument; move the other
+                 * arguments up one to be a binary instruction for
+                 * Midgard */
+
+                memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src));
+                break;
+        }
+
+        default:
+                DBG("Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
+                assert(0);
+                return;
+        }
+
+        /* Midgard can perform certain modifiers on output of an ALU op */
+        unsigned outmod;
+
+        if (midgard_is_integer_out_op(op)) {
+                outmod = midgard_outmod_int_wrap;
+        } else {
+                bool sat = instr->dest.saturate || instr->op == nir_op_fsat;
+                outmod = sat ? midgard_outmod_sat : midgard_outmod_none;
+        }
+
+        /* fmax(a, 0.0) can turn into a .pos modifier as an optimization */
+
+        if (instr->op == nir_op_fmax) {
+                if (nir_is_fzero_constant(instr->src[0].src)) {
+                        op = midgard_alu_op_fmov;
+                        nr_inputs = 1;
+                        outmod = midgard_outmod_pos;
+                        instr->src[0] = instr->src[1];
+                } else if (nir_is_fzero_constant(instr->src[1].src)) {
+                        op = midgard_alu_op_fmov;
+                        nr_inputs = 1;
+                        outmod = midgard_outmod_pos;
+                }
+        }
+
+        /* Fetch unit, quirks, etc information */
+        unsigned opcode_props = alu_opcode_props[op].props;
+        bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24;
+
+        /* src0 will always exist afaik, but src1 will not for 1-argument
+         * instructions. The latter can only be fetched if the instruction
+         * needs it, or else we may segfault. */
+
+        unsigned src0 = nir_alu_src_index(ctx, &instr->src[0]);
+        unsigned src1 = nr_inputs == 2 ? nir_alu_src_index(ctx, &instr->src[1]) : SSA_UNUSED_0;
+
+        /* Rather than use the instruction generation helpers, we do it
+         * ourselves here to avoid the mess */
+
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .ssa_args = {
+                        .src0 = quirk_flipped_r24 ? SSA_UNUSED_1 : src0,
+                        .src1 = quirk_flipped_r24 ? src0         : src1,
+                        .dest = dest,
+                }
+        };
+
+        nir_alu_src *nirmods[2] = { NULL };
+
+        if (nr_inputs == 2) {
+                nirmods[0] = &instr->src[0];
+                nirmods[1] = &instr->src[1];
+        } else if (nr_inputs == 1) {
+                nirmods[quirk_flipped_r24] = &instr->src[0];
+        } else {
+                assert(0);
+        }
+
+        /* These were lowered to a move, so apply the corresponding mod */
+
+        if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) {
+                nir_alu_src *s = nirmods[quirk_flipped_r24];
+
+                if (instr->op == nir_op_fneg)
+                        s->negate = !s->negate;
+
+                if (instr->op == nir_op_fabs)
+                        s->abs = !s->abs;
+        }
+
+        bool is_int = midgard_is_integer_op(op);
+
+        ins.mask = mask_of(nr_components);
+
+        midgard_vector_alu alu = {
+                .op = op,
+                .reg_mode = reg_mode,
+                .dest_override = dest_override,
+                .outmod = outmod,
+
+                .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, sext_1)),
+                .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[1], is_int, broadcast_swizzle, half_2, sext_2)),
+        };
+
+        /* Apply writemask if non-SSA, keeping in mind that we can't write to components that don't exist */
+
+        if (!is_ssa)
+                ins.mask &= instr->dest.write_mask;
+
+        ins.alu = alu;
+
+        /* Late fixup for emulated instructions */
+
+        if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) {
+                /* Presently, our second argument is an inline #0 constant.
+                 * Switch over to an embedded 1.0 constant (that can't fit
+                 * inline, since we're 32-bit, not 16-bit like the inline
+                 * constants) */
+
+                ins.ssa_args.inline_constant = false;
+                ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
+                ins.has_constants = true;
+
+                if (instr->op == nir_op_b2f32) {
+                        ins.constants[0] = 1.0f;
+                } else {
+                        /* Type pun it into place */
+                        uint32_t one = 0x1;
+                        memcpy(&ins.constants[0], &one, sizeof(uint32_t));
+                }
+
+                ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx);
+        } else if (nr_inputs == 1 && !quirk_flipped_r24) {
+                /* Lots of instructions need a 0 plonked in */
+                ins.ssa_args.inline_constant = false;
+                ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
+                ins.has_constants = true;
+                ins.constants[0] = 0.0f;
+                ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx);
+        } else if (instr->op == nir_op_inot) {
+                /* ~b = ~(b & b), so duplicate the source */
+                ins.ssa_args.src1 = ins.ssa_args.src0;
+                ins.alu.src2 = ins.alu.src1;
+        }
+
+        if ((opcode_props & UNITS_ALL) == UNIT_VLUT) {
+                /* To avoid duplicating the lookup tables (probably), true LUT
+                 * instructions can only operate as if they were scalars. Lower
+                 * them here by changing the component. */
+
+                uint8_t original_swizzle[4];
+                memcpy(original_swizzle, nirmods[0]->swizzle, sizeof(nirmods[0]->swizzle));
+                unsigned orig_mask = ins.mask;
+
+                for (int i = 0; i < nr_components; ++i) {
+                        /* Mask the associated component, dropping the
+                         * instruction if needed */
+
+                        ins.mask = 1 << i;
+                        ins.mask &= orig_mask;
+
+                        if (!ins.mask)
+                                continue;
+
+                        for (int j = 0; j < 4; ++j)
+                                nirmods[0]->swizzle[j] = original_swizzle[i]; /* Pull from the correct component */
+
+                        ins.alu.src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int, broadcast_swizzle, half_1, false));
+                        emit_mir_instruction(ctx, ins);
+                }
+        } else {
+                emit_mir_instruction(ctx, ins);
+        }
+}
+
+#undef ALU_CASE
+
+/* Uniforms and UBOs use a shared code path, as uniforms are just (slightly
+ * optimized) versions of UBO #0 */
+
+static void
+emit_ubo_read(
+        compiler_context *ctx,
+        unsigned dest,
+        unsigned offset,
+        nir_src *indirect_offset,
+        unsigned index)
+{
+        /* TODO: half-floats */
+
+        if (!indirect_offset && offset < ctx->uniform_cutoff && index == 0) {
+                /* Fast path: For the first 16 uniforms, direct accesses are
+                 * 0-cycle, since they're just a register fetch in the usual
+                 * case.  So, we alias the registers while we're still in
+                 * SSA-space */
+
+                int reg_slot = 23 - offset;
+                alias_ssa(ctx, dest, SSA_FIXED_REGISTER(reg_slot));
+        } else {
+                /* Otherwise, read from the 'special' UBO to access
+                 * higher-indexed uniforms, at a performance cost. More
+                 * generally, we're emitting a UBO read instruction. */
+
+                midgard_instruction ins = m_ld_uniform_32(dest, offset);
+
+                /* TODO: Don't split */
+                ins.load_store.varying_parameters = (offset & 7) << 7;
+                ins.load_store.address = offset >> 3;
+
+                if (indirect_offset) {
+                        emit_indirect_offset(ctx, indirect_offset);
+                        ins.load_store.unknown = 0x8700 | index; /* xxx: what is this? */
+                } else {
+                        ins.load_store.unknown = 0x1E00 | index; /* xxx: what is this? */
+                }
+
+                /* TODO respect index */
+
+                emit_mir_instruction(ctx, ins);
+        }
+}
+
+static void
+emit_varying_read(
+        compiler_context *ctx,
+        unsigned dest, unsigned offset,
+        unsigned nr_comp, unsigned component,
+        nir_src *indirect_offset, nir_alu_type type)
+{
+        /* XXX: Half-floats? */
+        /* TODO: swizzle, mask */
+
+        midgard_instruction ins = m_ld_vary_32(dest, offset);
+        ins.mask = mask_of(nr_comp);
+        ins.load_store.swizzle = SWIZZLE_XYZW >> (2 * component);
+
+        midgard_varying_parameter p = {
+                .is_varying = 1,
+                .interpolation = midgard_interp_default,
+                .flat = /*var->data.interpolation == INTERP_MODE_FLAT*/ 0
+        };
+
+        unsigned u;
+        memcpy(&u, &p, sizeof(p));
+        ins.load_store.varying_parameters = u;
+
+        if (indirect_offset) {
+                /* We need to add in the dynamic index, moved to r27.w */
+                emit_indirect_offset(ctx, indirect_offset);
+                ins.load_store.unknown = 0x79e; /* xxx: what is this? */
+        } else {
+                /* Just a direct load */
+                ins.load_store.unknown = 0x1e9e; /* xxx: what is this? */
+        }
+
+        /* Use the type appropriate load */
+        switch (type) {
+        case nir_type_uint:
+        case nir_type_bool:
+                ins.load_store.op = midgard_op_ld_vary_32u;
+                break;
+        case nir_type_int:
+                ins.load_store.op = midgard_op_ld_vary_32i;
+                break;
+        case nir_type_float:
+                ins.load_store.op = midgard_op_ld_vary_32;
+                break;
+        default:
+                unreachable("Attempted to load unknown type");
+                break;
+        }
+
+        emit_mir_instruction(ctx, ins);
+}
+
+static void
+emit_sysval_read(compiler_context *ctx, nir_instr *instr)
+{
+        unsigned dest;
+        /* Figure out which uniform this is */
+        int sysval = sysval_for_instr(ctx, instr, &dest);
+        void *val = _mesa_hash_table_u64_search(ctx->sysval_to_id, sysval);
+
+        /* Sysvals are prefix uniforms */
+        unsigned uniform = ((uintptr_t) val) - 1;
+
+        /* Emit the read itself -- this is never indirect */
+        emit_ubo_read(ctx, dest, uniform, NULL, 0);
+}
+
+static void
+emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
+{
+        unsigned offset = 0, reg;
+
+        switch (instr->intrinsic) {
+        case nir_intrinsic_discard_if:
+                emit_condition(ctx, &instr->src[0], true, COMPONENT_X);
+
+        /* fallthrough */
+
+        case nir_intrinsic_discard: {
+                bool conditional = instr->intrinsic == nir_intrinsic_discard_if;
+                struct midgard_instruction discard = v_branch(conditional, false);
+                discard.branch.target_type = TARGET_DISCARD;
+                emit_mir_instruction(ctx, discard);
+
+                ctx->can_discard = true;
+                break;
+        }
+
+        case nir_intrinsic_load_uniform:
+        case nir_intrinsic_load_ubo:
+        case nir_intrinsic_load_input: {
+                bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
+                bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
+
+                /* Get the base type of the intrinsic */
+                /* TODO: Infer type? Does it matter? */
+                nir_alu_type t =
+                        is_ubo ? nir_type_uint : nir_intrinsic_type(instr);
+                t = nir_alu_type_get_base_type(t);
+
+                if (!is_ubo) {
+                        offset = nir_intrinsic_base(instr);
+                }
+
+                unsigned nr_comp = nir_intrinsic_dest_components(instr);
+
+                nir_src *src_offset = nir_get_io_offset_src(instr);
+
+                bool direct = nir_src_is_const(*src_offset);
+
+                if (direct)
+                        offset += nir_src_as_uint(*src_offset);
+
+                /* We may need to apply a fractional offset */
+                int component = instr->intrinsic == nir_intrinsic_load_input ?
+                                nir_intrinsic_component(instr) : 0;
+                reg = nir_dest_index(ctx, &instr->dest);
+
+                if (is_uniform && !ctx->is_blend) {
+                        emit_ubo_read(ctx, reg, ctx->sysval_count + offset, !direct ? &instr->src[0] : NULL, 0);
+                } else if (is_ubo) {
+                        nir_src index = instr->src[0];
+
+                        /* We don't yet support indirect UBOs. For indirect
+                         * block numbers (if that's possible), we don't know
+                         * enough about the hardware yet. For indirect sources,
+                         * we know what we need but we need to add some NIR
+                         * support for lowering correctly with respect to
+                         * 128-bit reads */
+
+                        assert(nir_src_is_const(index));
+                        assert(nir_src_is_const(*src_offset));
+
+                        /* TODO: Alignment */
+                        assert((offset & 0xF) == 0);
+
+                        uint32_t uindex = nir_src_as_uint(index) + 1;
+                        emit_ubo_read(ctx, reg, offset / 16, NULL, uindex);
+                } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) {
+                        emit_varying_read(ctx, reg, offset, nr_comp, component, !direct ? &instr->src[0] : NULL, t);
+                } else if (ctx->is_blend) {
+                        /* For blend shaders, load the input color, which is
+                         * preloaded to r0 */
+
+                        midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0));
+                        emit_mir_instruction(ctx, move);
+                }  else if (ctx->stage == MESA_SHADER_VERTEX) {
+                        midgard_instruction ins = m_ld_attr_32(reg, offset);
+                        ins.load_store.unknown = 0x1E1E; /* XXX: What is this? */
+                        ins.mask = mask_of(nr_comp);
+
+                        /* Use the type appropriate load */
+                        switch (t) {
+                        case nir_type_uint:
+                        case nir_type_bool:
+                                ins.load_store.op = midgard_op_ld_attr_32u;
+                                break;
+                        case nir_type_int:
+                                ins.load_store.op = midgard_op_ld_attr_32i;
+                                break;
+                        case nir_type_float:
+                                ins.load_store.op = midgard_op_ld_attr_32;
+                                break;
+                        default:
+                                unreachable("Attempted to load unknown type");
+                                break;
+                        }
+
+                        emit_mir_instruction(ctx, ins);
+                } else {
+                        DBG("Unknown load\n");
+                        assert(0);
+                }
+
+                break;
+        }
+
+        /* Reads 128-bit value raw off the tilebuffer during blending, tasty */
+
+        case nir_intrinsic_load_raw_output_pan:
+                reg = nir_dest_index(ctx, &instr->dest);
+                assert(ctx->is_blend);
+
+                midgard_instruction ins = m_ld_color_buffer_8(reg, 0);
+                emit_mir_instruction(ctx, ins);
+                break;
+
+        case nir_intrinsic_load_blend_const_color_rgba: {
+                assert(ctx->is_blend);
+                reg = nir_dest_index(ctx, &instr->dest);
+
+                /* Blend constants are embedded directly in the shader and
+                 * patched in, so we use some magic routing */
+
+                midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, reg);
+                ins.has_constants = true;
+                ins.has_blend_constant = true;
+                emit_mir_instruction(ctx, ins);
+                break;
+        }
+
+        case nir_intrinsic_store_output:
+                assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");
+
+                offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]);
+
+                reg = nir_src_index(ctx, &instr->src[0]);
+
+                if (ctx->stage == MESA_SHADER_FRAGMENT) {
+                        /* gl_FragColor is not emitted with load/store
+                         * instructions. Instead, it gets plonked into
+                         * r0 at the end of the shader and we do the
+                         * framebuffer writeout dance. TODO: Defer
+                         * writes */
+
+                        midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0));
+                        emit_mir_instruction(ctx, move);
+
+                        /* Save the index we're writing to for later reference
+                         * in the epilogue */
+
+                        ctx->fragment_output = reg;
+                } else if (ctx->stage == MESA_SHADER_VERTEX) {
+                        /* Varyings are written into one of two special
+                         * varying register, r26 or r27. The register itself is
+                         * selected as the register in the st_vary instruction,
+                         * minus the base of 26. E.g. write into r27 and then
+                         * call st_vary(1) */
+
+                        midgard_instruction ins = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(26));
+                        emit_mir_instruction(ctx, ins);
+
+                        /* We should have been vectorized, though we don't
+                         * currently check that st_vary is emitted only once
+                         * per slot (this is relevant, since there's not a mask
+                         * parameter available on the store [set to 0 by the
+                         * blob]). We do respect the component by adjusting the
+                         * swizzle. */
+
+                        unsigned component = nir_intrinsic_component(instr);
+
+                        midgard_instruction st = m_st_vary_32(SSA_FIXED_REGISTER(0), offset);
+                        st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
+                        st.load_store.swizzle = SWIZZLE_XYZW << (2*component);
+                        emit_mir_instruction(ctx, st);
+                } else {
+                        DBG("Unknown store\n");
+                        assert(0);
+                }
+
+                break;
+
+        /* Special case of store_output for lowered blend shaders */
+        case nir_intrinsic_store_raw_output_pan:
+                assert (ctx->stage == MESA_SHADER_FRAGMENT);
+                reg = nir_src_index(ctx, &instr->src[0]);
+
+                midgard_instruction move = v_mov(reg, blank_alu_src, SSA_FIXED_REGISTER(0));
+                emit_mir_instruction(ctx, move);
+                ctx->fragment_output = reg;
+
+                break;
+
+        case nir_intrinsic_load_alpha_ref_float:
+                assert(instr->dest.is_ssa);
+
+                float ref_value = ctx->alpha_ref;
+
+                float *v = ralloc_array(NULL, float, 4);
+                memcpy(v, &ref_value, sizeof(float));
+                _mesa_hash_table_u64_insert(ctx->ssa_constants, instr->dest.ssa.index + 1, v);
+                break;
+
+        case nir_intrinsic_load_viewport_scale:
+        case nir_intrinsic_load_viewport_offset:
+                emit_sysval_read(ctx, &instr->instr);
+                break;
+
+        default:
+                printf ("Unhandled intrinsic\n");
+                assert(0);
+                break;
+        }
+}
+
+static unsigned
+midgard_tex_format(enum glsl_sampler_dim dim)
+{
+        switch (dim) {
+        case GLSL_SAMPLER_DIM_1D:
+        case GLSL_SAMPLER_DIM_BUF:
+                return MALI_TEX_1D;
+
+        case GLSL_SAMPLER_DIM_2D:
+        case GLSL_SAMPLER_DIM_EXTERNAL:
+                return MALI_TEX_2D;
+
+        case GLSL_SAMPLER_DIM_3D:
+                return MALI_TEX_3D;
+
+        case GLSL_SAMPLER_DIM_CUBE:
+                return MALI_TEX_CUBE;
+
+        default:
+                DBG("Unknown sampler dim type\n");
+                assert(0);
+                return 0;
+        }
+}
+
+/* Tries to attach an explicit LOD / bias as a constant. Returns whether this
+ * was successful */
+
+static bool
+pan_attach_constant_bias(
+        compiler_context *ctx,
+        nir_src lod,
+        midgard_texture_word *word)
+{
+        /* To attach as constant, it has to *be* constant */
+
+        if (!nir_src_is_const(lod))
+                return false;
+
+        float f = nir_src_as_float(lod);
+
+        /* Break into fixed-point */
+        signed lod_int = f;
+        float lod_frac = f - lod_int;
+
+        /* Carry over negative fractions */
+        if (lod_frac < 0.0) {
+                lod_int--;
+                lod_frac += 1.0;
+        }
+
+        /* Encode */
+        word->bias = float_to_ubyte(lod_frac);
+        word->bias_int = lod_int;
+
+        return true;
+}
+
+static enum mali_sampler_type
+midgard_sampler_type(nir_alu_type t) {
+        switch (nir_alu_type_get_base_type(t))
+        {
+        case nir_type_float:
+                                return MALI_SAMPLER_FLOAT;
+        case nir_type_int:
+                return MALI_SAMPLER_SIGNED;
+        case nir_type_uint:
+                return MALI_SAMPLER_UNSIGNED;
+        default:
+                unreachable("Unknown sampler type");
+        }
+}
+
+static void
+emit_texop_native(compiler_context *ctx, nir_tex_instr *instr,
+                  unsigned midgard_texop)
+{
+        /* TODO */
+        //assert (!instr->sampler);
+        //assert (!instr->texture_array_size);
+
+        /* Allocate registers via a round robin scheme to alternate between the two registers */
+        int reg = ctx->texture_op_count & 1;
+        int in_reg = reg, out_reg = reg;
+
+        /* Make room for the reg */
+
+        if (ctx->texture_index[reg] > -1)
+                unalias_ssa(ctx, ctx->texture_index[reg]);
+
+        int texture_index = instr->texture_index;
+        int sampler_index = texture_index;
+
+        /* No helper to build texture words -- we do it all here */
+        midgard_instruction ins = {
+                .type = TAG_TEXTURE_4,
+                .mask = 0xF,
+                .texture = {
+                        .op = midgard_texop,
+                        .format = midgard_tex_format(instr->sampler_dim),
+                        .texture_handle = texture_index,
+                        .sampler_handle = sampler_index,
+
+                        /* TODO: Regalloc it in */
+                        .swizzle = SWIZZLE_XYZW,
+
+                        /* TODO: half */
+                        .in_reg_full = 1,
+                        .out_full = 1,
+
+                        .sampler_type = midgard_sampler_type(instr->dest_type),
+                }
+        };
+
+        for (unsigned i = 0; i < instr->num_srcs; ++i) {
+                int reg = SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE + in_reg);
+                int index = nir_src_index(ctx, &instr->src[i].src);
+                int nr_comp = nir_src_num_components(instr->src[i].src);
+                midgard_vector_alu_src alu_src = blank_alu_src;
+
+                switch (instr->src[i].src_type) {
+                case nir_tex_src_coord: {
+                        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
+                                /* texelFetch is undefined on samplerCube */
+                                assert(midgard_texop != TEXTURE_OP_TEXEL_FETCH);
+
+                                /* For cubemaps, we need to load coords into
+                                 * special r27, and then use a special ld/st op
+                                 * to select the face and copy the xy into the
+                                 * texture register */
+
+                                alu_src.swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X);
+
+                                midgard_instruction move = v_mov(index, alu_src, SSA_FIXED_REGISTER(27));
+                                emit_mir_instruction(ctx, move);
+
+                                midgard_instruction st = m_st_cubemap_coords(reg, 0);
+                                st.load_store.unknown = 0x24; /* XXX: What is this? */
+                                st.mask = 0x3; /* xy */
+                                st.load_store.swizzle = alu_src.swizzle;
+                                emit_mir_instruction(ctx, st);
+
+                                ins.texture.in_reg_swizzle = swizzle_of(2);
+                        } else {
+                                ins.texture.in_reg_swizzle = alu_src.swizzle = swizzle_of(nr_comp);
+
+                                midgard_instruction mov = v_mov(index, alu_src, reg);
+                                mov.mask = mask_of(nr_comp);
+                                emit_mir_instruction(ctx, mov);
+
+                                if (midgard_texop == TEXTURE_OP_TEXEL_FETCH) {
+                                        /* Texel fetch opcodes care about the
+                                         * values of z and w, so we actually
+                                         * need to spill into a second register
+                                         * for a texel fetch with register bias
+                                         * (for non-2D). TODO: Implement that
+                                         */
+
+                                        assert(instr->sampler_dim == GLSL_SAMPLER_DIM_2D);
+
+                                        midgard_instruction zero = v_mov(index, alu_src, reg);
+                                        zero.ssa_args.inline_constant = true;
+                                        zero.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
+                                        zero.has_constants = true;
+                                        zero.mask = ~mov.mask;
+                                        emit_mir_instruction(ctx, zero);
+
+                                        ins.texture.in_reg_swizzle = SWIZZLE_XYZZ;
+                                } else {
+                                        /* Non-texel fetch doesn't need that
+                                         * nonsense. However we do use the Z
+                                         * for array indexing */
+                                        bool is_3d = instr->sampler_dim == GLSL_SAMPLER_DIM_3D;
+                                        ins.texture.in_reg_swizzle = is_3d ? SWIZZLE_XYZZ : SWIZZLE_XYXZ;
+                                }
+                        }
+
+                        break;
+                }
+
+                case nir_tex_src_bias:
+                case nir_tex_src_lod: {
+                        /* Try as a constant if we can */
+
+                        bool is_txf = midgard_texop == TEXTURE_OP_TEXEL_FETCH;
+                        if (!is_txf && pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture))
+                                break;
+
+                        /* Otherwise we use a register. To keep RA simple, we
+                         * put the bias/LOD into the w component of the input
+                         * source, which is otherwise in xy */
+
+                        alu_src.swizzle = SWIZZLE_XXXX;
+
+                        midgard_instruction mov = v_mov(index, alu_src, reg);
+                        mov.mask = 1 << COMPONENT_W;
+                        emit_mir_instruction(ctx, mov);
+
+                        ins.texture.lod_register = true;
+
+                        midgard_tex_register_select sel = {
+                                .select = in_reg,
+                                .full = 1,
+
+                                /* w */
+                                .component_lo = 1,
+                                .component_hi = 1
+                        };
+
+                        uint8_t packed;
+                        memcpy(&packed, &sel, sizeof(packed));
+                        ins.texture.bias = packed;
+
+                        break;
+                };
+
+                default:
+                        unreachable("Unknown texture source type\n");
+                }
+        }
+
+        /* Set registers to read and write from the same place */
+        ins.texture.in_reg_select = in_reg;
+        ins.texture.out_reg_select = out_reg;
+
+        emit_mir_instruction(ctx, ins);
+
+        int o_reg = REGISTER_TEXTURE_BASE + out_reg, o_index = nir_dest_index(ctx, &instr->dest);
+        midgard_instruction ins2 = v_mov(SSA_FIXED_REGISTER(o_reg), blank_alu_src, o_index);
+        emit_mir_instruction(ctx, ins2);
+
+        /* Used for .cont and .last hinting */
+        ctx->texture_op_count++;
+}
+
+static void
+emit_tex(compiler_context *ctx, nir_tex_instr *instr)
+{
+        /* Fixup op, since only textureLod is permitted in VS but NIR can give
+         * generic tex in some cases (which confuses the hardware) */
+
+        bool is_vertex = ctx->stage == MESA_SHADER_VERTEX;
+
+        if (is_vertex && instr->op == nir_texop_tex)
+                instr->op = nir_texop_txl;
+
+        switch (instr->op) {
+        case nir_texop_tex:
+        case nir_texop_txb:
+                emit_texop_native(ctx, instr, TEXTURE_OP_NORMAL);
+                break;
+        case nir_texop_txl:
+                emit_texop_native(ctx, instr, TEXTURE_OP_LOD);
+                break;
+        case nir_texop_txf:
+                emit_texop_native(ctx, instr, TEXTURE_OP_TEXEL_FETCH);
+                break;
+        case nir_texop_txs:
+                emit_sysval_read(ctx, &instr->instr);
+                break;
+        default:
+                unreachable("Unhanlded texture op");
+        }
+}
+
+static void
+emit_jump(compiler_context *ctx, nir_jump_instr *instr)
+{
+        switch (instr->type) {
+        case nir_jump_break: {
+                /* Emit a branch out of the loop */
+                struct midgard_instruction br = v_branch(false, false);
+                br.branch.target_type = TARGET_BREAK;
+                br.branch.target_break = ctx->current_loop_depth;
+                emit_mir_instruction(ctx, br);
+
+                DBG("break..\n");
+                break;
+        }
+
+        default:
+                DBG("Unknown jump type %d\n", instr->type);
+                break;
+        }
+}
+
+static void
+emit_instr(compiler_context *ctx, struct nir_instr *instr)
+{
+        switch (instr->type) {
+        case nir_instr_type_load_const:
+                emit_load_const(ctx, nir_instr_as_load_const(instr));
+                break;
+
+        case nir_instr_type_intrinsic:
+                emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
+                break;
+
+        case nir_instr_type_alu:
+                emit_alu(ctx, nir_instr_as_alu(instr));
+                break;
+
+        case nir_instr_type_tex:
+                emit_tex(ctx, nir_instr_as_tex(instr));
+                break;
+
+        case nir_instr_type_jump:
+                emit_jump(ctx, nir_instr_as_jump(instr));
+                break;
+
+        case nir_instr_type_ssa_undef:
+                /* Spurious */
+                break;
+
+        default:
+                DBG("Unhandled instruction type\n");
+                break;
+        }
+}
+
+
+/* ALU instructions can inline or embed constants, which decreases register
+ * pressure and saves space. */
+
+#define CONDITIONAL_ATTACH(src) { \
+	void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src + 1); \
+\
+	if (entry) { \
+		attach_constants(ctx, alu, entry, alu->ssa_args.src + 1); \
+		alu->ssa_args.src = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \
+	} \
+}
+
+static void
+inline_alu_constants(compiler_context *ctx)
+{
+        mir_foreach_instr(ctx, alu) {
+                /* Other instructions cannot inline constants */
+                if (alu->type != TAG_ALU_4) continue;
+
+                /* If there is already a constant here, we can do nothing */
+                if (alu->has_constants) continue;
+
+                /* It makes no sense to inline constants on a branch */
+                if (alu->compact_branch || alu->prepacked_branch) continue;
+
+                CONDITIONAL_ATTACH(src0);
+
+                if (!alu->has_constants) {
+                        CONDITIONAL_ATTACH(src1)
+                } else if (!alu->inline_constant) {
+                        /* Corner case: _two_ vec4 constants, for instance with a
+                         * csel. For this case, we can only use a constant
+                         * register for one, we'll have to emit a move for the
+                         * other. Note, if both arguments are constants, then
+                         * necessarily neither argument depends on the value of
+                         * any particular register. As the destination register
+                         * will be wiped, that means we can spill the constant
+                         * to the destination register.
+                         */
+
+                        void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src1 + 1);
+                        unsigned scratch = alu->ssa_args.dest;
+
+                        if (entry) {
+                                midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, scratch);
+                                attach_constants(ctx, &ins, entry, alu->ssa_args.src1 + 1);
+
+                                /* Force a break XXX Defer r31 writes */
+                                ins.unit = UNIT_VLUT;
+
+                                /* Set the source */
+                                alu->ssa_args.src1 = scratch;
+
+                                /* Inject us -before- the last instruction which set r31 */
+                                mir_insert_instruction_before(mir_prev_op(alu), ins);
+                        }
+                }
+        }
+}
+
+/* Midgard supports two types of constants, embedded constants (128-bit) and
+ * inline constants (16-bit). Sometimes, especially with scalar ops, embedded
+ * constants can be demoted to inline constants, for space savings and
+ * sometimes a performance boost */
+
+static void
+embedded_to_inline_constant(compiler_context *ctx)
+{
+        mir_foreach_instr(ctx, ins) {
+                if (!ins->has_constants) continue;
+
+                if (ins->ssa_args.inline_constant) continue;
+
+                /* Blend constants must not be inlined by definition */
+                if (ins->has_blend_constant) continue;
+
+                /* We can inline 32-bit (sometimes) or 16-bit (usually) */
+                bool is_16 = ins->alu.reg_mode == midgard_reg_mode_16;
+                bool is_32 = ins->alu.reg_mode == midgard_reg_mode_32;
+
+                if (!(is_16 || is_32))
+                        continue;
+
+                /* src1 cannot be an inline constant due to encoding
+                 * restrictions. So, if possible we try to flip the arguments
+                 * in that case */
+
+                int op = ins->alu.op;
+
+                if (ins->ssa_args.src0 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
+                        switch (op) {
+                        /* These ops require an operational change to flip
+                         * their arguments TODO */
+                        case midgard_alu_op_flt:
+                        case midgard_alu_op_fle:
+                        case midgard_alu_op_ilt:
+                        case midgard_alu_op_ile:
+                        case midgard_alu_op_fcsel:
+                        case midgard_alu_op_icsel:
+                                DBG("Missed non-commutative flip (%s)\n", alu_opcode_props[op].name);
+                        default:
+                                break;
+                        }
+
+                        if (alu_opcode_props[op].props & OP_COMMUTES) {
+                                /* Flip the SSA numbers */
+                                ins->ssa_args.src0 = ins->ssa_args.src1;
+                                ins->ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
+
+                                /* And flip the modifiers */
+
+                                unsigned src_temp;
+
+                                src_temp = ins->alu.src2;
+                                ins->alu.src2 = ins->alu.src1;
+                                ins->alu.src1 = src_temp;
+                        }
+                }
+
+                if (ins->ssa_args.src1 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
+                        /* Extract the source information */
+
+                        midgard_vector_alu_src *src;
+                        int q = ins->alu.src2;
+                        midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
+                        src = m;
+
+                        /* Component is from the swizzle, e.g. r26.w -> w component. TODO: What if x is masked out? */
+                        int component = src->swizzle & 3;
+
+                        /* Scale constant appropriately, if we can legally */
+                        uint16_t scaled_constant = 0;
+
+                        if (midgard_is_integer_op(op) || is_16) {
+                                unsigned int *iconstants = (unsigned int *) ins->constants;
+                                scaled_constant = (uint16_t) iconstants[component];
+
+                                /* Constant overflow after resize */
+                                if (scaled_constant != iconstants[component])
+                                        continue;
+                        } else {
+                                float original = (float) ins->constants[component];
+                                scaled_constant = _mesa_float_to_half(original);
+
+                                /* Check for loss of precision. If this is
+                                 * mediump, we don't care, but for a highp
+                                 * shader, we need to pay attention. NIR
+                                 * doesn't yet tell us which mode we're in!
+                                 * Practically this prevents most constants
+                                 * from being inlined, sadly. */
+
+                                float fp32 = _mesa_half_to_float(scaled_constant);
+
+                                if (fp32 != original)
+                                        continue;
+                        }
+
+                        /* We don't know how to handle these with a constant */
+
+                        if (src->mod || src->half || src->rep_low || src->rep_high) {
+                                DBG("Bailing inline constant...\n");
+                                continue;
+                        }
+
+                        /* Make sure that the constant is not itself a
+                         * vector by checking if all accessed values
+                         * (by the swizzle) are the same. */
+
+                        uint32_t *cons = (uint32_t *) ins->constants;
+                        uint32_t value = cons[component];
+
+                        bool is_vector = false;
+                        unsigned mask = effective_writemask(&ins->alu, ins->mask);
+
+                        for (int c = 1; c < 4; ++c) {
+                                /* We only care if this component is actually used */
+                                if (!(mask & (1 << c)))
+                                        continue;
+
+                                uint32_t test = cons[(src->swizzle >> (2 * c)) & 3];
+
+                                if (test != value) {
+                                        is_vector = true;
+                                        break;
+                                }
+                        }
+
+                        if (is_vector)
+                                continue;
+
+                        /* Get rid of the embedded constant */
+                        ins->has_constants = false;
+                        ins->ssa_args.src1 = SSA_UNUSED_0;
+                        ins->ssa_args.inline_constant = true;
+                        ins->inline_constant = scaled_constant;
+                }
+        }
+}
+
+/* Map normal SSA sources to other SSA sources / fixed registers (like
+ * uniforms) */
+
+static void
+map_ssa_to_alias(compiler_context *ctx, int *ref)
+{
+        /* Sign is used quite deliberately for unused */
+        if (*ref < 0)
+                return;
+
+        unsigned int alias = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_alias, *ref + 1);
+
+        if (alias) {
+                /* Remove entry in leftovers to avoid a redunant fmov */
+
+                struct set_entry *leftover = _mesa_set_search(ctx->leftover_ssa_to_alias, ((void *) (uintptr_t) (*ref + 1)));
+
+                if (leftover)
+                        _mesa_set_remove(ctx->leftover_ssa_to_alias, leftover);
+
+                /* Assign the alias map */
+                *ref = alias - 1;
+                return;
+        }
+}
+
+/* Basic dead code elimination on the MIR itself, which cleans up e.g. the
+ * texture pipeline */
+
+static bool
+midgard_opt_dead_code_eliminate(compiler_context *ctx, midgard_block *block)
+{
+        bool progress = false;
+
+        mir_foreach_instr_in_block_safe(block, ins) {
+                if (ins->type != TAG_ALU_4) continue;
+                if (ins->compact_branch) continue;
+
+                if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
+                if (mir_is_live_after(ctx, block, ins, ins->ssa_args.dest)) continue;
+
+                mir_remove_instruction(ins);
+                progress = true;
+        }
+
+        return progress;
+}
+
+/* Dead code elimination for branches at the end of a block - only one branch
+ * per block is legal semantically */
+
+static void
+midgard_opt_cull_dead_branch(compiler_context *ctx, midgard_block *block)
+{
+        bool branched = false;
+
+        mir_foreach_instr_in_block_safe(block, ins) {
+                if (!midgard_is_branch_unit(ins->unit)) continue;
+
+                /* We ignore prepacked branches since the fragment epilogue is
+                 * just generally special */
+                if (ins->prepacked_branch) continue;
+
+                /* Discards are similarly special and may not correspond to the
+                 * end of a block */
+
+                if (ins->branch.target_type == TARGET_DISCARD) continue;
+
+                if (branched) {
+                        /* We already branched, so this is dead */
+                        mir_remove_instruction(ins);
+                }
+
+                branched = true;
+        }
+}
+
+static bool
+mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask)
+{
+        /* abs or neg */
+        if (!is_int && src.mod) return true;
+
+        /* Other int mods don't matter in isolation */
+        if (is_int && src.mod == midgard_int_shift) return true;
+
+        /* size-conversion */
+        if (src.half) return true;
+
+        /* swizzle */
+        for (unsigned c = 0; c < 4; ++c) {
+                if (!(mask & (1 << c))) continue;
+                if (((src.swizzle >> (2*c)) & 3) != c) return true;
+        }
+
+        return false;
+}
+
+static bool
+mir_nontrivial_source2_mod(midgard_instruction *ins)
+{
+        bool is_int = midgard_is_integer_op(ins->alu.op);
+
+        midgard_vector_alu_src src2 =
+                vector_alu_from_unsigned(ins->alu.src2);
+
+        return mir_nontrivial_mod(src2, is_int, ins->mask);
+}
+
+static bool
+mir_nontrivial_outmod(midgard_instruction *ins)
+{
+        bool is_int = midgard_is_integer_op(ins->alu.op);
+        unsigned mod = ins->alu.outmod;
+
+        /* Type conversion is a sort of outmod */
+        if (ins->alu.dest_override != midgard_dest_override_none)
+                return true;
+
+        if (is_int)
+                return mod != midgard_outmod_int_wrap;
+        else
+                return mod != midgard_outmod_none;
+}
+
+static bool
+midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
+{
+        bool progress = false;
+
+        mir_foreach_instr_in_block_safe(block, ins) {
+                if (ins->type != TAG_ALU_4) continue;
+                if (!OP_IS_MOVE(ins->alu.op)) continue;
+
+                unsigned from = ins->ssa_args.src1;
+                unsigned to = ins->ssa_args.dest;
+
+                /* We only work on pure SSA */
+
+                if (to >= SSA_FIXED_MINIMUM) continue;
+                if (from >= SSA_FIXED_MINIMUM) continue;
+                if (to >= ctx->func->impl->ssa_alloc) continue;
+                if (from >= ctx->func->impl->ssa_alloc) continue;
+
+                /* Constant propagation is not handled here, either */
+                if (ins->ssa_args.inline_constant) continue;
+                if (ins->has_constants) continue;
+
+                if (mir_nontrivial_source2_mod(ins)) continue;
+                if (mir_nontrivial_outmod(ins)) continue;
+
+                /* We're clear -- rewrite */
+                mir_rewrite_index_src(ctx, to, from);
+                mir_remove_instruction(ins);
+                progress |= true;
+        }
+
+        return progress;
+}
+
+/* fmov.pos is an idiom for fpos. Propoagate the .pos up to the source, so then
+ * the move can be propagated away entirely */
+
+static bool
+mir_compose_float_outmod(midgard_outmod_float *outmod, midgard_outmod_float comp)
+{
+        /* Nothing to do */
+        if (comp == midgard_outmod_none)
+                return true;
+
+        if (*outmod == midgard_outmod_none) {
+                *outmod = comp;
+                return true;
+        }
+
+        /* TODO: Compose rules */
+        return false;
+}
+
+static bool
+midgard_opt_pos_propagate(compiler_context *ctx, midgard_block *block)
+{
+        bool progress = false;
+
+        mir_foreach_instr_in_block_safe(block, ins) {
+                if (ins->type != TAG_ALU_4) continue;
+                if (ins->alu.op != midgard_alu_op_fmov) continue;
+                if (ins->alu.outmod != midgard_outmod_pos) continue;
+
+                /* TODO: Registers? */
+                unsigned src = ins->ssa_args.src1;
+                if (src >= ctx->func->impl->ssa_alloc) continue;
+                assert(!mir_has_multiple_writes(ctx, src));
+
+                /* There might be a source modifier, too */
+                if (mir_nontrivial_source2_mod(ins)) continue;
+
+                /* Backpropagate the modifier */
+                mir_foreach_instr_in_block_from_rev(block, v, mir_prev_op(ins)) {
+                        if (v->type != TAG_ALU_4) continue;
+                        if (v->ssa_args.dest != src) continue;
+
+                        /* Can we even take a float outmod? */
+                        if (midgard_is_integer_out_op(v->alu.op)) continue;
+
+                        midgard_outmod_float temp = v->alu.outmod;
+                        progress |= mir_compose_float_outmod(&temp, ins->alu.outmod);
+
+                        /* Throw in the towel.. */
+                        if (!progress) break;
+
+                        /* Otherwise, transfer the modifier */
+                        v->alu.outmod = temp;
+                        ins->alu.outmod = midgard_outmod_none;
+
+                        break;
+                }
+        }
+
+        return progress;
+}
+
+/* The following passes reorder MIR instructions to enable better scheduling */
+
+static void
+midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
+{
+        mir_foreach_instr_in_block_safe(block, ins) {
+                if (ins->type != TAG_LOAD_STORE_4) continue;
+
+                /* We've found a load/store op. Check if next is also load/store. */
+                midgard_instruction *next_op = mir_next_op(ins);
+                if (&next_op->link != &block->instructions) {
+                        if (next_op->type == TAG_LOAD_STORE_4) {
+                                /* If so, we're done since we're a pair */
+                                ins = mir_next_op(ins);
+                                continue;
+                        }
+
+                        /* Maximum search distance to pair, to avoid register pressure disasters */
+                        int search_distance = 8;
+
+                        /* Otherwise, we have an orphaned load/store -- search for another load */
+                        mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
+                                /* Terminate search if necessary */
+                                if (!(search_distance--)) break;
+
+                                if (c->type != TAG_LOAD_STORE_4) continue;
+
+                                /* Stores cannot be reordered, since they have
+                                 * dependencies. For the same reason, indirect
+                                 * loads cannot be reordered as their index is
+                                 * loaded in r27.w */
+
+                                if (OP_IS_STORE(c->load_store.op)) continue;
+
+                                /* It appears the 0x800 bit is set whenever a
+                                 * load is direct, unset when it is indirect.
+                                 * Skip indirect loads. */
+
+                                if (!(c->load_store.unknown & 0x800)) continue;
+
+                                /* We found one! Move it up to pair and remove it from the old location */
+
+                                mir_insert_instruction_before(ins, *c);
+                                mir_remove_instruction(c);
+
+                                break;
+                        }
+                }
+        }
+}
+
+/* If there are leftovers after the below pass, emit actual fmov
+ * instructions for the slow-but-correct path */
+
+static void
+emit_leftover_move(compiler_context *ctx)
+{
+        set_foreach(ctx->leftover_ssa_to_alias, leftover) {
+                int base = ((uintptr_t) leftover->key) - 1;
+                int mapped = base;
+
+                map_ssa_to_alias(ctx, &mapped);
+                EMIT(mov, mapped, blank_alu_src, base);
+        }
+}
+
+static void
+actualise_ssa_to_alias(compiler_context *ctx)
+{
+        mir_foreach_instr(ctx, ins) {
+                map_ssa_to_alias(ctx, &ins->ssa_args.src0);
+                map_ssa_to_alias(ctx, &ins->ssa_args.src1);
+        }
+
+        emit_leftover_move(ctx);
+}
+
+static void
+emit_fragment_epilogue(compiler_context *ctx)
+{
+        /* Special case: writing out constants requires us to include the move
+         * explicitly now, so shove it into r0 */
+
+        void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, ctx->fragment_output + 1);
+
+        if (constant_value) {
+                midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(0));
+                attach_constants(ctx, &ins, constant_value, ctx->fragment_output + 1);
+                emit_mir_instruction(ctx, ins);
+        }
+
+        /* Perform the actual fragment writeout. We have two writeout/branch
+         * instructions, forming a loop until writeout is successful as per the
+         * docs. TODO: gl_FragDepth */
+
+        EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, 0, midgard_condition_always);
+        EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always);
+}
+
+static midgard_block *
+emit_block(compiler_context *ctx, nir_block *block)
+{
+        midgard_block *this_block = calloc(sizeof(midgard_block), 1);
+        list_addtail(&this_block->link, &ctx->blocks);
+
+        this_block->is_scheduled = false;
+        ++ctx->block_count;
+
+        ctx->texture_index[0] = -1;
+        ctx->texture_index[1] = -1;
+
+        /* Add us as a successor to the block we are following */
+        if (ctx->current_block)
+                midgard_block_add_successor(ctx->current_block, this_block);
+
+        /* Set up current block */
+        list_inithead(&this_block->instructions);
+        ctx->current_block = this_block;
+
+        nir_foreach_instr(instr, block) {
+                emit_instr(ctx, instr);
+                ++ctx->instruction_count;
+        }
+
+        inline_alu_constants(ctx);
+        embedded_to_inline_constant(ctx);
+
+        /* Perform heavylifting for aliasing */
+        actualise_ssa_to_alias(ctx);
+
+        midgard_pair_load_store(ctx, this_block);
+
+        /* Append fragment shader epilogue (value writeout) */
+        if (ctx->stage == MESA_SHADER_FRAGMENT) {
+                if (block == nir_impl_last_block(ctx->func->impl)) {
+                        emit_fragment_epilogue(ctx);
+                }
+        }
+
+        if (block == nir_start_block(ctx->func->impl))
+                ctx->initial_block = this_block;
+
+        if (block == nir_impl_last_block(ctx->func->impl))
+                ctx->final_block = this_block;
+
+        /* Allow the next control flow to access us retroactively, for
+         * branching etc */
+        ctx->current_block = this_block;
+
+        /* Document the fallthrough chain */
+        ctx->previous_source_block = this_block;
+
+        return this_block;
+}
+
+static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list);
+
+static void
+emit_if(struct compiler_context *ctx, nir_if *nif)
+{
+        /* Conditional branches expect the condition in r31.w; emit a move for
+         * that in the _previous_ block (which is the current block). */
+        emit_condition(ctx, &nif->condition, true, COMPONENT_X);
+
+        /* Speculatively emit the branch, but we can't fill it in until later */
+        EMIT(branch, true, true);
+        midgard_instruction *then_branch = mir_last_in_block(ctx->current_block);
+
+        /* Emit the two subblocks */
+        midgard_block *then_block = emit_cf_list(ctx, &nif->then_list);
+
+        /* Emit a jump from the end of the then block to the end of the else */
+        EMIT(branch, false, false);
+        midgard_instruction *then_exit = mir_last_in_block(ctx->current_block);
+
+        /* Emit second block, and check if it's empty */
+
+        int else_idx = ctx->block_count;
+        int count_in = ctx->instruction_count;
+        midgard_block *else_block = emit_cf_list(ctx, &nif->else_list);
+        int after_else_idx = ctx->block_count;
+
+        /* Now that we have the subblocks emitted, fix up the branches */
+
+        assert(then_block);
+        assert(else_block);
+
+        if (ctx->instruction_count == count_in) {
+                /* The else block is empty, so don't emit an exit jump */
+                mir_remove_instruction(then_exit);
+                then_branch->branch.target_block = after_else_idx;
+        } else {
+                then_branch->branch.target_block = else_idx;
+                then_exit->branch.target_block = after_else_idx;
+        }
+}
+
+static void
+emit_loop(struct compiler_context *ctx, nir_loop *nloop)
+{
+        /* Remember where we are */
+        midgard_block *start_block = ctx->current_block;
+
+        /* Allocate a loop number, growing the current inner loop depth */
+        int loop_idx = ++ctx->current_loop_depth;
+
+        /* Get index from before the body so we can loop back later */
+        int start_idx = ctx->block_count;
+
+        /* Emit the body itself */
+        emit_cf_list(ctx, &nloop->body);
+
+        /* Branch back to loop back */
+        struct midgard_instruction br_back = v_branch(false, false);
+        br_back.branch.target_block = start_idx;
+        emit_mir_instruction(ctx, br_back);
+
+        /* Mark down that branch in the graph. Note that we're really branching
+         * to the block *after* we started in. TODO: Why doesn't the branch
+         * itself have an off-by-one then...? */
+        midgard_block_add_successor(ctx->current_block, start_block->successors[0]);
+
+        /* Find the index of the block about to follow us (note: we don't add
+         * one; blocks are 0-indexed so we get a fencepost problem) */
+        int break_block_idx = ctx->block_count;
+
+        /* Fix up the break statements we emitted to point to the right place,
+         * now that we can allocate a block number for them */
+
+        list_for_each_entry_from(struct midgard_block, block, start_block, &ctx->blocks, link) {
+                mir_foreach_instr_in_block(block, ins) {
+                        if (ins->type != TAG_ALU_4) continue;
+                        if (!ins->compact_branch) continue;
+                        if (ins->prepacked_branch) continue;
+
+                        /* We found a branch -- check the type to see if we need to do anything */
+                        if (ins->branch.target_type != TARGET_BREAK) continue;
+
+                        /* It's a break! Check if it's our break */
+                        if (ins->branch.target_break != loop_idx) continue;
+
+                        /* Okay, cool, we're breaking out of this loop.
+                         * Rewrite from a break to a goto */
+
+                        ins->branch.target_type = TARGET_GOTO;
+                        ins->branch.target_block = break_block_idx;
+                }
+        }
+
+        /* Now that we've finished emitting the loop, free up the depth again
+         * so we play nice with recursion amid nested loops */
+        --ctx->current_loop_depth;
+
+        /* Dump loop stats */
+        ++ctx->loop_count;
+}
+
+static midgard_block *
+emit_cf_list(struct compiler_context *ctx, struct exec_list *list)
+{
+        midgard_block *start_block = NULL;
+
+        foreach_list_typed(nir_cf_node, node, node, list) {
+                switch (node->type) {
+                case nir_cf_node_block: {
+                        midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node));
+
+                        if (!start_block)
+                                start_block = block;
+
+                        break;
+                }
+
+                case nir_cf_node_if:
+                        emit_if(ctx, nir_cf_node_as_if(node));
+                        break;
+
+                case nir_cf_node_loop:
+                        emit_loop(ctx, nir_cf_node_as_loop(node));
+                        break;
+
+                case nir_cf_node_function:
+                        assert(0);
+                        break;
+                }
+        }
+
+        return start_block;
+}
+
+/* Due to lookahead, we need to report the first tag executed in the command
+ * stream and in branch targets. An initial block might be empty, so iterate
+ * until we find one that 'works' */
+
+static unsigned
+midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx)
+{
+        midgard_block *initial_block = mir_get_block(ctx, block_idx);
+
+        unsigned first_tag = 0;
+
+        do {
+                midgard_bundle *initial_bundle = util_dynarray_element(&initial_block->bundles, midgard_bundle, 0);
+
+                if (initial_bundle) {
+                        first_tag = initial_bundle->tag;
+                        break;
+                }
+
+                /* Initial block is empty, try the next block */
+                initial_block = list_first_entry(&(initial_block->link), midgard_block, link);
+        } while(initial_block != NULL);
+
+        assert(first_tag);
+        return first_tag;
+}
+
+int
+midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend)
+{
+        struct util_dynarray *compiled = &program->compiled;
+
+        midgard_debug = debug_get_option_midgard_debug();
+
+        compiler_context ictx = {
+                .nir = nir,
+                .stage = nir->info.stage,
+
+                .is_blend = is_blend,
+                .blend_constant_offset = 0,
+
+                .alpha_ref = program->alpha_ref
+        };
+
+        compiler_context *ctx = &ictx;
+
+        /* TODO: Decide this at runtime */
+        ctx->uniform_cutoff = 8;
+
+        /* Initialize at a global (not block) level hash tables */
+
+        ctx->ssa_constants = _mesa_hash_table_u64_create(NULL);
+        ctx->ssa_to_alias = _mesa_hash_table_u64_create(NULL);
+        ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
+        ctx->sysval_to_id = _mesa_hash_table_u64_create(NULL);
+        ctx->leftover_ssa_to_alias = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+
+        /* Record the varying mapping for the command stream's bookkeeping */
+
+        struct exec_list *varyings =
+                        ctx->stage == MESA_SHADER_VERTEX ? &nir->outputs : &nir->inputs;
+
+        unsigned max_varying = 0;
+        nir_foreach_variable(var, varyings) {
+                unsigned loc = var->data.driver_location;
+                unsigned sz = glsl_type_size(var->type, FALSE);
+
+                for (int c = 0; c < sz; ++c) {
+                        program->varyings[loc + c] = var->data.location + c;
+                        max_varying = MAX2(max_varying, loc + c);
+                }
+        }
+
+        /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
+         * (so we don't accidentally duplicate the epilogue since mesa/st has
+         * messed with our I/O quite a bit already) */
+
+        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+
+        if (ctx->stage == MESA_SHADER_VERTEX)
+                NIR_PASS_V(nir, nir_lower_viewport_transform);
+
+        NIR_PASS_V(nir, nir_lower_var_copies);
+        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+        NIR_PASS_V(nir, nir_split_var_copies);
+        NIR_PASS_V(nir, nir_lower_var_copies);
+        NIR_PASS_V(nir, nir_lower_global_vars_to_local);
+        NIR_PASS_V(nir, nir_lower_var_copies);
+        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+
+        NIR_PASS_V(nir, nir_lower_io, nir_var_all, glsl_type_size, 0);
+
+        /* Optimisation passes */
+
+        optimise_nir(nir);
+
+        if (midgard_debug & MIDGARD_DBG_SHADERS) {
+                nir_print_shader(nir, stdout);
+        }
+
+        /* Assign sysvals and counts, now that we're sure
+         * (post-optimisation) */
+
+        midgard_nir_assign_sysvals(ctx, nir);
+
+        program->uniform_count = nir->num_uniforms;
+        program->sysval_count = ctx->sysval_count;
+        memcpy(program->sysvals, ctx->sysvals, sizeof(ctx->sysvals[0]) * ctx->sysval_count);
+
+        program->attribute_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_inputs : 0;
+        program->varying_count = max_varying + 1; /* Fencepost off-by-one */
+
+        nir_foreach_function(func, nir) {
+                if (!func->impl)
+                        continue;
+
+                list_inithead(&ctx->blocks);
+                ctx->block_count = 0;
+                ctx->func = func;
+
+                emit_cf_list(ctx, &func->impl->body);
+                emit_block(ctx, func->impl->end_block);
+
+                break; /* TODO: Multi-function shaders */
+        }
+
+        util_dynarray_init(compiled, NULL);
+
+        /* MIR-level optimizations */
+
+        bool progress = false;
+
+        do {
+                progress = false;
+
+                mir_foreach_block(ctx, block) {
+                        progress |= midgard_opt_pos_propagate(ctx, block);
+                        progress |= midgard_opt_copy_prop(ctx, block);
+                        progress |= midgard_opt_dead_code_eliminate(ctx, block);
+                }
+        } while (progress);
+
+        /* Nested control-flow can result in dead branches at the end of the
+         * block. This messes with our analysis and is just dead code, so cull
+         * them */
+        mir_foreach_block(ctx, block) {
+                midgard_opt_cull_dead_branch(ctx, block);
+        }
+
+        /* Schedule! */
+        schedule_program(ctx);
+
+        /* Now that all the bundles are scheduled and we can calculate block
+         * sizes, emit actual branch instructions rather than placeholders */
+
+        int br_block_idx = 0;
+
+        mir_foreach_block(ctx, block) {
+                util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
+                        for (int c = 0; c < bundle->instruction_count; ++c) {
+                                midgard_instruction *ins = bundle->instructions[c];
+
+                                if (!midgard_is_branch_unit(ins->unit)) continue;
+
+                                if (ins->prepacked_branch) continue;
+
+                                /* Parse some basic branch info */
+                                bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT;
+                                bool is_conditional = ins->branch.conditional;
+                                bool is_inverted = ins->branch.invert_conditional;
+                                bool is_discard = ins->branch.target_type == TARGET_DISCARD;
+
+                                /* Determine the block we're jumping to */
+                                int target_number = ins->branch.target_block;
+
+                                /* Report the destination tag */
+                                int dest_tag = is_discard ? 0 : midgard_get_first_tag_from_block(ctx, target_number);
+
+                                /* Count up the number of quadwords we're
+                                 * jumping over = number of quadwords until
+                                 * (br_block_idx, target_number) */
+
+                                int quadword_offset = 0;
+
+                                if (is_discard) {
+                                        /* Jump to the end of the shader. We
+                                         * need to include not only the
+                                         * following blocks, but also the
+                                         * contents of our current block (since
+                                         * discard can come in the middle of
+                                         * the block) */
+
+                                        midgard_block *blk = mir_get_block(ctx, br_block_idx + 1);
+
+                                        for (midgard_bundle *bun = bundle + 1; bun < (midgard_bundle *)((char*) block->bundles.data + block->bundles.size); ++bun) {
+                                                quadword_offset += quadword_size(bun->tag);
+                                        }
+
+                                        mir_foreach_block_from(ctx, blk, b) {
+                                                quadword_offset += b->quadword_count;
+                                        }
+
+                                } else if (target_number > br_block_idx) {
+                                        /* Jump forward */
+
+                                        for (int idx = br_block_idx + 1; idx < target_number; ++idx) {
+                                                midgard_block *blk = mir_get_block(ctx, idx);
+                                                assert(blk);
+
+                                                quadword_offset += blk->quadword_count;
+                                        }
+                                } else {
+                                        /* Jump backwards */
+
+                                        for (int idx = br_block_idx; idx >= target_number; --idx) {
+                                                midgard_block *blk = mir_get_block(ctx, idx);
+                                                assert(blk);
+
+                                                quadword_offset -= blk->quadword_count;
+                                        }
+                                }
+
+                                /* Unconditional extended branches (far jumps)
+                                 * have issues, so we always use a conditional
+                                 * branch, setting the condition to always for
+                                 * unconditional. For compact unconditional
+                                 * branches, cond isn't used so it doesn't
+                                 * matter what we pick. */
+
+                                midgard_condition cond =
+                                        !is_conditional ? midgard_condition_always :
+                                        is_inverted ? midgard_condition_false :
+                                        midgard_condition_true;
+
+                                midgard_jmp_writeout_op op =
+                                        is_discard ? midgard_jmp_writeout_op_discard :
+                                        (is_compact && !is_conditional) ? midgard_jmp_writeout_op_branch_uncond :
+                                        midgard_jmp_writeout_op_branch_cond;
+
+                                if (!is_compact) {
+                                        midgard_branch_extended branch =
+                                                midgard_create_branch_extended(
+                                                        cond, op,
+                                                        dest_tag,
+                                                        quadword_offset);
+
+                                        memcpy(&ins->branch_extended, &branch, sizeof(branch));
+                                } else if (is_conditional || is_discard) {
+                                        midgard_branch_cond branch = {
+                                                .op = op,
+                                                .dest_tag = dest_tag,
+                                                .offset = quadword_offset,
+                                                .cond = cond
+                                        };
+
+                                        assert(branch.offset == quadword_offset);
+
+                                        memcpy(&ins->br_compact, &branch, sizeof(branch));
+                                } else {
+                                        assert(op == midgard_jmp_writeout_op_branch_uncond);
+
+                                        midgard_branch_uncond branch = {
+                                                .op = op,
+                                                .dest_tag = dest_tag,
+                                                .offset = quadword_offset,
+                                                .unknown = 1
+                                        };
+
+                                        assert(branch.offset == quadword_offset);
+
+                                        memcpy(&ins->br_compact, &branch, sizeof(branch));
+                                }
+                        }
+                }
+
+                ++br_block_idx;
+        }
+
+        /* Emit flat binary from the instruction arrays. Iterate each block in
+         * sequence. Save instruction boundaries such that lookahead tags can
+         * be assigned easily */
+
+        /* Cache _all_ bundles in source order for lookahead across failed branches */
+
+        int bundle_count = 0;
+        mir_foreach_block(ctx, block) {
+                bundle_count += block->bundles.size / sizeof(midgard_bundle);
+        }
+        midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count);
+        int bundle_idx = 0;
+        mir_foreach_block(ctx, block) {
+                util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
+                        source_order_bundles[bundle_idx++] = bundle;
+                }
+        }
+
+        int current_bundle = 0;
+
+        /* Midgard prefetches instruction types, so during emission we
+         * need to lookahead. Unless this is the last instruction, in
+         * which we return 1. Or if this is the second to last and the
+         * last is an ALU, then it's also 1... */
+
+        mir_foreach_block(ctx, block) {
+                mir_foreach_bundle_in_block(block, bundle) {
+                        int lookahead = 1;
+
+                        if (current_bundle + 1 < bundle_count) {
+                                uint8_t next = source_order_bundles[current_bundle + 1]->tag;
+
+                                if (!(current_bundle + 2 < bundle_count) && IS_ALU(next)) {
+                                        lookahead = 1;
+                                } else {
+                                        lookahead = next;
+                                }
+                        }
+
+                        emit_binary_bundle(ctx, bundle, compiled, lookahead);
+                        ++current_bundle;
+                }
+
+                /* TODO: Free deeper */
+                //util_dynarray_fini(&block->instructions);
+        }
+
+        free(source_order_bundles);
+
+        /* Report the very first tag executed */
+        program->first_tag = midgard_get_first_tag_from_block(ctx, 0);
+
+        /* Deal with off-by-one related to the fencepost problem */
+        program->work_register_count = ctx->work_registers + 1;
+
+        program->can_discard = ctx->can_discard;
+        program->uniform_cutoff = ctx->uniform_cutoff;
+
+        program->blend_patch_offset = ctx->blend_constant_offset;
+
+        if (midgard_debug & MIDGARD_DBG_SHADERS)
+                disassemble_midgard(program->compiled.data, program->compiled.size);
+
+        if (midgard_debug & MIDGARD_DBG_SHADERDB) {
+                unsigned nr_bundles = 0, nr_ins = 0, nr_quadwords = 0;
+
+                /* Count instructions and bundles */
+
+                mir_foreach_instr_global(ctx, ins) {
+                        nr_ins++;
+                }
+
+                mir_foreach_block(ctx, block) {
+                        nr_bundles += util_dynarray_num_elements(
+                                              &block->bundles, midgard_bundle);
+
+                        nr_quadwords += block->quadword_count;
+                }
+
+                /* Calculate thread count. There are certain cutoffs by
+                 * register count for thread count */
+
+                unsigned nr_registers = program->work_register_count;
+
+                unsigned nr_threads =
+                        (nr_registers <= 4) ? 4 :
+                        (nr_registers <= 8) ? 2 :
+                        1;
+
+                /* Dump stats */
+
+                fprintf(stderr, "shader%d - %s shader: "
+                        "%u inst, %u bundles, %u quadwords, "
+                        "%u registers, %u threads, %u loops\n",
+                        SHADER_DB_COUNT++,
+                        gl_shader_stage_name(ctx->stage),
+                        nr_ins, nr_bundles, nr_quadwords,
+                        nr_registers, nr_threads,
+                        ctx->loop_count);
+        }
+
+
+        return 0;
+}
diff --git a/src/panfrost/midgard/midgard_compile.h b/src/panfrost/midgard/midgard_compile.h
new file mode 100644
index 00000000000..147494b8e8a
--- /dev/null
+++ b/src/panfrost/midgard/midgard_compile.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MIDGARD_H_
+#define __MIDGARD_H_
+
+#include "compiler/nir/nir.h"
+#include "util/u_dynarray.h"
+
+/* Define the general compiler entry point */
+
+#define MAX_SYSVAL_COUNT 32
+
+/* Allow 2D of sysval IDs, while allowing nonparametric sysvals to equal
+ * their class for equal comparison */
+
+#define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type)
+#define PAN_SYSVAL_TYPE(sysval) ((sysval) & 0xffff)
+#define PAN_SYSVAL_ID(sysval) ((sysval) >> 16)
+
+/* Define some common types. We start at one for easy indexing of hash
+ * tables internal to the compiler */
+
+enum {
+        PAN_SYSVAL_VIEWPORT_SCALE = 1,
+        PAN_SYSVAL_VIEWPORT_OFFSET = 2,
+        PAN_SYSVAL_TEXTURE_SIZE = 3,
+} pan_sysval;
+
+#define PAN_TXS_SYSVAL_ID(texidx, dim, is_array)          \
+	((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0))
+
+#define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id)        ((id) & 0x7f)
+#define PAN_SYSVAL_ID_TO_TXS_DIM(id)            (((id) >> 7) & 0x3)
+#define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id)       !!((id) & (1 << 9))
+
+typedef struct {
+        int work_register_count;
+        int uniform_count;
+        int uniform_cutoff;
+
+        int attribute_count;
+        int varying_count;
+
+        /* Prepended before uniforms, mapping to SYSVAL_ names for the
+         * sysval */
+
+        unsigned sysval_count;
+        unsigned sysvals[MAX_SYSVAL_COUNT];
+
+        unsigned varyings[32];
+
+        /* Boolean properties of the program */
+        bool can_discard;
+        bool writes_point_size;
+
+        int first_tag;
+
+        struct util_dynarray compiled;
+
+        /* For a blend shader using a constant color -- patch point. If
+         * negative, there's no constant. */
+
+        int blend_patch_offset;
+
+        /* IN: For a fragment shader with a lowered alpha test, the ref value */
+        float alpha_ref;
+} midgard_program;
+
+int
+midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend);
+
+/* NIR options are shared between the standalone compiler and the online
+ * compiler. Defining it here is the simplest, though maybe not the Right
+ * solution. */
+
+static const nir_shader_compiler_options midgard_nir_options = {
+        .lower_ffma = true,
+        .lower_sub = true,
+        .lower_scmp = true,
+        .lower_flrp32 = true,
+        .lower_flrp64 = true,
+        .lower_ffract = true,
+        .lower_fmod = true,
+        .lower_fdiv = true,
+        .lower_idiv = true,
+        .lower_isign = true,
+        .lower_fpow = true,
+        .lower_find_lsb = true,
+
+        .lower_wpos_pntc = true,
+
+        /* TODO: We have native ops to help here, which we'll want to look into
+         * eventually */
+        .lower_fsign = true,
+
+        .vertex_id_zero_based = true,
+        .lower_extract_byte = true,
+        .lower_extract_word = true,
+        .lower_rotate = true,
+
+        .lower_doubles_options = nir_lower_dmod,
+
+        .vectorize_io = true,
+};
+
+#endif
diff --git a/src/panfrost/midgard/midgard_emit.c b/src/panfrost/midgard/midgard_emit.c
new file mode 100644
index 00000000000..3522e77d5b1
--- /dev/null
+++ b/src/panfrost/midgard/midgard_emit.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+#include "midgard_ops.h"
+
+/* Midgard IR only knows vector ALU types, but we sometimes need to actually
+ * use scalar ALU instructions, for functional or performance reasons. To do
+ * this, we just demote vector ALU payloads to scalar. */
+
+static int
+component_from_mask(unsigned mask)
+{
+        for (int c = 0; c < 8; ++c) {
+                if (mask & (1 << c))
+                        return c;
+        }
+
+        assert(0);
+        return 0;
+}
+
+static unsigned
+vector_to_scalar_source(unsigned u, bool is_int, bool is_full)
+{
+        midgard_vector_alu_src v;
+        memcpy(&v, &u, sizeof(v));
+
+        /* TODO: Integers */
+
+        unsigned component = v.swizzle & 3;
+        bool upper = false; /* TODO */
+
+        midgard_scalar_alu_src s = { 0 };
+
+        if (is_full) {
+                /* For a 32-bit op, just check the source half flag */
+                s.full = !v.half;
+        } else if (!v.half) {
+                /* For a 16-bit op that's not subdivided, never full */
+                s.full = false;
+        } else {
+                /* We can't do 8-bit scalar, abort! */
+                assert(0);
+        }
+
+        /* Component indexing takes size into account */
+
+        if (s.full)
+                s.component = component << 1;
+        else
+                s.component = component + (upper << 2);
+
+        if (is_int) {
+                /* TODO */
+        } else {
+                s.abs = v.mod & MIDGARD_FLOAT_MOD_ABS;
+                s.negate = v.mod & MIDGARD_FLOAT_MOD_NEG;
+        }
+
+        unsigned o;
+        memcpy(&o, &s, sizeof(s));
+
+        return o & ((1 << 6) - 1);
+}
+
+static midgard_scalar_alu
+vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins)
+{
+        bool is_int = midgard_is_integer_op(v.op);
+        bool is_full = v.reg_mode == midgard_reg_mode_32;
+        bool is_inline_constant = ins->ssa_args.inline_constant;
+
+        /* The output component is from the mask */
+        midgard_scalar_alu s = {
+                .op = v.op,
+                .src1 = vector_to_scalar_source(v.src1, is_int, is_full),
+                .src2 = !is_inline_constant ? vector_to_scalar_source(v.src2, is_int, is_full) : 0,
+                .unknown = 0,
+                .outmod = v.outmod,
+                .output_full = is_full,
+                .output_component = component_from_mask(ins->mask),
+        };
+
+        /* Full components are physically spaced out */
+        if (is_full) {
+                assert(s.output_component < 4);
+                s.output_component <<= 1;
+        }
+
+        /* Inline constant is passed along rather than trying to extract it
+         * from v */
+
+        if (ins->ssa_args.inline_constant) {
+                uint16_t imm = 0;
+                int lower_11 = ins->inline_constant & ((1 << 12) - 1);
+                imm |= (lower_11 >> 9) & 3;
+                imm |= (lower_11 >> 6) & 4;
+                imm |= (lower_11 >> 2) & 0x38;
+                imm |= (lower_11 & 63) << 6;
+
+                s.src2 = imm;
+        }
+
+        return s;
+}
+
+static void
+emit_alu_bundle(compiler_context *ctx,
+                midgard_bundle *bundle,
+                struct util_dynarray *emission,
+                unsigned lookahead)
+{
+        /* Emit the control word */
+        util_dynarray_append(emission, uint32_t, bundle->control | lookahead);
+
+        /* Next up, emit register words */
+        for (unsigned i = 0; i < bundle->instruction_count; ++i) {
+                midgard_instruction *ins = bundle->instructions[i];
+
+                /* Check if this instruction has registers */
+                if (ins->compact_branch || ins->prepacked_branch) continue;
+
+                /* Otherwise, just emit the registers */
+                uint16_t reg_word = 0;
+                memcpy(&reg_word, &ins->registers, sizeof(uint16_t));
+                util_dynarray_append(emission, uint16_t, reg_word);
+        }
+
+        /* Now, we emit the body itself */
+        for (unsigned i = 0; i < bundle->instruction_count; ++i) {
+                midgard_instruction *ins = bundle->instructions[i];
+
+                /* Where is this body */
+                unsigned size = 0;
+                void *source = NULL;
+
+                /* In case we demote to a scalar */
+                midgard_scalar_alu scalarized;
+
+                if (ins->unit & UNITS_ANY_VECTOR) {
+                        if (ins->alu.reg_mode == midgard_reg_mode_32)
+                                ins->alu.mask = expand_writemask_32(ins->mask);
+                        else
+                                ins->alu.mask = ins->mask;
+
+                        size = sizeof(midgard_vector_alu);
+                        source = &ins->alu;
+                } else if (ins->unit == ALU_ENAB_BR_COMPACT) {
+                        size = sizeof(midgard_branch_cond);
+                        source = &ins->br_compact;
+                } else if (ins->compact_branch) { /* misnomer */
+                        size = sizeof(midgard_branch_extended);
+                        source = &ins->branch_extended;
+                } else {
+                        size = sizeof(midgard_scalar_alu);
+                        scalarized = vector_to_scalar_alu(ins->alu, ins);
+                        source = &scalarized;
+                }
+
+                memcpy(util_dynarray_grow_bytes(emission, 1, size), source, size);
+        }
+
+        /* Emit padding (all zero) */
+        memset(util_dynarray_grow_bytes(emission, 1, bundle->padding), 0, bundle->padding);
+
+        /* Tack on constants */
+
+        if (bundle->has_embedded_constants) {
+                util_dynarray_append(emission, float, bundle->constants[0]);
+                util_dynarray_append(emission, float, bundle->constants[1]);
+                util_dynarray_append(emission, float, bundle->constants[2]);
+                util_dynarray_append(emission, float, bundle->constants[3]);
+        }
+}
+
+/* After everything is scheduled, emit whole bundles at a time */
+
+void
+emit_binary_bundle(compiler_context *ctx,
+                   midgard_bundle *bundle,
+                   struct util_dynarray *emission,
+                   int next_tag)
+{
+        int lookahead = next_tag << 4;
+
+        switch (bundle->tag) {
+        case TAG_ALU_4:
+        case TAG_ALU_8:
+        case TAG_ALU_12:
+        case TAG_ALU_16:
+                emit_alu_bundle(ctx, bundle, emission, lookahead);
+                break;
+
+        case TAG_LOAD_STORE_4: {
+                /* One or two composing instructions */
+
+                uint64_t current64, next64 = LDST_NOP;
+
+                /* Copy masks */
+
+                for (unsigned i = 0; i < bundle->instruction_count; ++i) {
+                        bundle->instructions[i]->load_store.mask =
+                                bundle->instructions[i]->mask;
+                }
+
+                memcpy(&current64, &bundle->instructions[0]->load_store, sizeof(current64));
+
+                if (bundle->instruction_count == 2)
+                        memcpy(&next64, &bundle->instructions[1]->load_store, sizeof(next64));
+
+                midgard_load_store instruction = {
+                        .type = bundle->tag,
+                        .next_type = next_tag,
+                        .word1 = current64,
+                        .word2 = next64
+                };
+
+                util_dynarray_append(emission, midgard_load_store, instruction);
+
+                break;
+        }
+
+        case TAG_TEXTURE_4:
+        case TAG_TEXTURE_4_VTX: {
+                /* Texture instructions are easy, since there is no pipelining
+                 * nor VLIW to worry about. We may need to set .cont/.last
+                 * flags. */
+
+                midgard_instruction *ins = bundle->instructions[0];
+
+                ins->texture.type = bundle->tag;
+                ins->texture.next_type = next_tag;
+                ins->texture.mask = ins->mask;
+
+                ctx->texture_op_count--;
+
+                if (ins->texture.op == TEXTURE_OP_NORMAL) {
+                        bool continues = ctx->texture_op_count > 0;
+                        ins->texture.cont = continues;
+                        ins->texture.last = !continues;
+                } else {
+                        ins->texture.cont = ins->texture.last = 1;
+                }
+
+                util_dynarray_append(emission, midgard_texture_word, ins->texture);
+                break;
+        }
+
+        default:
+                unreachable("Unknown midgard instruction type\n");
+        }
+}
diff --git a/src/panfrost/midgard/midgard_liveness.c b/src/panfrost/midgard/midgard_liveness.c
new file mode 100644
index 00000000000..a18d8b9f8ad
--- /dev/null
+++ b/src/panfrost/midgard/midgard_liveness.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* mir_is_live_after performs liveness analysis on the MIR, used primarily
+ * as part of register allocation. TODO: Algorithmic improvements for
+ * compiler performance (this is the worst algorithm possible -- see
+ * backlog with Connor on IRC) */
+
+#include "compiler.h"
+
+static bool
+midgard_is_live_in_instr(midgard_instruction *ins, int src)
+{
+        if (ins->compact_branch)
+                return false;
+
+        if (ins->ssa_args.src0 == src)
+                return true;
+
+        if (!ins->ssa_args.inline_constant && ins->ssa_args.src1 == src)
+                return true;
+
+        return false;
+}
+
+/* Determine if a variable is live in the successors of a block */
+static bool
+is_live_after_successors(compiler_context *ctx, midgard_block *bl, int src)
+{
+        for (unsigned i = 0; i < bl->nr_successors; ++i) {
+                midgard_block *succ = bl->successors[i];
+
+                /* If we already visited, the value we're seeking
+                 * isn't down this path (or we would have short
+                 * circuited */
+
+                if (succ->visited) continue;
+
+                /* Otherwise (it's visited *now*), check the block */
+
+                succ->visited = true;
+
+                mir_foreach_instr_in_block(succ, ins) {
+                        if (midgard_is_live_in_instr(ins, src))
+                                return true;
+                }
+
+                /* ...and also, check *its* successors */
+                if (is_live_after_successors(ctx, succ, src))
+                        return true;
+
+        }
+
+        /* Welp. We're really not live. */
+
+        return false;
+}
+
+bool
+mir_is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src)
+{
+        /* Check the rest of the block for liveness */
+
+        mir_foreach_instr_in_block_from(block, ins, mir_next_op(start)) {
+                if (midgard_is_live_in_instr(ins, src))
+                        return true;
+        }
+
+        /* Check the rest of the blocks for liveness recursively */
+
+        bool succ = is_live_after_successors(ctx, block, src);
+
+        mir_foreach_block(ctx, block) {
+                block->visited = false;
+        }
+
+        return succ;
+}
+
+/* Just a quick check -- is it written more than once? (I.e. are we definitely
+ * not SSA?) */
+
+bool
+mir_has_multiple_writes(compiler_context *ctx, int dest)
+{
+        unsigned write_count = 0;
+
+        mir_foreach_instr_global(ctx, ins) {
+                if (ins->ssa_args.dest == dest)
+                        write_count++;
+        }
+
+        return write_count > 1;
+}
diff --git a/src/panfrost/midgard/midgard_nir.h b/src/panfrost/midgard/midgard_nir.h
new file mode 100644
index 00000000000..85eadd34631
--- /dev/null
+++ b/src/panfrost/midgard/midgard_nir.h
@@ -0,0 +1,5 @@
+#include <stdbool.h>
+#include "nir.h"
+
+bool midgard_nir_lower_algebraic_late(nir_shader *shader);
+bool midgard_nir_scale_trig(nir_shader *shader);
diff --git a/src/panfrost/midgard/midgard_nir_algebraic.py b/src/panfrost/midgard/midgard_nir_algebraic.py
new file mode 100644
index 00000000000..faf83364c3a
--- /dev/null
+++ b/src/panfrost/midgard/midgard_nir_algebraic.py
@@ -0,0 +1,96 @@
+#
+# Copyright (C) 2018 Alyssa Rosenzweig
+#
+# Copyright (C) 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+import argparse
+import sys
+import math
+
+a = 'a'
+b = 'b'
+c = 'c'
+
+algebraic_late = [
+    # ineg must be lowered late, but only for integers; floats will try to
+    # have modifiers attached... hence why this has to be here rather than
+    # a more standard lower_negate approach
+
+    (('ineg', a), ('isub', 0, a)),
+
+    # These two special-cases save space/an op than the actual csel op +
+    # scheduler flexibility
+
+    (('b32csel', a, 'b@32', 0), ('iand', a, b)),
+    (('b32csel', a, 0, 'b@32'), ('iand', ('inot', a), b)),
+]
+
+
+# Midgard is able to type convert down by only one "step" per instruction; if
+# NIR wants more than one step, we need to break up into multiple instructions
+
+converts = [
+    (('i2i8', 'a@32'), ('i2i8', ('i2i16', a))),
+    (('u2u8', 'a@32'), ('u2u8', ('u2u16', a))),
+
+    (('i2i32', 'a@8'), ('i2i32', ('i2i16', a))),
+    (('u2u32', 'a@8'), ('u2u32', ('u2u16', a))),
+
+    (('f2i32', 'a@16'), ('f2i32', ('f2f32', a))),
+    (('f2u32', 'a@16'), ('f2u32', ('f2f32', a))),
+
+    # Totally redundant
+    (('~f2f16', ('f2f32', 'a@16')), a),
+
+    (('pack_half_2x16_split', 'a@32', 'b@32'), ('ior', ('ishl', ('i2i32', ('f2f16', b)), 16), ('i2i32', ('f2f16', a)))),
+]
+
+# Midgard scales fsin/fcos arguments by pi.
+# Pass must be run only once, after the main loop
+
+scale_trig = [
+        (('fsin', a), ('fsin', ('fdiv', a, math.pi))),
+        (('fcos', a), ('fcos', ('fdiv', a, math.pi))),
+]
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--import-path', required=True)
+    args = parser.parse_args()
+    sys.path.insert(0, args.import_path)
+    run()
+
+
+def run():
+    import nir_algebraic  # pylint: disable=import-error
+
+    print('#include "midgard_nir.h"')
+
+    print(nir_algebraic.AlgebraicPass("midgard_nir_lower_algebraic_late",
+                                      algebraic_late + converts).render())
+
+    print(nir_algebraic.AlgebraicPass("midgard_nir_scale_trig",
+                                      scale_trig).render())
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/panfrost/midgard/midgard_ops.c b/src/panfrost/midgard/midgard_ops.c
new file mode 100644
index 00000000000..ccd750cff83
--- /dev/null
+++ b/src/panfrost/midgard/midgard_ops.c
@@ -0,0 +1,221 @@
+/* Copyright (c) 2018-2019 Alyssa Rosenzweig ([email protected])
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "midgard.h"
+
+/* Include the definitions of the macros and such */
+
+#define MIDGARD_OPS_TABLE
+#include "helpers.h"
+#undef MIDGARD_OPS_TABLE
+
+/* Table of mapping opcodes to accompanying properties. This is used for both
+ * the disassembler and the compiler. It is placed in a .c file like this to
+ * avoid duplications in the binary */
+
+struct mir_op_props alu_opcode_props[256] = {
+        [midgard_alu_op_fadd]		 = {"fadd", UNITS_ADD | OP_COMMUTES},
+        [midgard_alu_op_fmul]		 = {"fmul", UNITS_MUL | UNIT_VLUT | OP_COMMUTES},
+        [midgard_alu_op_fmin]		 = {"fmin", UNITS_MUL | UNITS_ADD | OP_COMMUTES},
+        [midgard_alu_op_fmax]		 = {"fmax", UNITS_MUL | UNITS_ADD | OP_COMMUTES},
+        [midgard_alu_op_imin]		 = {"imin", UNITS_MOST | OP_COMMUTES},
+        [midgard_alu_op_imax]		 = {"imax", UNITS_MOST | OP_COMMUTES},
+        [midgard_alu_op_umin]		 = {"umin", UNITS_MOST | OP_COMMUTES},
+        [midgard_alu_op_umax]		 = {"umax", UNITS_MOST | OP_COMMUTES},
+        [midgard_alu_op_ihadd]		 = {"ihadd", UNITS_ADD | OP_COMMUTES},
+        [midgard_alu_op_uhadd]		 = {"uhadd", UNITS_ADD | OP_COMMUTES},
+        [midgard_alu_op_irhadd]		 = {"irhadd", UNITS_ADD | OP_COMMUTES},
+        [midgard_alu_op_urhadd]		 = {"urhadd", UNITS_ADD | OP_COMMUTES},
+
+        [midgard_alu_op_fmov]		 = {"fmov", UNITS_ALL | QUIRK_FLIPPED_R24},
+        [midgard_alu_op_fmov_rtz]	 = {"fmov_rtz", UNITS_ALL | QUIRK_FLIPPED_R24},
+        [midgard_alu_op_fmov_rtn]	 = {"fmov_rtn", UNITS_ALL | QUIRK_FLIPPED_R24},
+        [midgard_alu_op_fmov_rtp]	 = {"fmov_rtp", UNITS_ALL | QUIRK_FLIPPED_R24},
+        [midgard_alu_op_fround]          = {"fround", UNITS_ADD},
+        [midgard_alu_op_froundeven]      = {"froundeven", UNITS_ADD},
+        [midgard_alu_op_ftrunc]          = {"ftrunc", UNITS_ADD},
+        [midgard_alu_op_ffloor]		 = {"ffloor", UNITS_ADD},
+        [midgard_alu_op_fceil]		 = {"fceil", UNITS_ADD},
+        [midgard_alu_op_ffma]		 = {"ffma", UNIT_VLUT},
+
+        /* Though they output a scalar, they need to run on a vector unit
+         * since they process vectors */
+        [midgard_alu_op_fdot3]		 = {"fdot3", UNIT_VMUL | OP_CHANNEL_COUNT(3) | OP_COMMUTES},
+        [midgard_alu_op_fdot3r]		 = {"fdot3r", UNIT_VMUL | OP_CHANNEL_COUNT(3) | OP_COMMUTES},
+        [midgard_alu_op_fdot4]		 = {"fdot4", UNIT_VMUL | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+
+        /* Incredibly, iadd can run on vmul, etc */
+        [midgard_alu_op_iadd]		 = {"iadd", UNITS_MOST | OP_COMMUTES},
+        [midgard_alu_op_iaddsat]	 = {"iaddsat", UNITS_ADD | OP_COMMUTES},
+        [midgard_alu_op_uaddsat]	 = {"uaddsat", UNITS_ADD | OP_COMMUTES},
+        [midgard_alu_op_iabsdiff]	 = {"iabsdiff", UNITS_ADD},
+        [midgard_alu_op_uabsdiff]	 = {"uabsdiff", UNITS_ADD},
+        [midgard_alu_op_ichoose]	 = {"ichoose", UNITS_ADD},
+        [midgard_alu_op_isub]		 = {"isub", UNITS_MOST},
+        [midgard_alu_op_isubsat]	 = {"isubsat", UNITS_MOST},
+        [midgard_alu_op_usubsat]	 = {"usubsat", UNITS_MOST},
+        [midgard_alu_op_imul]		 = {"imul", UNITS_MUL | OP_COMMUTES},
+        [midgard_alu_op_imov]		 = {"imov", UNITS_MOST | QUIRK_FLIPPED_R24},
+
+        /* For vector comparisons, use ball etc */
+        [midgard_alu_op_feq]		 = {"feq", UNITS_MOST | OP_TYPE_CONVERT | OP_COMMUTES},
+        [midgard_alu_op_fne]		 = {"fne", UNITS_MOST | OP_TYPE_CONVERT | OP_COMMUTES},
+        [midgard_alu_op_fle]		 = {"fle", UNITS_MOST | OP_TYPE_CONVERT},
+        [midgard_alu_op_flt]		 = {"flt", UNITS_MOST | OP_TYPE_CONVERT},
+        [midgard_alu_op_ieq]		 = {"ieq", UNITS_MOST | OP_COMMUTES},
+        [midgard_alu_op_ine]		 = {"ine", UNITS_MOST | OP_COMMUTES},
+        [midgard_alu_op_ilt]		 = {"ilt", UNITS_MOST},
+        [midgard_alu_op_ile]		 = {"ile", UNITS_MOST},
+        [midgard_alu_op_ult]		 = {"ult", UNITS_MOST},
+        [midgard_alu_op_ule]		 = {"ule", UNITS_MOST},
+
+        [midgard_alu_op_icsel]		 = {"icsel", UNITS_ADD},
+        [midgard_alu_op_icsel_v]         = {"icsel_v", UNITS_ADD}, /* Acts as bitselect() */
+        [midgard_alu_op_fcsel_v]	 = {"fcsel_v", UNITS_ADD},
+        [midgard_alu_op_fcsel]		 = {"fcsel", UNITS_ADD | UNIT_SMUL},
+
+        [midgard_alu_op_frcp]		 = {"frcp", UNIT_VLUT},
+        [midgard_alu_op_frsqrt]		 = {"frsqrt", UNIT_VLUT},
+        [midgard_alu_op_fsqrt]		 = {"fsqrt", UNIT_VLUT},
+        [midgard_alu_op_fpow_pt1]	 = {"fpow_pt1", UNIT_VLUT},
+        [midgard_alu_op_fpown_pt1]	 = {"fpown_pt1", UNIT_VLUT},
+        [midgard_alu_op_fpowr_pt1]	 = {"fpowr_pt1", UNIT_VLUT},
+        [midgard_alu_op_fexp2]		 = {"fexp2", UNIT_VLUT},
+        [midgard_alu_op_flog2]		 = {"flog2", UNIT_VLUT},
+
+        [midgard_alu_op_f2i_rte]	 = {"f2i_rte", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_f2i_rtz]	 = {"f2i_rtz", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_f2i_rtn]	 = {"f2i_rtn", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_f2i_rtp]	 = {"f2i_rtp", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_f2u_rte]	 = {"f2i_rte", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_f2u_rtz]	 = {"f2i_rtz", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_f2u_rtn]	 = {"f2i_rtn", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_f2u_rtp]	 = {"f2i_rtp", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_i2f_rte]	 = {"i2f", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_i2f_rtz]	 = {"i2f_rtz", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_i2f_rtn]	 = {"i2f_rtn", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_i2f_rtp]	 = {"i2f_rtp", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_u2f_rte]	 = {"u2f", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_u2f_rtz]	 = {"u2f_rtz", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_u2f_rtn]	 = {"u2f_rtn", UNITS_ADD | OP_TYPE_CONVERT},
+        [midgard_alu_op_u2f_rtp]	 = {"u2f_rtp", UNITS_ADD | OP_TYPE_CONVERT},
+
+        [midgard_alu_op_fsin]		 = {"fsin", UNIT_VLUT},
+        [midgard_alu_op_fcos]		 = {"fcos", UNIT_VLUT},
+
+        /* XXX: Test case where it's right on smul but not sadd */
+        [midgard_alu_op_iand]		 = {"iand", UNITS_MOST | OP_COMMUTES},
+        [midgard_alu_op_iandnot]         = {"iandnot", UNITS_MOST},
+
+        [midgard_alu_op_ior]		 = {"ior", UNITS_MOST | OP_COMMUTES},
+        [midgard_alu_op_iornot]		 = {"iornot", UNITS_MOST | OP_COMMUTES},
+        [midgard_alu_op_inor]		 = {"inor", UNITS_MOST | OP_COMMUTES},
+        [midgard_alu_op_ixor]		 = {"ixor", UNITS_MOST | OP_COMMUTES},
+        [midgard_alu_op_inxor]		 = {"inxor", UNITS_MOST | OP_COMMUTES},
+        [midgard_alu_op_iclz]		 = {"iclz", UNITS_ADD},
+        [midgard_alu_op_ibitcount8]	 = {"ibitcount8", UNITS_ADD},
+        [midgard_alu_op_inand]		 = {"inand", UNITS_MOST},
+        [midgard_alu_op_ishl]		 = {"ishl", UNITS_ADD},
+        [midgard_alu_op_iasr]		 = {"iasr", UNITS_ADD},
+        [midgard_alu_op_ilsr]		 = {"ilsr", UNITS_ADD},
+
+        [midgard_alu_op_fball_eq]	 = {"fball_eq",  UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+        [midgard_alu_op_fbany_neq]	 = {"fbany_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+        [midgard_alu_op_iball_eq]	 = {"iball_eq",  UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+        [midgard_alu_op_iball_neq]	 = {"iball_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+        [midgard_alu_op_ibany_eq]	 = {"ibany_eq",  UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+        [midgard_alu_op_ibany_neq]	 = {"ibany_neq", UNITS_VECTOR | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+
+        /* These instructions are not yet emitted by the compiler, so
+         * don't speculate about units yet */
+        [midgard_alu_op_ishladd]        = {"ishladd", 0},
+
+        [midgard_alu_op_uball_lt]       = {"uball_lt", 0},
+        [midgard_alu_op_uball_lte]      = {"uball_lte", 0},
+        [midgard_alu_op_iball_lt]       = {"iball_lt", 0},
+        [midgard_alu_op_iball_lte]      = {"iball_lte", 0},
+        [midgard_alu_op_ubany_lt]       = {"ubany_lt", 0},
+        [midgard_alu_op_ubany_lte]      = {"ubany_lte", 0},
+        [midgard_alu_op_ibany_lt]       = {"ibany_lt", 0},
+        [midgard_alu_op_ibany_lte]      = {"ibany_lte", 0},
+
+        [midgard_alu_op_freduce]        = {"freduce", 0},
+        [midgard_alu_op_bball_eq]       = {"bball_eq", 0 | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+        [midgard_alu_op_bbany_neq]      = {"bball_eq", 0 | OP_CHANNEL_COUNT(4) | OP_COMMUTES},
+        [midgard_alu_op_fatan2_pt1]     = {"fatan2_pt1", 0},
+        [midgard_alu_op_fatan_pt2]      = {"fatan_pt2", 0},
+};
+
+const char *load_store_opcode_names[256] = {
+        [midgard_op_st_cubemap_coords] = "st_cubemap_coords",
+        [midgard_op_ld_global_id] = "ld_global_id",
+        [midgard_op_ldst_perspective_division_z] = "ldst_perspective_division_z",
+        [midgard_op_ldst_perspective_division_w] = "ldst_perspective_division_w",
+
+        [midgard_op_atomic_add] = "atomic_add",
+        [midgard_op_atomic_and] = "atomic_and",
+        [midgard_op_atomic_or] = "atomic_or",
+        [midgard_op_atomic_xor] = "atomic_xor",
+        [midgard_op_atomic_imin] = "atomic_imin",
+        [midgard_op_atomic_umin] = "atomic_umin",
+        [midgard_op_atomic_imax] = "atomic_imax",
+        [midgard_op_atomic_umax] = "atomic_umax",
+        [midgard_op_atomic_xchg] = "atomic_xchg",
+
+        [midgard_op_ld_char] = "ld_char",
+        [midgard_op_ld_char2] = "ld_char2",
+        [midgard_op_ld_short] = "ld_short",
+        [midgard_op_ld_char4] = "ld_char4",
+        [midgard_op_ld_short4] = "ld_short4",
+        [midgard_op_ld_int4] = "ld_int4",
+
+        [midgard_op_ld_attr_32] = "ld_attr_32",
+        [midgard_op_ld_attr_16] = "ld_attr_16",
+        [midgard_op_ld_attr_32i] = "ld_attr_32i",
+        [midgard_op_ld_attr_32u] = "ld_attr_32u",
+
+        [midgard_op_ld_vary_32] = "ld_vary_32",
+        [midgard_op_ld_vary_16] = "ld_vary_16",
+        [midgard_op_ld_vary_32i] = "ld_vary_32i",
+        [midgard_op_ld_vary_32u] = "ld_vary_32u",
+
+        [midgard_op_ld_color_buffer_16] = "ld_color_buffer_16",
+
+        [midgard_op_ld_uniform_16] = "ld_uniform_16",
+        [midgard_op_ld_uniform_32] = "ld_uniform_32",
+        [midgard_op_ld_uniform_32i] = "ld_uniform_32i",
+        [midgard_op_ld_color_buffer_8] = "ld_color_buffer_8",
+
+        [midgard_op_st_char] = "st_char",
+        [midgard_op_st_char2] = "st_char2",
+        [midgard_op_st_char4] = "st_char4",
+        [midgard_op_st_short4] = "st_short4",
+        [midgard_op_st_int4] = "st_int4",
+
+        [midgard_op_st_vary_32] = "st_vary_32",
+        [midgard_op_st_vary_16] = "st_vary_16",
+        [midgard_op_st_vary_32i] = "st_vary_32i",
+        [midgard_op_st_vary_32u] = "st_vary_32u",
+
+        [midgard_op_st_image_f] = "st_image_f",
+        [midgard_op_st_image_ui] = "st_image_ui",
+        [midgard_op_st_image_i] = "st_image_i",
+};
diff --git a/src/panfrost/midgard/midgard_ops.h b/src/panfrost/midgard/midgard_ops.h
new file mode 100644
index 00000000000..64c91a5bcac
--- /dev/null
+++ b/src/panfrost/midgard/midgard_ops.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018-2019 Alyssa Rosenzweig ([email protected])
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "helpers.h"
+
+/* Forward declare */
+
+extern struct mir_op_props alu_opcode_props[256];
+extern const char *load_store_opcode_names[256];
+
+/* Is this opcode that of an integer (regardless of signedness)? Instruction
+ * names authoritatively determine types */
+
+static inline bool
+midgard_is_integer_op(int op)
+{
+        const char *name = alu_opcode_props[op].name;
+
+        if (!name)
+                return false;
+
+        return (name[0] == 'i') || (name[0] == 'u');
+}
+
+/* Does this opcode *write* an integer? Same as is_integer_op, unless it's a
+ * conversion between int<->float in which case we do the opposite */
+
+static inline bool
+midgard_is_integer_out_op(int op)
+{
+        bool is_int = midgard_is_integer_op(op);
+        bool is_conversion = alu_opcode_props[op].props & OP_TYPE_CONVERT;
+
+        return is_int ^ is_conversion;
+}
+
+/* Determines effective writemask, taking quirks and expansion into account */
+
+static inline unsigned
+effective_writemask(midgard_vector_alu *alu, unsigned existing_mask)
+{
+        /* Channel count is off-by-one to fit in two-bits (0 channel makes no
+         * sense) */
+
+        unsigned channel_count = GET_CHANNEL_COUNT(alu_opcode_props[alu->op].props);
+
+        /* If there is a fixed channel count, construct the appropriate mask */
+
+        if (channel_count)
+                return (1 << channel_count) - 1;
+
+        return existing_mask;
+};
+
+
diff --git a/src/panfrost/midgard/midgard_print.c b/src/panfrost/midgard/midgard_print.c
new file mode 100644
index 00000000000..6e10429ccee
--- /dev/null
+++ b/src/panfrost/midgard/midgard_print.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+#include "helpers.h"
+#include "midgard_ops.h"
+
+/* Pretty printer for Midgard IR, for use debugging compiler-internal
+ * passes like register allocation. The output superficially resembles
+ * Midgard assembly, with the exception that unit information and such is
+ * (normally) omitted, and generic indices are usually used instead of
+ * registers */
+
+static void
+mir_print_source(int source)
+{
+        if (source >= SSA_FIXED_MINIMUM) {
+                /* Specific register */
+                int reg = SSA_REG_FROM_FIXED(source);
+
+                /* TODO: Moving threshold */
+                if (reg > 16 && reg < 24)
+                        printf("u%d", 23 - reg);
+                else
+                        printf("r%d", reg);
+        } else {
+                printf("%d", source);
+        }
+}
+
+void
+mir_print_instruction(midgard_instruction *ins)
+{
+        printf("\t");
+
+        switch (ins->type) {
+        case TAG_ALU_4: {
+                midgard_alu_op op = ins->alu.op;
+                const char *name = alu_opcode_props[op].name;
+
+                if (ins->unit)
+                        printf("%d.", ins->unit);
+
+                printf("%s", name ? name : "??");
+                break;
+        }
+
+        case TAG_LOAD_STORE_4: {
+                midgard_load_store_op op = ins->load_store.op;
+                const char *name = load_store_opcode_names[op];
+
+                assert(name);
+                printf("%s", name);
+                break;
+        }
+
+        case TAG_TEXTURE_4: {
+                printf("texture");
+                break;
+        }
+
+        default:
+                assert(0);
+        }
+
+        ssa_args *args = &ins->ssa_args;
+
+        printf(" %d, ", args->dest);
+
+        mir_print_source(args->src0);
+        printf(", ");
+
+        if (args->inline_constant)
+                printf("#%d", ins->inline_constant);
+        else
+                mir_print_source(args->src1);
+
+        if (ins->has_constants)
+                printf(" <%f, %f, %f, %f>", ins->constants[0], ins->constants[1], ins->constants[2], ins->constants[3]);
+
+        printf("\n");
+}
+
+/* Dumps MIR for a block or entire shader respective */
+
+void
+mir_print_block(midgard_block *block)
+{
+        printf("{\n");
+
+        mir_foreach_instr_in_block(block, ins) {
+                mir_print_instruction(ins);
+        }
+
+        printf("}\n");
+}
+
+void
+mir_print_shader(compiler_context *ctx)
+{
+        mir_foreach_block(ctx, block) {
+                mir_print_block(block);
+        }
+}
+
+void
+mir_print_bundle(midgard_bundle *bundle)
+{
+        printf("[\n");
+
+        for (unsigned i = 0; i < bundle->instruction_count; ++i) {
+                midgard_instruction *ins = bundle->instructions[i];
+                mir_print_instruction(ins);
+        }
+
+        printf("]\n");
+}
diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c
new file mode 100644
index 00000000000..cfe091326ed
--- /dev/null
+++ b/src/panfrost/midgard/midgard_ra.c
@@ -0,0 +1,506 @@
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+#include "midgard_ops.h"
+#include "util/register_allocate.h"
+#include "util/u_math.h"
+
+/* For work registers, we can subdivide in various ways. So we create
+ * classes for the various sizes and conflict accordingly, keeping in
+ * mind that physical registers are divided along 128-bit boundaries.
+ * The important part is that 128-bit boundaries are not crossed.
+ *
+ * For each 128-bit register, we can subdivide to 32-bits 10 ways
+ *
+ * vec4: xyzw
+ * vec3: xyz, yzw
+ * vec2: xy, yz, zw,
+ * vec1: x, y, z, w
+ *
+ * For each 64-bit register, we can subdivide similarly to 16-bit
+ * (TODO: half-float RA, not that we support fp16 yet)
+ */
+
+#define WORK_STRIDE 10
+
+/* Prepacked masks/swizzles for virtual register types */
+static unsigned reg_type_to_mask[WORK_STRIDE] = {
+        0xF,                                    /* xyzw */
+        0x7, 0x7 << 1,                          /* xyz */
+                 0x3, 0x3 << 1, 0x3 << 2,                /* xy */
+                 0x1, 0x1 << 1, 0x1 << 2, 0x1 << 3       /* x */
+};
+
+static unsigned reg_type_to_swizzle[WORK_STRIDE] = {
+        SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+
+        SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+        SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_W, COMPONENT_W),
+
+        SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+        SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_Z, COMPONENT_W),
+        SWIZZLE(COMPONENT_Z, COMPONENT_W, COMPONENT_Z, COMPONENT_W),
+
+        SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+        SWIZZLE(COMPONENT_Y, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+        SWIZZLE(COMPONENT_Z, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+        SWIZZLE(COMPONENT_W, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+};
+
+struct phys_reg {
+        unsigned reg;
+        unsigned mask;
+        unsigned swizzle;
+};
+
+/* Given the mask/swizzle of both the register and the original source,
+ * compose to find the actual mask/swizzle to give the hardware */
+
+static unsigned
+compose_writemask(unsigned mask, struct phys_reg reg)
+{
+        /* Note: the reg mask is guaranteed to be contiguous. So we shift
+         * into the X place, compose via a simple AND, and shift back */
+
+        unsigned shift = __builtin_ctz(reg.mask);
+        return ((reg.mask >> shift) & mask) << shift;
+}
+
+static unsigned
+compose_swizzle(unsigned swizzle, unsigned mask,
+                struct phys_reg reg, struct phys_reg dst)
+{
+        unsigned out = pan_compose_swizzle(swizzle, reg.swizzle);
+
+        /* Based on the register mask, we need to adjust over. E.g if we're
+         * writing to yz, a base swizzle of xy__ becomes _xy_. Save the
+         * original first component (x). But to prevent duplicate shifting
+         * (only applies to ALU -- mask param is set to xyzw out on L/S to
+         * prevent changes), we have to account for the shift inherent to the
+         * original writemask */
+
+        unsigned rep = out & 0x3;
+        unsigned shift = __builtin_ctz(dst.mask) - __builtin_ctz(mask);
+        unsigned shifted = out << (2*shift);
+
+        /* ..but we fill in the gaps so it appears to replicate */
+
+        for (unsigned s = 0; s < shift; ++s)
+                shifted |= rep << (2*s);
+
+        return shifted;
+}
+
+/* When we're 'squeezing down' the values in the IR, we maintain a hash
+ * as such */
+
+static unsigned
+find_or_allocate_temp(compiler_context *ctx, unsigned hash)
+{
+        if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
+                return hash;
+
+        unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
+                                ctx->hash_to_temp, hash + 1);
+
+        if (temp)
+                return temp - 1;
+
+        /* If no temp is find, allocate one */
+        temp = ctx->temp_count++;
+        ctx->max_hash = MAX2(ctx->max_hash, hash);
+
+        _mesa_hash_table_u64_insert(ctx->hash_to_temp,
+                                    hash + 1, (void *) ((uintptr_t) temp + 1));
+
+        return temp;
+}
+
+/* Callback for register allocation selection, trivial default for now */
+
+static unsigned int
+midgard_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
+{
+        /* Choose the first available register to minimise register pressure */
+
+        for (int i = 0; i < (16 * WORK_STRIDE); ++i) {
+                if (BITSET_TEST(regs, i)) {
+                        return i;
+                }
+        }
+
+        assert(0);
+        return 0;
+}
+
+/* Helper to return the default phys_reg for a given register */
+
+static struct phys_reg
+default_phys_reg(int reg)
+{
+        struct phys_reg r = {
+                .reg = reg,
+                .mask = 0xF, /* xyzw */
+                .swizzle = 0xE4 /* xyzw */
+        };
+
+        return r;
+}
+
+/* Determine which physical register, swizzle, and mask a virtual
+ * register corresponds to */
+
+static struct phys_reg
+index_to_reg(compiler_context *ctx, struct ra_graph *g, int reg)
+{
+        /* Check for special cases */
+        if (reg >= SSA_FIXED_MINIMUM)
+                return default_phys_reg(SSA_REG_FROM_FIXED(reg));
+        else if ((reg < 0) || !g)
+                return default_phys_reg(REGISTER_UNUSED);
+
+        /* Special cases aside, we pick the underlying register */
+        int virt = ra_get_node_reg(g, reg);
+
+        /* Divide out the register and classification */
+        int phys = virt / WORK_STRIDE;
+        int type = virt % WORK_STRIDE;
+
+        struct phys_reg r = {
+                .reg = phys,
+                .mask = reg_type_to_mask[type],
+                .swizzle = reg_type_to_swizzle[type]
+        };
+
+        /* Report that we actually use this register, and return it */
+        ctx->work_registers = MAX2(ctx->work_registers, phys);
+        return r;
+}
+
+/* This routine performs the actual register allocation. It should be succeeded
+ * by install_registers */
+
+struct ra_graph *
+allocate_registers(compiler_context *ctx)
+{
+        /* The number of vec4 work registers available depends on when the
+         * uniforms start, so compute that first */
+
+        int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0);
+
+        int virtual_count = work_count * WORK_STRIDE;
+
+        /* First, initialize the RA */
+        struct ra_regs *regs = ra_alloc_reg_set(NULL, virtual_count, true);
+
+        int work_vec4 = ra_alloc_reg_class(regs);
+        int work_vec3 = ra_alloc_reg_class(regs);
+        int work_vec2 = ra_alloc_reg_class(regs);
+        int work_vec1 = ra_alloc_reg_class(regs);
+
+        unsigned classes[4] = {
+                work_vec1,
+                work_vec2,
+                work_vec3,
+                work_vec4
+        };
+
+        /* Add the full set of work registers */
+        for (unsigned i = 0; i < work_count; ++i) {
+                int base = WORK_STRIDE * i;
+
+                /* Build a full set of subdivisions */
+                ra_class_add_reg(regs, work_vec4, base);
+                ra_class_add_reg(regs, work_vec3, base + 1);
+                ra_class_add_reg(regs, work_vec3, base + 2);
+                ra_class_add_reg(regs, work_vec2, base + 3);
+                ra_class_add_reg(regs, work_vec2, base + 4);
+                ra_class_add_reg(regs, work_vec2, base + 5);
+                ra_class_add_reg(regs, work_vec1, base + 6);
+                ra_class_add_reg(regs, work_vec1, base + 7);
+                ra_class_add_reg(regs, work_vec1, base + 8);
+                ra_class_add_reg(regs, work_vec1, base + 9);
+
+                for (unsigned a = 0; a < 10; ++a) {
+                        unsigned mask1 = reg_type_to_mask[a];
+
+                        for (unsigned b = 0; b < 10; ++b) {
+                                unsigned mask2 = reg_type_to_mask[b];
+
+                                if (mask1 & mask2)
+                                        ra_add_reg_conflict(regs,
+                                                            base + a, base + b);
+                        }
+                }
+        }
+
+        /* We're done setting up */
+        ra_set_finalize(regs, NULL);
+
+        /* Transform the MIR into squeezed index form */
+        mir_foreach_block(ctx, block) {
+                mir_foreach_instr_in_block(block, ins) {
+                        if (ins->compact_branch) continue;
+
+                        ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
+                        ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
+
+                        if (!ins->ssa_args.inline_constant)
+                                ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
+
+                }
+        }
+
+        /* No register allocation to do with no SSA */
+
+        if (!ctx->temp_count)
+                return NULL;
+
+        /* Let's actually do register allocation */
+        int nodes = ctx->temp_count;
+        struct ra_graph *g = ra_alloc_interference_graph(regs, nodes);
+
+        /* Determine minimum size needed to hold values, to indirectly
+         * determine class */
+
+        unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count);
+
+        mir_foreach_block(ctx, block) {
+                mir_foreach_instr_in_block(block, ins) {
+                        if (ins->compact_branch) continue;
+                        if (ins->ssa_args.dest < 0) continue;
+                        if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
+
+                        int class = util_logbase2(ins->mask) + 1;
+
+                        /* Use the largest class if there's ambiguity, this
+                         * handles partial writes */
+
+                        int dest = ins->ssa_args.dest;
+                        found_class[dest] = MAX2(found_class[dest], class);
+                }
+        }
+
+        for (unsigned i = 0; i < ctx->temp_count; ++i) {
+                unsigned class = found_class[i];
+                if (!class) continue;
+                ra_set_node_class(g, i, classes[class - 1]);
+        }
+
+        /* Determine liveness */
+
+        int *live_start = malloc(nodes * sizeof(int));
+        int *live_end = malloc(nodes * sizeof(int));
+
+        /* Initialize as non-existent */
+
+        for (int i = 0; i < nodes; ++i) {
+                live_start[i] = live_end[i] = -1;
+        }
+
+        int d = 0;
+
+        mir_foreach_block(ctx, block) {
+                mir_foreach_instr_in_block(block, ins) {
+                        if (ins->compact_branch) continue;
+
+                        /* Dest is < 0 for st_vary instructions, which break
+                         * the usual SSA conventions. Liveness analysis doesn't
+                         * make sense on these instructions, so skip them to
+                         * avoid memory corruption */
+
+                        if (ins->ssa_args.dest < 0) continue;
+
+                        if (ins->ssa_args.dest < SSA_FIXED_MINIMUM) {
+                                /* If this destination is not yet live, it is
+                                 * now since we just wrote it */
+
+                                int dest = ins->ssa_args.dest;
+
+                                if (live_start[dest] == -1)
+                                        live_start[dest] = d;
+                        }
+
+                        /* Since we just used a source, the source might be
+                         * dead now. Scan the rest of the block for
+                         * invocations, and if there are none, the source dies
+                         * */
+
+                        int sources[2] = {
+                                ins->ssa_args.src0, ins->ssa_args.src1
+                        };
+
+                        for (int src = 0; src < 2; ++src) {
+                                int s = sources[src];
+
+                                if (s < 0) continue;
+
+                                if (s >= SSA_FIXED_MINIMUM) continue;
+
+                                if (!mir_is_live_after(ctx, block, ins, s)) {
+                                        live_end[s] = d;
+                                }
+                        }
+
+                        ++d;
+                }
+        }
+
+        /* If a node still hasn't been killed, kill it now */
+
+        for (int i = 0; i < nodes; ++i) {
+                /* live_start == -1 most likely indicates a pinned output */
+
+                if (live_end[i] == -1)
+                        live_end[i] = d;
+        }
+
+        /* Setup interference between nodes that are live at the same time */
+
+        for (int i = 0; i < nodes; ++i) {
+                for (int j = i + 1; j < nodes; ++j) {
+                        bool j_overlaps_i = live_start[j] < live_end[i];
+                        bool i_overlaps_j = live_end[j] < live_start[i];
+
+                        if (i_overlaps_j || j_overlaps_i)
+                                ra_add_node_interference(g, i, j);
+                }
+        }
+
+        ra_set_select_reg_callback(g, midgard_ra_select_callback, NULL);
+
+        if (!ra_allocate(g)) {
+                unreachable("Error allocating registers\n");
+        }
+
+        /* Cleanup */
+        free(live_start);
+        free(live_end);
+
+        return g;
+}
+
+/* Once registers have been decided via register allocation
+ * (allocate_registers), we need to rewrite the MIR to use registers instead of
+ * indices */
+
+static void
+install_registers_instr(
+        compiler_context *ctx,
+        struct ra_graph *g,
+        midgard_instruction *ins)
+{
+        ssa_args args = ins->ssa_args;
+
+        switch (ins->type) {
+        case TAG_ALU_4: {
+                int adjusted_src = args.inline_constant ? -1 : args.src1;
+                struct phys_reg src1 = index_to_reg(ctx, g, args.src0);
+                struct phys_reg src2 = index_to_reg(ctx, g, adjusted_src);
+                struct phys_reg dest = index_to_reg(ctx, g, args.dest);
+
+                unsigned uncomposed_mask = ins->mask;
+                ins->mask = compose_writemask(uncomposed_mask, dest);
+
+                /* Adjust the dest mask if necessary. Mostly this is a no-op
+                 * but it matters for dot products */
+                dest.mask = effective_writemask(&ins->alu, ins->mask);
+
+                midgard_vector_alu_src mod1 =
+                        vector_alu_from_unsigned(ins->alu.src1);
+                mod1.swizzle = compose_swizzle(mod1.swizzle, uncomposed_mask, src1, dest);
+                ins->alu.src1 = vector_alu_srco_unsigned(mod1);
+
+                ins->registers.src1_reg = src1.reg;
+
+                ins->registers.src2_imm = args.inline_constant;
+
+                if (args.inline_constant) {
+                        /* Encode inline 16-bit constant. See disassembler for
+                         * where the algorithm is from */
+
+                        ins->registers.src2_reg = ins->inline_constant >> 11;
+
+                        int lower_11 = ins->inline_constant & ((1 << 12) - 1);
+                        uint16_t imm = ((lower_11 >> 8) & 0x7) |
+                                       ((lower_11 & 0xFF) << 3);
+
+                        ins->alu.src2 = imm << 2;
+                } else {
+                        midgard_vector_alu_src mod2 =
+                                vector_alu_from_unsigned(ins->alu.src2);
+                        mod2.swizzle = compose_swizzle(
+                                               mod2.swizzle, uncomposed_mask, src2, dest);
+                        ins->alu.src2 = vector_alu_srco_unsigned(mod2);
+
+                        ins->registers.src2_reg = src2.reg;
+                }
+
+                ins->registers.out_reg = dest.reg;
+                break;
+        }
+
+        case TAG_LOAD_STORE_4: {
+                if (OP_IS_STORE_VARY(ins->load_store.op)) {
+                        /* TODO: use ssa_args for st_vary */
+                        ins->load_store.reg = 0;
+                } else {
+                        /* Which physical register we read off depends on
+                         * whether we are loading or storing -- think about the
+                         * logical dataflow */
+
+                        unsigned r = OP_IS_STORE(ins->load_store.op) ?
+                                     args.src0 : args.dest;
+                        struct phys_reg src = index_to_reg(ctx, g, r);
+
+                        ins->load_store.reg = src.reg;
+
+                        ins->load_store.swizzle = compose_swizzle(
+                                                          ins->load_store.swizzle, 0xF,
+                                                          default_phys_reg(0), src);
+
+                        ins->mask = compose_writemask(
+                                            ins->mask, src);
+                }
+
+                break;
+        }
+
+        default:
+                break;
+        }
+}
+
+void
+install_registers(compiler_context *ctx, struct ra_graph *g)
+{
+        mir_foreach_block(ctx, block) {
+                mir_foreach_instr_in_block(block, ins) {
+                        if (ins->compact_branch) continue;
+                        install_registers_instr(ctx, g, ins);
+                }
+        }
+
+}
diff --git a/src/panfrost/midgard/midgard_ra_pipeline.c b/src/panfrost/midgard/midgard_ra_pipeline.c
new file mode 100644
index 00000000000..cd64bdf29e5
--- /dev/null
+++ b/src/panfrost/midgard/midgard_ra_pipeline.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2019 Alyssa Rosenzweig <[email protected]>
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+
+/* Creates pipeline registers. This is a prepass run before the main register
+ * allocator but after scheduling, once bundles are created. It works by
+ * iterating the scheduled IR, checking if a value is ever used after the end
+ * of the current bundle. If it is not, it is promoted to a bundle-specific
+ * pipeline register.
+ *
+ * Pipeline registers are only written from the first two stages of the
+ * pipeline (vmul/sadd) lasting the duration of the bundle only. There are two
+ * 128-bit pipeline registers available (r24/r25). The upshot is that no actual
+ * register allocation is needed; we can _always_ promote a value to a pipeline
+ * register, liveness permitting. This greatly simplifies the logic of this
+ * passing, negating the need for a proper RA like work registers.
+ */
+
+static bool
+mir_pipeline_ins(
+        compiler_context *ctx,
+        midgard_block *block,
+        midgard_bundle *bundle, unsigned i,
+        unsigned pipeline_count)
+{
+        midgard_instruction *ins = bundle->instructions[i];
+        unsigned dest = ins->ssa_args.dest;
+
+        /* Check to make sure we're legal */
+
+        if (ins->compact_branch)
+                return false;
+
+        /* Don't allow non-SSA. Pipelining registers is theoretically possible,
+         * but the analysis is much hairier, so don't bother quite yet */
+        if ((dest < 0) || (dest >= ctx->func->impl->ssa_alloc))
+                return false;
+
+        /* Make sure they're not lying to us. Blend shaders lie. TODO: Fix your
+         * bad code Alyssa */
+
+        if (mir_has_multiple_writes(ctx, dest))
+                return false;
+
+        /* We want to know if we live after this bundle, so check if
+         * we're live after the last instruction of the bundle */
+
+        midgard_instruction *end = bundle->instructions[
+                                    bundle->instruction_count - 1];
+
+        if (mir_is_live_after(ctx, block, end, ins->ssa_args.dest))
+                return false;
+
+        /* We're only live in this bundle -- pipeline! */
+
+        mir_rewrite_index(ctx, dest, SSA_FIXED_REGISTER(24 + pipeline_count));
+
+        return true;
+}
+
+void
+mir_create_pipeline_registers(compiler_context *ctx)
+{
+        mir_foreach_block(ctx, block) {
+                mir_foreach_bundle_in_block(block, bundle) {
+                        if (!mir_is_alu_bundle(bundle)) continue;
+                        if (bundle->instruction_count < 2) continue;
+
+                        /* Only first 2 instructions could pipeline */
+                        bool succ = mir_pipeline_ins(ctx, block, bundle, 0, 0);
+                        mir_pipeline_ins(ctx, block, bundle, 1, succ);
+                }
+        }
+}
diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c
new file mode 100644
index 00000000000..7a3841e4d44
--- /dev/null
+++ b/src/panfrost/midgard/midgard_schedule.c
@@ -0,0 +1,541 @@
+/*
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+#include "midgard_ops.h"
+#include "util/u_memory.h"
+
+/* Create a mask of accessed components from a swizzle to figure out vector
+ * dependencies */
+
+static unsigned
+swizzle_to_access_mask(unsigned swizzle)
+{
+        unsigned component_mask = 0;
+
+        for (int i = 0; i < 4; ++i) {
+                unsigned c = (swizzle >> (2 * i)) & 3;
+                component_mask |= (1 << c);
+        }
+
+        return component_mask;
+}
+
+/* Does the mask cover more than a scalar? */
+
+static bool
+is_single_component_mask(unsigned mask)
+{
+        int components = 0;
+
+        for (int c = 0; c < 8; ++c) {
+                if (mask & (1 << c))
+                        components++;
+        }
+
+        return components == 1;
+}
+
+/* Checks for an SSA data hazard between two adjacent instructions, keeping in
+ * mind that we are a vector architecture and we can write to different
+ * components simultaneously */
+
+static bool
+can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
+{
+        /* Each instruction reads some registers and writes to a register. See
+         * where the first writes */
+
+        /* Figure out where exactly we wrote to */
+        int source = first->ssa_args.dest;
+        int source_mask = first->mask;
+
+        /* As long as the second doesn't read from the first, we're okay */
+        if (second->ssa_args.src0 == source) {
+                if (first->type == TAG_ALU_4) {
+                        /* Figure out which components we just read from */
+
+                        int q = second->alu.src1;
+                        midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
+
+                        /* Check if there are components in common, and fail if so */
+                        if (swizzle_to_access_mask(m->swizzle) & source_mask)
+                                return false;
+                } else
+                        return false;
+
+        }
+
+        if (second->ssa_args.src1 == source)
+                return false;
+
+        /* Otherwise, it's safe in that regard. Another data hazard is both
+         * writing to the same place, of course */
+
+        if (second->ssa_args.dest == source) {
+                /* ...but only if the components overlap */
+
+                if (second->mask & source_mask)
+                        return false;
+        }
+
+        /* ...That's it */
+        return true;
+}
+
+static bool
+midgard_has_hazard(
+        midgard_instruction **segment, unsigned segment_size,
+        midgard_instruction *ains)
+{
+        for (int s = 0; s < segment_size; ++s)
+                if (!can_run_concurrent_ssa(segment[s], ains))
+                        return true;
+
+        return false;
+
+
+}
+
+/* Schedules, but does not emit, a single basic block. After scheduling, the
+ * final tag and size of the block are known, which are necessary for branching
+ * */
+
+static midgard_bundle
+schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
+{
+        int instructions_emitted = 0, packed_idx = 0;
+        midgard_bundle bundle = { 0 };
+
+        uint8_t tag = ins->type;
+
+        /* Default to the instruction's tag */
+        bundle.tag = tag;
+
+        switch (ins->type) {
+        case TAG_ALU_4: {
+                uint32_t control = 0;
+                size_t bytes_emitted = sizeof(control);
+
+                /* TODO: Constant combining */
+                int index = 0, last_unit = 0;
+
+                /* Previous instructions, for the purpose of parallelism */
+                midgard_instruction *segment[4] = {0};
+                int segment_size = 0;
+
+                instructions_emitted = -1;
+                midgard_instruction *pins = ins;
+
+                unsigned constant_count = 0;
+
+                for (;;) {
+                        midgard_instruction *ains = pins;
+
+                        /* Advance instruction pointer */
+                        if (index) {
+                                ains = mir_next_op(pins);
+                                pins = ains;
+                        }
+
+                        /* Out-of-work condition */
+                        if ((struct list_head *) ains == &block->instructions)
+                                break;
+
+                        /* Ensure that the chain can continue */
+                        if (ains->type != TAG_ALU_4) break;
+
+                        /* If there's already something in the bundle and we
+                         * have weird scheduler constraints, break now */
+                        if (ains->precede_break && index) break;
+
+                        /* According to the presentation "The ARM
+                         * Mali-T880 Mobile GPU" from HotChips 27,
+                         * there are two pipeline stages. Branching
+                         * position determined experimentally. Lines
+                         * are executed in parallel:
+                         *
+                         * [ VMUL ] [ SADD ]
+                         * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
+                         *
+                         * Verify that there are no ordering dependencies here.
+                         *
+                         * TODO: Allow for parallelism!!!
+                         */
+
+                        /* Pick a unit for it if it doesn't force a particular unit */
+
+                        int unit = ains->unit;
+
+                        if (!unit) {
+                                int op = ains->alu.op;
+                                int units = alu_opcode_props[op].props;
+
+                                bool scalarable = units & UNITS_SCALAR;
+                                bool could_scalar = is_single_component_mask(ains->mask);
+
+                                /* Only 16/32-bit can run on a scalar unit */
+                                could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8;
+                                could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64;
+                                could_scalar &= ains->alu.dest_override == midgard_dest_override_none;
+
+                                if (ains->alu.reg_mode == midgard_reg_mode_16) {
+                                        /* If we're running in 16-bit mode, we
+                                         * can't have any 8-bit sources on the
+                                         * scalar unit (since the scalar unit
+                                         * doesn't understand 8-bit) */
+
+                                        midgard_vector_alu_src s1 =
+                                                vector_alu_from_unsigned(ains->alu.src1);
+
+                                        could_scalar &= !s1.half;
+
+                                        if (!ains->ssa_args.inline_constant) {
+                                                midgard_vector_alu_src s2 =
+                                                        vector_alu_from_unsigned(ains->alu.src2);
+
+                                                could_scalar &= !s2.half;
+                                        }
+
+                                }
+
+                                bool scalar = could_scalar && scalarable;
+
+                                /* TODO: Check ahead-of-time for other scalar
+                                 * hazards that otherwise get aborted out */
+
+                                if (scalar)
+                                        assert(units & UNITS_SCALAR);
+
+                                if (!scalar) {
+                                        if (last_unit >= UNIT_VADD) {
+                                                if (units & UNIT_VLUT)
+                                                        unit = UNIT_VLUT;
+                                                else
+                                                        break;
+                                        } else {
+                                                if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL)
+                                                        unit = UNIT_VMUL;
+                                                else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
+                                                        unit = UNIT_VADD;
+                                                else if (units & UNIT_VLUT)
+                                                        unit = UNIT_VLUT;
+                                                else
+                                                        break;
+                                        }
+                                } else {
+                                        if (last_unit >= UNIT_VADD) {
+                                                if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
+                                                        unit = UNIT_SMUL;
+                                                else if (units & UNIT_VLUT)
+                                                        unit = UNIT_VLUT;
+                                                else
+                                                        break;
+                                        } else {
+                                                if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
+                                                        unit = UNIT_SADD;
+                                                else if (units & UNIT_SMUL)
+                                                        unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL;
+                                                else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
+                                                        unit = UNIT_VADD;
+                                                else
+                                                        break;
+                                        }
+                                }
+
+                                assert(unit & units);
+                        }
+
+                        /* Late unit check, this time for encoding (not parallelism) */
+                        if (unit <= last_unit) break;
+
+                        /* Clear the segment */
+                        if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
+                                segment_size = 0;
+
+                        if (midgard_has_hazard(segment, segment_size, ains))
+                                break;
+
+                        /* We're good to go -- emit the instruction */
+                        ains->unit = unit;
+
+                        segment[segment_size++] = ains;
+
+                        /* We try to reuse constants if possible, by adjusting
+                         * the swizzle */
+
+                        if (ains->has_blend_constant) {
+                                /* Everything conflicts with the blend constant */
+                                if (bundle.has_embedded_constants)
+                                        break;
+
+                                bundle.has_blend_constant = 1;
+                                bundle.has_embedded_constants = 1;
+                        } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) {
+                                /* TODO: DRY with the analysis pass */
+
+                                if (bundle.has_blend_constant)
+                                        break;
+
+                                if (constant_count)
+                                        break;
+
+                                /* TODO: Fix packing XXX */
+                                uint16_t *bundles = (uint16_t *) bundle.constants;
+                                uint32_t *constants = (uint32_t *) ains->constants;
+
+                                /* Copy them wholesale */
+                                for (unsigned i = 0; i < 4; ++i)
+                                        bundles[i] = constants[i];
+
+                                bundle.has_embedded_constants = true;
+                                constant_count = 4;
+                        } else if (ains->has_constants) {
+                                /* By definition, blend constants conflict with
+                                 * everything, so if there are already
+                                 * constants we break the bundle *now* */
+
+                                if (bundle.has_blend_constant)
+                                        break;
+
+                                /* For anything but blend constants, we can do
+                                 * proper analysis, however */
+
+                                /* TODO: Mask by which are used */
+                                uint32_t *constants = (uint32_t *) ains->constants;
+                                uint32_t *bundles = (uint32_t *) bundle.constants;
+
+                                uint32_t indices[4] = { 0 };
+                                bool break_bundle = false;
+
+                                for (unsigned i = 0; i < 4; ++i) {
+                                        uint32_t cons = constants[i];
+                                        bool constant_found = false;
+
+                                        /* Search for the constant */
+                                        for (unsigned j = 0; j < constant_count; ++j) {
+                                                if (bundles[j] != cons)
+                                                        continue;
+
+                                                /* We found it, reuse */
+                                                indices[i] = j;
+                                                constant_found = true;
+                                                break;
+                                        }
+
+                                        if (constant_found)
+                                                continue;
+
+                                        /* We didn't find it, so allocate it */
+                                        unsigned idx = constant_count++;
+
+                                        if (idx >= 4) {
+                                                /* Uh-oh, out of space */
+                                                break_bundle = true;
+                                                break;
+                                        }
+
+                                        /* We have space, copy it in! */
+                                        bundles[idx] = cons;
+                                        indices[i] = idx;
+                                }
+
+                                if (break_bundle)
+                                        break;
+
+                                /* Cool, we have it in. So use indices as a
+                                 * swizzle */
+
+                                unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
+                                unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
+
+                                if (ains->ssa_args.src0 == r_constant)
+                                        ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
+
+                                if (ains->ssa_args.src1 == r_constant)
+                                        ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
+
+                                bundle.has_embedded_constants = true;
+                        }
+
+                        if (ains->unit & UNITS_ANY_VECTOR) {
+                                bytes_emitted += sizeof(midgard_reg_info);
+                                bytes_emitted += sizeof(midgard_vector_alu);
+                        } else if (ains->compact_branch) {
+                                /* All of r0 has to be written out along with
+                                 * the branch writeout */
+
+                                if (ains->writeout) {
+                                        /* The rules for when "bare" writeout
+                                         * is safe are when all components are
+                                         * r0 are written out in the final
+                                         * bundle, earlier than VLUT, where any
+                                         * register dependencies of r0 are from
+                                         * an earlier bundle. We can't verify
+                                         * this before RA, so we don't try. */
+
+                                        if (index != 0)
+                                                break;
+
+                                        /* Inject a move */
+                                        midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
+                                        ins.unit = UNIT_VMUL;
+                                        control |= ins.unit;
+
+                                        /* TODO don't leak */
+                                        midgard_instruction *move =
+                                                mem_dup(&ins, sizeof(midgard_instruction));
+                                        bytes_emitted += sizeof(midgard_reg_info);
+                                        bytes_emitted += sizeof(midgard_vector_alu);
+                                        bundle.instructions[packed_idx++] = move;
+                                }
+
+                                if (ains->unit == ALU_ENAB_BRANCH) {
+                                        bytes_emitted += sizeof(midgard_branch_extended);
+                                } else {
+                                        bytes_emitted += sizeof(ains->br_compact);
+                                }
+                        } else {
+                                bytes_emitted += sizeof(midgard_reg_info);
+                                bytes_emitted += sizeof(midgard_scalar_alu);
+                        }
+
+                        /* Defer marking until after writing to allow for break */
+                        control |= ains->unit;
+                        last_unit = ains->unit;
+                        ++instructions_emitted;
+                        ++index;
+                }
+
+                int padding = 0;
+
+                /* Pad ALU op to nearest word */
+
+                if (bytes_emitted & 15) {
+                        padding = 16 - (bytes_emitted & 15);
+                        bytes_emitted += padding;
+                }
+
+                /* Constants must always be quadwords */
+                if (bundle.has_embedded_constants)
+                        bytes_emitted += 16;
+
+                /* Size ALU instruction for tag */
+                bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
+                bundle.padding = padding;
+                bundle.control = bundle.tag | control;
+
+                break;
+        }
+
+        case TAG_LOAD_STORE_4: {
+                /* Load store instructions have two words at once. If
+                 * we only have one queued up, we need to NOP pad.
+                 * Otherwise, we store both in succession to save space
+                 * and cycles -- letting them go in parallel -- skip
+                 * the next. The usefulness of this optimisation is
+                 * greatly dependent on the quality of the instruction
+                 * scheduler.
+                 */
+
+                midgard_instruction *next_op = mir_next_op(ins);
+
+                if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
+                        /* TODO: Concurrency check */
+                        instructions_emitted++;
+                }
+
+                break;
+        }
+
+        case TAG_TEXTURE_4: {
+                /* Which tag we use depends on the shader stage */
+                bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
+                bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
+                break;
+        }
+
+        default:
+                unreachable("Unknown tag");
+                break;
+        }
+
+        /* Copy the instructions into the bundle */
+        bundle.instruction_count = instructions_emitted + 1 + packed_idx;
+
+        midgard_instruction *uins = ins;
+        for (; packed_idx < bundle.instruction_count; ++packed_idx) {
+                bundle.instructions[packed_idx] = uins;
+                uins = mir_next_op(uins);
+        }
+
+        *skip = instructions_emitted;
+
+        return bundle;
+}
+
+/* Schedule a single block by iterating its instruction to create bundles.
+ * While we go, tally about the bundle sizes to compute the block size. */
+
+static void
+schedule_block(compiler_context *ctx, midgard_block *block)
+{
+        util_dynarray_init(&block->bundles, NULL);
+
+        block->quadword_count = 0;
+
+        mir_foreach_instr_in_block(block, ins) {
+                int skip;
+                midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
+                util_dynarray_append(&block->bundles, midgard_bundle, bundle);
+
+                if (bundle.has_blend_constant) {
+                        /* TODO: Multiblock? */
+                        int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
+                        ctx->blend_constant_offset = quadwords_within_block * 0x10;
+                }
+
+                while(skip--)
+                        ins = mir_next_op(ins);
+
+                block->quadword_count += quadword_size(bundle.tag);
+        }
+
+        block->is_scheduled = true;
+}
+
+void
+schedule_program(compiler_context *ctx)
+{
+        /* We run RA prior to scheduling */
+
+        mir_foreach_block(ctx, block) {
+                schedule_block(ctx, block);
+        }
+
+        /* Pipeline registers creation is a prepass before RA */
+        mir_create_pipeline_registers(ctx);
+
+        struct ra_graph *g = allocate_registers(ctx);
+        install_registers(ctx, g);
+}
diff --git a/src/panfrost/midgard/mir.c b/src/panfrost/midgard/mir.c
new file mode 100644
index 00000000000..6adc1350c0a
--- /dev/null
+++ b/src/panfrost/midgard/mir.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2019 Alyssa Rosenzweig <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler.h"
+
+void
+mir_rewrite_index_src(compiler_context *ctx, unsigned old, unsigned new)
+{
+        mir_foreach_instr_global(ctx, ins) {
+                if (ins->ssa_args.src0 == old)
+                        ins->ssa_args.src0 = new;
+
+                if (ins->ssa_args.src1 == old &&
+                    !ins->ssa_args.inline_constant)
+                        ins->ssa_args.src1 = new;
+        }
+}
+
+void
+mir_rewrite_index_dst(compiler_context *ctx, unsigned old, unsigned new)
+{
+        mir_foreach_instr_global(ctx, ins) {
+                if (ins->ssa_args.dest == old)
+                        ins->ssa_args.dest = new;
+        }
+}
+
+void
+mir_rewrite_index(compiler_context *ctx, unsigned old, unsigned new)
+{
+        mir_rewrite_index_src(ctx, old, new);
+        mir_rewrite_index_dst(ctx, old, new);
+}