14 files changed, 7482 insertions, 0 deletions
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
new file mode 100644
index 00000000000..3fd914fa863
--- /dev/null
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -0,0 +1,1963 @@
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <inttypes.h>
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/ralloc.h"
+#include "util/hash_table.h"
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "v3d_compiler.h"
+
+/* We don't do any address packing. */
+#define __gen_user_data void
+#define __gen_address_type uint32_t
+#define __gen_address_offset(reloc) (*reloc)
+#define __gen_emit_reloc(cl, reloc)
+#include "cle/v3d_packet_v33_pack.h"
+
+static struct qreg
+ntq_get_src(struct v3d_compile *c, nir_src src, int i);
+static void
+ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
+
+static void
+resize_qreg_array(struct v3d_compile *c,
+                  struct qreg **regs,
+                  uint32_t *size,
+                  uint32_t decl_size)
+{
+        if (*size >= decl_size)
+                return;
+
+        uint32_t old_size = *size;
+        *size = MAX2(*size * 2, decl_size);
+        *regs = reralloc(c, *regs, struct qreg, *size);
+        if (!*regs) {
+                fprintf(stderr, "Malloc failure\n");
+                abort();
+        }
+
+        for (uint32_t i = old_size; i < *size; i++)
+                (*regs)[i] = c->undef;
+}
+
+static struct qreg
+vir_SFU(struct v3d_compile *c, int waddr, struct qreg src)
+{
+        vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, waddr), src);
+        return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
+}
+
+static struct qreg
+vir_LDTMU(struct v3d_compile *c)
+{
+        vir_NOP(c)->qpu.sig.ldtmu = true;
+        return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
+}
+
+static struct qreg
+indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr)
+{
+        struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
+        uint32_t offset = nir_intrinsic_base(intr);
+        struct v3d_ubo_range *range = NULL;
+        unsigned i;
+
+        for (i = 0; i < c->num_ubo_ranges; i++) {
+                range = &c->ubo_ranges[i];
+                if (offset >= range->src_offset &&
+                    offset < range->src_offset + range->size) {
+                        break;
+                }
+        }
+        /* The driver-location-based offset always has to be within a declared
+         * uniform range.
+         */
+        assert(i != c->num_ubo_ranges);
+        if (!c->ubo_range_used[i]) {
+                c->ubo_range_used[i] = true;
+                range->dst_offset = c->next_ubo_dst_offset;
+                c->next_ubo_dst_offset += range->size;
+        }
+
+        offset -= range->src_offset;
+
+        if (range->dst_offset + offset != 0) {
+                indirect_offset = vir_ADD(c, indirect_offset,
+                                          vir_uniform_ui(c, range->dst_offset +
+                                                         offset));
+        }
+
+        /* Adjust for where we stored the TGSI register base. */
+        vir_ADD_dest(c,
+                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
+                     vir_uniform(c, QUNIFORM_UBO_ADDR, 0),
+                     indirect_offset);
+
+        return vir_LDTMU(c);
+}
+
+static struct qreg *
+ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
+{
+        struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
+                                          def->num_components);
+        _mesa_hash_table_insert(c->def_ht, def, qregs);
+        return qregs;
+}
+
+/**
+ * This function is responsible for getting VIR results into the associated
+ * storage for a NIR instruction.
+ *
+ * If it's a NIR SSA def, then we just set the associated hash table entry to
+ * the new result.
+ *
+ * If it's a NIR reg, then we need to update the existing qreg assigned to the
+ * NIR destination with the incoming value.  To do that without introducing
+ * new MOVs, we require that the incoming qreg either be a uniform, or be
+ * SSA-defined by the previous VIR instruction in the block and rewritable by
+ * this function.  That lets us sneak ahead and insert the SF flag beforehand
+ * (knowing that the previous instruction doesn't depend on flags) and rewrite
+ * its destination to be the NIR reg's destination
+ */
+static void
+ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
+               struct qreg result)
+{
+        struct qinst *last_inst = NULL;
+        if (!list_empty(&c->cur_block->instructions))
+                last_inst = (struct qinst *)c->cur_block->instructions.prev;
+
+        assert(result.file == QFILE_UNIF ||
+               (result.file == QFILE_TEMP &&
+                last_inst && last_inst == c->defs[result.index]));
+
+        if (dest->is_ssa) {
+                assert(chan < dest->ssa.num_components);
+
+                struct qreg *qregs;
+                struct hash_entry *entry =
+                        _mesa_hash_table_search(c->def_ht, &dest->ssa);
+
+                if (entry)
+                        qregs = entry->data;
+                else
+                        qregs = ntq_init_ssa_def(c, &dest->ssa);
+
+                qregs[chan] = result;
+        } else {
+                nir_register *reg = dest->reg.reg;
+                assert(dest->reg.base_offset == 0);
+                assert(reg->num_array_elems == 0);
+                struct hash_entry *entry =
+                        _mesa_hash_table_search(c->def_ht, reg);
+                struct qreg *qregs = entry->data;
+
+                /* Insert a MOV if the source wasn't an SSA def in the
+                 * previous instruction.
+                 */
+                if (result.file == QFILE_UNIF) {
+                        result = vir_MOV(c, result);
+                        last_inst = c->defs[result.index];
+                }
+
+                /* We know they're both temps, so just rewrite index. */
+                c->defs[last_inst->dst.index] = NULL;
+                last_inst->dst.index = qregs[chan].index;
+
+                /* If we're in control flow, then make this update of the reg
+                 * conditional on the execution mask.
+                 */
+                if (c->execute.file != QFILE_NULL) {
+                        last_inst->dst.index = qregs[chan].index;
+
+                        /* Set the flags to the current exec mask.  To insert
+                         * the flags push, we temporarily remove our SSA
+                         * instruction.
+                         */
+                        list_del(&last_inst->link);
+                        vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+                        list_addtail(&last_inst->link,
+                                     &c->cur_block->instructions);
+
+                        vir_set_cond(last_inst, V3D_QPU_COND_IFA);
+                        last_inst->cond_is_exec_mask = true;
+                }
+        }
+}
+
+static struct qreg
+ntq_get_src(struct v3d_compile *c, nir_src src, int i)
+{
+        struct hash_entry *entry;
+        if (src.is_ssa) {
+                entry = _mesa_hash_table_search(c->def_ht, src.ssa);
+                assert(i < src.ssa->num_components);
+        } else {
+                nir_register *reg = src.reg.reg;
+                entry = _mesa_hash_table_search(c->def_ht, reg);
+                assert(reg->num_array_elems == 0);
+                assert(src.reg.base_offset == 0);
+                assert(i < reg->num_components);
+        }
+
+        struct qreg *qregs = entry->data;
+        return qregs[i];
+}
+
+static struct qreg
+ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
+                unsigned src)
+{
+        assert(util_is_power_of_two(instr->dest.write_mask));
+        unsigned chan = ffs(instr->dest.write_mask) - 1;
+        struct qreg r = ntq_get_src(c, instr->src[src].src,
+                                    instr->src[src].swizzle[chan]);
+
+        assert(!instr->src[src].abs);
+        assert(!instr->src[src].negate);
+
+        return r;
+};
+
+static inline struct qreg
+vir_SAT(struct v3d_compile *c, struct qreg val)
+{
+        return vir_FMAX(c,
+                        vir_FMIN(c, val, vir_uniform_f(c, 1.0)),
+                        vir_uniform_f(c, 0.0));
+}
+
+static struct qreg
+ntq_umul(struct v3d_compile *c, struct qreg src0, struct qreg src1)
+{
+        vir_MULTOP(c, src0, src1);
+        return vir_UMUL24(c, src0, src1);
+}
+
+static struct qreg
+ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
+{
+        return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
+}
+
+static void
+ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
+{
+        unsigned unit = instr->texture_index;
+        int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
+        int dest_size = nir_tex_instr_dest_size(instr);
+
+        struct qreg lod = c->undef;
+        if (lod_index != -1)
+                lod = ntq_get_src(c, instr->src[lod_index].src, 0);
+
+        for (int i = 0; i < dest_size; i++) {
+                assert(i < 3);
+                enum quniform_contents contents;
+
+                if (instr->is_array && i == dest_size - 1)
+                        contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
+                else
+                        contents = QUNIFORM_TEXTURE_WIDTH + i;
+
+                struct qreg size = vir_uniform(c, contents, unit);
+
+                switch (instr->sampler_dim) {
+                case GLSL_SAMPLER_DIM_1D:
+                case GLSL_SAMPLER_DIM_2D:
+                case GLSL_SAMPLER_DIM_3D:
+                case GLSL_SAMPLER_DIM_CUBE:
+                        /* Don't minify the array size. */
+                        if (!(instr->is_array && i == dest_size - 1)) {
+                                size = ntq_minify(c, size, lod);
+                        }
+                        break;
+
+                case GLSL_SAMPLER_DIM_RECT:
+                        /* There's no LOD field for rects */
+                        break;
+
+                default:
+                        unreachable("Bad sampler type");
+                }
+
+                ntq_store_dest(c, &instr->dest, i, size);
+        }
+}
+
+static void
+ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
+{
+        unsigned unit = instr->texture_index;
+
+        /* Since each texture sampling op requires uploading uniforms to
+         * reference the texture, there's no HW support for texture size and
+         * you just upload uniforms containing the size.
+         */
+        switch (instr->op) {
+        case nir_texop_query_levels:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
+                return;
+        case nir_texop_txs:
+                ntq_emit_txs(c, instr);
+                return;
+        default:
+                break;
+        }
+
+        struct V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1 p0_unpacked = {
+                V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_header,
+
+                .fetch_sample_mode = instr->op == nir_texop_txf,
+        };
+
+        switch (instr->sampler_dim) {
+        case GLSL_SAMPLER_DIM_1D:
+                if (instr->is_array)
+                        p0_unpacked.lookup_type = TEXTURE_1D_ARRAY;
+                else
+                        p0_unpacked.lookup_type = TEXTURE_1D;
+                break;
+        case GLSL_SAMPLER_DIM_2D:
+        case GLSL_SAMPLER_DIM_RECT:
+                if (instr->is_array)
+                        p0_unpacked.lookup_type = TEXTURE_2D_ARRAY;
+                else
+                        p0_unpacked.lookup_type = TEXTURE_2D;
+                break;
+        case GLSL_SAMPLER_DIM_3D:
+                p0_unpacked.lookup_type = TEXTURE_3D;
+                break;
+        case GLSL_SAMPLER_DIM_CUBE:
+                p0_unpacked.lookup_type = TEXTURE_CUBE_MAP;
+                break;
+        default:
+                unreachable("Bad sampler type");
+        }
+
+        struct qreg coords[5];
+        int next_coord = 0;
+        for (unsigned i = 0; i < instr->num_srcs; i++) {
+                switch (instr->src[i].src_type) {
+                case nir_tex_src_coord:
+                        for (int j = 0; j < instr->coord_components; j++) {
+                                coords[next_coord++] =
+                                        ntq_get_src(c, instr->src[i].src, j);
+                        }
+                        if (instr->coord_components < 2)
+                                coords[next_coord++] = vir_uniform_f(c, 0.5);
+                        break;
+                case nir_tex_src_bias:
+                        coords[next_coord++] =
+                                ntq_get_src(c, instr->src[i].src, 0);
+
+                        p0_unpacked.bias_supplied = true;
+                        break;
+                case nir_tex_src_lod:
+                        /* XXX: Needs base level addition */
+                        coords[next_coord++] =
+                                ntq_get_src(c, instr->src[i].src, 0);
+
+                        if (instr->op != nir_texop_txf &&
+                            instr->op != nir_texop_tg4) {
+                                p0_unpacked.disable_autolod_use_bias_only = true;
+                        }
+                        break;
+                case nir_tex_src_comparator:
+                        coords[next_coord++] =
+                                ntq_get_src(c, instr->src[i].src, 0);
+
+                        p0_unpacked.shadow = true;
+                        break;
+
+                case nir_tex_src_offset: {
+                        nir_const_value *offset =
+                                nir_src_as_const_value(instr->src[i].src);
+                        p0_unpacked.texel_offset_for_s_coordinate =
+                                offset->i32[0];
+
+                        if (instr->coord_components >= 2)
+                                p0_unpacked.texel_offset_for_t_coordinate =
+                                        offset->i32[1];
+
+                        if (instr->coord_components >= 3)
+                                p0_unpacked.texel_offset_for_r_coordinate =
+                                        offset->i32[2];
+                        break;
+                }
+
+                default:
+                        unreachable("unknown texture source");
+                }
+        }
+
+        uint32_t p0_packed;
+        V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL,
+                                                         (uint8_t *)&p0_packed,
+                                                         &p0_unpacked);
+
+        /* There is no native support for GL texture rectangle coordinates, so
+         * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0,
+         * 1]).
+         */
+        if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
+                coords[0] = vir_FMUL(c, coords[0],
+                                     vir_uniform(c, QUNIFORM_TEXRECT_SCALE_X,
+                                                 unit));
+                coords[1] = vir_FMUL(c, coords[1],
+                                     vir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y,
+                                                 unit));
+        }
+
+        struct qreg texture_u[] = {
+                vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed),
+                vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit),
+        };
+        uint32_t next_texture_u = 0;
+
+        for (int i = 0; i < next_coord; i++) {
+                struct qreg dst;
+
+                if (i == next_coord - 1)
+                        dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUL);
+                else
+                        dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMU);
+
+                struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]);
+
+                if (i < 2) {
+                        tmu->has_implicit_uniform = true;
+                        tmu->src[vir_get_implicit_uniform_src(tmu)] =
+                                texture_u[next_texture_u++];
+                }
+        }
+
+        bool return_16 = (c->key->tex[unit].return_size == 16 ||
+                          p0_unpacked.shadow);
+
+        struct qreg return_values[4];
+        for (int i = 0; i < c->key->tex[unit].return_channels; i++)
+                return_values[i] = vir_LDTMU(c);
+        /* Swizzling .zw of an RG texture should give undefined results, not
+         * crash the compiler.
+         */
+        for (int i = c->key->tex[unit].return_channels; i < 4; i++)
+                return_values[i] = c->undef;
+
+        for (int i = 0; i < nir_tex_instr_dest_size(instr); i++) {
+                struct qreg chan;
+
+                if (return_16) {
+                        STATIC_ASSERT(PIPE_SWIZZLE_X == 0);
+                        chan = return_values[i / 2];
+
+                        enum v3d_qpu_input_unpack unpack;
+                        if (i & 1)
+                                unpack = V3D_QPU_UNPACK_H;
+                        else
+                                unpack = V3D_QPU_UNPACK_L;
+
+                        chan = vir_FMOV(c, chan);
+                        vir_set_unpack(c->defs[chan.index], 0, unpack);
+                } else {
+                        chan = vir_MOV(c, return_values[i]);
+                }
+                ntq_store_dest(c, &instr->dest, i, chan);
+        }
+}
+
+static struct qreg
+ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
+{
+        struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
+        if (is_cos)
+                input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
+
+        struct qreg periods = vir_FROUND(c, input);
+        struct qreg sin_output = vir_SFU(c, V3D_QPU_WADDR_SIN,
+                                         vir_FSUB(c, input, periods));
+        return vir_XOR(c, sin_output, vir_SHL(c,
+                                              vir_FTOIN(c, periods),
+                                              vir_uniform_ui(c, -1)));
+}
+
+static struct qreg
+ntq_fsign(struct v3d_compile *c, struct qreg src)
+{
+        struct qreg t = vir_get_temp(c);
+
+        vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
+        vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ);
+        vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
+        vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN);
+        vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
+        return vir_MOV(c, t);
+}
+
+static struct qreg
+ntq_isign(struct v3d_compile *c, struct qreg src)
+{
+        struct qreg t = vir_get_temp(c);
+
+        vir_MOV_dest(c, t, vir_uniform_ui(c, 0));
+        vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ);
+        vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1));
+        vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN);
+        vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1));
+        return vir_MOV(c, t);
+}
+
+static void
+emit_fragcoord_input(struct v3d_compile *c, int attr)
+{
+        c->inputs[attr * 4 + 0] = vir_FXCD(c);
+        c->inputs[attr * 4 + 1] = vir_FYCD(c);
+        c->inputs[attr * 4 + 2] = c->payload_z;
+        c->inputs[attr * 4 + 3] = vir_SFU(c, V3D_QPU_WADDR_RECIP,
+                                          c->payload_w);
+}
+
+static struct qreg
+emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
+                      uint8_t swizzle)
+{
+        struct qreg vary = vir_reg(QFILE_VARY, ~0);
+        struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+
+        /* For gl_PointCoord input or distance along a line, we'll be called
+         * with no nir_variable, and we don't count toward VPM size so we
+         * don't track an input slot.
+         */
+        if (!var) {
+                return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
+        }
+
+        int i = c->num_inputs++;
+        c->input_slots[i] = v3d_slot_from_slot_and_component(var->data.location,
+                                                             swizzle);
+
+        switch (var->data.interpolation) {
+        case INTERP_MODE_NONE:
+        case INTERP_MODE_SMOOTH:
+                if (var->data.centroid) {
+                        return vir_FADD(c, vir_FMUL(c, vary,
+                                                    c->payload_w_centroid), r5);
+                } else {
+                        return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
+                }
+        case INTERP_MODE_NOPERSPECTIVE:
+                /* C appears after the mov from the varying.
+                   XXX: improve ldvary setup.
+                */
+                return vir_FADD(c, vir_MOV(c, vary), r5);
+        case INTERP_MODE_FLAT:
+                BITSET_SET(c->flat_shade_flags, i);
+                vir_MOV_dest(c, c->undef, vary);
+                return vir_MOV(c, r5);
+        default:
+                unreachable("Bad interp mode");
+        }
+}
+
+static void
+emit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var)
+{
+        for (int i = 0; i < glsl_get_vector_elements(var->type); i++) {
+                c->inputs[attr * 4 + i] =
+                        emit_fragment_varying(c, var, i);
+        }
+}
+
+static void
+add_output(struct v3d_compile *c,
+           uint32_t decl_offset,
+           uint8_t slot,
+           uint8_t swizzle)
+{
+        uint32_t old_array_size = c->outputs_array_size;
+        resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
+                          decl_offset + 1);
+
+        if (old_array_size != c->outputs_array_size) {
+                c->output_slots = reralloc(c,
+                                           c->output_slots,
+                                           struct v3d_varying_slot,
+                                           c->outputs_array_size);
+        }
+
+        c->output_slots[decl_offset] =
+                v3d_slot_from_slot_and_component(slot, swizzle);
+}
+
+static void
+declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size)
+{
+        unsigned array_id = c->num_ubo_ranges++;
+        if (array_id >= c->ubo_ranges_array_size) {
+                c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
+                                                array_id + 1);
+                c->ubo_ranges = reralloc(c, c->ubo_ranges,
+                                         struct v3d_ubo_range,
+                                         c->ubo_ranges_array_size);
+                c->ubo_range_used = reralloc(c, c->ubo_range_used,
+                                             bool,
+                                             c->ubo_ranges_array_size);
+        }
+
+        c->ubo_ranges[array_id].dst_offset = 0;
+        c->ubo_ranges[array_id].src_offset = start;
+        c->ubo_ranges[array_id].size = size;
+        c->ubo_range_used[array_id] = false;
+}
+
+/**
+ * If compare_instr is a valid comparison instruction, emits the
+ * compare_instr's comparison and returns the sel_instr's return value based
+ * on the compare_instr's result.
+ */
+static bool
+ntq_emit_comparison(struct v3d_compile *c, struct qreg *dest,
+                    nir_alu_instr *compare_instr,
+                    nir_alu_instr *sel_instr)
+{
+        struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
+        struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
+        bool cond_invert = false;
+
+        switch (compare_instr->op) {
+        case nir_op_feq:
+        case nir_op_seq:
+                vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
+                break;
+        case nir_op_ieq:
+                vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
+                break;
+
+        case nir_op_fne:
+        case nir_op_sne:
+                vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
+                cond_invert = true;
+                break;
+        case nir_op_ine:
+                vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
+                cond_invert = true;
+                break;
+
+        case nir_op_fge:
+        case nir_op_sge:
+                vir_PF(c, vir_FCMP(c, src1, src0), V3D_QPU_PF_PUSHC);
+                break;
+        case nir_op_ige:
+                vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
+                cond_invert = true;
+                break;
+        case nir_op_uge:
+                vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
+                cond_invert = true;
+                break;
+
+        case nir_op_slt:
+        case nir_op_flt:
+                vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHN);
+                break;
+        case nir_op_ilt:
+                vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
+                break;
+        case nir_op_ult:
+                vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
+                break;
+
+        default:
+                return false;
+        }
+
+        enum v3d_qpu_cond cond = (cond_invert ?
+                                  V3D_QPU_COND_IFNA :
+                                  V3D_QPU_COND_IFA);
+
+        switch (sel_instr->op) {
+        case nir_op_seq:
+        case nir_op_sne:
+        case nir_op_sge:
+        case nir_op_slt:
+                *dest = vir_SEL(c, cond,
+                                vir_uniform_f(c, 1.0), vir_uniform_f(c, 0.0));
+                break;
+
+        case nir_op_bcsel:
+                *dest = vir_SEL(c, cond,
+                                ntq_get_alu_src(c, sel_instr, 1),
+                                ntq_get_alu_src(c, sel_instr, 2));
+                break;
+
+        default:
+                *dest = vir_SEL(c, cond,
+                                vir_uniform_ui(c, ~0), vir_uniform_ui(c, 0));
+                break;
+        }
+
+        /* Make the temporary for nir_store_dest(). */
+        *dest = vir_MOV(c, *dest);
+
+        return true;
+}
+
+/**
+ * Attempts to fold a comparison generating a boolean result into the
+ * condition code for selecting between two values, instead of comparing the
+ * boolean result against 0 to generate the condition code.
+ */
+static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr,
+                                  struct qreg *src)
+{
+        if (!instr->src[0].src.is_ssa)
+                goto out;
+        if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+                goto out;
+        nir_alu_instr *compare =
+                nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+        if (!compare)
+                goto out;
+
+        struct qreg dest;
+        if (ntq_emit_comparison(c, &dest, compare, instr))
+                return dest;
+
+out:
+        vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
+        return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2]));
+}
+
+
+static void
+ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
+{
+        /* This should always be lowered to ALU operations for V3D. */
+        assert(!instr->dest.saturate);
+
+        /* Vectors are special in that they have non-scalarized writemasks,
+         * and just take the first swizzle channel for each argument in order
+         * into each writemask channel.
+         */
+        if (instr->op == nir_op_vec2 ||
+            instr->op == nir_op_vec3 ||
+            instr->op == nir_op_vec4) {
+                struct qreg srcs[4];
+                for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
+                        srcs[i] = ntq_get_src(c, instr->src[i].src,
+                                              instr->src[i].swizzle[0]);
+                for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
+                        ntq_store_dest(c, &instr->dest.dest, i,
+                                       vir_MOV(c, srcs[i]));
+                return;
+        }
+
+        /* General case: We can just grab the one used channel per src. */
+        struct qreg src[nir_op_infos[instr->op].num_inputs];
+        for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+                src[i] = ntq_get_alu_src(c, instr, i);
+        }
+
+        struct qreg result;
+
+        switch (instr->op) {
+        case nir_op_fmov:
+        case nir_op_imov:
+                result = vir_MOV(c, src[0]);
+                break;
+        case nir_op_fmul:
+                result = vir_FMUL(c, src[0], src[1]);
+                break;
+        case nir_op_fadd:
+                result = vir_FADD(c, src[0], src[1]);
+                break;
+        case nir_op_fsub:
+                result = vir_FSUB(c, src[0], src[1]);
+                break;
+        case nir_op_fmin:
+                result = vir_FMIN(c, src[0], src[1]);
+                break;
+        case nir_op_fmax:
+                result = vir_FMAX(c, src[0], src[1]);
+                break;
+
+        case nir_op_f2i32:
+                result = vir_FTOIZ(c, src[0]);
+                break;
+        case nir_op_f2u32:
+                result = vir_FTOUZ(c, src[0]);
+                break;
+        case nir_op_i2f32:
+                result = vir_ITOF(c, src[0]);
+                break;
+        case nir_op_u2f32:
+                result = vir_UTOF(c, src[0]);
+                break;
+        case nir_op_b2f:
+                result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
+                break;
+        case nir_op_b2i:
+                result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
+                break;
+        case nir_op_i2b:
+        case nir_op_f2b:
+                vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
+                result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
+                                            vir_uniform_ui(c, ~0),
+                                            vir_uniform_ui(c, 0)));
+                break;
+
+        case nir_op_iadd:
+                result = vir_ADD(c, src[0], src[1]);
+                break;
+        case nir_op_ushr:
+                result = vir_SHR(c, src[0], src[1]);
+                break;
+        case nir_op_isub:
+                result = vir_SUB(c, src[0], src[1]);
+                break;
+        case nir_op_ishr:
+                result = vir_ASR(c, src[0], src[1]);
+                break;
+        case nir_op_ishl:
+                result = vir_SHL(c, src[0], src[1]);
+                break;
+        case nir_op_imin:
+                result = vir_MIN(c, src[0], src[1]);
+                break;
+        case nir_op_umin:
+                result = vir_UMIN(c, src[0], src[1]);
+                break;
+        case nir_op_imax:
+                result = vir_MAX(c, src[0], src[1]);
+                break;
+        case nir_op_umax:
+                result = vir_UMAX(c, src[0], src[1]);
+                break;
+        case nir_op_iand:
+                result = vir_AND(c, src[0], src[1]);
+                break;
+        case nir_op_ior:
+                result = vir_OR(c, src[0], src[1]);
+                break;
+        case nir_op_ixor:
+                result = vir_XOR(c, src[0], src[1]);
+                break;
+        case nir_op_inot:
+                result = vir_NOT(c, src[0]);
+                break;
+
+        case nir_op_imul:
+                result = ntq_umul(c, src[0], src[1]);
+                break;
+
+        case nir_op_seq:
+        case nir_op_sne:
+        case nir_op_sge:
+        case nir_op_slt:
+        case nir_op_feq:
+        case nir_op_fne:
+        case nir_op_fge:
+        case nir_op_flt:
+        case nir_op_ieq:
+        case nir_op_ine:
+        case nir_op_ige:
+        case nir_op_uge:
+        case nir_op_ilt:
+        case nir_op_ult:
+                if (!ntq_emit_comparison(c, &result, instr, instr)) {
+                        fprintf(stderr, "Bad comparison instruction\n");
+                }
+                break;
+
+        case nir_op_bcsel:
+                result = ntq_emit_bcsel(c, instr, src);
+                break;
+        case nir_op_fcsel:
+                vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
+                result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
+                                            src[1], src[2]));
+                break;
+
+        case nir_op_frcp:
+                result = vir_SFU(c, V3D_QPU_WADDR_RECIP, src[0]);
+                break;
+        case nir_op_frsq:
+                result = vir_SFU(c, V3D_QPU_WADDR_RSQRT, src[0]);
+                break;
+        case nir_op_fexp2:
+                result = vir_SFU(c, V3D_QPU_WADDR_EXP, src[0]);
+                break;
+        case nir_op_flog2:
+                result = vir_SFU(c, V3D_QPU_WADDR_LOG, src[0]);
+                break;
+
+        case nir_op_fceil:
+                result = vir_FCEIL(c, src[0]);
+                break;
+        case nir_op_ffloor:
+                result = vir_FFLOOR(c, src[0]);
+                break;
+        case nir_op_fround_even:
+                result = vir_FROUND(c, src[0]);
+                break;
+        case nir_op_ftrunc:
+                result = vir_FTRUNC(c, src[0]);
+                break;
+        case nir_op_ffract:
+                result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0]));
+                break;
+
+        case nir_op_fsin:
+                result = ntq_fsincos(c, src[0], false);
+                break;
+        case nir_op_fcos:
+                result = ntq_fsincos(c, src[0], true);
+                break;
+
+        case nir_op_fsign:
+                result = ntq_fsign(c, src[0]);
+                break;
+        case nir_op_isign:
+                result = ntq_isign(c, src[0]);
+                break;
+
+        case nir_op_fabs: {
+                result = vir_FMOV(c, src[0]);
+                vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
+                break;
+        }
+
+        case nir_op_iabs:
+                result = vir_MAX(c, src[0],
+                                vir_SUB(c, vir_uniform_ui(c, 0), src[0]));
+                break;
+
+        case nir_op_fddx:
+        case nir_op_fddx_coarse:
+        case nir_op_fddx_fine:
+                result = vir_FDX(c, src[0]);
+                break;
+
+        case nir_op_fddy:
+        case nir_op_fddy_coarse:
+        case nir_op_fddy_fine:
+                result = vir_FDY(c, src[0]);
+                break;
+
+        default:
+                fprintf(stderr, "unknown NIR ALU inst: ");
+                nir_print_instr(&instr->instr, stderr);
+                fprintf(stderr, "\n");
+                abort();
+        }
+
+        /* We have a scalar result, so the instruction should only have a
+         * single channel written to.
+         */
+        assert(util_is_power_of_two(instr->dest.write_mask));
+        ntq_store_dest(c, &instr->dest.dest,
+                       ffs(instr->dest.write_mask) - 1, result);
+}
+
+static void
+emit_frag_end(struct v3d_compile *c)
+{
+        uint32_t discard_cond = V3D_QPU_COND_NONE;
+        if (c->s->info.fs.uses_discard) {
+                vir_PF(c, vir_MOV(c, c->discard), V3D_QPU_PF_PUSHZ);
+                discard_cond = V3D_QPU_COND_IFA;
+        }
+
+        /* XXX
+        if (c->output_sample_mask_index != -1) {
+                vir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
+        }
+        */
+
+        if (c->output_position_index != -1) {
+                struct qinst *inst = vir_MOV_dest(c,
+                                                  vir_reg(QFILE_TLBU, 0),
+                                                  c->outputs[c->output_position_index]);
+
+                inst->src[vir_get_implicit_uniform_src(inst)] =
+                        vir_uniform_ui(c,
+                                       (1 << 2) | /* per pixel */
+                                       (2 << 6) /* type */ |
+                                       0xffffff00);
+        }
+
+        /* XXX: Performance improvement: Merge Z write and color writes TLB
+         * uniform setup
+         */
+
+        if (c->output_color_var) {
+                nir_variable *var = c->output_color_var;
+                struct qreg *color = &c->outputs[var->data.driver_location * 4];
+                int num_components = glsl_get_vector_elements(var->type);
+                uint32_t conf = ~0;
+                struct qinst *inst;
+
+                assert(num_components != 0);
+                switch (glsl_get_base_type(var->type)) {
+                case GLSL_TYPE_UINT:
+                case GLSL_TYPE_INT:
+                        conf = ((1 << 2) | /* per pixel */
+                                ((7 - 0) << 3) | /* rt */
+                                (1 << 6) /* type */ |
+                                (num_components - 1) |
+                                0xffffff00);
+
+
+                        inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
+                        vir_set_cond(inst, discard_cond);
+                        inst->src[vir_get_implicit_uniform_src(inst)] =
+                                vir_uniform_ui(c, conf);
+
+                        for (int i = 1; i < num_components; i++) {
+                                inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
+                                                    color[i]);
+                                vir_set_cond(inst, discard_cond);
+                        }
+                        break;
+
+                default: {
+                        struct qreg r = color[0];
+                        struct qreg g = color[1];
+                        struct qreg b = color[2];
+                        struct qreg a = color[3];
+
+                        if (c->fs_key->swap_color_rb)  {
+                                r = color[2];
+                                b = color[0];
+                        }
+
+                        inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g);
+                        vir_set_cond(inst, discard_cond);
+                        inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a);
+                        vir_set_cond(inst, discard_cond);
+                        break;
+                }
+                }
+        }
+}
+
+static void
+emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w)
+{
+        for (int i = 0; i < 2; i++) {
+                struct qreg coord = c->outputs[c->output_position_index + i];
+                coord = vir_FMUL(c, coord,
+                                 vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i,
+                                             0));
+                coord = vir_FMUL(c, coord, rcp_w);
+                vir_FTOIN_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM),
+                               coord);
+        }
+
+}
+
+static void
+emit_zs_write(struct v3d_compile *c, struct qreg rcp_w)
+{
+        struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
+        struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
+
+        vir_FADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM),
+                      vir_FMUL(c, vir_FMUL(c,
+                                           c->outputs[c->output_position_index + 2],
+                                           zscale),
+                               rcp_w),
+                      zoffset);
+}
+
+static void
+emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w)
+{
+        vir_VPM_WRITE(c, rcp_w);
+}
+
+static void
+emit_point_size_write(struct v3d_compile *c)
+{
+        struct qreg point_size;
+
+        if (c->output_point_size_index != -1)
+                point_size = c->outputs[c->output_point_size_index];
+        else
+                point_size = vir_uniform_f(c, 1.0);
+
+        /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
+         * BCM21553).
+         */
+        point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125));
+
+        vir_VPM_WRITE(c, point_size);
+}
+
+static void
+emit_vpm_write_setup(struct v3d_compile *c)
+{
+        uint32_t packed;
+        struct V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP unpacked = {
+                V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_header,
+
+                .horiz = true,
+                .laned = false,
+                .segs = true,
+                .stride = 1,
+                .size = VPM_SETUP_SIZE_32_BIT,
+                .addr = 0,
+        };
+
+        V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_pack(NULL,
+                                                (uint8_t *)&packed,
+                                                &unpacked);
+        vir_VPMSETUP(c, vir_uniform_ui(c, packed));
+}
+
+static void
+emit_vert_end(struct v3d_compile *c)
+{
+        struct qreg rcp_w = vir_SFU(c, V3D_QPU_WADDR_RECIP,
+                                    c->outputs[c->output_position_index + 3]);
+
+        emit_vpm_write_setup(c);
+
+        if (c->vs_key->is_coord) {
+                for (int i = 0; i < 4; i++)
+                        vir_VPM_WRITE(c, c->outputs[c->output_position_index + i]);
+                emit_scaled_viewport_write(c, rcp_w);
+                if (c->vs_key->per_vertex_point_size) {
+                        emit_point_size_write(c);
+                        /* emit_rcp_wc_write(c, rcp_w); */
+                }
+                /* XXX: Z-only rendering */
+                if (0)
+                        emit_zs_write(c, rcp_w);
+        } else {
+                emit_scaled_viewport_write(c, rcp_w);
+                emit_zs_write(c, rcp_w);
+                emit_rcp_wc_write(c, rcp_w);
+                if (c->vs_key->per_vertex_point_size)
+                        emit_point_size_write(c);
+        }
+
+        for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
+                struct v3d_varying_slot input = c->vs_key->fs_inputs[i];
+                int j;
+
+                for (j = 0; j < c->num_outputs; j++) {
+                        struct v3d_varying_slot output = c->output_slots[j];
+
+                        if (!memcmp(&input, &output, sizeof(input))) {
+                                vir_VPM_WRITE(c, c->outputs[j]);
+                                break;
+                        }
+                }
+                /* Emit padding if we didn't find a declared VS output for
+                 * this FS input.
+                 */
+                if (j == c->num_outputs)
+                        vir_VPM_WRITE(c, vir_uniform_f(c, 0.0));
+        }
+}
+
+void
+v3d_optimize_nir(struct nir_shader *s)
+{
+        bool progress;
+
+        do {
+                progress = false;
+
+                NIR_PASS_V(s, nir_lower_vars_to_ssa);
+                NIR_PASS(progress, s, nir_lower_alu_to_scalar);
+                NIR_PASS(progress, s, nir_lower_phis_to_scalar);
+                NIR_PASS(progress, s, nir_copy_prop);
+                NIR_PASS(progress, s, nir_opt_remove_phis);
+                NIR_PASS(progress, s, nir_opt_dce);
+                NIR_PASS(progress, s, nir_opt_dead_cf);
+                NIR_PASS(progress, s, nir_opt_cse);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 8);
+                NIR_PASS(progress, s, nir_opt_algebraic);
+                NIR_PASS(progress, s, nir_opt_constant_folding);
+                NIR_PASS(progress, s, nir_opt_undef);
+        } while (progress);
+}
+
+static int
+driver_location_compare(const void *in_a, const void *in_b)
+{
+        const nir_variable *const *a = in_a;
+        const nir_variable *const *b = in_b;
+
+        return (*a)->data.driver_location - (*b)->data.driver_location;
+}
+
+static struct qreg
+ntq_emit_vpm_read(struct v3d_compile *c,
+                  uint32_t *num_components_queued,
+                  uint32_t *remaining,
+                  uint32_t vpm_index)
+{
+        struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
+
+        if (*num_components_queued != 0) {
+                (*num_components_queued)--;
+                c->num_inputs++;
+                return vir_MOV(c, vpm);
+        }
+
+        uint32_t num_components = MIN2(*remaining, 32);
+
+        struct V3D33_VPM_GENERIC_BLOCK_READ_SETUP unpacked = {
+                V3D33_VPM_GENERIC_BLOCK_READ_SETUP_header,
+
+                .horiz = true,
+                .laned = false,
+                /* If the field is 0, that means a read count of 32. */
+                .num = num_components & 31,
+                .segs = true,
+                .stride = 1,
+                .size = VPM_SETUP_SIZE_32_BIT,
+                .addr = c->num_inputs,
+        };
+
+        uint32_t packed;
+        V3D33_VPM_GENERIC_BLOCK_READ_SETUP_pack(NULL,
+                                                (uint8_t *)&packed,
+                                                &unpacked);
+        vir_VPMSETUP(c, vir_uniform_ui(c, packed));
+
+        *num_components_queued = num_components - 1;
+        *remaining -= num_components;
+        c->num_inputs++;
+
+        return vir_MOV(c, vpm);
+}
+
+static void
+ntq_setup_inputs(struct v3d_compile *c)
+{
+        unsigned num_entries = 0;
+        unsigned num_components = 0;
+        nir_foreach_variable(var, &c->s->inputs) {
+                num_entries++;
+                num_components += glsl_get_components(var->type);
+        }
+
+        nir_variable *vars[num_entries];
+
+        unsigned i = 0;
+        nir_foreach_variable(var, &c->s->inputs)
+                vars[i++] = var;
+
+        /* Sort the variables so that we emit the input setup in
+         * driver_location order.  This is required for VPM reads, whose data
+         * is fetched into the VPM in driver_location (TGSI register index)
+         * order.
+         */
+        qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
+
+        uint32_t vpm_components_queued = 0;
+        if (c->s->stage == MESA_SHADER_VERTEX) {
+                bool uses_iid = c->s->info.system_values_read &
+                        (1ull << SYSTEM_VALUE_INSTANCE_ID);
+                bool uses_vid = c->s->info.system_values_read &
+                        (1ull << SYSTEM_VALUE_VERTEX_ID);
+
+                num_components += uses_iid;
+                num_components += uses_vid;
+
+                if (uses_iid) {
+                        c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,
+                                                   &num_components, ~0);
+                }
+
+                if (uses_vid) {
+                        c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,
+                                                   &num_components, ~0);
+                }
+        }
+
+        for (unsigned i = 0; i < num_entries; i++) {
+                nir_variable *var = vars[i];
+                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+                unsigned loc = var->data.driver_location;
+
+                assert(array_len == 1);
+                (void)array_len;
+                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
+                                  (loc + 1) * 4);
+
+                if (c->s->stage == MESA_SHADER_FRAGMENT) {
+                        if (var->data.location == VARYING_SLOT_POS) {
+                                emit_fragcoord_input(c, loc);
+                        } else if (var->data.location == VARYING_SLOT_PNTC ||
+                                   (var->data.location >= VARYING_SLOT_VAR0 &&
+                                    (c->fs_key->point_sprite_mask &
+                                     (1 << (var->data.location -
+                                            VARYING_SLOT_VAR0))))) {
+                                c->inputs[loc * 4 + 0] = c->point_x;
+                                c->inputs[loc * 4 + 1] = c->point_y;
+                        } else {
+                                emit_fragment_input(c, loc, var);
+                        }
+                } else {
+                        int var_components = glsl_get_components(var->type);
+
+                        for (int i = 0; i < var_components; i++) {
+                                c->inputs[loc * 4 + i] =
+                                        ntq_emit_vpm_read(c,
+                                                          &vpm_components_queued,
+                                                          &num_components,
+                                                          loc * 4 + i);
+
+                        }
+                        c->vattr_sizes[loc] = var_components;
+                }
+        }
+
+        if (c->s->stage == MESA_SHADER_VERTEX) {
+                assert(vpm_components_queued == 0);
+                assert(num_components == 0);
+        }
+}
+
+static void
+ntq_setup_outputs(struct v3d_compile *c)
+{
+        nir_foreach_variable(var, &c->s->outputs) {
+                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+                unsigned loc = var->data.driver_location * 4;
+
+                assert(array_len == 1);
+                (void)array_len;
+
+                for (int i = 0; i < 4; i++)
+                        add_output(c, loc + i, var->data.location, i);
+
+                if (c->s->stage == MESA_SHADER_FRAGMENT) {
+                        switch (var->data.location) {
+                        case FRAG_RESULT_COLOR:
+                        case FRAG_RESULT_DATA0:
+                                c->output_color_var = var;
+                                break;
+                        case FRAG_RESULT_DEPTH:
+                                c->output_position_index = loc;
+                                break;
+                        case FRAG_RESULT_SAMPLE_MASK:
+                                c->output_sample_mask_index = loc;
+                                break;
+                        }
+                } else {
+                        switch (var->data.location) {
+                        case VARYING_SLOT_POS:
+                                c->output_position_index = loc;
+                                break;
+                        case VARYING_SLOT_PSIZ:
+                                c->output_point_size_index = loc;
+                                break;
+                        }
+                }
+        }
+}
+
+static void
+ntq_setup_uniforms(struct v3d_compile *c)
+{
+        nir_foreach_variable(var, &c->s->uniforms) {
+                uint32_t vec4_count = glsl_count_attribute_slots(var->type,
+                                                                 false);
+                unsigned vec4_size = 4 * sizeof(float);
+
+                declare_uniform_range(c, var->data.driver_location * vec4_size,
+                                      vec4_count * vec4_size);
+
+        }
+}
+
+/**
+ * Sets up the mapping from nir_register to struct qreg *.
+ *
+ * Each nir_register gets a struct qreg per 32-bit component being stored.
+ */
+static void
+ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
+{
+        foreach_list_typed(nir_register, nir_reg, node, list) {
+                unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
+                struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
+                                                  array_len *
+                                                  nir_reg->num_components);
+
+                _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
+
+                for (int i = 0; i < array_len * nir_reg->num_components; i++)
+                        qregs[i] = vir_get_temp(c);
+        }
+}
+
+static void
+ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
+{
+        struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
+        for (int i = 0; i < instr->def.num_components; i++)
+                qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
+
+        _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
+}
+
+static void
+ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr)
+{
+        struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
+
+        /* VIR needs there to be *some* value, so pick 0 (same as for
+         * ntq_setup_registers().
+         */
+        for (int i = 0; i < instr->def.num_components; i++)
+                qregs[i] = vir_uniform_ui(c, 0);
+}
+
+static void
+ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        nir_const_value *const_offset;
+        unsigned offset;
+
+        switch (instr->intrinsic) {
+        case nir_intrinsic_load_uniform:
+                assert(instr->num_components == 1);
+                const_offset = nir_src_as_const_value(instr->src[0]);
+                if (const_offset) {
+                        offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+                        assert(offset % 4 == 0);
+                        /* We need dwords */
+                        offset = offset / 4;
+                        ntq_store_dest(c, &instr->dest, 0,
+                                       vir_uniform(c, QUNIFORM_UNIFORM,
+                                                   offset));
+                } else {
+                        ntq_store_dest(c, &instr->dest, 0,
+                                       indirect_uniform_load(c, instr));
+                }
+                break;
+
+        case nir_intrinsic_load_ubo:
+                for (int i = 0; i < instr->num_components; i++) {
+                        int ubo = nir_src_as_const_value(instr->src[0])->u32[0];
+
+                        /* Adjust for where we stored the TGSI register base. */
+                        vir_ADD_dest(c,
+                                     vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
+                                     vir_uniform(c, QUNIFORM_UBO_ADDR, 1 + ubo),
+                                     vir_ADD(c,
+                                             ntq_get_src(c, instr->src[1], 0),
+                                             vir_uniform_ui(c, i * 4)));
+
+                        ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
+                }
+                break;
+
+                const_offset = nir_src_as_const_value(instr->src[0]);
+                if (const_offset) {
+                        offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+                        assert(offset % 4 == 0);
+                        /* We need dwords */
+                        offset = offset / 4;
+                        ntq_store_dest(c, &instr->dest, 0,
+                                       vir_uniform(c, QUNIFORM_UNIFORM,
+                                                   offset));
+                } else {
+                        ntq_store_dest(c, &instr->dest, 0,
+                                       indirect_uniform_load(c, instr));
+                }
+                break;
+
+        case nir_intrinsic_load_user_clip_plane:
+                for (int i = 0; i < instr->num_components; i++) {
+                        ntq_store_dest(c, &instr->dest, i,
+                                       vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
+                                                   nir_intrinsic_ucp_id(instr) *
+                                                   4 + i));
+                }
+                break;
+
+        case nir_intrinsic_load_alpha_ref_float:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform(c, QUNIFORM_ALPHA_REF, 0));
+                break;
+
+        case nir_intrinsic_load_sample_mask_in:
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
+                break;
+
+        case nir_intrinsic_load_front_face:
+                /* The register contains 0 (front) or 1 (back), and we need to
+                 * turn it into a NIR bool where true means front.
+                 */
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_ADD(c,
+                                       vir_uniform_ui(c, -1),
+                                       vir_REVF(c)));
+                break;
+
+        case nir_intrinsic_load_instance_id:
+                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
+                break;
+
+        case nir_intrinsic_load_vertex_id:
+                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
+                break;
+
+        case nir_intrinsic_load_input:
+                const_offset = nir_src_as_const_value(instr->src[0]);
+                assert(const_offset && "v3d doesn't support indirect inputs");
+                for (int i = 0; i < instr->num_components; i++) {
+                        offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+                        int comp = nir_intrinsic_component(instr) + i;
+                        ntq_store_dest(c, &instr->dest, i,
+                                       vir_MOV(c, c->inputs[offset * 4 + comp]));
+                }
+                break;
+
+        case nir_intrinsic_store_output:
+                const_offset = nir_src_as_const_value(instr->src[1]);
+                assert(const_offset && "v3d doesn't support indirect outputs");
+                offset = ((nir_intrinsic_base(instr) +
+                           const_offset->u32[0]) * 4 +
+                          nir_intrinsic_component(instr));
+
+                for (int i = 0; i < instr->num_components; i++) {
+                        c->outputs[offset + i] =
+                                vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+                }
+                c->num_outputs = MAX2(c->num_outputs,
+                                      offset + instr->num_components);
+                break;
+
+        case nir_intrinsic_discard:
+                if (c->execute.file != QFILE_NULL) {
+                        vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+                        vir_MOV_cond(c, V3D_QPU_COND_IFA, c->discard,
+                                     vir_uniform_ui(c, ~0));
+                } else {
+                        vir_MOV_dest(c, c->discard, vir_uniform_ui(c, ~0));
+                }
+                break;
+
+        case nir_intrinsic_discard_if: {
+                /* true (~0) if we're discarding */
+                struct qreg cond = ntq_get_src(c, instr->src[0], 0);
+
+                if (c->execute.file != QFILE_NULL) {
+                        /* execute == 0 means the channel is active.  Invert
+                         * the condition so that we can use zero as "executing
+                         * and discarding."
+                         */
+                        vir_PF(c, vir_AND(c, c->execute, vir_NOT(c, cond)),
+                               V3D_QPU_PF_PUSHZ);
+                        vir_MOV_cond(c, V3D_QPU_COND_IFA, c->discard, cond);
+                } else {
+                        vir_OR_dest(c, c->discard, c->discard, cond);
+                }
+
+                break;
+        }
+
+        default:
+                fprintf(stderr, "Unknown intrinsic: ");
+                nir_print_instr(&instr->instr, stderr);
+                fprintf(stderr, "\n");
+                break;
+        }
+}
+
+/* Clears (activates) the execute flags for any channels whose jump target
+ * matches this block.
+ */
+static void
+ntq_activate_execute_for_block(struct v3d_compile *c)
+{
+        vir_PF(c, vir_SUB(c, c->execute, vir_uniform_ui(c, c->cur_block->index)),
+               V3D_QPU_PF_PUSHZ);
+
+        vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
+}
+
+static void
+ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt)
+{
+        nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
+        bool empty_else_block =
+                (nir_else_block == nir_if_last_else_block(if_stmt) &&
+                 exec_list_is_empty(&nir_else_block->instr_list));
+
+        struct qblock *then_block = vir_new_block(c);
+        struct qblock *after_block = vir_new_block(c);
+        struct qblock *else_block;
+        if (empty_else_block)
+                else_block = after_block;
+        else
+                else_block = vir_new_block(c);
+
+        bool was_top_level = false;
+        if (c->execute.file == QFILE_NULL) {
+                c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
+                was_top_level = true;
+        }
+
+        /* Set A for executing (execute == 0) and jumping (if->condition ==
+         * 0) channels, and then update execute flags for those to point to
+         * the ELSE block.
+         */
+        vir_PF(c, vir_OR(c,
+                         c->execute,
+                         ntq_get_src(c, if_stmt->condition, 0)),
+                V3D_QPU_PF_PUSHZ);
+        vir_MOV_cond(c, V3D_QPU_COND_IFA,
+                     c->execute,
+                     vir_uniform_ui(c, else_block->index));
+
+        /* Jump to ELSE if nothing is active for THEN, otherwise fall
+         * through.
+         */
+        vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+        vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
+        vir_link_blocks(c->cur_block, else_block);
+        vir_link_blocks(c->cur_block, then_block);
+
+        /* Process the THEN block. */
+        vir_set_emit_block(c, then_block);
+        ntq_emit_cf_list(c, &if_stmt->then_list);
+
+        if (!empty_else_block) {
+                /* Handle the end of the THEN block.  First, all currently
+                 * active channels update their execute flags to point to
+                 * ENDIF
+                 */
+                vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+                vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
+                             vir_uniform_ui(c, after_block->index));
+
+                /* If everything points at ENDIF, then jump there immediately. */
+                vir_PF(c, vir_SUB(c, c->execute,
+                                  vir_uniform_ui(c, after_block->index)),
+                       V3D_QPU_PF_PUSHZ);
+                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
+                vir_link_blocks(c->cur_block, after_block);
+                vir_link_blocks(c->cur_block, else_block);
+
+                vir_set_emit_block(c, else_block);
+                ntq_activate_execute_for_block(c);
+                ntq_emit_cf_list(c, &if_stmt->else_list);
+        }
+
+        vir_link_blocks(c->cur_block, after_block);
+
+        vir_set_emit_block(c, after_block);
+        if (was_top_level)
+                c->execute = c->undef;
+        else
+                ntq_activate_execute_for_block(c);
+}
+
+static void
+ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
+{
+        switch (jump->type) {
+        case nir_jump_break:
+                vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+                vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
+                             vir_uniform_ui(c, c->loop_break_block->index));
+                break;
+
+        case nir_jump_continue:
+                vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+                vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
+                             vir_uniform_ui(c, c->loop_cont_block->index));
+                break;
+
+        case nir_jump_return:
+                unreachable("All returns shouold be lowered\n");
+        }
+}
+
+static void
+ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
+{
+        switch (instr->type) {
+        case nir_instr_type_alu:
+                ntq_emit_alu(c, nir_instr_as_alu(instr));
+                break;
+
+        case nir_instr_type_intrinsic:
+                ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
+                break;
+
+        case nir_instr_type_load_const:
+                ntq_emit_load_const(c, nir_instr_as_load_const(instr));
+                break;
+
+        case nir_instr_type_ssa_undef:
+                ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
+                break;
+
+        case nir_instr_type_tex:
+                ntq_emit_tex(c, nir_instr_as_tex(instr));
+                break;
+
+        case nir_instr_type_jump:
+                ntq_emit_jump(c, nir_instr_as_jump(instr));
+                break;
+
+        default:
+                fprintf(stderr, "Unknown NIR instr type: ");
+                nir_print_instr(instr, stderr);
+                fprintf(stderr, "\n");
+                abort();
+        }
+}
+
+static void
+ntq_emit_block(struct v3d_compile *c, nir_block *block)
+{
+        nir_foreach_instr(instr, block) {
+                ntq_emit_instr(c, instr);
+        }
+}
+
+static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
+
+static void
+ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
+{
+        bool was_top_level = false;
+        if (c->execute.file == QFILE_NULL) {
+                c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
+                was_top_level = true;
+        }
+
+        struct qblock *save_loop_cont_block = c->loop_cont_block;
+        struct qblock *save_loop_break_block = c->loop_break_block;
+
+        c->loop_cont_block = vir_new_block(c);
+        c->loop_break_block = vir_new_block(c);
+
+        vir_link_blocks(c->cur_block, c->loop_cont_block);
+        vir_set_emit_block(c, c->loop_cont_block);
+        ntq_activate_execute_for_block(c);
+
+        ntq_emit_cf_list(c, &loop->body);
+
+        /* Re-enable any previous continues now, so our ANYA check below
+         * works.
+         *
+         * XXX: Use the .ORZ flags update, instead.
+         */
+        vir_PF(c, vir_SUB(c,
+                          c->execute,
+                          vir_uniform_ui(c, c->loop_cont_block->index)),
+               V3D_QPU_PF_PUSHZ);
+        vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
+
+        vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+
+        vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
+        vir_link_blocks(c->cur_block, c->loop_cont_block);
+        vir_link_blocks(c->cur_block, c->loop_break_block);
+
+        vir_set_emit_block(c, c->loop_break_block);
+        if (was_top_level)
+                c->execute = c->undef;
+        else
+                ntq_activate_execute_for_block(c);
+
+        c->loop_break_block = save_loop_break_block;
+        c->loop_cont_block = save_loop_cont_block;
+}
+
+static void
+ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
+{
+        fprintf(stderr, "FUNCTIONS not handled.\n");
+        abort();
+}
+
+static void
+ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
+{
+        foreach_list_typed(nir_cf_node, node, node, list) {
+                switch (node->type) {
+                case nir_cf_node_block:
+                        ntq_emit_block(c, nir_cf_node_as_block(node));
+                        break;
+
+                case nir_cf_node_if:
+                        ntq_emit_if(c, nir_cf_node_as_if(node));
+                        break;
+
+                case nir_cf_node_loop:
+                        ntq_emit_loop(c, nir_cf_node_as_loop(node));
+                        break;
+
+                case nir_cf_node_function:
+                        ntq_emit_function(c, nir_cf_node_as_function(node));
+                        break;
+
+                default:
+                        fprintf(stderr, "Unknown NIR node type\n");
+                        abort();
+                }
+        }
+}
+
+static void
+ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
+{
+        ntq_setup_registers(c, &impl->registers);
+        ntq_emit_cf_list(c, &impl->body);
+}
+
+static void
+nir_to_vir(struct v3d_compile *c)
+{
+        if (c->s->stage == MESA_SHADER_FRAGMENT) {
+                c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
+                c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
+
+                if (c->s->info.fs.uses_discard)
+                        c->discard = vir_MOV(c, vir_uniform_ui(c, 0));
+
+                if (c->fs_key->is_points) {
+                        c->point_x = emit_fragment_varying(c, NULL, 0);
+                        c->point_y = emit_fragment_varying(c, NULL, 0);
+                } else if (c->fs_key->is_lines) {
+                        c->line_x = emit_fragment_varying(c, NULL, 0);
+                }
+        }
+
+        ntq_setup_inputs(c);
+        ntq_setup_outputs(c);
+        ntq_setup_uniforms(c);
+        ntq_setup_registers(c, &c->s->registers);
+
+        /* Find the main function and emit the body. */
+        nir_foreach_function(function, c->s) {
+                assert(strcmp(function->name, "main") == 0);
+                assert(function->impl);
+                ntq_emit_impl(c, function->impl);
+        }
+}
+
+const nir_shader_compiler_options v3d_nir_options = {
+        .lower_extract_byte = true,
+        .lower_extract_word = true,
+        .lower_bitfield_insert = true,
+        .lower_bitfield_extract = true,
+        .lower_ffma = true,
+        .lower_flrp32 = true,
+        .lower_fpow = true,
+        .lower_fsat = true,
+        .lower_fsqrt = true,
+        .lower_negate = true,
+        .native_integers = true,
+};
+
+
+#if 0
+static int
+count_nir_instrs(nir_shader *nir)
+{
+        int count = 0;
+        nir_foreach_function(function, nir) {
+                if (!function->impl)
+                        continue;
+                nir_foreach_block(block, function->impl) {
+                        nir_foreach_instr(instr, block)
+                                count++;
+                }
+        }
+        return count;
+}
+#endif
+
+void
+v3d_nir_to_vir(struct v3d_compile *c)
+{
+        if (V3D_DEBUG & (V3D_DEBUG_NIR |
+                         v3d_debug_flag_for_shader_stage(c->s->stage))) {
+                fprintf(stderr, "%s prog %d/%d NIR:\n",
+                        vir_get_stage_name(c),
+                        c->program_id, c->variant_id);
+                nir_print_shader(c->s, stderr);
+        }
+
+        nir_to_vir(c);
+
+        switch (c->s->stage) {
+        case MESA_SHADER_FRAGMENT:
+                emit_frag_end(c);
+                break;
+        case MESA_SHADER_VERTEX:
+                emit_vert_end(c);
+                break;
+        default:
+                unreachable("bad stage");
+        }
+
+        if (V3D_DEBUG & (V3D_DEBUG_VIR |
+                         v3d_debug_flag_for_shader_stage(c->s->stage))) {
+                fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
+                        vir_get_stage_name(c),
+                        c->program_id, c->variant_id);
+                vir_dump(c);
+                fprintf(stderr, "\n");
+        }
+
+        vir_optimize(c);
+        vir_lower_uniforms(c);
+
+        /* XXX: vir_schedule_instructions(c); */
+
+        if (V3D_DEBUG & (V3D_DEBUG_VIR |
+                         v3d_debug_flag_for_shader_stage(c->s->stage))) {
+                fprintf(stderr, "%s prog %d/%d VIR:\n",
+                        vir_get_stage_name(c),
+                        c->program_id, c->variant_id);
+                vir_dump(c);
+                fprintf(stderr, "\n");
+        }
+
+        v3d_vir_to_qpu(c);
+}
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
new file mode 100644
index 00000000000..b5a0aa9a34a
--- /dev/null
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -0,0 +1,1362 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file
+ *
+ * The basic model of the list scheduler is to take a basic block, compute a
+ * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
+ * pick a DAG head, then put all the children that are now DAG heads into the
+ * list of things to schedule.
+ *
+ * The goal of scheduling here is to pack pairs of operations together in a
+ * single QPU instruction.
+ */
+
+#include "qpu/qpu_disasm.h"
+#include "v3d_compiler.h"
+#include "util/ralloc.h"
+
+static bool debug;
+
+struct schedule_node_child;
+
+struct schedule_node {
+        struct list_head link;
+        struct qinst *inst;
+        struct schedule_node_child *children;
+        uint32_t child_count;
+        uint32_t child_array_size;
+        uint32_t parent_count;
+
+        /* Longest cycles + instruction_latency() of any parent of this node. */
+        uint32_t unblocked_time;
+
+        /**
+         * Minimum number of cycles from scheduling this instruction until the
+         * end of the program, based on the slowest dependency chain through
+         * the children.
+         */
+        uint32_t delay;
+
+        /**
+         * cycles between this instruction being scheduled and when its result
+         * can be consumed.
+         */
+        uint32_t latency;
+};
+
+struct schedule_node_child {
+        struct schedule_node *node;
+        bool write_after_read;
+};
+
+/* When walking the instructions in reverse, we need to swap before/after in
+ * add_dep().
+ */
+enum direction { F, R };
+
+struct schedule_state {
+        struct schedule_node *last_r[6];
+        struct schedule_node *last_rf[64];
+        struct schedule_node *last_sf;
+        struct schedule_node *last_vpm_read;
+        struct schedule_node *last_tmu_write;
+        struct schedule_node *last_tlb;
+        struct schedule_node *last_vpm;
+        struct schedule_node *last_unif;
+        struct schedule_node *last_rtop;
+        enum direction dir;
+        /* Estimated cycle when the current instruction would start. */
+        uint32_t time;
+};
+
+static void
+add_dep(struct schedule_state *state,
+        struct schedule_node *before,
+        struct schedule_node *after,
+        bool write)
+{
+        bool write_after_read = !write && state->dir == R;
+
+        if (!before || !after)
+                return;
+
+        assert(before != after);
+
+        if (state->dir == R) {
+                struct schedule_node *t = before;
+                before = after;
+                after = t;
+        }
+
+        for (int i = 0; i < before->child_count; i++) {
+                if (before->children[i].node == after &&
+                    (before->children[i].write_after_read == write_after_read)) {
+                        return;
+                }
+        }
+
+        if (before->child_array_size <= before->child_count) {
+                before->child_array_size = MAX2(before->child_array_size * 2, 16);
+                before->children = reralloc(before, before->children,
+                                            struct schedule_node_child,
+                                            before->child_array_size);
+        }
+
+        before->children[before->child_count].node = after;
+        before->children[before->child_count].write_after_read =
+                write_after_read;
+        before->child_count++;
+        after->parent_count++;
+}
+
+static void
+add_read_dep(struct schedule_state *state,
+              struct schedule_node *before,
+              struct schedule_node *after)
+{
+        add_dep(state, before, after, false);
+}
+
+static void
+add_write_dep(struct schedule_state *state,
+              struct schedule_node **before,
+              struct schedule_node *after)
+{
+        add_dep(state, *before, after, true);
+        *before = after;
+}
+
+static bool
+qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
+{
+        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+                return false;
+
+        if (inst->alu.add.magic_write &&
+            (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
+             inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
+                return true;
+
+        if (inst->alu.mul.magic_write &&
+            (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
+             inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
+                return true;
+
+        return false;
+}
+
+static void
+process_mux_deps(struct schedule_state *state, struct schedule_node *n,
+                 enum v3d_qpu_mux mux)
+{
+        switch (mux) {
+        case V3D_QPU_MUX_A:
+                add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
+                break;
+        case V3D_QPU_MUX_B:
+                add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n);
+                break;
+        default:
+                add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
+                break;
+        }
+}
+
+
+static void
+process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
+                   uint32_t waddr, bool magic)
+{
+        if (!magic) {
+                add_write_dep(state, &state->last_rf[waddr], n);
+        } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
+                add_write_dep(state, &state->last_tmu_write, n);
+        } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
+                /* Handled by v3d_qpu_writes_r4() check. */
+        } else {
+                switch (waddr) {
+                case V3D_QPU_WADDR_R0:
+                case V3D_QPU_WADDR_R1:
+                case V3D_QPU_WADDR_R2:
+                case V3D_QPU_WADDR_R3:
+                case V3D_QPU_WADDR_R4:
+                case V3D_QPU_WADDR_R5:
+                        add_write_dep(state,
+                                      &state->last_r[waddr - V3D_QPU_WADDR_R0],
+                                      n);
+                        break;
+
+                case V3D_QPU_WADDR_VPM:
+                case V3D_QPU_WADDR_VPMU:
+                        add_write_dep(state, &state->last_vpm, n);
+                        break;
+
+                case V3D_QPU_WADDR_TLB:
+                case V3D_QPU_WADDR_TLBU:
+                        add_write_dep(state, &state->last_tlb, n);
+                        break;
+
+                case V3D_QPU_WADDR_NOP:
+                        break;
+
+                default:
+                        fprintf(stderr, "Unknown waddr %d\n", waddr);
+                        abort();
+                }
+        }
+}
+
+static void
+process_cond_deps(struct schedule_state *state, struct schedule_node *n,
+                  enum v3d_qpu_cond cond)
+{
+        if (cond != V3D_QPU_COND_NONE)
+                add_read_dep(state, state->last_sf, n);
+}
+
+static void
+process_pf_deps(struct schedule_state *state, struct schedule_node *n,
+                enum v3d_qpu_pf pf)
+{
+        if (pf != V3D_QPU_PF_NONE)
+                add_write_dep(state, &state->last_sf, n);
+}
+
+static void
+process_uf_deps(struct schedule_state *state, struct schedule_node *n,
+                enum v3d_qpu_uf uf)
+{
+        if (uf != V3D_QPU_UF_NONE)
+                add_write_dep(state, &state->last_sf, n);
+}
+
+/**
+ * Common code for dependencies that need to be tracked both forward and
+ * backward.
+ *
+ * This is for things like "all reads of r4 have to happen between the r4
+ * writes that surround them".
+ */
+static void
+calculate_deps(struct schedule_state *state, struct schedule_node *n)
+{
+        struct qinst *qinst = n->inst;
+        struct v3d_qpu_instr *inst = &qinst->qpu;
+
+        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
+                if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
+                        add_read_dep(state, state->last_sf, n);
+
+                /* XXX: BDI */
+                /* XXX: BDU */
+                /* XXX: ub */
+                /* XXX: raddr_a */
+
+                add_write_dep(state, &state->last_unif, n);
+                return;
+        }
+
+        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
+
+        /* XXX: LOAD_IMM */
+
+        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
+                process_mux_deps(state, n, inst->alu.add.a);
+        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
+                process_mux_deps(state, n, inst->alu.add.b);
+
+        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
+                process_mux_deps(state, n, inst->alu.mul.a);
+        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
+                process_mux_deps(state, n, inst->alu.mul.b);
+
+        switch (inst->alu.add.op) {
+        case V3D_QPU_A_VPMSETUP:
+                /* Could distinguish read/write by unpacking the uniform. */
+                add_write_dep(state, &state->last_vpm, n);
+                add_write_dep(state, &state->last_vpm_read, n);
+                break;
+
+        case V3D_QPU_A_STVPMV:
+        case V3D_QPU_A_STVPMD:
+        case V3D_QPU_A_STVPMP:
+                add_write_dep(state, &state->last_vpm, n);
+                break;
+
+        case V3D_QPU_A_MSF:
+                add_read_dep(state, state->last_tlb, n);
+                break;
+
+        case V3D_QPU_A_SETMSF:
+        case V3D_QPU_A_SETREVF:
+                add_write_dep(state, &state->last_tlb, n);
+                break;
+
+        case V3D_QPU_A_FLAPUSH:
+        case V3D_QPU_A_FLBPUSH:
+        case V3D_QPU_A_VFLA:
+        case V3D_QPU_A_VFLNA:
+        case V3D_QPU_A_VFLB:
+        case V3D_QPU_A_VFLNB:
+                add_read_dep(state, state->last_sf, n);
+                break;
+
+        case V3D_QPU_A_FLBPOP:
+                add_write_dep(state, &state->last_sf, n);
+                break;
+
+        default:
+                break;
+        }
+
+        switch (inst->alu.mul.op) {
+        case V3D_QPU_M_MULTOP:
+        case V3D_QPU_M_UMUL24:
+                /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
+                 * resets it to 0.  We could possibly reorder umul24s relative
+                 * to each other, but for now just keep all the MUL parts in
+                 * order.
+                 */
+                add_write_dep(state, &state->last_rtop, n);
+                break;
+        default:
+                break;
+        }
+
+        if (inst->alu.add.op != V3D_QPU_A_NOP) {
+                process_waddr_deps(state, n, inst->alu.add.waddr,
+                                   inst->alu.add.magic_write);
+        }
+        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
+                process_waddr_deps(state, n, inst->alu.mul.waddr,
+                                   inst->alu.mul.magic_write);
+        }
+
+        if (v3d_qpu_writes_r3(inst))
+                add_write_dep(state, &state->last_r[3], n);
+        if (v3d_qpu_writes_r4(inst))
+                add_write_dep(state, &state->last_r[4], n);
+        if (v3d_qpu_writes_r5(inst))
+                add_write_dep(state, &state->last_r[5], n);
+
+        if (inst->sig.thrsw) {
+                /* All accumulator contents and flags are undefined after the
+                 * switch.
+                 */
+                for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
+                        add_write_dep(state, &state->last_r[i], n);
+                add_write_dep(state, &state->last_sf, n);
+
+                /* Scoreboard-locking operations have to stay after the last
+                 * thread switch.
+                 */
+                add_write_dep(state, &state->last_tlb, n);
+
+                add_write_dep(state, &state->last_tmu_write, n);
+        }
+
+        if (inst->sig.ldtmu) {
+                /* TMU loads are coming from a FIFO, so ordering is important.
+                 */
+                add_write_dep(state, &state->last_tmu_write, n);
+        }
+
+        if (inst->sig.ldtlb | inst->sig.ldtlbu)
+                add_read_dep(state, state->last_tlb, n);
+
+        if (inst->sig.ldvpm)
+                add_write_dep(state, &state->last_vpm_read, n);
+
+        /* inst->sig.ldunif or sideband uniform read */
+        if (qinst->uniform != ~0)
+                add_write_dep(state, &state->last_unif, n);
+
+        process_cond_deps(state, n, inst->flags.ac);
+        process_cond_deps(state, n, inst->flags.mc);
+        process_pf_deps(state, n, inst->flags.apf);
+        process_pf_deps(state, n, inst->flags.mpf);
+        process_uf_deps(state, n, inst->flags.auf);
+        process_uf_deps(state, n, inst->flags.muf);
+}
+
+static void
+calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list)
+{
+        struct schedule_state state;
+
+        memset(&state, 0, sizeof(state));
+        state.dir = F;
+
+        list_for_each_entry(struct schedule_node, node, schedule_list, link)
+                calculate_deps(&state, node);
+}
+
+static void
+calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list)
+{
+        struct list_head *node;
+        struct schedule_state state;
+
+        memset(&state, 0, sizeof(state));
+        state.dir = R;
+
+        for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
+                calculate_deps(&state, (struct schedule_node *)node);
+        }
+}
+
+struct choose_scoreboard {
+        int tick;
+        int last_sfu_write_tick;
+        int last_ldvary_tick;
+        int last_uniforms_reset_tick;
+        uint32_t last_waddr_add, last_waddr_mul;
+        bool tlb_locked;
+};
+
+static bool
+mux_reads_too_soon(struct choose_scoreboard *scoreboard,
+                   const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
+{
+        switch (mux) {
+        case V3D_QPU_MUX_A:
+                if (scoreboard->last_waddr_add == inst->raddr_a ||
+                    scoreboard->last_waddr_mul == inst->raddr_a) {
+                        return true;
+                }
+                break;
+
+        case V3D_QPU_MUX_B:
+                if (scoreboard->last_waddr_add == inst->raddr_b ||
+                    scoreboard->last_waddr_mul == inst->raddr_b) {
+                        return true;
+                }
+                break;
+
+        case V3D_QPU_MUX_R4:
+                if (scoreboard->tick - scoreboard->last_sfu_write_tick <= 2)
+                        return true;
+                break;
+
+        case V3D_QPU_MUX_R5:
+                if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
+                        return true;
+                break;
+        default:
+                break;
+        }
+
+        return false;
+}
+
+static bool
+reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+                           struct qinst *qinst)
+{
+        const struct v3d_qpu_instr *inst = &qinst->qpu;
+
+        /* XXX: Branching off of raddr. */
+        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
+                return false;
+
+        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
+
+        if (inst->alu.add.op != V3D_QPU_A_NOP) {
+                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
+                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
+                        return true;
+                }
+                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
+                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
+                        return true;
+                }
+        }
+
+        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
+                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
+                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
+                        return true;
+                }
+                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
+                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
+                        return true;
+                }
+        }
+
+        /* XXX: imm */
+
+        return false;
+}
+
+static bool
+writes_too_soon_after_write(struct choose_scoreboard *scoreboard,
+                            struct qinst *qinst)
+{
+        const struct v3d_qpu_instr *inst = &qinst->qpu;
+
+        /* Don't schedule any other r4 write too soon after an SFU write.
+         * This would normally be prevented by dependency tracking, but might
+         * occur if a dead SFU computation makes it to scheduling.
+         */
+        if (scoreboard->tick - scoreboard->last_sfu_write_tick < 2 &&
+            v3d_qpu_writes_r4(inst))
+                return true;
+
+        return false;
+}
+
+static bool
+pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
+                          const struct v3d_qpu_instr *inst)
+{
+        return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
+}
+
+static int
+get_instruction_priority(const struct v3d_qpu_instr *inst)
+{
+        uint32_t baseline_score;
+        uint32_t next_score = 0;
+
+        /* Schedule TLB operations as late as possible, to get more
+         * parallelism between shaders.
+         */
+        if (qpu_inst_is_tlb(inst))
+                return next_score;
+        next_score++;
+
+        /* Schedule texture read results collection late to hide latency. */
+        if (inst->sig.ldtmu)
+                return next_score;
+        next_score++;
+
+        /* Default score for things that aren't otherwise special. */
+        baseline_score = next_score;
+        next_score++;
+
+        /* Schedule texture read setup early to hide their latency better. */
+        if (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
+            ((inst->alu.add.magic_write &&
+              v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr)) ||
+             (inst->alu.mul.magic_write &&
+              v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr)))) {
+                return next_score;
+        }
+        next_score++;
+
+        return baseline_score;
+}
+
+static bool
+qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr)
+{
+        return (v3d_qpu_magic_waddr_is_tmu(waddr) ||
+                v3d_qpu_magic_waddr_is_sfu(waddr) ||
+                v3d_qpu_magic_waddr_is_tlb(waddr) ||
+                v3d_qpu_magic_waddr_is_vpm(waddr) ||
+                v3d_qpu_magic_waddr_is_tsy(waddr));
+}
+
+static bool
+qpu_accesses_peripheral(const struct v3d_qpu_instr *inst)
+{
+        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                if (inst->alu.add.op != V3D_QPU_A_NOP &&
+                    inst->alu.add.magic_write &&
+                    qpu_magic_waddr_is_periph(inst->alu.add.waddr)) {
+                        return true;
+                }
+
+                if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+                    inst->alu.mul.magic_write &&
+                    qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) {
+                        return true;
+                }
+        }
+
+        return (inst->sig.ldvpm ||
+                inst->sig.ldtmu ||
+                inst->sig.ldtlb ||
+                inst->sig.ldtlbu);
+}
+
+static bool
+qpu_merge_inst(const struct v3d_device_info *devinfo,
+               struct v3d_qpu_instr *result,
+               const struct v3d_qpu_instr *a,
+               const struct v3d_qpu_instr *b)
+{
+        if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
+            b->type != V3D_QPU_INSTR_TYPE_ALU) {
+                return false;
+        }
+
+        /* Can't do more than one peripheral access in an instruction. */
+        if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b))
+                return false;
+
+        struct v3d_qpu_instr merge = *a;
+
+        if (b->alu.add.op != V3D_QPU_A_NOP) {
+                if (a->alu.add.op != V3D_QPU_A_NOP)
+                        return false;
+                merge.alu.add = b->alu.add;
+
+                merge.flags.ac = b->flags.ac;
+                merge.flags.apf = b->flags.apf;
+                merge.flags.auf = b->flags.auf;
+        }
+
+        if (b->alu.mul.op != V3D_QPU_M_NOP) {
+                if (a->alu.mul.op != V3D_QPU_M_NOP)
+                        return false;
+                merge.alu.mul = b->alu.mul;
+
+                merge.flags.mc = b->flags.mc;
+                merge.flags.mpf = b->flags.mpf;
+                merge.flags.muf = b->flags.muf;
+        }
+
+        if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) {
+                if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) &&
+                    a->raddr_a != b->raddr_a) {
+                        return false;
+                }
+                merge.raddr_a = b->raddr_a;
+        }
+
+        if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
+                if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
+                    a->raddr_b != b->raddr_b) {
+                        return false;
+                }
+                merge.raddr_b = b->raddr_b;
+        }
+
+        merge.sig.thrsw |= b->sig.thrsw;
+        merge.sig.ldunif |= b->sig.ldunif;
+        merge.sig.ldtmu |= b->sig.ldtmu;
+        merge.sig.ldvary |= b->sig.ldvary;
+        merge.sig.ldvpm |= b->sig.ldvpm;
+        merge.sig.small_imm |= b->sig.small_imm;
+        merge.sig.ldtlb |= b->sig.ldtlb;
+        merge.sig.ldtlbu |= b->sig.ldtlbu;
+        merge.sig.ucb |= b->sig.ucb;
+        merge.sig.rotate |= b->sig.rotate;
+        merge.sig.wrtmuc |= b->sig.wrtmuc;
+
+        uint64_t packed;
+        bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
+
+        *result = merge;
+        /* No modifying the real instructions on failure. */
+        assert(ok || (a != result && b != result));
+
+        return ok;
+}
+
+static struct schedule_node *
+choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
+                               struct choose_scoreboard *scoreboard,
+                               struct list_head *schedule_list,
+                               struct schedule_node *prev_inst)
+{
+        struct schedule_node *chosen = NULL;
+        int chosen_prio = 0;
+
+        /* Don't pair up anything with a thread switch signal -- emit_thrsw()
+         * will handle pairing it along with filling the delay slots.
+         */
+        if (prev_inst) {
+                if (prev_inst->inst->qpu.sig.thrsw)
+                        return NULL;
+        }
+
+        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
+                const struct v3d_qpu_instr *inst = &n->inst->qpu;
+
+                /* Don't choose the branch instruction until it's the last one
+                 * left.  We'll move it up to fit its delay slots after we
+                 * choose it.
+                 */
+                if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
+                    !list_is_singular(schedule_list)) {
+                        continue;
+                }
+
+                /* "An instruction must not read from a location in physical
+                 *  regfile A or B that was written to by the previous
+                 *  instruction."
+                 */
+                if (reads_too_soon_after_write(scoreboard, n->inst))
+                        continue;
+
+                if (writes_too_soon_after_write(scoreboard, n->inst))
+                        continue;
+
+                /* "A scoreboard wait must not occur in the first two
+                 *  instructions of a fragment shader. This is either the
+                 *  explicit Wait for Scoreboard signal or an implicit wait
+                 *  with the first tile-buffer read or write instruction."
+                 */
+                if (pixel_scoreboard_too_soon(scoreboard, inst))
+                        continue;
+
+                /* ldunif and ldvary both write r5, but ldunif does so a tick
+                 * sooner.  If the ldvary's r5 wasn't used, then ldunif might
+                 * otherwise get scheduled so ldunif and ldvary try to update
+                 * r5 in the same tick.
+                 */
+                if (inst->sig.ldunif &&
+                    scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
+                        continue;
+                }
+
+                /* If we're trying to pair with another instruction, check
+                 * that they're compatible.
+                 */
+                if (prev_inst) {
+                        /* Don't pair up a thread switch signal -- we'll
+                         * handle pairing it when we pick it on its own.
+                         */
+                        if (inst->sig.thrsw)
+                                continue;
+
+                        if (prev_inst->inst->uniform != -1 &&
+                            n->inst->uniform != -1)
+                                continue;
+
+                        /* Don't merge in something that will lock the TLB.
+                         * Hopwefully what we have in inst will release some
+                         * other instructions, allowing us to delay the
+                         * TLB-locking instruction until later.
+                         */
+                        if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
+                                continue;
+
+                        struct v3d_qpu_instr merged_inst;
+                        if (!qpu_merge_inst(devinfo, &merged_inst,
+                                            &prev_inst->inst->qpu, inst)) {
+                                continue;
+                        }
+                }
+
+                int prio = get_instruction_priority(inst);
+
+                /* Found a valid instruction.  If nothing better comes along,
+                 * this one works.
+                 */
+                if (!chosen) {
+                        chosen = n;
+                        chosen_prio = prio;
+                        continue;
+                }
+
+                if (prio > chosen_prio) {
+                        chosen = n;
+                        chosen_prio = prio;
+                } else if (prio < chosen_prio) {
+                        continue;
+                }
+
+                if (n->delay > chosen->delay) {
+                        chosen = n;
+                        chosen_prio = prio;
+                } else if (n->delay < chosen->delay) {
+                        continue;
+                }
+        }
+
+        return chosen;
+}
+
+static void
+update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
+                                  enum v3d_qpu_waddr waddr)
+{
+        if (v3d_qpu_magic_waddr_is_sfu(waddr))
+                scoreboard->last_sfu_write_tick = scoreboard->tick;
+}
+
+static void
+update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
+                             const struct v3d_qpu_instr *inst)
+{
+        scoreboard->last_waddr_add = ~0;
+        scoreboard->last_waddr_mul = ~0;
+
+        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
+                return;
+
+        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
+
+        if (inst->alu.add.op != V3D_QPU_A_NOP)  {
+                if (inst->alu.add.magic_write) {
+                        update_scoreboard_for_magic_waddr(scoreboard,
+                                                          inst->alu.add.waddr);
+                } else {
+                        scoreboard->last_waddr_add = inst->alu.add.waddr;
+                }
+        }
+
+        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
+                if (inst->alu.mul.magic_write) {
+                        update_scoreboard_for_magic_waddr(scoreboard,
+                                                          inst->alu.mul.waddr);
+                } else {
+                        scoreboard->last_waddr_mul = inst->alu.mul.waddr;
+                }
+        }
+
+        if (inst->sig.ldvary)
+                scoreboard->last_ldvary_tick = scoreboard->tick;
+
+        if (qpu_inst_is_tlb(inst))
+                scoreboard->tlb_locked = true;
+}
+
+static void
+dump_state(const struct v3d_device_info *devinfo,
+           struct list_head *schedule_list)
+{
+        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
+                fprintf(stderr, "         t=%4d: ", n->unblocked_time);
+                v3d_qpu_dump(devinfo, &n->inst->qpu);
+                fprintf(stderr, "\n");
+
+                for (int i = 0; i < n->child_count; i++) {
+                        struct schedule_node *child = n->children[i].node;
+                        if (!child)
+                                continue;
+
+                        fprintf(stderr, "                 - ");
+                        v3d_qpu_dump(devinfo, &child->inst->qpu);
+                        fprintf(stderr, " (%d parents, %c)\n",
+                                child->parent_count,
+                                n->children[i].write_after_read ? 'w' : 'r');
+                }
+        }
+}
+
+static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr,
+                                    const struct v3d_qpu_instr *after)
+{
+        /* Apply some huge latency between texture fetch requests and getting
+         * their results back.
+         *
+         * FIXME: This is actually pretty bogus.  If we do:
+         *
+         * mov tmu0_s, a
+         * <a bit of math>
+         * mov tmu0_s, b
+         * load_tmu0
+         * <more math>
+         * load_tmu0
+         *
+         * we count that as worse than
+         *
+         * mov tmu0_s, a
+         * mov tmu0_s, b
+         * <lots of math>
+         * load_tmu0
+         * <more math>
+         * load_tmu0
+         *
+         * because we associate the first load_tmu0 with the *second* tmu0_s.
+         */
+        if (v3d_qpu_magic_waddr_is_tmu(waddr) && after->sig.ldtmu)
+                return 100;
+
+        /* Assume that anything depending on us is consuming the SFU result. */
+        if (v3d_qpu_magic_waddr_is_sfu(waddr))
+                return 3;
+
+        return 1;
+}
+
+static uint32_t
+instruction_latency(struct schedule_node *before, struct schedule_node *after)
+{
+        const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
+        const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
+        uint32_t latency = 1;
+
+        if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
+            after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
+                return latency;
+
+        if (before_inst->alu.add.magic_write) {
+                latency = MAX2(latency,
+                               magic_waddr_latency(before_inst->alu.add.waddr,
+                                                   after_inst));
+        }
+
+        if (before_inst->alu.mul.magic_write) {
+                latency = MAX2(latency,
+                               magic_waddr_latency(before_inst->alu.mul.waddr,
+                                                   after_inst));
+        }
+
+        return latency;
+}
+
+/** Recursive computation of the delay member of a node. */
+static void
+compute_delay(struct schedule_node *n)
+{
+        if (!n->child_count) {
+                n->delay = 1;
+        } else {
+                for (int i = 0; i < n->child_count; i++) {
+                        if (!n->children[i].node->delay)
+                                compute_delay(n->children[i].node);
+                        n->delay = MAX2(n->delay,
+                                        n->children[i].node->delay +
+                                        instruction_latency(n, n->children[i].node));
+                }
+        }
+}
+
+static void
+mark_instruction_scheduled(struct list_head *schedule_list,
+                           uint32_t time,
+                           struct schedule_node *node,
+                           bool war_only)
+{
+        if (!node)
+                return;
+
+        for (int i = node->child_count - 1; i >= 0; i--) {
+                struct schedule_node *child =
+                        node->children[i].node;
+
+                if (!child)
+                        continue;
+
+                if (war_only && !node->children[i].write_after_read)
+                        continue;
+
+                /* If the requirement is only that the node not appear before
+                 * the last read of its destination, then it can be scheduled
+                 * immediately after (or paired with!) the thing reading the
+                 * destination.
+                 */
+                uint32_t latency = 0;
+                if (!war_only) {
+                        latency = instruction_latency(node,
+                                                      node->children[i].node);
+                }
+
+                child->unblocked_time = MAX2(child->unblocked_time,
+                                             time + latency);
+                child->parent_count--;
+                if (child->parent_count == 0)
+                        list_add(&child->link, schedule_list);
+
+                node->children[i].node = NULL;
+        }
+}
+
+static struct qinst *
+vir_nop()
+{
+        struct qreg undef = { QFILE_NULL, 0 };
+        struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
+
+        return qinst;
+}
+
+#if 0
+static struct qinst *
+nop_after(struct qinst *inst)
+{
+        struct qinst *q = vir_nop();
+
+        list_add(&q->link, &inst->link);
+
+        return q;
+}
+
+/**
+ * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
+ * with another instruction.
+ */
+static void
+emit_thrsw(struct v3d_compile *c,
+           struct choose_scoreboard *scoreboard,
+           const struct v3d_qpu_instr *inst)
+{
+        /* There should be nothing in a thrsw inst being scheduled other than
+         * the signal bits.
+         */
+        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
+        assert(inst->alu.add.op == V3D_QPU_A_NOP);
+        assert(inst->alu.mul.op == V3D_QPU_M_NOP);
+
+        /* Try to find an earlier scheduled instruction that we can merge the
+         * thrsw into.
+         */
+        int thrsw_ip = c->qpu_inst_count;
+        for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
+                uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
+                uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
+
+                if (prev_sig == QPU_SIG_NONE)
+                        thrsw_ip = c->qpu_inst_count - i;
+        }
+
+        if (thrsw_ip != c->qpu_inst_count) {
+                /* Merge the thrsw into the existing instruction. */
+                c->qpu_insts[thrsw_ip] =
+                        QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
+        } else {
+                qpu_serialize_one_inst(c, inst);
+                update_scoreboard_for_chosen(scoreboard, inst);
+        }
+
+        /* Fill the delay slots. */
+        while (c->qpu_inst_count < thrsw_ip + 3) {
+                update_scoreboard_for_chosen(scoreboard, v3d_qpu_nop());
+                qpu_serialize_one_inst(c, v3d_qpu_nop());
+        }
+}
+#endif
+
+static uint32_t
+schedule_instructions(struct v3d_compile *c,
+                      struct choose_scoreboard *scoreboard,
+                      struct qblock *block,
+                      struct list_head *schedule_list,
+                      enum quniform_contents *orig_uniform_contents,
+                      uint32_t *orig_uniform_data,
+                      uint32_t *next_uniform)
+{
+        const struct v3d_device_info *devinfo = c->devinfo;
+        uint32_t time = 0;
+
+        if (debug) {
+                fprintf(stderr, "initial deps:\n");
+                dump_state(devinfo, schedule_list);
+                fprintf(stderr, "\n");
+        }
+
+        /* Remove non-DAG heads from the list. */
+        list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
+                if (n->parent_count != 0)
+                        list_del(&n->link);
+        }
+
+        while (!list_empty(schedule_list)) {
+                struct schedule_node *chosen =
+                        choose_instruction_to_schedule(devinfo,
+                                                       scoreboard,
+                                                       schedule_list,
+                                                       NULL);
+                struct schedule_node *merge = NULL;
+
+                /* If there are no valid instructions to schedule, drop a NOP
+                 * in.
+                 */
+                struct qinst *qinst = chosen ? chosen->inst : vir_nop();
+                struct v3d_qpu_instr *inst = &qinst->qpu;
+
+                if (debug) {
+                        fprintf(stderr, "t=%4d: current list:\n",
+                                time);
+                        dump_state(devinfo, schedule_list);
+                        fprintf(stderr, "t=%4d: chose:   ", time);
+                        v3d_qpu_dump(devinfo, inst);
+                        fprintf(stderr, "\n");
+                }
+
+                /* Schedule this instruction onto the QPU list. Also try to
+                 * find an instruction to pair with it.
+                 */
+                if (chosen) {
+                        time = MAX2(chosen->unblocked_time, time);
+                        list_del(&chosen->link);
+                        mark_instruction_scheduled(schedule_list, time,
+                                                   chosen, true);
+
+                        merge = choose_instruction_to_schedule(devinfo,
+                                                               scoreboard,
+                                                               schedule_list,
+                                                               chosen);
+                        if (merge) {
+                                time = MAX2(merge->unblocked_time, time);
+                                list_del(&merge->link);
+                                (void)qpu_merge_inst(devinfo, inst,
+                                                     inst, &merge->inst->qpu);
+                                if (merge->inst->uniform != -1) {
+                                        chosen->inst->uniform =
+                                                merge->inst->uniform;
+                                }
+
+                                if (debug) {
+                                        fprintf(stderr, "t=%4d: merging: ",
+                                                time);
+                                        v3d_qpu_dump(devinfo, &merge->inst->qpu);
+                                        fprintf(stderr, "\n");
+                                        fprintf(stderr, "         result: ");
+                                        v3d_qpu_dump(devinfo, inst);
+                                        fprintf(stderr, "\n");
+                                }
+                        }
+                }
+
+                /* Update the uniform index for the rewritten location --
+                 * branch target updating will still need to change
+                 * c->uniform_data[] using this index.
+                 */
+                if (qinst->uniform != -1) {
+                        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
+                                block->branch_uniform = *next_uniform;
+
+                        c->uniform_data[*next_uniform] =
+                                orig_uniform_data[qinst->uniform];
+                        c->uniform_contents[*next_uniform] =
+                                orig_uniform_contents[qinst->uniform];
+                        qinst->uniform = *next_uniform;
+                        (*next_uniform)++;
+                }
+
+                if (debug) {
+                        fprintf(stderr, "\n");
+                }
+
+                /* Now that we've scheduled a new instruction, some of its
+                 * children can be promoted to the list of instructions ready to
+                 * be scheduled.  Update the children's unblocked time for this
+                 * DAG edge as we do so.
+                 */
+                mark_instruction_scheduled(schedule_list, time, chosen, false);
+
+                if (merge) {
+                        mark_instruction_scheduled(schedule_list, time, merge,
+                                                   false);
+
+                        /* The merged VIR instruction doesn't get re-added to the
+                         * block, so free it now.
+                         */
+                        free(merge->inst);
+                }
+
+                if (0 && inst->sig.thrsw) {
+                        /* XXX emit_thrsw(c, scoreboard, qinst); */
+                } else {
+                        c->qpu_inst_count++;
+                        list_addtail(&qinst->link, &block->instructions);
+                        update_scoreboard_for_chosen(scoreboard, inst);
+                }
+
+                scoreboard->tick++;
+                time++;
+
+                if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ||
+                    inst->sig.thrsw /* XXX */) {
+                        block->branch_qpu_ip = c->qpu_inst_count - 1;
+                        /* Fill the delay slots.
+                         *
+                         * We should fill these with actual instructions,
+                         * instead, but that will probably need to be done
+                         * after this, once we know what the leading
+                         * instructions of the successors are (so we can
+                         * handle A/B register file write latency)
+                        */
+                        /* XXX: scoreboard */
+                        int slots = (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ?
+                                     3 : 2);
+                        for (int i = 0; i < slots; i++) {
+                                struct qinst *nop = vir_nop();
+                                list_addtail(&nop->link, &block->instructions);
+
+                                update_scoreboard_for_chosen(scoreboard,
+                                                             &nop->qpu);
+                                c->qpu_inst_count++;
+                                scoreboard->tick++;
+                                time++;
+                        }
+                }
+        }
+
+        return time;
+}
+
+static uint32_t
+qpu_schedule_instructions_block(struct v3d_compile *c,
+                                struct choose_scoreboard *scoreboard,
+                                struct qblock *block,
+                                enum quniform_contents *orig_uniform_contents,
+                                uint32_t *orig_uniform_data,
+                                uint32_t *next_uniform)
+{
+        void *mem_ctx = ralloc_context(NULL);
+        struct list_head schedule_list;
+
+        list_inithead(&schedule_list);
+
+        /* Wrap each instruction in a scheduler structure. */
+        while (!list_empty(&block->instructions)) {
+                struct qinst *qinst = (struct qinst *)block->instructions.next;
+                struct schedule_node *n =
+                        rzalloc(mem_ctx, struct schedule_node);
+
+                n->inst = qinst;
+
+                list_del(&qinst->link);
+                list_addtail(&n->link, &schedule_list);
+        }
+
+        calculate_forward_deps(c, &schedule_list);
+        calculate_reverse_deps(c, &schedule_list);
+
+        list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
+                compute_delay(n);
+        }
+
+        uint32_t cycles = schedule_instructions(c, scoreboard, block,
+                                                &schedule_list,
+                                                orig_uniform_contents,
+                                                orig_uniform_data,
+                                                next_uniform);
+
+        ralloc_free(mem_ctx);
+
+        return cycles;
+}
+
+static void
+qpu_set_branch_targets(struct v3d_compile *c)
+{
+        vir_for_each_block(block, c) {
+                /* The end block of the program has no branch. */
+                if (!block->successors[0])
+                        continue;
+
+                /* If there was no branch instruction, then the successor
+                 * block must follow immediately after this one.
+                 */
+                if (block->branch_qpu_ip == ~0) {
+                        assert(block->end_qpu_ip + 1 ==
+                               block->successors[0]->start_qpu_ip);
+                        continue;
+                }
+
+                /* Walk back through the delay slots to find the branch
+                 * instr.
+                 */
+                struct list_head *entry = block->instructions.prev;
+                for (int i = 0; i < 3; i++)
+                        entry = entry->prev;
+                struct qinst *branch = container_of(entry, branch, link);
+                assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
+
+                /* Make sure that the if-we-don't-jump
+                 * successor was scheduled just after the
+                 * delay slots.
+                 */
+                assert(!block->successors[1] ||
+                       block->successors[1]->start_qpu_ip ==
+                       block->branch_qpu_ip + 4);
+
+                branch->qpu.branch.offset =
+                        ((block->successors[0]->start_qpu_ip -
+                          (block->branch_qpu_ip + 4)) *
+                         sizeof(uint64_t));
+
+                /* Set up the relative offset to jump in the
+                 * uniform stream.
+                 *
+                 * Use a temporary here, because
+                 * uniform_data[inst->uniform] may be shared
+                 * between multiple instructions.
+                 */
+                assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
+                c->uniform_data[branch->uniform] =
+                        (block->successors[0]->start_uniform -
+                         (block->branch_uniform + 1)) * 4;
+        }
+}
+
+uint32_t
+v3d_qpu_schedule_instructions(struct v3d_compile *c)
+{
+        const struct v3d_device_info *devinfo = c->devinfo;
+
+        /* We reorder the uniforms as we schedule instructions, so save the
+         * old data off and replace it.
+         */
+        uint32_t *uniform_data = c->uniform_data;
+        enum quniform_contents *uniform_contents = c->uniform_contents;
+        c->uniform_contents = ralloc_array(c, enum quniform_contents,
+                                           c->num_uniforms);
+        c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
+        c->uniform_array_size = c->num_uniforms;
+        uint32_t next_uniform = 0;
+
+        struct choose_scoreboard scoreboard;
+        memset(&scoreboard, 0, sizeof(scoreboard));
+        scoreboard.last_waddr_add = ~0;
+        scoreboard.last_waddr_mul = ~0;
+        scoreboard.last_ldvary_tick = -10;
+        scoreboard.last_sfu_write_tick = -10;
+        scoreboard.last_uniforms_reset_tick = -10;
+
+        if (debug) {
+                fprintf(stderr, "Pre-schedule instructions\n");
+                vir_for_each_block(block, c) {
+                        fprintf(stderr, "BLOCK %d\n", block->index);
+                        list_for_each_entry(struct qinst, qinst,
+                                            &block->instructions, link) {
+                                v3d_qpu_dump(devinfo, &qinst->qpu);
+                                fprintf(stderr, "\n");
+                        }
+                }
+                fprintf(stderr, "\n");
+        }
+
+        uint32_t cycles = 0;
+        vir_for_each_block(block, c) {
+                block->start_qpu_ip = c->qpu_inst_count;
+                block->branch_qpu_ip = ~0;
+                block->start_uniform = next_uniform;
+
+                cycles += qpu_schedule_instructions_block(c,
+                                                          &scoreboard,
+                                                          block,
+                                                          uniform_contents,
+                                                          uniform_data,
+                                                          &next_uniform);
+
+                block->end_qpu_ip = c->qpu_inst_count - 1;
+        }
+
+        qpu_set_branch_targets(c);
+
+        assert(next_uniform == c->num_uniforms);
+
+        return cycles;
+}
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
new file mode 100644
index 00000000000..d99d76a8beb
--- /dev/null
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file
+ *
+ * Validates the QPU instruction sequence after register allocation and
+ * scheduling.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "v3d_compiler.h"
+#include "qpu/qpu_disasm.h"
+
+struct v3d_qpu_validate_state {
+        struct v3d_compile *c;
+        const struct v3d_qpu_instr *last;
+        int ip;
+        int last_sfu_write;
+};
+
+static void
+fail_instr(struct v3d_qpu_validate_state *state, const char *msg)
+{
+        struct v3d_compile *c = state->c;
+
+        fprintf(stderr, "v3d_qpu_validate at ip %d: %s:\n", state->ip, msg);
+
+        int dump_ip = 0;
+        vir_for_each_inst_inorder(inst, c) {
+                v3d_qpu_dump(c->devinfo, &inst->qpu);
+
+                if (dump_ip++ == state->ip)
+                        fprintf(stderr, " *** ERROR ***");
+
+                fprintf(stderr, "\n");
+        }
+
+        fprintf(stderr, "\n");
+        abort();
+}
+
+static bool
+qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst,
+                        bool (*predicate)(enum v3d_qpu_waddr waddr))
+{
+        if (inst->type == V3D_QPU_INSTR_TYPE_ALU)
+                return false;
+
+        if (inst->alu.add.op != V3D_QPU_A_NOP &&
+            inst->alu.add.magic_write &&
+            predicate(inst->alu.add.waddr))
+                return true;
+
+        if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+            inst->alu.mul.magic_write &&
+            predicate(inst->alu.mul.waddr))
+                return true;
+
+        return false;
+}
+
+static void
+qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
+{
+        const struct v3d_qpu_instr *inst = &qinst->qpu;
+
+        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+                return;
+
+        /* LDVARY writes r5 two instructions later and LDUNIF writes
+         * r5 one instruction later, which is illegal to have
+         * together.
+         */
+        if (state->last && state->last->sig.ldvary && inst->sig.ldunif) {
+                fail_instr(state, "LDUNIF after a LDVARY");
+        }
+
+        int tmu_writes = 0;
+        int sfu_writes = 0;
+        int vpm_writes = 0;
+        int tlb_writes = 0;
+        int tsy_writes = 0;
+
+        if (inst->alu.add.op != V3D_QPU_A_NOP) {
+                if (inst->alu.add.magic_write) {
+                        if (v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr))
+                                tmu_writes++;
+                        if (v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr))
+                                sfu_writes++;
+                        if (v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr))
+                                vpm_writes++;
+                        if (v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr))
+                                tlb_writes++;
+                        if (v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr))
+                                tsy_writes++;
+                }
+        }
+
+        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
+                if (inst->alu.mul.magic_write) {
+                        if (v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr))
+                                tmu_writes++;
+                        if (v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr))
+                                sfu_writes++;
+                        if (v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr))
+                                vpm_writes++;
+                        if (v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr))
+                                tlb_writes++;
+                        if (v3d_qpu_magic_waddr_is_tsy(inst->alu.mul.waddr))
+                                tsy_writes++;
+                }
+        }
+
+        (void)qpu_magic_waddr_matches; /* XXX */
+
+        /* SFU r4 results come back two instructions later.  No doing
+         * r4 read/writes or other SFU lookups until it's done.
+         */
+        if (state->ip - state->last_sfu_write < 2) {
+                if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_R4))
+                        fail_instr(state, "R4 read too soon after SFU");
+
+                if (v3d_qpu_writes_r4(inst))
+                        fail_instr(state, "R4 write too soon after SFU");
+
+                if (sfu_writes)
+                        fail_instr(state, "SFU write too soon after SFU");
+        }
+
+        /* XXX: The docs say VPM can happen with the others, but the simulator
+         * disagrees.
+         */
+        if (tmu_writes +
+            sfu_writes +
+            vpm_writes +
+            tlb_writes +
+            tsy_writes +
+            inst->sig.ldtmu +
+            inst->sig.ldtlb +
+            inst->sig.ldvpm +
+            inst->sig.ldtlbu > 1) {
+                fail_instr(state,
+                           "Only one of [TMU, SFU, TSY, TLB read, VPM] allowed");
+        }
+
+        if (sfu_writes)
+                state->last_sfu_write = state->ip;
+}
+
+static void
+qpu_validate_block(struct v3d_qpu_validate_state *state, struct qblock *block)
+{
+        vir_for_each_inst(qinst, block) {
+                qpu_validate_inst(state, qinst);
+
+                state->last = &qinst->qpu;
+                state->ip++;
+        }
+}
+
+/**
+ * Checks for the instruction restrictions from page 37 ("Summary of
+ * Instruction Restrictions").
+ */
+void
+qpu_validate(struct v3d_compile *c)
+{
+        /* We don't want to do validation in release builds, but we want to
+         * keep compiling the validation code to make sure it doesn't get
+         * broken.
+         */
+#ifndef DEBUG
+        return;
+#endif
+
+        struct v3d_qpu_validate_state state = {
+                .c = c,
+                .last_sfu_write = -10,
+                .ip = 0,
+        };
+
+        vir_for_each_block(block, c) {
+                qpu_validate_block(&state, block);
+        }
+}
diff --git a/src/broadcom/compiler/v3d_compiler.c b/src/broadcom/compiler/v3d_compiler.c
new file mode 100644
index 00000000000..acce09db3fa
--- /dev/null
+++ b/src/broadcom/compiler/v3d_compiler.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+struct v3d_compiler *
+v3d_compiler_init(void)
+{
+        struct v3d_compile *c = rzalloc(struct v3d_compile);
+
+        return c;
+}
+
+void
+v3d_add_qpu_inst(struct v3d_compiler *c, uint64_t inst)
+{
+        if (c->qpu_inst_count >= c->qpu_inst_size) {
+                c->qpu_inst_size = MAX2(c->qpu_inst_size * 2, 16);
+                c->qpu_insts = reralloc(c, c->qpu_insts, uint64_t,
+                                        c->qpu_inst_size_array_size);
+
+        }
+
+        c->qpu_insts[c->qpu_inst_count++] = inst;
+}
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
new file mode 100644
index 00000000000..e0eeefe245a
--- /dev/null
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -0,0 +1,927 @@
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef V3D_COMPILER_H
+#define V3D_COMPILER_H
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "util/macros.h"
+#include "common/v3d_debug.h"
+#include "compiler/nir/nir.h"
+#include "util/list.h"
+#include "util/u_math.h"
+
+#include "qpu/qpu_instr.h"
+#include "pipe/p_state.h"
+
+#define V3D_MAX_TEXTURE_SAMPLERS 32
+#define V3D_MAX_SAMPLES 4
+#define V3D_MAX_FS_INPUTS 64
+#define V3D_MAX_VS_INPUTS 64
+
+struct nir_builder;
+
+struct v3d_fs_inputs {
+        /**
+         * Array of the meanings of the VPM inputs this shader needs.
+         *
+         * It doesn't include those that aren't part of the VPM, like
+         * point/line coordinates.
+         */
+        struct v3d_varying_slot *input_slots;
+        uint32_t num_inputs;
+};
+
+enum qfile {
+        /** An unused source or destination register. */
+        QFILE_NULL,
+
+        /** A physical register, such as the W coordinate payload. */
+        QFILE_REG,
+        /** One of the regsiters for fixed function interactions. */
+        QFILE_MAGIC,
+
+        /**
+         *  A virtual register, that will be allocated to actual accumulator
+         * or physical registers later.
+         */
+        QFILE_TEMP,
+        QFILE_VARY,
+        QFILE_UNIF,
+        QFILE_TLB,
+        QFILE_TLBU,
+
+        /**
+         * VPM reads use this with an index value to say what part of the VPM
+         * is being read.
+         */
+        QFILE_VPM,
+
+        /**
+         * Stores an immediate value in the index field that will be used
+         * directly by qpu_load_imm().
+         */
+        QFILE_LOAD_IMM,
+
+        /**
+         * Stores an immediate value in the index field that can be turned
+         * into a small immediate field by qpu_encode_small_immediate().
+         */
+        QFILE_SMALL_IMM,
+};
+
+/**
+ * A reference to a QPU register or a virtual temp register.
+ */
+struct qreg {
+        enum qfile file;
+        uint32_t index;
+};
+
+static inline struct qreg vir_reg(enum qfile file, uint32_t index)
+{
+        return (struct qreg){file, index};
+}
+
+/**
+ * A reference to an actual register at the QPU level, for register
+ * allocation.
+ */
+struct qpu_reg {
+        bool magic;
+        int index;
+};
+
+struct qinst {
+        /** Entry in qblock->instructions */
+        struct list_head link;
+
+        /**
+         * The instruction being wrapped.  Its condition codes, pack flags,
+         * signals, etc. will all be used, with just the register references
+         * being replaced by the contents of qinst->dst and qinst->src[].
+         */
+        struct v3d_qpu_instr qpu;
+
+        /* Pre-register-allocation references to src/dst registers */
+        struct qreg dst;
+        struct qreg src[3];
+        bool cond_is_exec_mask;
+        bool has_implicit_uniform;
+
+        /* After vir_to_qpu.c: If instr reads a uniform, which uniform from
+         * the uncompiled stream it is.
+         */
+        int uniform;
+};
+
+enum quniform_contents {
+        /**
+         * Indicates that a constant 32-bit value is copied from the program's
+         * uniform contents.
+         */
+        QUNIFORM_CONSTANT,
+        /**
+         * Indicates that the program's uniform contents are used as an index
+         * into the GL uniform storage.
+         */
+        QUNIFORM_UNIFORM,
+
+        /** @{
+         * Scaling factors from clip coordinates to relative to the viewport
+         * center.
+         *
+         * This is used by the coordinate and vertex shaders to produce the
+         * 32-bit entry consisting of 2 16-bit fields with 12.4 signed fixed
+         * point offsets from the viewport ccenter.
+         */
+        QUNIFORM_VIEWPORT_X_SCALE,
+        QUNIFORM_VIEWPORT_Y_SCALE,
+        /** @} */
+
+        QUNIFORM_VIEWPORT_Z_OFFSET,
+        QUNIFORM_VIEWPORT_Z_SCALE,
+
+        QUNIFORM_USER_CLIP_PLANE,
+
+        /**
+         * A reference to a texture config parameter 0 uniform.
+         *
+         * This is a uniform implicitly loaded with a QPU_W_TMU* write, which
+         * defines texture type, miplevels, and such.  It will be found as a
+         * parameter to the first QOP_TEX_[STRB] instruction in a sequence.
+         */
+        QUNIFORM_TEXTURE_CONFIG_P0_0,
+        QUNIFORM_TEXTURE_CONFIG_P0_1,
+        QUNIFORM_TEXTURE_CONFIG_P0_2,
+        QUNIFORM_TEXTURE_CONFIG_P0_3,
+        QUNIFORM_TEXTURE_CONFIG_P0_4,
+        QUNIFORM_TEXTURE_CONFIG_P0_5,
+        QUNIFORM_TEXTURE_CONFIG_P0_6,
+        QUNIFORM_TEXTURE_CONFIG_P0_7,
+        QUNIFORM_TEXTURE_CONFIG_P0_8,
+        QUNIFORM_TEXTURE_CONFIG_P0_9,
+        QUNIFORM_TEXTURE_CONFIG_P0_10,
+        QUNIFORM_TEXTURE_CONFIG_P0_11,
+        QUNIFORM_TEXTURE_CONFIG_P0_12,
+        QUNIFORM_TEXTURE_CONFIG_P0_13,
+        QUNIFORM_TEXTURE_CONFIG_P0_14,
+        QUNIFORM_TEXTURE_CONFIG_P0_15,
+        QUNIFORM_TEXTURE_CONFIG_P0_16,
+        QUNIFORM_TEXTURE_CONFIG_P0_17,
+        QUNIFORM_TEXTURE_CONFIG_P0_18,
+        QUNIFORM_TEXTURE_CONFIG_P0_19,
+        QUNIFORM_TEXTURE_CONFIG_P0_20,
+        QUNIFORM_TEXTURE_CONFIG_P0_21,
+        QUNIFORM_TEXTURE_CONFIG_P0_22,
+        QUNIFORM_TEXTURE_CONFIG_P0_23,
+        QUNIFORM_TEXTURE_CONFIG_P0_24,
+        QUNIFORM_TEXTURE_CONFIG_P0_25,
+        QUNIFORM_TEXTURE_CONFIG_P0_26,
+        QUNIFORM_TEXTURE_CONFIG_P0_27,
+        QUNIFORM_TEXTURE_CONFIG_P0_28,
+        QUNIFORM_TEXTURE_CONFIG_P0_29,
+        QUNIFORM_TEXTURE_CONFIG_P0_30,
+        QUNIFORM_TEXTURE_CONFIG_P0_31,
+        QUNIFORM_TEXTURE_CONFIG_P0_32,
+
+        /**
+         * A reference to a texture config parameter 1 uniform.
+         *
+         * This is a uniform implicitly loaded with a QPU_W_TMU* write, which
+         * defines texture width, height, filters, and wrap modes.  It will be
+         * found as a parameter to the second QOP_TEX_[STRB] instruction in a
+         * sequence.
+         */
+        QUNIFORM_TEXTURE_CONFIG_P1,
+
+        QUNIFORM_TEXTURE_FIRST_LEVEL,
+
+        QUNIFORM_TEXTURE_WIDTH,
+        QUNIFORM_TEXTURE_HEIGHT,
+        QUNIFORM_TEXTURE_DEPTH,
+        QUNIFORM_TEXTURE_ARRAY_SIZE,
+        QUNIFORM_TEXTURE_LEVELS,
+
+        QUNIFORM_TEXTURE_MSAA_ADDR,
+
+        QUNIFORM_UBO_ADDR,
+
+        QUNIFORM_TEXRECT_SCALE_X,
+        QUNIFORM_TEXRECT_SCALE_Y,
+
+        QUNIFORM_TEXTURE_BORDER_COLOR,
+
+        QUNIFORM_STENCIL,
+
+        QUNIFORM_ALPHA_REF,
+        QUNIFORM_SAMPLE_MASK,
+};
+
+struct v3d_varying_slot {
+        uint8_t slot_and_component;
+};
+
+static inline struct v3d_varying_slot
+v3d_slot_from_slot_and_component(uint8_t slot, uint8_t component)
+{
+        assert(slot < 255 / 4);
+        return (struct v3d_varying_slot){ (slot << 2) + component };
+}
+
+static inline uint8_t v3d_slot_get_slot(struct v3d_varying_slot slot)
+{
+        return slot.slot_and_component >> 2;
+}
+
+static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot)
+{
+        return slot.slot_and_component & 3;
+}
+
+struct v3d_ubo_range {
+        /**
+         * offset in bytes from the start of the ubo where this range is
+         * uploaded.
+         *
+         * Only set once used is set.
+         */
+        uint32_t dst_offset;
+
+        /**
+         * offset in bytes from the start of the gallium uniforms where the
+         * data comes from.
+         */
+        uint32_t src_offset;
+
+        /** size in bytes of this ubo range */
+        uint32_t size;
+};
+
+struct v3d_key {
+        void *shader_state;
+        struct {
+                uint8_t swizzle[4];
+                uint8_t return_size;
+                uint8_t return_channels;
+                union {
+                        struct {
+                                unsigned compare_mode:1;
+                                unsigned compare_func:3;
+                                unsigned wrap_s:3;
+                                unsigned wrap_t:3;
+                        };
+                        struct {
+                                uint16_t msaa_width, msaa_height;
+                        };
+                };
+        } tex[V3D_MAX_TEXTURE_SAMPLERS];
+        uint8_t ucp_enables;
+};
+
+struct v3d_fs_key {
+        struct v3d_key base;
+        bool depth_enabled;
+        bool is_points;
+        bool is_lines;
+        bool alpha_test;
+        bool point_coord_upper_left;
+        bool light_twoside;
+        bool msaa;
+        bool sample_coverage;
+        bool sample_alpha_to_coverage;
+        bool sample_alpha_to_one;
+        bool clamp_color;
+        bool swap_color_rb;
+        uint8_t alpha_test_func;
+        uint8_t logicop_func;
+        uint32_t point_sprite_mask;
+
+        struct pipe_rt_blend_state blend;
+};
+
+struct v3d_vs_key {
+        struct v3d_key base;
+
+        struct v3d_varying_slot fs_inputs[V3D_MAX_FS_INPUTS];
+        uint8_t num_fs_inputs;
+
+        bool is_coord;
+        bool per_vertex_point_size;
+        bool clamp_color;
+};
+
+/** A basic block of VIR intructions. */
+struct qblock {
+        struct list_head link;
+
+        struct list_head instructions;
+
+        struct set *predecessors;
+        struct qblock *successors[2];
+
+        int index;
+
+        /* Instruction IPs for the first and last instruction of the block.
+         * Set by qpu_schedule.c.
+         */
+        uint32_t start_qpu_ip;
+        uint32_t end_qpu_ip;
+
+        /* Instruction IP for the branch instruction of the block.  Set by
+         * qpu_schedule.c.
+         */
+        uint32_t branch_qpu_ip;
+
+        /** Offset within the uniform stream at the start of the block. */
+        uint32_t start_uniform;
+        /** Offset within the uniform stream of the branch instruction */
+        uint32_t branch_uniform;
+
+        /** @{ used by v3d_vir_live_variables.c */
+        BITSET_WORD *def;
+        BITSET_WORD *use;
+        BITSET_WORD *live_in;
+        BITSET_WORD *live_out;
+        int start_ip, end_ip;
+        /** @} */
+};
+
+/**
+ * Compiler state saved across compiler invocations, for any expensive global
+ * setup.
+ */
+struct v3d_compiler {
+        const struct v3d_device_info *devinfo;
+        struct ra_regs *regs;
+        unsigned int reg_class[3];
+};
+
+struct v3d_compile {
+        const struct v3d_device_info *devinfo;
+        nir_shader *s;
+        nir_function_impl *impl;
+        struct exec_list *cf_node_list;
+        const struct v3d_compiler *compiler;
+
+        /**
+         * Mapping from nir_register * or nir_ssa_def * to array of struct
+         * qreg for the values.
+         */
+        struct hash_table *def_ht;
+
+        /* For each temp, the instruction generating its value. */
+        struct qinst **defs;
+        uint32_t defs_array_size;
+
+        /**
+         * Inputs to the shader, arranged by TGSI declaration order.
+         *
+         * Not all fragment shader QFILE_VARY reads are present in this array.
+         */
+        struct qreg *inputs;
+        struct qreg *outputs;
+        bool msaa_per_sample_output;
+        struct qreg color_reads[V3D_MAX_SAMPLES];
+        struct qreg sample_colors[V3D_MAX_SAMPLES];
+        uint32_t inputs_array_size;
+        uint32_t outputs_array_size;
+        uint32_t uniforms_array_size;
+
+        /* Booleans for whether the corresponding QFILE_VARY[i] is
+         * flat-shaded.  This doesn't count gl_FragColor flat-shading, which is
+         * controlled by shader->color_inputs and rasterizer->flatshade in the
+         * gallium driver.
+         */
+        BITSET_WORD flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
+
+        struct v3d_ubo_range *ubo_ranges;
+        bool *ubo_range_used;
+        uint32_t ubo_ranges_array_size;
+        /** Number of uniform areas tracked in ubo_ranges. */
+        uint32_t num_ubo_ranges;
+        uint32_t next_ubo_dst_offset;
+
+        /* State for whether we're executing on each channel currently.  0 if
+         * yes, otherwise a block number + 1 that the channel jumped to.
+         */
+        struct qreg execute;
+
+        struct qreg line_x, point_x, point_y;
+
+        /**
+         * Instance ID, which comes in before the vertex attribute payload if
+         * the shader record requests it.
+         */
+        struct qreg iid;
+
+        /**
+         * Vertex ID, which comes in before the vertex attribute payload
+         * (after Instance ID) if the shader record requests it.
+         */
+        struct qreg vid;
+
+        /* Fragment shader payload regs. */
+        struct qreg payload_w, payload_w_centroid, payload_z;
+
+        /** boolean (~0 -> true) if the fragment has been discarded. */
+        struct qreg discard;
+
+        uint8_t vattr_sizes[V3D_MAX_VS_INPUTS];
+        uint32_t num_vpm_writes;
+
+        /**
+         * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
+         *
+         * This includes those that aren't part of the VPM varyings, like
+         * point/line coordinates.
+         */
+        struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS];
+
+        /**
+         * An entry per outputs[] in the VS indicating what the VARYING_SLOT_*
+         * of the output is.  Used to emit from the VS in the order that the
+         * FS needs.
+         */
+        struct v3d_varying_slot *output_slots;
+
+        struct pipe_shader_state *shader_state;
+        struct v3d_key *key;
+        struct v3d_fs_key *fs_key;
+        struct v3d_vs_key *vs_key;
+
+        /* Live ranges of temps. */
+        int *temp_start, *temp_end;
+
+        uint32_t *uniform_data;
+        enum quniform_contents *uniform_contents;
+        uint32_t uniform_array_size;
+        uint32_t num_uniforms;
+        uint32_t num_outputs;
+        uint32_t output_position_index;
+        nir_variable *output_color_var;
+        uint32_t output_point_size_index;
+        uint32_t output_sample_mask_index;
+
+        struct qreg undef;
+        uint32_t num_temps;
+
+        struct list_head blocks;
+        int next_block_index;
+        struct qblock *cur_block;
+        struct qblock *loop_cont_block;
+        struct qblock *loop_break_block;
+
+        uint64_t *qpu_insts;
+        uint32_t qpu_inst_count;
+        uint32_t qpu_inst_size;
+
+        /* For the FS, the number of varying inputs not counting the
+         * point/line varyings payload
+         */
+        uint32_t num_inputs;
+
+        /**
+         * Number of inputs from num_inputs remaining to be queued to the read
+         * FIFO in the VS/CS.
+         */
+        uint32_t num_inputs_remaining;
+
+        /* Number of inputs currently in the read FIFO for the VS/CS */
+        uint32_t num_inputs_in_fifo;
+
+        /** Next offset in the VPM to read from in the VS/CS */
+        uint32_t vpm_read_offset;
+
+        uint32_t program_id;
+        uint32_t variant_id;
+
+        /* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH
+         * is used to hide texturing latency at the cost of limiting ourselves
+         * to the bottom half of physical reg space.
+         */
+        bool fs_threaded;
+
+        bool last_thrsw_at_top_level;
+
+        bool failed;
+};
+
+struct v3d_uniform_list {
+        enum quniform_contents *contents;
+        uint32_t *data;
+        uint32_t count;
+};
+
+struct v3d_prog_data {
+        struct v3d_uniform_list uniforms;
+
+        struct v3d_ubo_range *ubo_ranges;
+        uint32_t num_ubo_ranges;
+        uint32_t ubo_size;
+
+        uint8_t num_inputs;
+
+};
+
+struct v3d_vs_prog_data {
+        struct v3d_prog_data base;
+
+        bool uses_iid, uses_vid;
+
+        /* Number of components read from each vertex attribute. */
+        uint8_t vattr_sizes[32];
+
+        /* Total number of components read, for the shader state record. */
+        uint32_t vpm_input_size;
+
+        /* Total number of components written, for the shader state record. */
+        uint32_t vpm_output_size;
+};
+
+struct v3d_fs_prog_data {
+        struct v3d_prog_data base;
+
+        struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS];
+
+        /** bitmask of which inputs are color inputs, for flat shade handling. */
+        uint32_t color_inputs[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
+
+        /* Bitmask for whether the corresponding input is flat-shaded,
+         * independent of rasterizer (gl_FragColor) flat-shading.
+         */
+        BITSET_WORD flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
+
+        bool writes_z;
+};
+
+/* Special nir_load_input intrinsic index for loading the current TLB
+ * destination color.
+ */
+#define V3D_NIR_TLB_COLOR_READ_INPUT		2000000000
+
+#define V3D_NIR_MS_MASK_OUTPUT			2000000000
+
+extern const nir_shader_compiler_options v3d_nir_options;
+
+const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo);
+void v3d_compiler_free(const struct v3d_compiler *compiler);
+void v3d_optimize_nir(struct nir_shader *s);
+
+uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler,
+                         struct v3d_vs_key *key,
+                         struct v3d_vs_prog_data *prog_data,
+                         nir_shader *s,
+                         int program_id, int variant_id,
+                         uint32_t *final_assembly_size);
+
+uint64_t *v3d_compile_fs(const struct v3d_compiler *compiler,
+                         struct v3d_fs_key *key,
+                         struct v3d_fs_prog_data *prog_data,
+                         nir_shader *s,
+                         int program_id, int variant_id,
+                         uint32_t *final_assembly_size);
+
+void v3d_nir_to_vir(struct v3d_compile *c);
+
+void vir_compile_destroy(struct v3d_compile *c);
+const char *vir_get_stage_name(struct v3d_compile *c);
+struct qblock *vir_new_block(struct v3d_compile *c);
+void vir_set_emit_block(struct v3d_compile *c, struct qblock *block);
+void vir_link_blocks(struct qblock *predecessor, struct qblock *successor);
+struct qblock *vir_entry_block(struct v3d_compile *c);
+struct qblock *vir_exit_block(struct v3d_compile *c);
+struct qinst *vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst,
+                           struct qreg src0, struct qreg src1);
+struct qinst *vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst,
+                           struct qreg src0, struct qreg src1);
+struct qinst *vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src0);
+void vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst);
+struct qreg vir_uniform(struct v3d_compile *c,
+                        enum quniform_contents contents,
+                        uint32_t data);
+void vir_schedule_instructions(struct v3d_compile *c);
+struct v3d_qpu_instr v3d_qpu_nop(void);
+
+struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst);
+struct qinst *vir_emit_nondef(struct v3d_compile *c, struct qinst *inst);
+void vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond);
+void vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf);
+void vir_set_unpack(struct qinst *inst, int src,
+                    enum v3d_qpu_input_unpack unpack);
+
+struct qreg vir_get_temp(struct v3d_compile *c);
+void vir_calculate_live_intervals(struct v3d_compile *c);
+bool vir_has_implicit_uniform(struct qinst *inst);
+int vir_get_implicit_uniform_src(struct qinst *inst);
+int vir_get_non_sideband_nsrc(struct qinst *inst);
+int vir_get_nsrc(struct qinst *inst);
+bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst);
+bool vir_get_add_op(struct qinst *inst, enum v3d_qpu_add_op *op);
+bool vir_get_mul_op(struct qinst *inst, enum v3d_qpu_mul_op *op);
+bool vir_is_raw_mov(struct qinst *inst);
+bool vir_is_tex(struct qinst *inst);
+bool vir_is_add(struct qinst *inst);
+bool vir_is_mul(struct qinst *inst);
+bool vir_is_float_input(struct qinst *inst);
+bool vir_depends_on_flags(struct qinst *inst);
+bool vir_writes_r3(struct qinst *inst);
+bool vir_writes_r4(struct qinst *inst);
+struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
+uint8_t vir_channels_written(struct qinst *inst);
+
+void vir_dump(struct v3d_compile *c);
+void vir_dump_inst(struct v3d_compile *c, struct qinst *inst);
+
+void vir_validate(struct v3d_compile *c);
+
+void vir_optimize(struct v3d_compile *c);
+bool vir_opt_algebraic(struct v3d_compile *c);
+bool vir_opt_constant_folding(struct v3d_compile *c);
+bool vir_opt_copy_propagate(struct v3d_compile *c);
+bool vir_opt_dead_code(struct v3d_compile *c);
+bool vir_opt_peephole_sf(struct v3d_compile *c);
+bool vir_opt_small_immediates(struct v3d_compile *c);
+bool vir_opt_vpm(struct v3d_compile *c);
+void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c);
+void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
+void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
+void vir_lower_uniforms(struct v3d_compile *c);
+
+void v3d_vir_to_qpu(struct v3d_compile *c);
+uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
+void qpu_validate(struct v3d_compile *c);
+struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
+bool vir_init_reg_sets(struct v3d_compiler *compiler);
+
+void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf);
+
+static inline bool
+quniform_contents_is_texture_p0(enum quniform_contents contents)
+{
+        return (contents >= QUNIFORM_TEXTURE_CONFIG_P0_0 &&
+                contents < (QUNIFORM_TEXTURE_CONFIG_P0_0 +
+                            V3D_MAX_TEXTURE_SAMPLERS));
+}
+
+static inline struct qreg
+vir_uniform_ui(struct v3d_compile *c, uint32_t ui)
+{
+        return vir_uniform(c, QUNIFORM_CONSTANT, ui);
+}
+
+static inline struct qreg
+vir_uniform_f(struct v3d_compile *c, float f)
+{
+        return vir_uniform(c, QUNIFORM_CONSTANT, fui(f));
+}
+
+#define VIR_ALU0(name, vir_inst, op)                                     \
+static inline struct qreg                                                \
+vir_##name(struct v3d_compile *c)                                        \
+{                                                                        \
+        return vir_emit_def(c, vir_inst(op, c->undef,                    \
+                                        c->undef, c->undef));            \
+}                                                                        \
+static inline struct qinst *                                             \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest)               \
+{                                                                        \
+        return vir_emit_nondef(c, vir_inst(op, dest,                     \
+                                           c->undef, c->undef));         \
+}
+
+#define VIR_ALU1(name, vir_inst, op)                                     \
+static inline struct qreg                                                \
+vir_##name(struct v3d_compile *c, struct qreg a)                         \
+{                                                                        \
+        return vir_emit_def(c, vir_inst(op, c->undef,                    \
+                                        a, c->undef));                   \
+}                                                                        \
+static inline struct qinst *                                             \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
+                  struct qreg a)                                         \
+{                                                                        \
+        return vir_emit_nondef(c, vir_inst(op, dest, a,          \
+                                           c->undef));                   \
+}
+
+#define VIR_ALU2(name, vir_inst, op)                                       \
+static inline struct qreg                                                \
+vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b)          \
+{                                                                        \
+        return vir_emit_def(c, vir_inst(op, c->undef, a, b));    \
+}                                                                        \
+static inline struct qinst *                                             \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
+                  struct qreg a, struct qreg b)                          \
+{                                                                        \
+        return vir_emit_nondef(c, vir_inst(op, dest, a, b));     \
+}
+
+#define VIR_NODST_1(name, vir_inst, op)                                               \
+static inline struct qinst *                                            \
+vir_##name(struct v3d_compile *c, struct qreg a)                        \
+{                                                                       \
+        return vir_emit_nondef(c, vir_inst(op, c->undef,        \
+                                           a, c->undef));               \
+}
+
+#define VIR_NODST_2(name, vir_inst, op)                                               \
+static inline struct qinst *                                            \
+vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b)         \
+{                                                                       \
+        return vir_emit_nondef(c, vir_inst(op, c->undef,                \
+                                           a, b));                      \
+}
+
+#define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_ALU2(name) VIR_ALU2(name, vir_mul_inst, V3D_QPU_M_##name)
+#define VIR_A_ALU1(name) VIR_ALU1(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_ALU1(name) VIR_ALU1(name, vir_mul_inst, V3D_QPU_M_##name)
+#define VIR_A_ALU0(name) VIR_ALU0(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_ALU0(name) VIR_ALU0(name, vir_mul_inst, V3D_QPU_M_##name)
+#define VIR_A_NODST_2(name) VIR_NODST_2(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_NODST_2(name) VIR_NODST_2(name, vir_mul_inst, V3D_QPU_M_##name)
+#define VIR_A_NODST_1(name) VIR_NODST_1(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_NODST_1(name) VIR_NODST_1(name, vir_mul_inst, V3D_QPU_M_##name)
+
+VIR_A_ALU2(FADD)
+VIR_A_ALU2(VFPACK)
+VIR_A_ALU2(FSUB)
+VIR_A_ALU2(FMIN)
+VIR_A_ALU2(FMAX)
+
+VIR_A_ALU2(ADD)
+VIR_A_ALU2(SUB)
+VIR_A_ALU2(SHL)
+VIR_A_ALU2(SHR)
+VIR_A_ALU2(ASR)
+VIR_A_ALU2(ROR)
+VIR_A_ALU2(MIN)
+VIR_A_ALU2(MAX)
+VIR_A_ALU2(UMIN)
+VIR_A_ALU2(UMAX)
+VIR_A_ALU2(AND)
+VIR_A_ALU2(OR)
+VIR_A_ALU2(XOR)
+VIR_A_ALU2(VADD)
+VIR_A_ALU2(VSUB)
+VIR_A_ALU1(NOT)
+VIR_A_ALU1(NEG)
+VIR_A_ALU1(FLAPUSH)
+VIR_A_ALU1(FLBPUSH)
+VIR_A_ALU1(FLBPOP)
+VIR_A_ALU1(SETMSF)
+VIR_A_ALU1(SETREVF)
+VIR_A_ALU1(TIDX)
+VIR_A_ALU1(EIDX)
+
+VIR_A_ALU0(FXCD)
+VIR_A_ALU0(XCD)
+VIR_A_ALU0(FYCD)
+VIR_A_ALU0(YCD)
+VIR_A_ALU0(MSF)
+VIR_A_ALU0(REVF)
+VIR_A_NODST_1(VPMSETUP)
+VIR_A_ALU2(FCMP)
+VIR_A_ALU2(VFMAX)
+
+VIR_A_ALU1(FROUND)
+VIR_A_ALU1(FTOIN)
+VIR_A_ALU1(FTRUNC)
+VIR_A_ALU1(FTOIZ)
+VIR_A_ALU1(FFLOOR)
+VIR_A_ALU1(FTOUZ)
+VIR_A_ALU1(FCEIL)
+VIR_A_ALU1(FTOC)
+
+VIR_A_ALU1(FDX)
+VIR_A_ALU1(FDY)
+
+VIR_A_ALU1(ITOF)
+VIR_A_ALU1(CLZ)
+VIR_A_ALU1(UTOF)
+
+VIR_M_ALU2(UMUL24)
+VIR_M_ALU2(FMUL)
+VIR_M_ALU2(SMUL24)
+VIR_M_NODST_2(MULTOP)
+
+VIR_M_ALU1(MOV)
+VIR_M_ALU1(FMOV)
+
+static inline struct qinst *
+vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
+             struct qreg dest, struct qreg src)
+{
+        struct qinst *mov = vir_MOV_dest(c, dest, src);
+        vir_set_cond(mov, cond);
+        return mov;
+}
+
+static inline struct qreg
+vir_SEL(struct v3d_compile *c, enum v3d_qpu_cond cond,
+        struct qreg src0, struct qreg src1)
+{
+        struct qreg t = vir_get_temp(c);
+        vir_MOV_dest(c, t, src1);
+        vir_MOV_cond(c, cond, t, src0);
+        return t;
+}
+
+static inline void
+vir_VPM_WRITE(struct v3d_compile *c, struct qreg val)
+{
+        vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
+}
+
+static inline struct qinst *
+vir_NOP(struct v3d_compile *c)
+{
+        return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_NOP,
+                                               c->undef, c->undef, c->undef));
+}
+/*
+static inline struct qreg
+vir_LOAD_IMM(struct v3d_compile *c, uint32_t val)
+{
+        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM, c->undef,
+                                        vir_reg(QFILE_LOAD_IMM, val), c->undef));
+}
+
+static inline struct qreg
+vir_LOAD_IMM_U2(struct v3d_compile *c, uint32_t val)
+{
+        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_U2, c->undef,
+                                        vir_reg(QFILE_LOAD_IMM, val),
+                                        c->undef));
+}
+static inline struct qreg
+vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val)
+{
+        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_I2, c->undef,
+                                        vir_reg(QFILE_LOAD_IMM, val),
+                                        c->undef));
+}
+*/
+
+static inline struct qinst *
+vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_cond cond)
+{
+        /* The actual uniform_data value will be set at scheduling time */
+        return vir_emit_nondef(c, vir_branch_inst(cond, vir_uniform_ui(c, 0)));
+}
+
+#define vir_for_each_block(block, c)                                    \
+        list_for_each_entry(struct qblock, block, &c->blocks, link)
+
+#define vir_for_each_block_rev(block, c)                                \
+        list_for_each_entry_rev(struct qblock, block, &c->blocks, link)
+
+/* Loop over the non-NULL members of the successors array. */
+#define vir_for_each_successor(succ, block)                             \
+        for (struct qblock *succ = block->successors[0];                \
+             succ != NULL;                                              \
+             succ = (succ == block->successors[1] ? NULL :              \
+                     block->successors[1]))
+
+#define vir_for_each_inst(inst, block)                                  \
+        list_for_each_entry(struct qinst, inst, &block->instructions, link)
+
+#define vir_for_each_inst_rev(inst, block)                                  \
+        list_for_each_entry_rev(struct qinst, inst, &block->instructions, link)
+
+#define vir_for_each_inst_safe(inst, block)                             \
+        list_for_each_entry_safe(struct qinst, inst, &block->instructions, link)
+
+#define vir_for_each_inst_inorder(inst, c)                              \
+        vir_for_each_block(_block, c)                                   \
+                vir_for_each_inst(inst, _block)
+
+#endif /* V3D_COMPILER_H */
diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
new file mode 100644
index 00000000000..9cdcc02195c
--- /dev/null
+++ b/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/v3d_compiler.h"
+#include "compiler/nir/nir_builder.h"
+
+/**
+ * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
+ * intrinsics into something amenable to the V3D architecture.
+ *
+ * Currently, it splits VS inputs and uniforms into scalars, drops any
+ * non-position outputs in coordinate shaders, and fixes up the addressing on
+ * indirect uniform loads.  FS input and VS output scalarization is handled by
+ * nir_lower_io_to_scalar().
+ */
+
+static void
+replace_intrinsic_with_vec(nir_builder *b, nir_intrinsic_instr *intr,
+                           nir_ssa_def **comps)
+{
+
+        /* Batch things back together into a vector.  This will get split by
+         * the later ALU scalarization pass.
+         */
+        nir_ssa_def *vec = nir_vec(b, comps, intr->num_components);
+
+        /* Replace the old intrinsic with a reference to our reconstructed
+         * vector.
+         */
+        nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec));
+        nir_instr_remove(&intr->instr);
+}
+
+static void
+v3d_nir_lower_output(struct v3d_compile *c, nir_builder *b,
+                     nir_intrinsic_instr *intr)
+{
+        nir_variable *output_var = NULL;
+        nir_foreach_variable(var, &c->s->outputs) {
+                if (var->data.driver_location == nir_intrinsic_base(intr)) {
+                        output_var = var;
+                        break;
+                }
+        }
+        assert(output_var);
+
+        if (c->vs_key) {
+                int slot = output_var->data.location;
+                bool used = false;
+
+                switch (slot) {
+                case VARYING_SLOT_PSIZ:
+                case VARYING_SLOT_POS:
+                        used = true;
+                        break;
+
+                default:
+                        for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
+                                if (v3d_slot_get_slot(c->vs_key->fs_inputs[i]) == slot) {
+                                        used = true;
+                                        break;
+                                }
+                        }
+                        break;
+                }
+
+                if (!used)
+                        nir_instr_remove(&intr->instr);
+        }
+}
+
+static void
+v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,
+                      nir_intrinsic_instr *intr)
+{
+        b->cursor = nir_before_instr(&intr->instr);
+
+        /* Generate scalar loads equivalent to the original vector. */
+        nir_ssa_def *dests[4];
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, intr->intrinsic);
+                intr_comp->num_components = 1;
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL);
+
+                /* Convert the uniform offset to bytes.  If it happens
+                 * to be a constant, constant-folding will clean up
+                 * the shift for us.
+                 */
+                nir_intrinsic_set_base(intr_comp,
+                                       nir_intrinsic_base(intr) * 16 +
+                                       i * 4);
+
+                intr_comp->src[0] =
+                        nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
+                                                 nir_imm_int(b, 4)));
+
+                dests[i] = &intr_comp->dest.ssa;
+
+                nir_builder_instr_insert(b, &intr_comp->instr);
+        }
+
+        replace_intrinsic_with_vec(b, intr, dests);
+}
+
+static void
+v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
+                       struct nir_instr *instr)
+{
+        if (instr->type != nir_instr_type_intrinsic)
+                return;
+        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+        switch (intr->intrinsic) {
+        case nir_intrinsic_load_input:
+                break;
+
+        case nir_intrinsic_store_output:
+                v3d_nir_lower_output(c, b, intr);
+                break;
+
+        case nir_intrinsic_load_uniform:
+                v3d_nir_lower_uniform(c, b, intr);
+                break;
+
+        case nir_intrinsic_load_user_clip_plane:
+        default:
+                break;
+        }
+}
+
+static bool
+v3d_nir_lower_io_impl(struct v3d_compile *c, nir_function_impl *impl)
+{
+        nir_builder b;
+        nir_builder_init(&b, impl);
+
+        nir_foreach_block(block, impl) {
+                nir_foreach_instr_safe(instr, block)
+                        v3d_nir_lower_io_instr(c, &b, instr);
+        }
+
+        nir_metadata_preserve(impl, nir_metadata_block_index |
+                              nir_metadata_dominance);
+
+        return true;
+}
+
+void
+v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
+{
+        nir_foreach_function(function, s) {
+                if (function->impl)
+                        v3d_nir_lower_io_impl(c, function->impl);
+        }
+}
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
new file mode 100644
index 00000000000..35df757a208
--- /dev/null
+++ b/src/broadcom/compiler/vir.c
@@ -0,0 +1,907 @@
+/*
+ * Copyright © 2016-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_compiler.h"
+
+int
+vir_get_non_sideband_nsrc(struct qinst *inst)
+{
+        switch (inst->qpu.type) {
+        case V3D_QPU_INSTR_TYPE_BRANCH:
+                return 0;
+        case V3D_QPU_INSTR_TYPE_ALU:
+                if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
+                        return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
+                else
+                        return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
+        }
+
+        return 0;
+}
+
+int
+vir_get_nsrc(struct qinst *inst)
+{
+        int nsrc = vir_get_non_sideband_nsrc(inst);
+
+        if (vir_has_implicit_uniform(inst))
+                nsrc++;
+
+        return nsrc;
+}
+
+bool
+vir_has_implicit_uniform(struct qinst *inst)
+{
+        switch (inst->qpu.type) {
+        case V3D_QPU_INSTR_TYPE_BRANCH:
+                return true;
+        case V3D_QPU_INSTR_TYPE_ALU:
+                switch (inst->dst.file) {
+                case QFILE_TLBU:
+                        return true;
+                default:
+                        return inst->has_implicit_uniform;
+                }
+        }
+        return false;
+}
+
+/* The sideband uniform for textures gets stored after the normal ALU
+ * arguments.
+ */
+int
+vir_get_implicit_uniform_src(struct qinst *inst)
+{
+        return vir_get_nsrc(inst) - 1;
+}
+
+/**
+ * Returns whether the instruction has any side effects that must be
+ * preserved.
+ */
+bool
+vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
+{
+        switch (inst->qpu.type) {
+        case V3D_QPU_INSTR_TYPE_BRANCH:
+                return true;
+        case V3D_QPU_INSTR_TYPE_ALU:
+                switch (inst->qpu.alu.add.op) {
+                case V3D_QPU_A_SETREVF:
+                case V3D_QPU_A_SETMSF:
+                case V3D_QPU_A_VPMSETUP:
+                        return true;
+                default:
+                        break;
+                }
+
+                switch (inst->qpu.alu.mul.op) {
+                case V3D_QPU_M_MULTOP:
+                        return true;
+                default:
+                        break;
+                }
+        }
+
+        if (inst->qpu.sig.ldtmu)
+                return true;
+
+        return false;
+}
+
+bool
+vir_is_float_input(struct qinst *inst)
+{
+        /* XXX: More instrs */
+        switch (inst->qpu.type) {
+        case V3D_QPU_INSTR_TYPE_BRANCH:
+                return false;
+        case V3D_QPU_INSTR_TYPE_ALU:
+                switch (inst->qpu.alu.add.op) {
+                case V3D_QPU_A_FADD:
+                case V3D_QPU_A_FSUB:
+                case V3D_QPU_A_FMIN:
+                case V3D_QPU_A_FMAX:
+                case V3D_QPU_A_FTOIN:
+                        return true;
+                default:
+                        break;
+                }
+
+                switch (inst->qpu.alu.mul.op) {
+                case V3D_QPU_M_FMOV:
+                case V3D_QPU_M_VFMUL:
+                case V3D_QPU_M_FMUL:
+                        return true;
+                default:
+                        break;
+                }
+        }
+
+        return false;
+}
+
+bool
+vir_is_raw_mov(struct qinst *inst)
+{
+        if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+            (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
+             inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
+                return false;
+        }
+
+        if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
+            inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
+                return false;
+        }
+
+        if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
+            inst->qpu.flags.mc != V3D_QPU_COND_NONE)
+                return false;
+
+        return true;
+}
+
+bool
+vir_is_add(struct qinst *inst)
+{
+        return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+                inst->qpu.alu.add.op != V3D_QPU_A_NOP);
+}
+
+bool
+vir_is_mul(struct qinst *inst)
+{
+        return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+                inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
+}
+
+bool
+vir_is_tex(struct qinst *inst)
+{
+        if (inst->dst.file == QFILE_MAGIC)
+                return v3d_qpu_magic_waddr_is_tmu(inst->dst.index);
+
+        return false;
+}
+
+bool
+vir_depends_on_flags(struct qinst *inst)
+{
+        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
+                return (inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS);
+        } else {
+                return (inst->qpu.flags.ac != V3D_QPU_COND_NONE &&
+                        inst->qpu.flags.mc != V3D_QPU_COND_NONE);
+        }
+}
+
+bool
+vir_writes_r3(struct qinst *inst)
+{
+        for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                switch (inst->src[i].file) {
+                case QFILE_VARY:
+                case QFILE_VPM:
+                        return true;
+                default:
+                        break;
+                }
+        }
+
+        return false;
+}
+
+bool
+vir_writes_r4(struct qinst *inst)
+{
+        switch (inst->dst.file) {
+        case QFILE_MAGIC:
+                switch (inst->dst.index) {
+                case V3D_QPU_WADDR_RECIP:
+                case V3D_QPU_WADDR_RSQRT:
+                case V3D_QPU_WADDR_EXP:
+                case V3D_QPU_WADDR_LOG:
+                case V3D_QPU_WADDR_SIN:
+                        return true;
+                }
+                break;
+        default:
+                break;
+        }
+
+        if (inst->qpu.sig.ldtmu)
+                return true;
+
+        return false;
+}
+
+void
+vir_set_unpack(struct qinst *inst, int src,
+               enum v3d_qpu_input_unpack unpack)
+{
+        assert(src == 0 || src == 1);
+
+        if (vir_is_add(inst)) {
+                if (src == 0)
+                        inst->qpu.alu.add.a_unpack = unpack;
+                else
+                        inst->qpu.alu.add.b_unpack = unpack;
+        } else {
+                assert(vir_is_mul(inst));
+                if (src == 0)
+                        inst->qpu.alu.mul.a_unpack = unpack;
+                else
+                        inst->qpu.alu.mul.b_unpack = unpack;
+        }
+}
+
+void
+vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
+{
+        if (vir_is_add(inst)) {
+                inst->qpu.flags.ac = cond;
+        } else {
+                assert(vir_is_mul(inst));
+                inst->qpu.flags.mc = cond;
+        }
+}
+
+void
+vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf)
+{
+        if (vir_is_add(inst)) {
+                inst->qpu.flags.apf = pf;
+        } else {
+                assert(vir_is_mul(inst));
+                inst->qpu.flags.mpf = pf;
+        }
+}
+
+#if 0
+uint8_t
+vir_channels_written(struct qinst *inst)
+{
+        if (vir_is_mul(inst)) {
+                switch (inst->dst.pack) {
+                case QPU_PACK_MUL_NOP:
+                case QPU_PACK_MUL_8888:
+                        return 0xf;
+                case QPU_PACK_MUL_8A:
+                        return 0x1;
+                case QPU_PACK_MUL_8B:
+                        return 0x2;
+                case QPU_PACK_MUL_8C:
+                        return 0x4;
+                case QPU_PACK_MUL_8D:
+                        return 0x8;
+                }
+        } else {
+                switch (inst->dst.pack) {
+                case QPU_PACK_A_NOP:
+                case QPU_PACK_A_8888:
+                case QPU_PACK_A_8888_SAT:
+                case QPU_PACK_A_32_SAT:
+                        return 0xf;
+                case QPU_PACK_A_8A:
+                case QPU_PACK_A_8A_SAT:
+                        return 0x1;
+                case QPU_PACK_A_8B:
+                case QPU_PACK_A_8B_SAT:
+                        return 0x2;
+                case QPU_PACK_A_8C:
+                case QPU_PACK_A_8C_SAT:
+                        return 0x4;
+                case QPU_PACK_A_8D:
+                case QPU_PACK_A_8D_SAT:
+                        return 0x8;
+                case QPU_PACK_A_16A:
+                case QPU_PACK_A_16A_SAT:
+                        return 0x3;
+                case QPU_PACK_A_16B:
+                case QPU_PACK_A_16B_SAT:
+                        return 0xc;
+                }
+        }
+        unreachable("Bad pack field");
+}
+#endif
+
+struct qreg
+vir_get_temp(struct v3d_compile *c)
+{
+        struct qreg reg;
+
+        reg.file = QFILE_TEMP;
+        reg.index = c->num_temps++;
+
+        if (c->num_temps > c->defs_array_size) {
+                uint32_t old_size = c->defs_array_size;
+                c->defs_array_size = MAX2(old_size * 2, 16);
+                c->defs = reralloc(c, c->defs, struct qinst *,
+                                   c->defs_array_size);
+                memset(&c->defs[old_size], 0,
+                       sizeof(c->defs[0]) * (c->defs_array_size - old_size));
+        }
+
+        return reg;
+}
+
+struct qinst *
+vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
+{
+        struct qinst *inst = calloc(1, sizeof(*inst));
+
+        inst->qpu = v3d_qpu_nop();
+        inst->qpu.alu.add.op = op;
+
+        inst->dst = dst;
+        inst->src[0] = src0;
+        inst->src[1] = src1;
+        inst->uniform = ~0;
+
+        return inst;
+}
+
+struct qinst *
+vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
+{
+        struct qinst *inst = calloc(1, sizeof(*inst));
+
+        inst->qpu = v3d_qpu_nop();
+        inst->qpu.alu.mul.op = op;
+
+        inst->dst = dst;
+        inst->src[0] = src0;
+        inst->src[1] = src1;
+        inst->uniform = ~0;
+
+        return inst;
+}
+
+struct qinst *
+vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src)
+{
+        struct qinst *inst = calloc(1, sizeof(*inst));
+
+        inst->qpu = v3d_qpu_nop();
+        inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
+        inst->qpu.branch.cond = cond;
+        inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
+        inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
+        inst->qpu.branch.ub = true;
+        inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
+
+        inst->dst = vir_reg(QFILE_NULL, 0);
+        inst->src[0] = src;
+        inst->uniform = ~0;
+
+        return inst;
+}
+
+static void
+vir_emit(struct v3d_compile *c, struct qinst *inst)
+{
+        list_addtail(&inst->link, &c->cur_block->instructions);
+
+        if (inst->dst.file == QFILE_MAGIC &&
+            inst->dst.index == V3D_QPU_WADDR_VPM)
+                c->num_vpm_writes++;
+}
+
+/* Updates inst to write to a new temporary, emits it, and notes the def. */
+struct qreg
+vir_emit_def(struct v3d_compile *c, struct qinst *inst)
+{
+        assert(inst->dst.file == QFILE_NULL);
+
+        inst->dst = vir_get_temp(c);
+
+        if (inst->dst.file == QFILE_TEMP)
+                c->defs[inst->dst.index] = inst;
+
+        vir_emit(c, inst);
+
+        return inst->dst;
+}
+
+struct qinst *
+vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
+{
+        if (inst->dst.file == QFILE_TEMP)
+                c->defs[inst->dst.index] = NULL;
+
+        vir_emit(c, inst);
+
+        return inst;
+}
+
+struct qblock *
+vir_new_block(struct v3d_compile *c)
+{
+        struct qblock *block = rzalloc(c, struct qblock);
+
+        list_inithead(&block->instructions);
+
+        block->predecessors = _mesa_set_create(block,
+                                               _mesa_hash_pointer,
+                                               _mesa_key_pointer_equal);
+
+        block->index = c->next_block_index++;
+
+        return block;
+}
+
+void
+vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
+{
+        c->cur_block = block;
+        list_addtail(&block->link, &c->blocks);
+}
+
+struct qblock *
+vir_entry_block(struct v3d_compile *c)
+{
+        return list_first_entry(&c->blocks, struct qblock, link);
+}
+
+struct qblock *
+vir_exit_block(struct v3d_compile *c)
+{
+        return list_last_entry(&c->blocks, struct qblock, link);
+}
+
+void
+vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
+{
+        _mesa_set_add(successor->predecessors, predecessor);
+        if (predecessor->successors[0]) {
+                assert(!predecessor->successors[1]);
+                predecessor->successors[1] = successor;
+        } else {
+                predecessor->successors[0] = successor;
+        }
+}
+
+const struct v3d_compiler *
+v3d_compiler_init(const struct v3d_device_info *devinfo)
+{
+        struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
+        if (!compiler)
+                return NULL;
+
+        compiler->devinfo = devinfo;
+
+        if (!vir_init_reg_sets(compiler)) {
+                ralloc_free(compiler);
+                return NULL;
+        }
+
+        return compiler;
+}
+
+void
+v3d_compiler_free(const struct v3d_compiler *compiler)
+{
+        ralloc_free((void *)compiler);
+}
+
+static struct v3d_compile *
+vir_compile_init(const struct v3d_compiler *compiler,
+                 struct v3d_key *key,
+                 nir_shader *s,
+                 int program_id, int variant_id)
+{
+        struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
+
+        c->compiler = compiler;
+        c->devinfo = compiler->devinfo;
+        c->key = key;
+        c->program_id = program_id;
+        c->variant_id = variant_id;
+
+        s = nir_shader_clone(c, s);
+        c->s = s;
+
+        list_inithead(&c->blocks);
+        vir_set_emit_block(c, vir_new_block(c));
+
+        c->output_position_index = -1;
+        c->output_point_size_index = -1;
+        c->output_sample_mask_index = -1;
+
+        c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
+                                            _mesa_key_pointer_equal);
+
+        return c;
+}
+
+static void
+v3d_lower_nir(struct v3d_compile *c)
+{
+        struct nir_lower_tex_options tex_options = {
+                .lower_rect = false, /* XXX */
+                .lower_txp = ~0,
+                /* Apply swizzles to all samplers. */
+                .swizzle_result = ~0,
+        };
+
+        /* Lower the format swizzle and (for 32-bit returns)
+         * ARB_texture_swizzle-style swizzle.
+         */
+        for (int i = 0; i < ARRAY_SIZE(c->key->tex); i++) {
+                for (int j = 0; j < 4; j++)
+                        tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
+        }
+
+        NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
+}
+
+static void
+v3d_lower_nir_late(struct v3d_compile *c)
+{
+        NIR_PASS_V(c->s, v3d_nir_lower_io, c);
+        NIR_PASS_V(c->s, nir_lower_idiv);
+}
+
+static void
+v3d_set_prog_data_uniforms(struct v3d_compile *c,
+                           struct v3d_prog_data *prog_data)
+{
+        int count = c->num_uniforms;
+        struct v3d_uniform_list *ulist = &prog_data->uniforms;
+
+        ulist->count = count;
+        ulist->data = ralloc_array(prog_data, uint32_t, count);
+        memcpy(ulist->data, c->uniform_data,
+               count * sizeof(*ulist->data));
+        ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
+        memcpy(ulist->contents, c->uniform_contents,
+               count * sizeof(*ulist->contents));
+}
+
+/* Copy the compiler UBO range state to the compiled shader, dropping out
+ * arrays that were never referenced by an indirect load.
+ *
+ * (Note that QIR dead code elimination of an array access still leaves that
+ * array alive, though)
+ */
+static void
+v3d_set_prog_data_ubo(struct v3d_compile *c,
+                      struct v3d_prog_data *prog_data)
+{
+        if (!c->num_ubo_ranges)
+                return;
+
+        prog_data->num_ubo_ranges = 0;
+        prog_data->ubo_ranges = ralloc_array(prog_data, struct v3d_ubo_range,
+                                             c->num_ubo_ranges);
+        for (int i = 0; i < c->num_ubo_ranges; i++) {
+                if (!c->ubo_range_used[i])
+                        continue;
+
+                struct v3d_ubo_range *range = &c->ubo_ranges[i];
+                prog_data->ubo_ranges[prog_data->num_ubo_ranges++] = *range;
+                prog_data->ubo_size += range->size;
+        }
+
+        if (prog_data->ubo_size) {
+                if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
+                        fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
+                                vir_get_stage_name(c),
+                                c->program_id, c->variant_id,
+                                prog_data->ubo_size / 4);
+                }
+        }
+}
+
+static void
+v3d_set_prog_data(struct v3d_compile *c,
+                  struct v3d_prog_data *prog_data)
+{
+        v3d_set_prog_data_uniforms(c, prog_data);
+        v3d_set_prog_data_ubo(c, prog_data);
+}
+
+static uint64_t *
+v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
+{
+        *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
+
+        uint64_t *qpu_insts = malloc(*final_assembly_size);
+        if (!qpu_insts)
+                return NULL;
+
+        memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
+
+        vir_compile_destroy(c);
+
+        return qpu_insts;
+}
+
+uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler,
+                         struct v3d_vs_key *key,
+                         struct v3d_vs_prog_data *prog_data,
+                         nir_shader *s,
+                         int program_id, int variant_id,
+                         uint32_t *final_assembly_size)
+{
+        struct v3d_compile *c = vir_compile_init(compiler, &key->base, s,
+                                                 program_id, variant_id);
+
+        c->vs_key = key;
+
+        v3d_lower_nir(c);
+
+        if (key->clamp_color)
+                NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
+
+        if (key->base.ucp_enables) {
+                NIR_PASS_V(c->s, nir_lower_clip_vs, key->base.ucp_enables);
+                NIR_PASS_V(c->s, nir_lower_io_to_scalar,
+                           nir_var_shader_out);
+        }
+
+        /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
+
+        v3d_lower_nir_late(c);
+        v3d_optimize_nir(c->s);
+        NIR_PASS_V(c->s, nir_convert_from_ssa, true);
+
+        v3d_nir_to_vir(c);
+
+        v3d_set_prog_data(c, &prog_data->base);
+
+        prog_data->base.num_inputs = c->num_inputs;
+
+        /* The vertex data gets format converted by the VPM so that
+         * each attribute channel takes up a VPM column.  Precompute
+         * the sizes for the shader record.
+         */
+        for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
+                prog_data->vattr_sizes[i] = c->vattr_sizes[i];
+                prog_data->vpm_input_size += c->vattr_sizes[i];
+        }
+
+        /* Input/output segment size are in 8x32-bit multiples. */
+        prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
+        prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8;
+
+        prog_data->uses_vid = (s->info.system_values_read &
+                               (1ull << SYSTEM_VALUE_VERTEX_ID));
+        prog_data->uses_iid = (s->info.system_values_read &
+                               (1ull << SYSTEM_VALUE_INSTANCE_ID));
+
+        return v3d_return_qpu_insts(c, final_assembly_size);
+}
+
+static void
+v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
+                            struct v3d_fs_prog_data *prog_data)
+{
+        prog_data->base.num_inputs = c->num_inputs;
+        memcpy(prog_data->input_slots, c->input_slots,
+               c->num_inputs * sizeof(*c->input_slots));
+
+        for (int i = 0; i < c->num_inputs; i++) {
+                struct v3d_varying_slot v3d_slot = c->input_slots[i];
+                uint8_t slot = v3d_slot_get_slot(v3d_slot);
+
+                if (slot == VARYING_SLOT_COL0 ||
+                    slot == VARYING_SLOT_COL1 ||
+                    slot == VARYING_SLOT_BFC0 ||
+                    slot == VARYING_SLOT_BFC1) {
+                        BITSET_SET(prog_data->color_inputs, i);
+                }
+
+                if (BITSET_TEST(c->flat_shade_flags, i))
+                        BITSET_SET(prog_data->flat_shade_flags, i);
+        }
+}
+
+uint64_t *v3d_compile_fs(const struct v3d_compiler *compiler,
+                         struct v3d_fs_key *key,
+                         struct v3d_fs_prog_data *prog_data,
+                         nir_shader *s,
+                         int program_id, int variant_id,
+                         uint32_t *final_assembly_size)
+{
+        struct v3d_compile *c = vir_compile_init(compiler, &key->base, s,
+                                                 program_id, variant_id);
+
+        c->fs_key = key;
+
+        v3d_lower_nir(c);
+
+        if (key->light_twoside)
+                NIR_PASS_V(c->s, nir_lower_two_sided_color);
+
+        if (key->clamp_color)
+                NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
+
+        if (key->alpha_test) {
+                NIR_PASS_V(c->s, nir_lower_alpha_test, key->alpha_test_func,
+                           false);
+        }
+
+        if (key->base.ucp_enables)
+                NIR_PASS_V(c->s, nir_lower_clip_fs, key->base.ucp_enables);
+
+        /* Note: FS input scalarizing must happen after
+         * nir_lower_two_sided_color, which only handles a vec4 at a time.
+         */
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
+
+        v3d_lower_nir_late(c);
+        v3d_optimize_nir(c->s);
+        NIR_PASS_V(c->s, nir_convert_from_ssa, true);
+
+        v3d_nir_to_vir(c);
+
+        v3d_set_prog_data(c, &prog_data->base);
+        v3d_set_fs_prog_data_inputs(c, prog_data);
+        if (c->s->info.outputs_written & (1 << FRAG_RESULT_DEPTH))
+                prog_data->writes_z = true;
+
+        return v3d_return_qpu_insts(c, final_assembly_size);
+}
+
+void
+vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
+{
+        if (qinst->dst.file == QFILE_TEMP)
+                c->defs[qinst->dst.index] = NULL;
+
+        list_del(&qinst->link);
+        free(qinst);
+}
+
+struct qreg
+vir_follow_movs(struct v3d_compile *c, struct qreg reg)
+{
+        /* XXX
+        int pack = reg.pack;
+
+        while (reg.file == QFILE_TEMP &&
+               c->defs[reg.index] &&
+               (c->defs[reg.index]->op == QOP_MOV ||
+                c->defs[reg.index]->op == QOP_FMOV) &&
+               !c->defs[reg.index]->dst.pack &&
+               !c->defs[reg.index]->src[0].pack) {
+                reg = c->defs[reg.index]->src[0];
+        }
+
+        reg.pack = pack;
+        */
+        return reg;
+}
+
+void
+vir_compile_destroy(struct v3d_compile *c)
+{
+        vir_for_each_block(block, c) {
+                while (!list_empty(&block->instructions)) {
+                        struct qinst *qinst =
+                                list_first_entry(&block->instructions,
+                                                 struct qinst, link);
+                        vir_remove_instruction(c, qinst);
+                }
+        }
+
+        ralloc_free(c);
+}
+
+struct qreg
+vir_uniform(struct v3d_compile *c,
+            enum quniform_contents contents,
+            uint32_t data)
+{
+        for (int i = 0; i < c->num_uniforms; i++) {
+                if (c->uniform_contents[i] == contents &&
+                    c->uniform_data[i] == data) {
+                        return vir_reg(QFILE_UNIF, i);
+                }
+        }
+
+        uint32_t uniform = c->num_uniforms++;
+
+        if (uniform >= c->uniform_array_size) {
+                c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
+                                             c->uniform_array_size * 2);
+
+                c->uniform_data = reralloc(c, c->uniform_data,
+                                           uint32_t,
+                                           c->uniform_array_size);
+                c->uniform_contents = reralloc(c, c->uniform_contents,
+                                               enum quniform_contents,
+                                               c->uniform_array_size);
+        }
+
+        c->uniform_contents[uniform] = contents;
+        c->uniform_data[uniform] = data;
+
+        return vir_reg(QFILE_UNIF, uniform);
+}
+
+void
+vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf)
+{
+        struct qinst *last_inst = NULL;
+
+        if (!list_empty(&c->cur_block->instructions))
+                last_inst = (struct qinst *)c->cur_block->instructions.prev;
+
+        if (src.file != QFILE_TEMP ||
+            !c->defs[src.index] ||
+            last_inst != c->defs[src.index]) {
+                /* XXX: Make the MOV be the appropriate type */
+                last_inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), src);
+                last_inst = (struct qinst *)c->cur_block->instructions.prev;
+        }
+
+        vir_set_pf(last_inst, pf);
+}
+
+#define OPTPASS(func)                                                   \
+        do {                                                            \
+                bool stage_progress = func(c);                          \
+                if (stage_progress) {                                   \
+                        progress = true;                                \
+                        if (print_opt_debug) {                          \
+                                fprintf(stderr,                         \
+                                        "VIR opt pass %2d: %s progress\n", \
+                                        pass, #func);                   \
+                        }                                               \
+                        /*XXX vir_validate(c);*/                        \
+                }                                                       \
+        } while (0)
+
+void
+vir_optimize(struct v3d_compile *c)
+{
+        bool print_opt_debug = false;
+        int pass = 1;
+
+        while (true) {
+                bool progress = false;
+
+                OPTPASS(vir_opt_copy_propagate);
+                OPTPASS(vir_opt_dead_code);
+
+                if (!progress)
+                        break;
+
+                pass++;
+        }
+}
+
+const char *
+vir_get_stage_name(struct v3d_compile *c)
+{
+        if (c->vs_key && c->vs_key->is_coord)
+                return "MESA_SHADER_COORD";
+        else
+                return gl_shader_stage_name(c->s->stage);
+}
diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c
new file mode 100644
index 00000000000..ad5c061a138
--- /dev/null
+++ b/src/broadcom/compiler/vir_dump.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright © 2016-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_compiler.h"
+
+static void
+vir_print_reg(struct v3d_compile *c, struct qreg reg)
+{
+        static const char *files[] = {
+                [QFILE_TEMP] = "t",
+                [QFILE_VARY] = "v",
+                [QFILE_UNIF] = "u",
+                [QFILE_TLB] = "tlb",
+                [QFILE_TLBU] = "tlbu",
+        };
+        static const char *quniform_names[] = {
+                [QUNIFORM_VIEWPORT_X_SCALE] = "vp_x_scale",
+                [QUNIFORM_VIEWPORT_Y_SCALE] = "vp_y_scale",
+                [QUNIFORM_VIEWPORT_Z_OFFSET] = "vp_z_offset",
+                [QUNIFORM_VIEWPORT_Z_SCALE] = "vp_z_scale",
+        };
+
+        switch (reg.file) {
+
+        case QFILE_NULL:
+                fprintf(stderr, "null");
+                break;
+
+        case QFILE_LOAD_IMM:
+                fprintf(stderr, "0x%08x (%f)", reg.index, uif(reg.index));
+                break;
+
+        case QFILE_REG:
+                fprintf(stderr, "rf%d", reg.index);
+                break;
+
+        case QFILE_MAGIC:
+                fprintf(stderr, "%s", v3d_qpu_magic_waddr_name(reg.index));
+                break;
+
+        case QFILE_SMALL_IMM:
+                if ((int)reg.index >= -16 && (int)reg.index <= 15)
+                        fprintf(stderr, "%d", reg.index);
+                else
+                        fprintf(stderr, "%f", uif(reg.index));
+                break;
+
+        case QFILE_VPM:
+                fprintf(stderr, "vpm%d.%d",
+                        reg.index / 4, reg.index % 4);
+                break;
+
+        case QFILE_TLB:
+                fprintf(stderr, "%s", files[reg.file]);
+                break;
+
+        case QFILE_UNIF: {
+                enum quniform_contents contents = c->uniform_contents[reg.index];
+
+                fprintf(stderr, "%s%d", files[reg.file], reg.index);
+
+                switch (contents) {
+                case QUNIFORM_CONSTANT:
+                        fprintf(stderr, " (0x%08x / %f)",
+                                c->uniform_data[reg.index],
+                                uif(c->uniform_data[reg.index]));
+                        break;
+
+                case QUNIFORM_UNIFORM:
+                        fprintf(stderr, " (push[%d])",
+                                c->uniform_data[reg.index]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P1:
+                        fprintf(stderr, " (tex[%d].p1)",
+                                c->uniform_data[reg.index]);
+                        break;
+
+                case QUNIFORM_TEXTURE_WIDTH:
+                        fprintf(stderr, " (tex[%d].width)",
+                                c->uniform_data[reg.index]);
+                        break;
+                case QUNIFORM_TEXTURE_HEIGHT:
+                        fprintf(stderr, " (tex[%d].height)",
+                                c->uniform_data[reg.index]);
+                        break;
+                case QUNIFORM_TEXTURE_DEPTH:
+                        fprintf(stderr, " (tex[%d].depth)",
+                                c->uniform_data[reg.index]);
+                        break;
+                case QUNIFORM_TEXTURE_ARRAY_SIZE:
+                        fprintf(stderr, " (tex[%d].array_size)",
+                                c->uniform_data[reg.index]);
+                        break;
+                case QUNIFORM_TEXTURE_LEVELS:
+                        fprintf(stderr, " (tex[%d].levels)",
+                                c->uniform_data[reg.index]);
+                        break;
+
+                case QUNIFORM_UBO_ADDR:
+                        fprintf(stderr, " (ubo[%d])",
+                                c->uniform_data[reg.index]);
+                        break;
+
+                default:
+                        if (quniform_contents_is_texture_p0(contents)) {
+                                fprintf(stderr, " (tex[%d].p0: 0x%08x)",
+                                        contents - QUNIFORM_TEXTURE_CONFIG_P0_0,
+                                        c->uniform_data[reg.index]);
+                        } else if (contents < ARRAY_SIZE(quniform_names)) {
+                                fprintf(stderr, " (%s)",
+                                        quniform_names[contents]);
+                        } else {
+                                fprintf(stderr, " (%d / 0x%08x)", contents,
+                                        c->uniform_data[reg.index]);
+                        }
+                }
+
+                break;
+        }
+
+        default:
+                fprintf(stderr, "%s%d", files[reg.file], reg.index);
+                break;
+        }
+}
+
+static void
+vir_dump_sig(struct v3d_compile *c, struct qinst *inst)
+{
+        struct v3d_qpu_sig *sig = &inst->qpu.sig;
+
+        if (sig->thrsw)
+                fprintf(stderr, "; thrsw");
+        if (sig->ldvary)
+                fprintf(stderr, "; ldvary");
+        if (sig->ldvpm)
+                fprintf(stderr, "; ldvpm");
+        if (sig->ldtmu)
+                fprintf(stderr, "; ldtmu");
+        if (sig->ldunif)
+                fprintf(stderr, "; ldunif");
+        if (sig->wrtmuc)
+                fprintf(stderr, "; wrtmuc");
+}
+
+static void
+vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
+{
+        struct v3d_qpu_instr *instr = &inst->qpu;
+        int nsrc = vir_get_non_sideband_nsrc(inst);
+        int sideband_nsrc = vir_get_nsrc(inst);
+        enum v3d_qpu_input_unpack unpack[2];
+
+        if (inst->qpu.alu.add.op != V3D_QPU_A_NOP) {
+                fprintf(stderr, "%s", v3d_qpu_add_op_name(instr->alu.add.op));
+                fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.ac));
+                fprintf(stderr, "%s", v3d_qpu_pf_name(instr->flags.apf));
+                fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.auf));
+                fprintf(stderr, " ");
+
+                vir_print_reg(c, inst->dst);
+                fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
+
+                unpack[0] = instr->alu.add.a_unpack;
+                unpack[1] = instr->alu.add.b_unpack;
+        } else {
+                fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
+                fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
+                fprintf(stderr, "%s", v3d_qpu_pf_name(instr->flags.mpf));
+                fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.muf));
+                fprintf(stderr, " ");
+
+                vir_print_reg(c, inst->dst);
+                fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
+
+                unpack[0] = instr->alu.mul.a_unpack;
+                unpack[1] = instr->alu.mul.b_unpack;
+        }
+
+        for (int i = 0; i < sideband_nsrc; i++) {
+                fprintf(stderr, ", ");
+                vir_print_reg(c, inst->src[i]);
+                if (i < nsrc)
+                        fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i]));
+        }
+
+        vir_dump_sig(c, inst);
+}
+
+void
+vir_dump_inst(struct v3d_compile *c, struct qinst *inst)
+{
+        struct v3d_qpu_instr *instr = &inst->qpu;
+
+        switch (inst->qpu.type) {
+        case V3D_QPU_INSTR_TYPE_ALU:
+                vir_dump_alu(c, inst);
+                break;
+        case V3D_QPU_INSTR_TYPE_BRANCH:
+                fprintf(stderr, "b");
+                if (instr->branch.ub)
+                        fprintf(stderr, "u");
+
+                fprintf(stderr, "%s",
+                        v3d_qpu_branch_cond_name(instr->branch.cond));
+                fprintf(stderr, "%s", v3d_qpu_msfign_name(instr->branch.msfign));
+
+                switch (instr->branch.bdi) {
+                case V3D_QPU_BRANCH_DEST_ABS:
+                        fprintf(stderr, "  zero_addr+0x%08x", instr->branch.offset);
+                        break;
+
+                case V3D_QPU_BRANCH_DEST_REL:
+                        fprintf(stderr, "  %d", instr->branch.offset);
+                        break;
+
+                case V3D_QPU_BRANCH_DEST_LINK_REG:
+                        fprintf(stderr, "  lri");
+                        break;
+
+                case V3D_QPU_BRANCH_DEST_REGFILE:
+                        fprintf(stderr, "  rf%d", instr->branch.raddr_a);
+                        break;
+                }
+
+                if (instr->branch.ub) {
+                        switch (instr->branch.bdu) {
+                        case V3D_QPU_BRANCH_DEST_ABS:
+                                fprintf(stderr, ", a:unif");
+                                break;
+
+                        case V3D_QPU_BRANCH_DEST_REL:
+                                fprintf(stderr, ", r:unif");
+                                break;
+
+                        case V3D_QPU_BRANCH_DEST_LINK_REG:
+                                fprintf(stderr, ", lri");
+                                break;
+
+                        case V3D_QPU_BRANCH_DEST_REGFILE:
+                                fprintf(stderr, ", rf%d", instr->branch.raddr_a);
+                                break;
+                        }
+                }
+
+                if (vir_has_implicit_uniform(inst)) {
+                        fprintf(stderr, " ");
+                        vir_print_reg(c, inst->src[vir_get_implicit_uniform_src(inst)]);
+                }
+
+                break;
+        }
+}
+
+void
+vir_dump(struct v3d_compile *c)
+{
+        int ip = 0;
+
+        vir_for_each_block(block, c) {
+                fprintf(stderr, "BLOCK %d:\n", block->index);
+                vir_for_each_inst(inst, block) {
+                        if (c->temp_start) {
+                                bool first = true;
+
+                                for (int i = 0; i < c->num_temps; i++) {
+                                        if (c->temp_start[i] != ip)
+                                                continue;
+
+                                        if (first) {
+                                                first = false;
+                                        } else {
+                                                fprintf(stderr, ", ");
+                                        }
+                                        fprintf(stderr, "S%4d", i);
+                                }
+
+                                if (first)
+                                        fprintf(stderr, "      ");
+                                else
+                                        fprintf(stderr, " ");
+                        }
+
+                        if (c->temp_end) {
+                                bool first = true;
+
+                                for (int i = 0; i < c->num_temps; i++) {
+                                        if (c->temp_end[i] != ip)
+                                                continue;
+
+                                        if (first) {
+                                                first = false;
+                                        } else {
+                                                fprintf(stderr, ", ");
+                                        }
+                                        fprintf(stderr, "E%4d", i);
+                                }
+
+                                if (first)
+                                        fprintf(stderr, "      ");
+                                else
+                                        fprintf(stderr, " ");
+                        }
+
+                        vir_dump_inst(c, inst);
+                        fprintf(stderr, "\n");
+                        ip++;
+                }
+                if (block->successors[1]) {
+                        fprintf(stderr, "-> BLOCK %d, %d\n",
+                                block->successors[0]->index,
+                                block->successors[1]->index);
+                } else if (block->successors[0]) {
+                        fprintf(stderr, "-> BLOCK %d\n",
+                                block->successors[0]->index);
+                }
+        }
+}
diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c
new file mode 100644
index 00000000000..217b716fd9f
--- /dev/null
+++ b/src/broadcom/compiler/vir_live_variables.c
@@ -0,0 +1,340 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define MAX_INSTRUCTION (1 << 30)
+
+#include "util/ralloc.h"
+#include "util/register_allocate.h"
+#include "v3d_compiler.h"
+
+struct partial_update_state {
+        struct qinst *insts[4];
+        uint8_t channels;
+};
+
+static uint32_t
+int_hash(const void *key)
+{
+        return _mesa_hash_data(key, sizeof(int));
+}
+
+static bool
+int_compare(const void *key1, const void *key2)
+{
+        return *(const int *)key1 == *(const int *)key2;
+}
+
+static int
+vir_reg_to_var(struct qreg reg)
+{
+        if (reg.file == QFILE_TEMP)
+                return reg.index;
+
+        return -1;
+}
+
+static void
+vir_setup_use(struct v3d_compile *c, struct qblock *block, int ip,
+              struct qreg src)
+{
+        int var = vir_reg_to_var(src);
+        if (var == -1)
+                return;
+
+        c->temp_start[var] = MIN2(c->temp_start[var], ip);
+        c->temp_end[var] = MAX2(c->temp_end[var], ip);
+
+        /* The use[] bitset marks when the block makes
+         * use of a variable without having completely
+         * defined that variable within the block.
+         */
+        if (!BITSET_TEST(block->def, var))
+                BITSET_SET(block->use, var);
+}
+
+static struct partial_update_state *
+get_partial_update_state(struct hash_table *partial_update_ht,
+                         struct qinst *inst)
+{
+        struct hash_entry *entry =
+                _mesa_hash_table_search(partial_update_ht,
+                                        &inst->dst.index);
+        if (entry)
+                return entry->data;
+
+        struct partial_update_state *state =
+                rzalloc(partial_update_ht, struct partial_update_state);
+
+        _mesa_hash_table_insert(partial_update_ht, &inst->dst.index, state);
+
+        return state;
+}
+
+static void
+vir_setup_def(struct v3d_compile *c, struct qblock *block, int ip,
+              struct hash_table *partial_update_ht, struct qinst *inst)
+{
+        if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
+                return;
+
+        /* The def[] bitset marks when an initialization in a
+         * block completely screens off previous updates of
+         * that variable.
+         */
+        int var = vir_reg_to_var(inst->dst);
+        if (var == -1)
+                return;
+
+        c->temp_start[var] = MIN2(c->temp_start[var], ip);
+        c->temp_end[var] = MAX2(c->temp_end[var], ip);
+
+        /* If we've already tracked this as a def, or already used it within
+         * the block, there's nothing to do.
+         */
+        if (BITSET_TEST(block->use, var) || BITSET_TEST(block->def, var))
+                return;
+
+        /* Easy, common case: unconditional full register update.
+         *
+         * We treat conditioning on the exec mask as the same as not being
+         * conditional.  This makes sure that if the register gets set on
+         * either side of an if, it is treated as being screened off before
+         * the if.  Otherwise, if there was no intervening def, its live
+         * interval doesn't extend back to the start of he program, and if too
+         * many registers did that we'd fail to register allocate.
+         */
+        if (((inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
+              inst->qpu.flags.mc == V3D_QPU_COND_NONE) ||
+             inst->cond_is_exec_mask) &&
+            inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE &&
+            inst->qpu.alu.mul.output_pack == V3D_QPU_PACK_NONE) {
+                BITSET_SET(block->def, var);
+                return;
+        }
+
+        /* Finally, look at the condition code and packing and mark it as a
+         * def.  We need to make sure that we understand sequences
+         * instructions like:
+         *
+         *     mov.zs t0, t1
+         *     mov.zc t0, t2
+         *
+         * or:
+         *
+         *     mmov t0.8a, t1
+         *     mmov t0.8b, t2
+         *     mmov t0.8c, t3
+         *     mmov t0.8d, t4
+         *
+         * as defining the temp within the block, because otherwise dst's live
+         * range will get extended up the control flow to the top of the
+         * program.
+         */
+        struct partial_update_state *state =
+                get_partial_update_state(partial_update_ht, inst);
+        uint8_t mask = 0xf; /* XXX vir_channels_written(inst); */
+
+        if (inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
+            inst->qpu.flags.mc == V3D_QPU_COND_NONE) {
+                state->channels |= mask;
+        } else {
+                for (int i = 0; i < 4; i++) {
+                        if (!(mask & (1 << i)))
+                                continue;
+
+                        /* XXXif (state->insts[i] &&
+                            state->insts[i]->cond ==
+                            qpu_cond_complement(inst->cond))
+                                state->channels |= 1 << i;
+                        else
+                        */
+                                state->insts[i] = inst;
+                }
+        }
+
+        if (state->channels == 0xf)
+                BITSET_SET(block->def, var);
+}
+
+static void
+sf_state_clear(struct hash_table *partial_update_ht)
+{
+        struct hash_entry *entry;
+
+        hash_table_foreach(partial_update_ht, entry) {
+                struct partial_update_state *state = entry->data;
+
+                for (int i = 0; i < 4; i++) {
+                        if (state->insts[i] &&
+                            (state->insts[i]->qpu.flags.ac != V3D_QPU_COND_NONE ||
+                             state->insts[i]->qpu.flags.mc != V3D_QPU_COND_NONE))
+                                state->insts[i] = NULL;
+                }
+        }
+}
+
+/* Sets up the def/use arrays for when variables are used-before-defined or
+ * defined-before-used in the block.
+ *
+ * Also initializes the temp_start/temp_end to cover just the instruction IPs
+ * where the variable is used, which will be extended later in
+ * vir_compute_start_end().
+ */
+static void
+vir_setup_def_use(struct v3d_compile *c)
+{
+        struct hash_table *partial_update_ht =
+                _mesa_hash_table_create(c, int_hash, int_compare);
+        int ip = 0;
+
+        vir_for_each_block(block, c) {
+                block->start_ip = ip;
+
+                _mesa_hash_table_clear(partial_update_ht, NULL);
+
+                vir_for_each_inst(inst, block) {
+                        for (int i = 0; i < vir_get_nsrc(inst); i++)
+                                vir_setup_use(c, block, ip, inst->src[i]);
+
+                        vir_setup_def(c, block, ip, partial_update_ht, inst);
+
+                        if (false /* XXX inst->uf */)
+                                sf_state_clear(partial_update_ht);
+
+                        /* Payload registers: r0/1/2 contain W, centroid W,
+                         * and Z at program start.  Register allocation will
+                         * force their nodes to R0/1/2.
+                         */
+                        if (inst->src[0].file == QFILE_REG) {
+                                switch (inst->src[0].index) {
+                                case 0:
+                                case 1:
+                                case 2:
+                                        c->temp_start[inst->dst.index] = 0;
+                                        break;
+                                }
+                        }
+
+                        ip++;
+                }
+                block->end_ip = ip;
+        }
+
+        _mesa_hash_table_destroy(partial_update_ht, NULL);
+}
+
+static bool
+vir_live_variables_dataflow(struct v3d_compile *c, int bitset_words)
+{
+        bool cont = false;
+
+        vir_for_each_block_rev(block, c) {
+                /* Update live_out: Any successor using the variable
+                 * on entrance needs us to have the variable live on
+                 * exit.
+                 */
+                vir_for_each_successor(succ, block) {
+                        for (int i = 0; i < bitset_words; i++) {
+                                BITSET_WORD new_live_out = (succ->live_in[i] &
+                                                            ~block->live_out[i]);
+                                if (new_live_out) {
+                                        block->live_out[i] |= new_live_out;
+                                        cont = true;
+                                }
+                        }
+                }
+
+                /* Update live_in */
+                for (int i = 0; i < bitset_words; i++) {
+                        BITSET_WORD new_live_in = (block->use[i] |
+                                                   (block->live_out[i] &
+                                                    ~block->def[i]));
+                        if (new_live_in & ~block->live_in[i]) {
+                                block->live_in[i] |= new_live_in;
+                                cont = true;
+                        }
+                }
+        }
+
+        return cont;
+}
+
+/**
+ * Extend the start/end ranges for each variable to account for the
+ * new information calculated from control flow.
+ */
+static void
+vir_compute_start_end(struct v3d_compile *c, int num_vars)
+{
+        vir_for_each_block(block, c) {
+                for (int i = 0; i < num_vars; i++) {
+                        if (BITSET_TEST(block->live_in, i)) {
+                                c->temp_start[i] = MIN2(c->temp_start[i],
+                                                        block->start_ip);
+                                c->temp_end[i] = MAX2(c->temp_end[i],
+                                                      block->start_ip);
+                        }
+
+                        if (BITSET_TEST(block->live_out, i)) {
+                                c->temp_start[i] = MIN2(c->temp_start[i],
+                                                        block->end_ip);
+                                c->temp_end[i] = MAX2(c->temp_end[i],
+                                                      block->end_ip);
+                        }
+                }
+        }
+}
+
+void
+vir_calculate_live_intervals(struct v3d_compile *c)
+{
+        int bitset_words = BITSET_WORDS(c->num_temps);
+
+        /* If we called this function more than once, then we should be
+         * freeing the previous arrays.
+         */
+        assert(!c->temp_start);
+
+        c->temp_start = rzalloc_array(c, int, c->num_temps);
+        c->temp_end = rzalloc_array(c, int, c->num_temps);
+
+        for (int i = 0; i < c->num_temps; i++) {
+                c->temp_start[i] = MAX_INSTRUCTION;
+                c->temp_end[i] = -1;
+        }
+
+        vir_for_each_block(block, c) {
+                block->def = rzalloc_array(c, BITSET_WORD, bitset_words);
+                block->use = rzalloc_array(c, BITSET_WORD, bitset_words);
+                block->live_in = rzalloc_array(c, BITSET_WORD, bitset_words);
+                block->live_out = rzalloc_array(c, BITSET_WORD, bitset_words);
+        }
+
+        vir_setup_def_use(c);
+
+        while (vir_live_variables_dataflow(c, bitset_words))
+                ;
+
+        vir_compute_start_end(c, c->num_temps);
+}
diff --git a/src/broadcom/compiler/vir_lower_uniforms.c b/src/broadcom/compiler/vir_lower_uniforms.c
new file mode 100644
index 00000000000..b2741994a2d
--- /dev/null
+++ b/src/broadcom/compiler/vir_lower_uniforms.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file v3d_vir_lower_uniforms.c
+ *
+ * This is the pre-code-generation pass for fixing up instructions that try to
+ * read from multiple uniform values.
+ */
+
+#include "v3d_compiler.h"
+#include "util/hash_table.h"
+#include "util/u_math.h"
+
+static inline uint32_t
+index_hash(const void *key)
+{
+        return (uintptr_t)key;
+}
+
+static inline bool
+index_compare(const void *a, const void *b)
+{
+        return a == b;
+}
+
+static void
+add_uniform(struct hash_table *ht, struct qreg reg)
+{
+        struct hash_entry *entry;
+        void *key = (void *)(uintptr_t)(reg.index + 1);
+
+        entry = _mesa_hash_table_search(ht, key);
+        if (entry) {
+                entry->data++;
+        } else {
+                _mesa_hash_table_insert(ht, key, (void *)(uintptr_t)1);
+        }
+}
+
+static void
+remove_uniform(struct hash_table *ht, struct qreg reg)
+{
+        struct hash_entry *entry;
+        void *key = (void *)(uintptr_t)(reg.index + 1);
+
+        entry = _mesa_hash_table_search(ht, key);
+        assert(entry);
+        entry->data--;
+        if (entry->data == NULL)
+                _mesa_hash_table_remove(ht, entry);
+}
+
+static bool
+is_lowerable_uniform(struct qinst *inst, int i)
+{
+        if (inst->src[i].file != QFILE_UNIF)
+                return false;
+        if (vir_has_implicit_uniform(inst))
+                return i != vir_get_implicit_uniform_src(inst);
+        return true;
+}
+
+/* Returns the number of different uniform values referenced by the
+ * instruction.
+ */
+static uint32_t
+vir_get_instruction_uniform_count(struct qinst *inst)
+{
+        uint32_t count = 0;
+
+        for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                if (inst->src[i].file != QFILE_UNIF)
+                        continue;
+
+                bool is_duplicate = false;
+                for (int j = 0; j < i; j++) {
+                        if (inst->src[j].file == QFILE_UNIF &&
+                            inst->src[j].index == inst->src[i].index) {
+                                is_duplicate = true;
+                                break;
+                        }
+                }
+                if (!is_duplicate)
+                        count++;
+        }
+
+        return count;
+}
+
+void
+vir_lower_uniforms(struct v3d_compile *c)
+{
+        struct hash_table *ht =
+                _mesa_hash_table_create(c, index_hash, index_compare);
+
+        /* Walk the instruction list, finding which instructions have more
+         * than one uniform referenced, and add those uniform values to the
+         * ht.
+         */
+        vir_for_each_inst_inorder(inst, c) {
+                uint32_t nsrc = vir_get_nsrc(inst);
+
+                if (vir_get_instruction_uniform_count(inst) <= 1)
+                        continue;
+
+                for (int i = 0; i < nsrc; i++) {
+                        if (is_lowerable_uniform(inst, i))
+                                add_uniform(ht, inst->src[i]);
+                }
+        }
+
+        while (ht->entries) {
+                /* Find the most commonly used uniform in instructions that
+                 * need a uniform lowered.
+                 */
+                uint32_t max_count = 0;
+                uint32_t max_index = 0;
+                struct hash_entry *entry;
+                hash_table_foreach(ht, entry) {
+                        uint32_t count = (uintptr_t)entry->data;
+                        uint32_t index = (uintptr_t)entry->key - 1;
+                        if (count > max_count) {
+                                max_count = count;
+                                max_index = index;
+                        }
+                }
+
+                struct qreg unif = vir_reg(QFILE_UNIF, max_index);
+
+                /* Now, find the instructions using this uniform and make them
+                 * reference a temp instead.
+                 */
+                vir_for_each_block(block, c) {
+                        struct qinst *mov = NULL;
+
+                        vir_for_each_inst(inst, block) {
+                                uint32_t nsrc = vir_get_nsrc(inst);
+
+                                uint32_t count = vir_get_instruction_uniform_count(inst);
+
+                                if (count <= 1)
+                                        continue;
+
+                                /* If the block doesn't have a load of the
+                                 * uniform yet, add it.  We could potentially
+                                 * do better and CSE MOVs from multiple blocks
+                                 * into dominating blocks, except that may
+                                 * cause troubles for register allocation.
+                                 */
+                                if (!mov) {
+                                        mov = vir_mul_inst(V3D_QPU_M_MOV,
+                                                           vir_get_temp(c),
+                                                           unif, c->undef);
+                                        list_add(&mov->link,
+                                                 &block->instructions);
+                                        c->defs[mov->dst.index] = mov;
+                                }
+
+                                bool removed = false;
+                                for (int i = 0; i < nsrc; i++) {
+                                        if (is_lowerable_uniform(inst, i) &&
+                                            inst->src[i].index == max_index) {
+                                                inst->src[i].file =
+                                                        mov->dst.file;
+                                                inst->src[i].index =
+                                                        mov->dst.index;
+                                                remove_uniform(ht, unif);
+                                                removed = true;
+                                        }
+                                }
+                                if (removed)
+                                        count--;
+
+                                /* If the instruction doesn't need lowering any more,
+                                 * then drop it from the list.
+                                 */
+                                if (count <= 1) {
+                                        for (int i = 0; i < nsrc; i++) {
+                                                if (is_lowerable_uniform(inst, i))
+                                                        remove_uniform(ht, inst->src[i]);
+                                        }
+                                }
+                        }
+                }
+        }
+
+        _mesa_hash_table_destroy(ht, NULL);
+}
diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
new file mode 100644
index 00000000000..2a22a1b5521
--- /dev/null
+++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file v3d_opt_copy_propagation.c
+ *
+ * This implements simple copy propagation for VIR without control flow.
+ *
+ * For each temp, it keeps a qreg of which source it was MOVed from, if it
+ * was.  If we see that used later, we can just reuse the source value, since
+ * we know we don't have control flow, and we have SSA for our values so
+ * there's no killing to worry about.
+ */
+
+#include "v3d_compiler.h"
+
+static bool
+is_copy_mov(struct qinst *inst)
+{
+        if (!inst)
+                return false;
+
+        if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+            (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
+             inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
+                return false;
+        }
+
+        if (inst->dst.file != QFILE_TEMP)
+                return false;
+
+        if (inst->src[0].file != QFILE_TEMP &&
+            inst->src[0].file != QFILE_UNIF) {
+                return false;
+        }
+
+        if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
+            inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
+                return false;
+        }
+
+        if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
+            inst->qpu.flags.mc != V3D_QPU_COND_NONE) {
+                return false;
+        }
+
+        switch (inst->src[0].file) {
+        case QFILE_MAGIC:
+                /* No copy propagating from R3/R4/R5 -- the MOVs from those
+                 * are there to register allocate values produced into R3/4/5
+                 * to other regs (though hopefully r3/4/5).
+                 */
+                switch (inst->src[0].index) {
+                case V3D_QPU_WADDR_R3:
+                case V3D_QPU_WADDR_R4:
+                case V3D_QPU_WADDR_R5:
+                        return false;
+                default:
+                        break;
+                }
+                break;
+
+        case QFILE_REG:
+                switch (inst->src[0].index) {
+                case 0:
+                case 1:
+                case 2:
+                        /* MOVs from rf0/1/2 are only to track the live
+                         * intervals for W/centroid W/Z.
+                         */
+                        return false;
+                }
+                break;
+
+        default:
+                break;
+        }
+
+        return true;
+}
+
+static bool
+vir_has_unpack(struct qinst *inst, int chan)
+{
+        assert(chan == 0 || chan == 1);
+
+        if (vir_is_add(inst)) {
+                if (chan == 0)
+                        return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
+                else
+                        return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
+        } else {
+                if (chan == 0)
+                        return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
+                else
+                        return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
+        }
+}
+
+static bool
+try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
+{
+        bool debug = false;
+        bool progress = false;
+
+        for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                if (inst->src[i].file != QFILE_TEMP)
+                        continue;
+
+                /* We have two ways of finding MOVs we can copy propagate
+                 * from.  One is if it's an SSA def: then we can reuse it from
+                 * any block in the program, as long as its source is also an
+                 * SSA def.  Alternatively, if it's in the "movs" array
+                 * tracked within the block, then we know the sources for it
+                 * haven't been changed since we saw the instruction within
+                 * our block.
+                 */
+                struct qinst *mov = movs[inst->src[i].index];
+                if (!mov) {
+                        if (!is_copy_mov(c->defs[inst->src[i].index]))
+                                continue;
+                        mov = c->defs[inst->src[i].index];
+
+                        if (mov->src[0].file == QFILE_TEMP &&
+                            !c->defs[mov->src[0].index])
+                                continue;
+                }
+
+                if (vir_has_unpack(mov, 0)) {
+                        /* Make sure that the meaning of the unpack
+                         * would be the same between the two
+                         * instructions.
+                         */
+                        if (vir_is_float_input(inst) !=
+                            vir_is_float_input(mov)) {
+                                continue;
+                        }
+                        /* No composing the unpacks. */
+                        if (vir_has_unpack(inst, i))
+                            continue;
+                }
+
+                if (debug) {
+                        fprintf(stderr, "Copy propagate: ");
+                        vir_dump_inst(c, inst);
+                        fprintf(stderr, "\n");
+                }
+
+                inst->src[i] = mov->src[0];
+                if (vir_has_unpack(mov, 0)) {
+                        enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
+
+                        vir_set_unpack(inst, i, unpack);
+                }
+
+                if (debug) {
+                        fprintf(stderr, "to: ");
+                        vir_dump_inst(c, inst);
+                        fprintf(stderr, "\n");
+                }
+
+                progress = true;
+        }
+
+        return progress;
+}
+
+static void
+apply_kills(struct v3d_compile *c, struct qinst **movs, struct qinst *inst)
+{
+        if (inst->dst.file != QFILE_TEMP)
+                return;
+
+        for (int i = 0; i < c->num_temps; i++) {
+                if (movs[i] &&
+                    (movs[i]->dst.index == inst->dst.index ||
+                     (movs[i]->src[0].file == QFILE_TEMP &&
+                      movs[i]->src[0].index == inst->dst.index))) {
+                        movs[i] = NULL;
+                }
+        }
+}
+
+bool
+vir_opt_copy_propagate(struct v3d_compile *c)
+{
+        bool progress = false;
+        struct qinst **movs;
+
+        movs = ralloc_array(c, struct qinst *, c->num_temps);
+        if (!movs)
+                return false;
+
+        vir_for_each_block(block, c) {
+                /* The MOVs array tracks only available movs within the
+                 * block.
+                 */
+                memset(movs, 0, sizeof(struct qinst *) * c->num_temps);
+
+                vir_for_each_inst(inst, block) {
+                        progress = try_copy_prop(c, inst, movs) || progress;
+
+                        apply_kills(c, movs, inst);
+
+                        if (is_copy_mov(inst))
+                                movs[inst->dst.index] = inst;
+                }
+        }
+
+        ralloc_free(movs);
+
+        return progress;
+}
diff --git a/src/broadcom/compiler/vir_opt_dead_code.c b/src/broadcom/compiler/vir_opt_dead_code.c
new file mode 100644
index 00000000000..9e0ef20b6db
--- /dev/null
+++ b/src/broadcom/compiler/vir_opt_dead_code.c
@@ -0,0 +1,162 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file v3d_opt_dead_code.c
+ *
+ * This is a simple dead code eliminator for SSA values in VIR.
+ *
+ * It walks all the instructions finding what temps are used, then walks again
+ * to remove instructions writing unused temps.
+ *
+ * This is an inefficient implementation if you have long chains of
+ * instructions where the entire chain is dead, but we expect those to have
+ * been eliminated at the NIR level, and here we're just cleaning up small
+ * problems produced by NIR->VIR.
+ */
+
+#include "v3d_compiler.h"
+
+static bool debug;
+
+static void
+dce(struct v3d_compile *c, struct qinst *inst)
+{
+        if (debug) {
+                fprintf(stderr, "Removing: ");
+                vir_dump_inst(c, inst);
+                fprintf(stderr, "\n");
+        }
+        assert(inst->qpu.flags.apf == V3D_QPU_PF_NONE);
+        assert(inst->qpu.flags.mpf == V3D_QPU_PF_NONE);
+        vir_remove_instruction(c, inst);
+}
+
+static bool
+has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst)
+{
+        for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                if (inst->src[i].file == QFILE_VPM) {
+                        /* Instance ID, Vertex ID: Should have been removed at
+                         * the NIR level
+                         */
+                        if (inst->src[i].index == ~0)
+                                return true;
+
+                        uint32_t attr = inst->src[i].index / 4;
+                        uint32_t offset = inst->src[i].index % 4;
+
+                        if (c->vattr_sizes[attr] != offset)
+                                return true;
+
+                        /* Can't get rid of the last VPM read, or the
+                         * simulator (at least) throws an error.
+                         */
+                        uint32_t total_size = 0;
+                        for (uint32_t i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++)
+                                total_size += c->vattr_sizes[i];
+                        if (total_size == 1)
+                                return true;
+                }
+
+                /* Dead code removal of varyings is tricky, so just assert
+                 * that it all happened at the NIR level.
+                 */
+                if (inst->src[i].file == QFILE_VARY)
+                        return true;
+        }
+
+        return false;
+}
+
+bool
+vir_opt_dead_code(struct v3d_compile *c)
+{
+        bool progress = false;
+        bool *used = calloc(c->num_temps, sizeof(bool));
+
+        vir_for_each_inst_inorder(inst, c) {
+                for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                        if (inst->src[i].file == QFILE_TEMP)
+                                used[inst->src[i].index] = true;
+                }
+        }
+
+        vir_for_each_block(block, c) {
+                vir_for_each_inst_safe(inst, block) {
+                        if (inst->dst.file != QFILE_NULL &&
+                            !(inst->dst.file == QFILE_TEMP &&
+                              !used[inst->dst.index])) {
+                                continue;
+                        }
+
+                        if (vir_has_side_effects(c, inst))
+                                continue;
+
+                        if (inst->qpu.flags.apf != V3D_QPU_PF_NONE ||
+                            inst->qpu.flags.mpf != V3D_QPU_PF_NONE||
+                            has_nonremovable_reads(c, inst)) {
+                                /* If we can't remove the instruction, but we
+                                 * don't need its destination value, just
+                                 * remove the destination.  The register
+                                 * allocator would trivially color it and it
+                                 * wouldn't cause any register pressure, but
+                                 * it's nicer to read the VIR code without
+                                 * unused destination regs.
+                                 */
+                                if (inst->dst.file == QFILE_TEMP) {
+                                        if (debug) {
+                                                fprintf(stderr,
+                                                        "Removing dst from: ");
+                                                vir_dump_inst(c, inst);
+                                                fprintf(stderr, "\n");
+                                        }
+                                        c->defs[inst->dst.index] = NULL;
+                                        inst->dst.file = QFILE_NULL;
+                                        progress = true;
+                                }
+                                continue;
+                        }
+
+                        for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                                if (inst->src[i].file != QFILE_VPM)
+                                        continue;
+                                uint32_t attr = inst->src[i].index / 4;
+                                uint32_t offset = (inst->src[i].index % 4);
+
+                                if (c->vattr_sizes[attr] == offset) {
+                                        c->num_inputs--;
+                                        c->vattr_sizes[attr]--;
+                                }
+                        }
+
+                        dce(c, inst);
+                        progress = true;
+                        continue;
+                }
+        }
+
+        free(used);
+
+        return progress;
+}
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
new file mode 100644
index 00000000000..9ebf2cd69b4
--- /dev/null
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/ralloc.h"
+#include "util/register_allocate.h"
+#include "v3d_compiler.h"
+
+#define QPU_R(i) { .magic = false, .index = i }
+
+#define ACC_INDEX     0
+#define ACC_COUNT     5
+#define PHYS_INDEX    (ACC_INDEX + ACC_COUNT)
+#define PHYS_COUNT    64
+
+bool
+vir_init_reg_sets(struct v3d_compiler *compiler)
+{
+        compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
+                                          true);
+        if (!compiler->regs)
+                return false;
+
+        /* Allocate 3 regfile classes, for the ways the physical register file
+         * can be divided up for fragment shader threading.
+         */
+        for (int threads = 0; threads < 3; threads++) {
+                compiler->reg_class[threads] =
+                        ra_alloc_reg_class(compiler->regs);
+
+                for (int i = PHYS_INDEX;
+                     i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
+                        ra_class_add_reg(compiler->regs,
+                                         compiler->reg_class[threads], i);
+                }
+
+                for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT; i++) {
+                        ra_class_add_reg(compiler->regs,
+                                         compiler->reg_class[threads], i);
+                }
+        }
+
+        ra_set_finalize(compiler->regs, NULL);
+
+        return true;
+}
+
+struct node_to_temp_map {
+        uint32_t temp;
+        uint32_t priority;
+};
+
+static int
+node_to_temp_priority(const void *in_a, const void *in_b)
+{
+        const struct node_to_temp_map *a = in_a;
+        const struct node_to_temp_map *b = in_b;
+
+        return a->priority - b->priority;
+}
+
+#define CLASS_BIT_PHYS			(1 << 0)
+#define CLASS_BIT_R0_R2			(1 << 1)
+#define CLASS_BIT_R3			(1 << 2)
+#define CLASS_BIT_R4			(1 << 3)
+
+/**
+ * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
+ *
+ * The return value should be freed by the caller.
+ */
+struct qpu_reg *
+v3d_register_allocate(struct v3d_compile *c)
+{
+        struct node_to_temp_map map[c->num_temps];
+        uint32_t temp_to_node[c->num_temps];
+        uint8_t class_bits[c->num_temps];
+        struct qpu_reg *temp_registers = calloc(c->num_temps,
+                                                sizeof(*temp_registers));
+        int acc_nodes[ACC_COUNT];
+
+        struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
+                                                         c->num_temps +
+                                                         ARRAY_SIZE(acc_nodes));
+
+        /* Make some fixed nodes for the accumulators, which we will need to
+         * interfere with when ops have implied r3/r4 writes or for the thread
+         * switches.  We could represent these as classes for the nodes to
+         * live in, but the classes take up a lot of memory to set up, so we
+         * don't want to make too many.
+         */
+        for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) {
+                acc_nodes[i] = c->num_temps + i;
+                ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
+        }
+
+        /* Compute the live ranges so we can figure out interference. */
+        vir_calculate_live_intervals(c);
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                map[i].temp = i;
+                map[i].priority = c->temp_end[i] - c->temp_start[i];
+        }
+        qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                temp_to_node[map[i].temp] = i;
+        }
+
+        /* Figure out our register classes and preallocated registers.  We
+         * start with any temp being able to be in any file, then instructions
+         * incrementally remove bits that the temp definitely can't be in.
+         */
+        memset(class_bits,
+               CLASS_BIT_PHYS | CLASS_BIT_R0_R2 | CLASS_BIT_R3 | CLASS_BIT_R4,
+               sizeof(class_bits));
+
+        int ip = 0;
+        vir_for_each_inst_inorder(inst, c) {
+                /* If the instruction writes r3/r4 (and optionally moves its
+                 * result to a temp), nothing else can be stored in r3/r4 across
+                 * it.
+                 */
+                if (vir_writes_r3(inst)) {
+                        for (int i = 0; i < c->num_temps; i++) {
+                                if (c->temp_start[i] < ip &&
+                                    c->temp_end[i] > ip) {
+                                        ra_add_node_interference(g,
+                                                                 temp_to_node[i],
+                                                                 acc_nodes[3]);
+                                }
+                        }
+                }
+                if (vir_writes_r4(inst)) {
+                        for (int i = 0; i < c->num_temps; i++) {
+                                if (c->temp_start[i] < ip &&
+                                    c->temp_end[i] > ip) {
+                                        ra_add_node_interference(g,
+                                                                 temp_to_node[i],
+                                                                 acc_nodes[4]);
+                                }
+                        }
+                }
+
+                if (inst->src[0].file == QFILE_REG) {
+                        switch (inst->src[0].index) {
+                        case 0:
+                        case 1:
+                        case 2:
+                                /* Payload setup instructions: Force allocate
+                                 * the dst to the given register (so the MOV
+                                 * will disappear).
+                                 */
+                                assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
+                                assert(inst->dst.file == QFILE_TEMP);
+                                ra_set_node_reg(g,
+                                                temp_to_node[inst->dst.index],
+                                                PHYS_INDEX +
+                                                inst->src[0].index);
+                                break;
+                        }
+                }
+
+#if 0
+                switch (inst->op) {
+                case QOP_THRSW:
+                        /* All accumulators are invalidated across a thread
+                         * switch.
+                         */
+                        for (int i = 0; i < c->num_temps; i++) {
+                                if (c->temp_start[i] < ip && c->temp_end[i] > ip)
+                                        class_bits[i] &= ~(CLASS_BIT_R0_R3 |
+                                                           CLASS_BIT_R4);
+                        }
+                        break;
+
+                default:
+                        break;
+                }
+#endif
+
+                ip++;
+        }
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                ra_set_node_class(g, temp_to_node[i],
+                                  c->compiler->reg_class[c->fs_threaded]);
+        }
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                for (uint32_t j = i + 1; j < c->num_temps; j++) {
+                        if (!(c->temp_start[i] >= c->temp_end[j] ||
+                              c->temp_start[j] >= c->temp_end[i])) {
+                                ra_add_node_interference(g,
+                                                         temp_to_node[i],
+                                                         temp_to_node[j]);
+                        }
+                }
+        }
+
+        bool ok = ra_allocate(g);
+        if (!ok) {
+                if (!c->fs_threaded) {
+                        fprintf(stderr, "Failed to register allocate:\n");
+                        vir_dump(c);
+                }
+
+                c->failed = true;
+                free(temp_registers);
+                return NULL;
+        }
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
+                if (ra_reg < PHYS_INDEX) {
+                        temp_registers[i].magic = true;
+                        temp_registers[i].index = (V3D_QPU_WADDR_R0 +
+                                                   ra_reg - ACC_INDEX);
+                } else {
+                        temp_registers[i].magic = false;
+                        temp_registers[i].index = ra_reg - PHYS_INDEX;
+                }
+
+                /* If the value's never used, just write to the NOP register
+                 * for clarity in debug output.
+                 */
+                if (c->temp_start[i] == c->temp_end[i]) {
+                        temp_registers[i].magic = true;
+                        temp_registers[i].index = V3D_QPU_WADDR_NOP;
+                }
+        }
+
+        ralloc_free(g);
+
+        return temp_registers;
+}
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
new file mode 100644
index 00000000000..78bcea1e302
--- /dev/null
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/v3d_compiler.h"
+#include "qpu/qpu_instr.h"
+#include "qpu/qpu_disasm.h"
+
+static inline struct qpu_reg
+qpu_reg(int index)
+{
+        struct qpu_reg reg = {
+                .magic = false,
+                .index = index,
+        };
+        return reg;
+}
+
+static inline struct qpu_reg
+qpu_magic(enum v3d_qpu_waddr waddr)
+{
+        struct qpu_reg reg = {
+                .magic = true,
+                .index = waddr,
+        };
+        return reg;
+}
+
+static inline struct qpu_reg
+qpu_acc(int acc)
+{
+        return qpu_magic(V3D_QPU_WADDR_R0 + acc);
+}
+
+struct v3d_qpu_instr
+v3d_qpu_nop(void)
+{
+        struct v3d_qpu_instr instr = {
+                .type = V3D_QPU_INSTR_TYPE_ALU,
+                .alu = {
+                        .add = {
+                                .op = V3D_QPU_A_NOP,
+                                .waddr = V3D_QPU_WADDR_NOP,
+                                .magic_write = true,
+                        },
+                        .mul = {
+                                .op = V3D_QPU_M_NOP,
+                                .waddr = V3D_QPU_WADDR_NOP,
+                                .magic_write = true,
+                        },
+                }
+        };
+
+        return instr;
+}
+
+static struct qinst *
+vir_nop(void)
+{
+        struct qreg undef = { QFILE_NULL, 0 };
+        struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
+
+        return qinst;
+}
+
+static struct qinst *
+new_qpu_nop_before(struct qinst *inst)
+{
+        struct qinst *q = vir_nop();
+
+        list_addtail(&q->link, &inst->link);
+
+        return q;
+}
+
+static void
+new_ldunif_instr(struct qinst *inst, int i)
+{
+        struct qinst *ldunif = new_qpu_nop_before(inst);
+
+        ldunif->qpu.sig.ldunif = true;
+        assert(inst->src[i].file == QFILE_UNIF);
+        ldunif->uniform = inst->src[i].index;
+}
+
+/**
+ * Allocates the src register (accumulator or register file) into the RADDR
+ * fields of the instruction.
+ */
+static void
+set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+{
+        if (src.magic) {
+                assert(src.index >= V3D_QPU_WADDR_R0 &&
+                       src.index <= V3D_QPU_WADDR_R5);
+                *mux = src.index - V3D_QPU_WADDR_R0 + V3D_QPU_MUX_R0;
+                return;
+        }
+
+        if (instr->alu.add.a != V3D_QPU_MUX_A &&
+            instr->alu.add.b != V3D_QPU_MUX_A &&
+            instr->alu.mul.a != V3D_QPU_MUX_A &&
+            instr->alu.mul.b != V3D_QPU_MUX_A) {
+                instr->raddr_a = src.index;
+                *mux = V3D_QPU_MUX_A;
+        } else {
+                if (instr->raddr_a == src.index) {
+                        *mux = V3D_QPU_MUX_A;
+                } else {
+                        assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
+                                 instr->alu.add.b == V3D_QPU_MUX_B &&
+                                 instr->alu.mul.a == V3D_QPU_MUX_B &&
+                                 instr->alu.mul.b == V3D_QPU_MUX_B) ||
+                               src.index == instr->raddr_b);
+
+                        instr->raddr_b = src.index;
+                        *mux = V3D_QPU_MUX_B;
+                }
+        }
+}
+
+static void
+v3d_generate_code_block(struct v3d_compile *c,
+                        struct qblock *block,
+                        struct qpu_reg *temp_registers)
+{
+        int last_vpm_read_index = -1;
+
+        vir_for_each_inst(qinst, block) {
+#if 0
+                fprintf(stderr, "translating qinst to qpu: ");
+                vir_dump_inst(c, qinst);
+                fprintf(stderr, "\n");
+#endif
+
+                struct qinst *temp;
+
+                if (vir_has_implicit_uniform(qinst)) {
+                        int src = vir_get_implicit_uniform_src(qinst);
+                        assert(qinst->src[src].file == QFILE_UNIF);
+                        qinst->uniform = qinst->src[src].index;
+                        c->num_uniforms++;
+                }
+
+                int nsrc = vir_get_non_sideband_nsrc(qinst);
+                struct qpu_reg src[ARRAY_SIZE(qinst->src)];
+                bool emitted_ldunif = false;
+                for (int i = 0; i < nsrc; i++) {
+                        int index = qinst->src[i].index;
+                        switch (qinst->src[i].file) {
+                        case QFILE_REG:
+                                src[i] = qpu_reg(qinst->src[i].index);
+                                break;
+                        case QFILE_MAGIC:
+                                src[i] = qpu_magic(qinst->src[i].index);
+                                break;
+                        case QFILE_NULL:
+                        case QFILE_LOAD_IMM:
+                                src[i] = qpu_acc(0);
+                                break;
+                        case QFILE_TEMP:
+                                src[i] = temp_registers[index];
+                                break;
+                        case QFILE_UNIF:
+                                if (!emitted_ldunif) {
+                                        new_ldunif_instr(qinst, i);
+                                        c->num_uniforms++;
+                                        emitted_ldunif = true;
+                                }
+
+                                src[i] = qpu_acc(5);
+                                break;
+                        case QFILE_VARY:
+                                temp = new_qpu_nop_before(qinst);
+                                temp->qpu.sig.ldvary = true;
+
+                                src[i] = qpu_acc(3);
+                                break;
+                        case QFILE_SMALL_IMM:
+                                abort(); /* XXX */
+#if 0
+                                src[i].mux = QPU_MUX_SMALL_IMM;
+                                src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
+                                /* This should only have returned a valid
+                                 * small immediate field, not ~0 for failure.
+                                 */
+                                assert(src[i].addr <= 47);
+#endif
+                                break;
+
+                        case QFILE_VPM:
+                                assert((int)qinst->src[i].index >=
+                                       last_vpm_read_index);
+                                (void)last_vpm_read_index;
+                                last_vpm_read_index = qinst->src[i].index;
+
+                                temp = new_qpu_nop_before(qinst);
+                                temp->qpu.sig.ldvpm = true;
+
+                                src[i] = qpu_acc(3);
+                                break;
+
+                        case QFILE_TLB:
+                        case QFILE_TLBU:
+                                unreachable("bad vir src file");
+                        }
+                }
+
+                struct qpu_reg dst;
+                switch (qinst->dst.file) {
+                case QFILE_NULL:
+                        dst = qpu_magic(V3D_QPU_WADDR_NOP);
+                        break;
+
+                case QFILE_REG:
+                        dst = qpu_reg(qinst->dst.index);
+                        break;
+
+                case QFILE_MAGIC:
+                        dst = qpu_magic(qinst->dst.index);
+                        break;
+
+                case QFILE_TEMP:
+                        dst = temp_registers[qinst->dst.index];
+                        break;
+
+                case QFILE_VPM:
+                        dst = qpu_magic(V3D_QPU_WADDR_VPM);
+                        break;
+
+                case QFILE_TLB:
+                        dst = qpu_magic(V3D_QPU_WADDR_TLB);
+                        break;
+
+                case QFILE_TLBU:
+                        dst = qpu_magic(V3D_QPU_WADDR_TLBU);
+                        break;
+
+                case QFILE_VARY:
+                case QFILE_UNIF:
+                case QFILE_SMALL_IMM:
+                case QFILE_LOAD_IMM:
+                        assert(!"not reached");
+                        break;
+                }
+
+                if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+                        if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
+                                assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+                                if (nsrc >= 1) {
+                                        set_src(&qinst->qpu,
+                                                &qinst->qpu.alu.add.a, src[0]);
+                                }
+                                if (nsrc >= 2) {
+                                        set_src(&qinst->qpu,
+                                                &qinst->qpu.alu.add.b, src[1]);
+                                }
+
+                                qinst->qpu.alu.add.waddr = dst.index;
+                                qinst->qpu.alu.add.magic_write = dst.magic;
+                        } else {
+                                if (nsrc >= 1) {
+                                        set_src(&qinst->qpu,
+                                                &qinst->qpu.alu.mul.a, src[0]);
+                                }
+                                if (nsrc >= 2) {
+                                        set_src(&qinst->qpu,
+                                                &qinst->qpu.alu.mul.b, src[1]);
+                                }
+
+                                qinst->qpu.alu.mul.waddr = dst.index;
+                                qinst->qpu.alu.mul.magic_write = dst.magic;
+                        }
+                } else {
+                        assert(qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
+                }
+        }
+}
+
+
+static void
+v3d_dump_qpu(struct v3d_compile *c)
+{
+        fprintf(stderr, "%s prog %d/%d QPU:\n",
+                vir_get_stage_name(c),
+                c->program_id, c->variant_id);
+
+        for (int i = 0; i < c->qpu_inst_count; i++) {
+                const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]);
+                fprintf(stderr, "0x%016"PRIx64" %s\n", c->qpu_insts[i], str);
+        }
+        fprintf(stderr, "\n");
+}
+
+void
+v3d_vir_to_qpu(struct v3d_compile *c)
+{
+        struct qpu_reg *temp_registers = v3d_register_allocate(c);
+        struct qblock *end_block = list_last_entry(&c->blocks,
+                                                   struct qblock, link);
+
+        /* Reset the uniform count to how many will be actually loaded by the
+         * generated QPU code.
+         */
+        c->num_uniforms = 0;
+
+        vir_for_each_block(block, c)
+                v3d_generate_code_block(c, block, temp_registers);
+
+        struct qinst *thrsw = vir_nop();
+        list_addtail(&thrsw->link, &end_block->instructions);
+        thrsw->qpu.sig.thrsw = true;
+
+        uint32_t cycles = v3d_qpu_schedule_instructions(c);
+
+        c->qpu_insts = rzalloc_array(c, uint64_t, c->qpu_inst_count);
+        int i = 0;
+        vir_for_each_inst_inorder(inst, c) {
+                bool ok = v3d_qpu_instr_pack(c->devinfo, &inst->qpu,
+                                             &c->qpu_insts[i++]);
+                assert(ok); (void) ok;
+        }
+        assert(i == c->qpu_inst_count);
+
+        if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
+                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
+                        vir_get_stage_name(c),
+                        c->program_id, c->variant_id,
+                        cycles);
+        }
+
+        if (V3D_DEBUG & (V3D_DEBUG_QPU |
+                         v3d_debug_flag_for_shader_stage(c->s->stage))) {
+                v3d_dump_qpu(c);
+        }
+
+        qpu_validate(c);
+
+        free(temp_registers);
+}