12 files changed, 1243 insertions, 247 deletions
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index 294869fe99a..cf464b06315 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -5,6 +5,8 @@ C_SOURCES := \
 	vc4_draw.c \
 	vc4_emit.c \
 	vc4_program.c \
+	vc4_qir.c \
+	vc4_qpu_emit.c \
 	vc4_qpu.c \
 	vc4_qpu_disasm.c \
 	vc4_qpu_validate.c \
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 0c906488756..36ad1bd2c8d 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -32,6 +32,7 @@
 #include "vc4_bufmgr.h"
 #include "vc4_resource.h"
 #include "vc4_cl.h"
+#include "vc4_qir.h"
 
 #define VC4_DIRTY_BLEND         (1 <<  0)
 #define VC4_DIRTY_RASTERIZER    (1 <<  1)
@@ -63,10 +64,18 @@ struct vc4_texture_stateobj {
         unsigned dirty_samplers;
 };
 
+struct vc4_shader_uniform_info {
+        enum quniform_contents *contents;
+        uint32_t *data;
+        uint32_t count;
+};
+
 struct vc4_shader_state {
         struct pipe_shader_state base;
         struct vc4_bo *bo;
 
+        struct vc4_shader_uniform_info uniforms[2];
+
         uint32_t coord_shader_offset;
 };
 
@@ -173,7 +182,14 @@ void vc4_simulator_flush(struct vc4_context *vc4,
                          struct vc4_surface *color_surf);
 void *vc4_simulator_alloc(struct vc4_screen *screen, uint32_t size);
 
+void vc4_get_uniform_bo(struct vc4_context *vc4,
+                        struct vc4_shader_state *shader,
+                        struct vc4_constbuf_stateobj *cb,
+                        int shader_index, struct vc4_bo **out_bo,
+                        uint32_t *out_offset);
+
 void vc4_flush(struct pipe_context *pctx);
 void vc4_emit_state(struct pipe_context *pctx);
+void vc4_generate_code(struct qcompile *c);
 
 #endif /* VC4_CONTEXT_H */
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index 34977bb6938..f3283cda432 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -90,21 +90,6 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
         struct vc4_bo *ibo = get_ibo(vc4);
 
         struct vc4_bo *vbo = get_vbo(vc4, width, height);
-        static const uint32_t fs_uni[] = { 0 };
-        uint32_t vs_uni[] = {
-                fui(vc4->framebuffer.width * 16.0f / 2.0f),
-                fui(vc4->framebuffer.height * 16.0f / 2.0f),
-        };
-        uint32_t cs_uni[] = {
-                fui(vc4->framebuffer.width * 16.0f / 2.0f),
-                fui(vc4->framebuffer.height * 16.0f / 2.0f),
-        };
-        struct vc4_bo *fs_ubo = vc4_bo_alloc_mem(vc4->screen, fs_uni,
-                                                 sizeof(fs_uni), "fs_ubo");
-        struct vc4_bo *vs_ubo = vc4_bo_alloc_mem(vc4->screen, vs_uni,
-                                                 sizeof(vs_uni), "vs_ubo");
-        struct vc4_bo *cs_ubo = vc4_bo_alloc_mem(vc4->screen, cs_uni,
-                                                 sizeof(cs_uni), "cs_ubo");
 
         vc4->needs_flush = true;
 
@@ -149,25 +134,37 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 // Shader Record
 
+        struct vc4_bo *fs_ubo, *vs_ubo, *cs_ubo;
+        uint32_t fs_ubo_offset, vs_ubo_offset, cs_ubo_offset;
+        vc4_get_uniform_bo(vc4, vc4->prog.fs,
+                           &vc4->constbuf[PIPE_SHADER_FRAGMENT],
+                           0, &fs_ubo, &fs_ubo_offset);
+        vc4_get_uniform_bo(vc4, vc4->prog.vs,
+                           &vc4->constbuf[PIPE_SHADER_VERTEX],
+                           0, &vs_ubo, &vs_ubo_offset);
+        vc4_get_uniform_bo(vc4, vc4->prog.vs,
+                           &vc4->constbuf[PIPE_SHADER_VERTEX],
+                           1, &cs_ubo, &cs_ubo_offset);
+
         cl_start_shader_reloc(&vc4->shader_rec, 7);
         cl_u16(&vc4->shader_rec, VC4_SHADER_FLAG_ENABLE_CLIPPING);
         cl_u8(&vc4->shader_rec, 0); /* fs num uniforms (unused) */
         cl_u8(&vc4->shader_rec, 0); /* fs num varyings */
         cl_reloc(vc4, &vc4->shader_rec, vc4->prog.fs->bo, 0);
-        cl_reloc(vc4, &vc4->shader_rec, fs_ubo, 0);
+        cl_reloc(vc4, &vc4->shader_rec, fs_ubo, fs_ubo_offset);
 
         cl_u16(&vc4->shader_rec, 0); /* vs num uniforms */
         cl_u8(&vc4->shader_rec, 1); /* vs attribute array bitfield */
         cl_u8(&vc4->shader_rec, 16); /* vs total attribute size */
         cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo, 0);
-        cl_reloc(vc4, &vc4->shader_rec, vs_ubo, 0);
+        cl_reloc(vc4, &vc4->shader_rec, vs_ubo, vs_ubo_offset);
 
         cl_u16(&vc4->shader_rec, 0); /* cs num uniforms */
         cl_u8(&vc4->shader_rec, 1); /* cs attribute array bitfield */
         cl_u8(&vc4->shader_rec, 16); /* vs total attribute size */
         cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo,
                 vc4->prog.vs->coord_shader_offset);
-        cl_reloc(vc4, &vc4->shader_rec, cs_ubo, 0);
+        cl_reloc(vc4, &vc4->shader_rec, cs_ubo, cs_ubo_offset);
 
         cl_reloc(vc4, &vc4->shader_rec, vbo, 0);
         cl_u8(&vc4->shader_rec, 15); /* bytes - 1 in the attribute*/
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 56fe37c5f5f..8a937359472 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -27,261 +27,513 @@
 #include "pipe/p_state.h"
 #include "util/u_memory.h"
 #include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
 
 #include "vc4_context.h"
 #include "vc4_qpu.h"
+#include "vc4_qir.h"
+
+struct tgsi_to_qir {
+        struct tgsi_parse_context parser;
+        struct qcompile *c;
+        struct qreg *temps;
+        struct qreg *inputs;
+        struct qreg *outputs;
+        struct qreg *uniforms;
+        struct qreg *consts;
+        uint32_t num_consts;
+
+        uint32_t *uniform_data;
+        enum quniform_contents *uniform_contents;
+        uint32_t num_uniforms;
+};
+
+static struct qreg
+get_temp_for_uniform(struct tgsi_to_qir *trans, uint32_t uniform)
+{
+        struct qcompile *c = trans->c;
+        struct qreg u = { QFILE_UNIF, uniform };
+
+        struct qreg t = qir_MOV(c, u);
+        trans->uniforms[uniform] = t;
+        return t;
+}
+
+static struct qreg
+qir_uniform_ui(struct tgsi_to_qir *trans, uint32_t ui)
+{
+        for (int i = 0; i < trans->num_uniforms; i++) {
+                if (trans->uniform_contents[i] == QUNIFORM_CONSTANT &&
+                    trans->uniform_data[i] == ui)
+                        return trans->uniforms[i];
+        }
+
+        trans->uniform_contents[trans->num_uniforms] = QUNIFORM_CONSTANT;
+        trans->uniform_data[trans->num_uniforms] = ui;
+        return get_temp_for_uniform(trans, trans->num_uniforms++);
+}
+
+static struct qreg
+qir_uniform_f(struct tgsi_to_qir *trans, float f)
+{
+        return qir_uniform_ui(trans, fui(f));
+}
+
+static struct qreg
+qir_uniform(struct tgsi_to_qir *trans, uint32_t index)
+{
+        for (int i = 0; i < trans->num_uniforms; i++) {
+                if (trans->uniform_contents[i] == QUNIFORM_UNIFORM &&
+                    trans->uniform_data[i] == index)
+                        return trans->uniforms[i];
+        }
+
+        trans->uniform_contents[trans->num_uniforms] = QUNIFORM_UNIFORM;
+        trans->uniform_data[trans->num_uniforms] = index;
+        return get_temp_for_uniform(trans, trans->num_uniforms++);
+}
+
+static struct qreg
+get_src(struct tgsi_to_qir *trans, struct tgsi_src_register *src, int i)
+{
+        struct qcompile *c = trans->c;
+        struct qreg r = c->undef;
+
+        uint32_t s = i;
+        switch (i) {
+        case TGSI_SWIZZLE_X:
+                s = src->SwizzleX;
+                break;
+        case TGSI_SWIZZLE_Y:
+                s = src->SwizzleY;
+                break;
+        case TGSI_SWIZZLE_Z:
+                s = src->SwizzleZ;
+                break;
+        case TGSI_SWIZZLE_W:
+                s = src->SwizzleW;
+                break;
+        default:
+                abort();
+        }
+
+        assert(!src->Indirect);
+
+        switch (src->File) {
+        case TGSI_FILE_NULL:
+                return r;
+        case TGSI_FILE_TEMPORARY:
+                r = trans->temps[src->Index * 4 + s];
+                break;
+        case TGSI_FILE_IMMEDIATE:
+                r = trans->consts[src->Index * 4 + s];
+                break;
+        case TGSI_FILE_CONSTANT:
+                r = qir_uniform(trans, src->Index * 4 + s);
+                break;
+        case TGSI_FILE_INPUT:
+                r = trans->inputs[src->Index * 4 + s];
+                break;
+        default:
+                fprintf(stderr, "unknown src file %d\n", src->File);
+                abort();
+        }
+
+        if (src->Absolute)
+                r = qir_FMAXABS(c, r, r);
+
+        if (src->Negate)
+                r = qir_FSUB(c, qir_uniform_f(trans, 0), r);
+
+        return r;
+};
+
 
 static void
-vc4_dump_program(const uint64_t *insts, uint count)
+update_dst(struct tgsi_to_qir *trans, struct tgsi_full_instruction *tgsi_inst,
+           int i, struct qreg val)
 {
-        for (int i = 0; i < count; i++) {
-                fprintf(stderr, "0x%016"PRIx64" ", insts[i]);
-                vc4_qpu_disasm(&insts[i], 1);
-                fprintf(stderr, "\n");
+        struct tgsi_dst_register *tgsi_dst = &tgsi_inst->Dst[0].Register;
+
+        assert(!tgsi_dst->Indirect);
+
+        switch (tgsi_dst->File) {
+        case TGSI_FILE_TEMPORARY:
+                trans->temps[tgsi_dst->Index * 4 + i] = val;
+                break;
+        case TGSI_FILE_OUTPUT:
+                trans->outputs[tgsi_dst->Index * 4 + i] = val;
+                break;
+        default:
+                fprintf(stderr, "unknown dst file %d\n", tgsi_dst->File);
+                abort();
         }
+};
+
+static struct qreg
+tgsi_to_qir_alu(struct tgsi_to_qir *trans,
+                struct tgsi_full_instruction *tgsi_inst,
+                enum qop op, struct qreg *src, int i)
+{
+        struct qcompile *c = trans->c;
+        struct qreg dst = qir_get_temp(c);
+        qir_emit(c, qir_inst(op, dst, src[0 * 4 + i], src[1 * 4 + i]));
+        return dst;
 }
 
-static struct vc4_shader_state *
-vc4_shader_state_create(struct pipe_context *pctx,
-                        const struct pipe_shader_state *cso)
+static struct qreg
+tgsi_to_qir_mad(struct tgsi_to_qir *trans,
+                struct tgsi_full_instruction *tgsi_inst,
+                enum qop op, struct qreg *src, int i)
 {
-        struct vc4_shader_state *so = CALLOC_STRUCT(vc4_shader_state);
-        if (!so)
-                return NULL;
+        struct qcompile *c = trans->c;
+        return qir_FADD(c,
+                        qir_FMUL(c,
+                                 src[0 * 4 + i],
+                                 src[1 * 4 + i]),
+                        src[2 * 4 + i]);
+}
 
-        so->base.tokens = tgsi_dup_tokens(cso->tokens);
+static struct qreg
+tgsi_to_qir_dp(struct tgsi_to_qir *trans,
+               struct tgsi_full_instruction *tgsi_inst,
+               int num, struct qreg *src, int i)
+{
+        struct qcompile *c = trans->c;
 
-        return so;
+        struct qreg sum = qir_FMUL(c, src[0 * 4 + 0], src[1 * 4 + 0]);
+        for (int j = 1; j < num; j++) {
+                sum = qir_FADD(c, sum, qir_FMUL(c,
+                                                src[0 * 4 + j],
+                                                src[1 * 4 + j]));
+        }
+        return sum;
 }
 
-static void *
-vc4_fs_state_create(struct pipe_context *pctx,
-                    const struct pipe_shader_state *cso)
+static struct qreg
+tgsi_to_qir_dp2(struct tgsi_to_qir *trans,
+                 struct tgsi_full_instruction *tgsi_inst,
+                 enum qop op, struct qreg *src, int i)
 {
-        struct vc4_context *vc4 = vc4_context(pctx);
-        struct vc4_shader_state *so = vc4_shader_state_create(pctx, cso);
-        if (!so)
-                return NULL;
+        return tgsi_to_qir_dp(trans, tgsi_inst, 2, src, i);
+}
 
-        uint64_t gen_fsc[100];
-        uint64_t cur_inst;
-        int gen_fsc_len = 0;
-#if 0
-        cur_inst = qpu_load_imm_f(qpu_r5(), 0.0f);
-        gen_fsc[gen_fsc_len++] = cur_inst;
-
-        cur_inst = qpu_inst(qpu_a_MOV(qpu_r0(), qpu_vary()),
-                            qpu_m_MOV(qpu_r3(), qpu_r5()));
-        cur_inst |= QPU_PM;
-        cur_inst |= QPU_SET_FIELD(QPU_PACK_MUL_8D, QPU_PACK);
-        gen_fsc[gen_fsc_len++] = cur_inst;
-
-        cur_inst = qpu_inst(qpu_a_FADD(qpu_r0(), qpu_r0(), qpu_r5()),
-                            qpu_m_MOV(qpu_r1(), qpu_vary()));
-        gen_fsc[gen_fsc_len++] = cur_inst;
-
-        cur_inst = qpu_inst(qpu_a_FADD(qpu_r1(), qpu_r1(), qpu_r5()),
-                            qpu_m_MOV(qpu_r2(), qpu_vary()));
-        cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_WAIT_FOR_SCOREBOARD, QPU_SIG);
-        gen_fsc[gen_fsc_len++] = cur_inst;
-
-        cur_inst = qpu_inst(qpu_a_FADD(qpu_r2(), qpu_r2(), qpu_r5()),
-                            qpu_m_MOV(qpu_r3(), qpu_r0()));
-        cur_inst |= QPU_PM;
-        cur_inst |= QPU_SET_FIELD(QPU_PACK_MUL_8A, QPU_PACK);
-        gen_fsc[gen_fsc_len++] = cur_inst;
-
-        cur_inst = qpu_inst(qpu_a_NOP(),
-                            qpu_m_MOV(qpu_r3(), qpu_r1()));
-        cur_inst |= QPU_PM;
-        cur_inst |= QPU_SET_FIELD(QPU_PACK_MUL_8B, QPU_PACK);
-        gen_fsc[gen_fsc_len++] = cur_inst;
-
-        cur_inst = qpu_inst(qpu_a_NOP(),
-                            qpu_m_MOV(qpu_r3(), qpu_r2()));
-        cur_inst |= QPU_PM;
-        cur_inst |= QPU_SET_FIELD(QPU_PACK_MUL_8C, QPU_PACK);
-        gen_fsc[gen_fsc_len++] = cur_inst;
-
-        cur_inst = qpu_inst(qpu_a_MOV(qpu_tlbc(), qpu_r3()),
-                            qpu_m_NOP());
-        cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_PROG_END, QPU_SIG);
-        gen_fsc[gen_fsc_len++] = cur_inst;
-
-        cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
-        gen_fsc[gen_fsc_len++] = cur_inst;
-
-        cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
-        cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_SCOREBOARD_UNLOCK, QPU_SIG);
-        gen_fsc[gen_fsc_len++] = cur_inst;
-
-#else
-
-        /* drain the varyings. */
-        for (int i = 0; i < 3; i++) {
-                cur_inst = qpu_inst(qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_rb(QPU_R_NOP)),
-                                    qpu_m_NOP());
-                if (i == 1)
-                        cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_WAIT_FOR_SCOREBOARD, QPU_SIG);
-                gen_fsc[gen_fsc_len++] = cur_inst;
-
-                cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
-                gen_fsc[gen_fsc_len++] = cur_inst;
+static struct qreg
+tgsi_to_qir_dp3(struct tgsi_to_qir *trans,
+                 struct tgsi_full_instruction *tgsi_inst,
+                 enum qop op, struct qreg *src, int i)
+{
+        return tgsi_to_qir_dp(trans, tgsi_inst, 3, src, i);
+}
+
+static struct qreg
+tgsi_to_qir_dp4(struct tgsi_to_qir *trans,
+                 struct tgsi_full_instruction *tgsi_inst,
+                 enum qop op, struct qreg *src, int i)
+{
+        return tgsi_to_qir_dp(trans, tgsi_inst, 4, src, i);
+}
+
+static void
+emit_tgsi_instruction(struct tgsi_to_qir *trans,
+                      struct tgsi_full_instruction *tgsi_inst)
+{
+        struct qcompile *c = trans->c;
+        struct {
+                enum qop op;
+                struct qreg (*func)(struct tgsi_to_qir *trans,
+                                    struct tgsi_full_instruction *tgsi_inst,
+                                    enum qop op,
+                                    struct qreg *src, int i);
+        } op_trans[] = {
+                [TGSI_OPCODE_MOV] = { QOP_MOV, tgsi_to_qir_alu },
+                [TGSI_OPCODE_ABS] = { QOP_FMAXABS, tgsi_to_qir_alu },
+                [TGSI_OPCODE_MUL] = { QOP_FMUL, tgsi_to_qir_alu },
+                [TGSI_OPCODE_ADD] = { QOP_FADD, tgsi_to_qir_alu },
+                [TGSI_OPCODE_SUB] = { QOP_FSUB, tgsi_to_qir_alu },
+                [TGSI_OPCODE_MIN] = { QOP_FMIN, tgsi_to_qir_alu },
+                [TGSI_OPCODE_MAX] = { QOP_FMAX, tgsi_to_qir_alu },
+                [TGSI_OPCODE_RSQ] = { QOP_RSQ, tgsi_to_qir_alu },
+                [TGSI_OPCODE_MAD] = { 0, tgsi_to_qir_mad },
+                [TGSI_OPCODE_DP2] = { 0, tgsi_to_qir_dp2 },
+                [TGSI_OPCODE_DP3] = { 0, tgsi_to_qir_dp3 },
+                [TGSI_OPCODE_DP4] = { 0, tgsi_to_qir_dp4 },
+                [TGSI_OPCODE_LIT] = { QOP_MOV, tgsi_to_qir_alu }, /* XXX */
+        };
+        static int asdf = 0;
+        uint32_t tgsi_op = tgsi_inst->Instruction.Opcode;
+
+        if (tgsi_op == TGSI_OPCODE_END)
+                return;
+
+        tgsi_dump_instruction(tgsi_inst, asdf++);
+
+        if (tgsi_op > ARRAY_SIZE(op_trans) || !op_trans[tgsi_op].func) {
+                fprintf(stderr, "unknown tgsi inst: ");
+                tgsi_dump_instruction(tgsi_inst, asdf++);
+                fprintf(stderr, "\n");
+                abort();
+        }
+
+        struct qreg src_regs[12];
+        for (int s = 0; s < 3; s++) {
+                for (int i = 0; i < 4; i++) {
+                        src_regs[4 * s + i] =
+                                get_src(trans, &tgsi_inst->Src[s].Register, i);
+                }
         }
 
-        /* some colors */
-#if 1
         for (int i = 0; i < 4; i++) {
-                cur_inst = qpu_load_imm_f(qpu_rn(i), .2 + i / 4.0);
-                gen_fsc[gen_fsc_len++] = cur_inst;
+                if (!(tgsi_inst->Dst[0].Register.WriteMask & (1 << i)))
+                        continue;
+
+                struct qreg result;
+
+                result = op_trans[tgsi_op].func(trans, tgsi_inst,
+                                                op_trans[tgsi_op].op,
+                                                src_regs, i);
+
+                if (tgsi_inst->Instruction.Saturate) {
+                        float low = (tgsi_inst->Instruction.Saturate ==
+                                     TGSI_SAT_MINUS_PLUS_ONE ? -1.0 : 0.0);
+                        result = qir_FMAX(c,
+                                          qir_FMIN(c,
+                                                   result,
+                                                   qir_uniform_f(trans, 1.0)),
+                                          qir_uniform_f(trans, low));
+                }
+
+                update_dst(trans, tgsi_inst, i, result);
         }
+}
 
+static void
+parse_tgsi_immediate(struct tgsi_to_qir *trans, struct tgsi_full_immediate *imm)
+{
         for (int i = 0; i < 4; i++) {
-                cur_inst = qpu_inst(qpu_a_NOP(),
-                                    qpu_m_FMUL(qpu_ra(1),
-                                               qpu_rn(i), qpu_rn(i)));
-                cur_inst |= QPU_PM;
-                cur_inst |= QPU_SET_FIELD(QPU_PACK_A_8A + i, QPU_PACK);
-                gen_fsc[gen_fsc_len++] = cur_inst;
+                unsigned n = trans->num_consts++;
+                trans->consts[n] = qir_uniform_ui(trans, imm->u[i].Uint);
         }
-#else
-        cur_inst = qpu_load_imm_ui(qpu_ra(1), 0x22446688);
-        gen_fsc[gen_fsc_len++] = cur_inst;
-#endif
+}
 
-        cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
-        gen_fsc[gen_fsc_len++] = cur_inst;
+static void
+emit_frag_init(struct tgsi_to_qir *trans, struct vc4_shader_state *so)
+{
+        /* XXX: lols */
+        for (int i = 0; i < 4; i++) {
+                trans->inputs[i] = qir_uniform_ui(trans, fui(i / 4.0));
+        }
 
-        cur_inst = qpu_inst(qpu_a_MOV(qpu_tlbc(), qpu_ra(1)),
-                            qpu_m_NOP());
-        cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_PROG_END, QPU_SIG);
-        gen_fsc[gen_fsc_len++] = cur_inst;
+}
 
-        cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
-        gen_fsc[gen_fsc_len++] = cur_inst;
+static void
+emit_vert_init(struct tgsi_to_qir *trans, struct vc4_shader_state *so)
+{
+        struct qcompile *c = trans->c;
 
-        cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
-        cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_SCOREBOARD_UNLOCK, QPU_SIG);
-        gen_fsc[gen_fsc_len++] = cur_inst;
-#endif
+        /* XXX: attribute type/size/count */
+        for (int i = 0; i < 4; i++) {
+                trans->inputs[i] = qir_get_temp(c);
+                qir_emit(c, qir_inst(QOP_VPM_READ, trans->inputs[i],
+                                     c->undef, c->undef));
+        }
+}
 
+static void
+emit_coord_init(struct tgsi_to_qir *trans, struct vc4_shader_state *so)
+{
+        struct qcompile *c = trans->c;
 
-        if (1)
-                vc4_dump_program(gen_fsc, gen_fsc_len);
-        vc4_qpu_validate(gen_fsc, gen_fsc_len);
+        /* XXX: attribute type/size/count */
+        for (int i = 0; i < 4; i++) {
+                trans->inputs[i] = qir_get_temp(c);
+                qir_emit(c, qir_inst(QOP_VPM_READ, trans->inputs[i],
+                                     c->undef, c->undef));
+        }
+}
 
-        so->bo = vc4_bo_alloc_mem(vc4->screen, gen_fsc,
-                                  gen_fsc_len * sizeof(uint64_t), "fs_code");
+static void
+emit_frag_end(struct tgsi_to_qir *trans, struct vc4_shader_state *so)
+{
+        struct qcompile *c = trans->c;
+
+        struct qreg t = qir_get_temp(c);
+        qir_emit(c, qir_inst4(QOP_PACK_COLORS, t,
+                              trans->outputs[0],
+                              trans->outputs[1],
+                              trans->outputs[2],
+                              trans->outputs[3]));
+        qir_emit(c, qir_inst(QOP_TLB_COLOR_WRITE, c->undef,
+                             t, c->undef));
+}
 
-        return so;
+static void
+emit_scaled_viewport_write(struct tgsi_to_qir *trans)
+{
+        struct qcompile *c = trans->c;
+        struct qreg xyi[2];
+
+        for (int i = 0; i < 2; i++) {
+                trans->uniform_contents[trans->num_uniforms] =
+                        QUNIFORM_VIEWPORT_X_SCALE + i;
+                struct qreg scale = { QFILE_UNIF, trans->num_uniforms++ };
+
+                xyi[i] = qir_FTOI(c, qir_FMUL(c, trans->outputs[i], scale));
+        }
+
+        qir_VPM_WRITE(c, qir_PACK_SCALED(c, xyi[0], xyi[1]));
 }
 
-static int
-gen_vs_cs_code(uint64_t *gen, bool is_vs)
+static void
+emit_zs_write(struct tgsi_to_qir *trans)
 {
-        uint32_t count = 0;
-        uint64_t cur_inst;
-        struct qpu_reg x = qpu_ra(0);
-        struct qpu_reg y = qpu_ra(1);
-        struct qpu_reg z = qpu_ra(2);
-        struct qpu_reg w = qpu_ra(3);
-        struct qpu_reg xy = qpu_ra(10);
-        struct qpu_reg xs = qpu_ra(12);
-        struct qpu_reg ys = qpu_ra(13);
-        struct qpu_reg vpmread = qpu_ra(QPU_R_VPM);
-        struct qpu_reg vpm = qpu_ra(QPU_W_VPM);
+        struct qcompile *c = trans->c;
 
-        gen[count++] = qpu_load_imm_ui(qpu_vrsetup(), 0x00401a00);
-        gen[count++] = qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00);
+        /* XXX: rescale */
+        qir_VPM_WRITE(c, trans->outputs[2]);
+}
 
-#if 1
-        gen[count++] = qpu_inst(qpu_a_MOV(x, vpmread), qpu_m_NOP());
-        gen[count++] = qpu_inst(qpu_a_MOV(y, vpmread), qpu_m_NOP());
-        gen[count++] = qpu_inst(qpu_a_MOV(z, vpmread), qpu_m_NOP());
-        gen[count++] = qpu_inst(qpu_a_MOV(w, vpmread), qpu_m_NOP());
-
-
-        gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_FMUL(xs, x,
-                                                        qpu_rb(QPU_R_UNIF)));
-        gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_FMUL(ys, y,
-                                                        qpu_rb(QPU_R_UNIF)));
-
-        cur_inst = qpu_inst(qpu_a_FTOI(xy, xs), qpu_m_NOP());
-        cur_inst |= QPU_SET_FIELD(QPU_PACK_A_16A, QPU_PACK);
-        gen[count++] = cur_inst;
-        cur_inst = qpu_inst(qpu_a_FTOI(xy, ys), qpu_m_NOP());
-        cur_inst |= QPU_SET_FIELD(QPU_PACK_A_16B, QPU_PACK);
-        gen[count++] = cur_inst;
-
-#else
-
-        struct qpu_reg t = qpu_ra(20);
-        struct qpu_reg hundred = qpu_rb(21);
-        gen[count++] = qpu_inst(qpu_a_NOP(),
-                                qpu_m_MUL24(t,
-                                            qpu_ra(QPU_R_ELEM_QPU),
-                                            qpu_ra(QPU_R_ELEM_QPU)));
-        gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
-
-        gen[count++] = qpu_load_imm_ui(hundred, 400);
-        gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
-
-        struct qpu_reg xm = qpu_ra(22), ym = qpu_ra(23);
-        gen[count++] = qpu_inst(qpu_a_NOP(),
-                                qpu_m_MUL24(xm, hundred, qpu_ra(QPU_R_ELEM_QPU)));
-        gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
-
-        gen[count++] = qpu_inst(qpu_a_NOP(),
-                                qpu_m_MUL24(ym, hundred, t));
-        gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
-
-        cur_inst = qpu_inst(qpu_a_MOV(xy, xm), qpu_m_NOP());
-        cur_inst |= QPU_SET_FIELD(QPU_PACK_A_16A, QPU_PACK);
-        gen[count++] = cur_inst;
-        cur_inst = qpu_inst(qpu_a_MOV(xy, ym), qpu_m_NOP());
-        cur_inst |= QPU_SET_FIELD(QPU_PACK_A_16B, QPU_PACK);
-        gen[count++] = cur_inst;
-#endif
+static void
+emit_1_wc_write(struct tgsi_to_qir *trans)
+{
+        struct qcompile *c = trans->c;
+
+        /* XXX: RCP */
+        qir_VPM_WRITE(c, trans->outputs[3]);
+}
 
-        gen[count++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
+static void
+emit_vert_end(struct tgsi_to_qir *trans, struct vc4_shader_state *so)
+{
+        emit_scaled_viewport_write(trans);
+        emit_zs_write(trans);
+        emit_1_wc_write(trans);
+        /* XXX: write varyings */
+}
 
-        if (is_vs) {
-                gen[count++] = qpu_inst(qpu_a_MOV(vpm, xy), qpu_m_NOP());
+static void
+emit_coord_end(struct tgsi_to_qir *trans, struct vc4_shader_state *so)
+{
+        struct qcompile *c = trans->c;
 
-                /* XXX */
-                gen[count++] = qpu_inst(qpu_a_MOV(vpm, z), qpu_m_NOP());
-                gen[count++] = qpu_inst(qpu_a_MOV(vpm, w), qpu_m_NOP());
+        for (int i = 0; i < 4; i++)
+                qir_VPM_WRITE(c, trans->outputs[i]);
 
-        } else {
-                gen[count++] = qpu_inst(qpu_a_MOV(vpm, x), qpu_m_NOP());
-                gen[count++] = qpu_inst(qpu_a_MOV(vpm, y), qpu_m_NOP());
-                gen[count++] = qpu_inst(qpu_a_MOV(vpm, z), qpu_m_NOP());
-                gen[count++] = qpu_inst(qpu_a_MOV(vpm, w), qpu_m_NOP());
-                gen[count++] = qpu_inst(qpu_a_MOV(vpm, xy), qpu_m_NOP());
+        emit_scaled_viewport_write(trans);
+        emit_zs_write(trans);
+        emit_1_wc_write(trans);
+}
 
-                /* XXX */
-                gen[count++] = qpu_inst(qpu_a_MOV(vpm, z), qpu_m_NOP());
-                gen[count++] = qpu_inst(qpu_a_MOV(vpm, w), qpu_m_NOP());
+static struct tgsi_to_qir *
+vc4_shader_tgsi_to_qir(struct vc4_shader_state *so, enum qstage stage)
+{
+        struct tgsi_to_qir *trans = CALLOC_STRUCT(tgsi_to_qir);
+        struct qcompile *c;
+        int ret;
+
+        c = qir_compile_init();
+        c->stage = stage;
+
+        memset(trans, 0, sizeof(*trans));
+        /* XXX sizing */
+        trans->temps = calloc(sizeof(struct qreg), 1024);
+        trans->inputs = calloc(sizeof(struct qreg), 8 * 4);
+        trans->outputs = calloc(sizeof(struct qreg), 1024);
+        trans->uniforms = calloc(sizeof(struct qreg), 1024);
+        trans->consts = calloc(sizeof(struct qreg), 1024);
+
+        trans->uniform_data = calloc(sizeof(uint32_t), 1024);
+        trans->uniform_contents = calloc(sizeof(enum quniform_contents), 1024);
+
+        trans->c = c;
+        ret = tgsi_parse_init(&trans->parser, so->base.tokens);
+        assert(ret == TGSI_PARSE_OK);
+
+        fprintf(stderr, "TGSI:\n");
+        tgsi_dump(so->base.tokens, 0);
+
+        switch (stage) {
+        case QSTAGE_FRAG:
+                emit_frag_init(trans, so);
+                break;
+        case QSTAGE_VERT:
+                emit_vert_init(trans, so);
+                break;
+        case QSTAGE_COORD:
+                emit_coord_init(trans, so);
+                break;
         }
 
-        /* PROGRAM END */
-        cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
-        cur_inst = (cur_inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(QPU_SIG_PROG_END, QPU_SIG);
-        gen[count++] = cur_inst;
+        while (!tgsi_parse_end_of_tokens(&trans->parser)) {
+                tgsi_parse_token(&trans->parser);
 
-        cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
-        gen[count++] = cur_inst;
+                switch (trans->parser.FullToken.Token.Type) {
+                case TGSI_TOKEN_TYPE_INSTRUCTION:
+                        emit_tgsi_instruction(trans,
+                                              &trans->parser.FullToken.FullInstruction);
+                        break;
 
-        cur_inst = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
-        gen[count++] = cur_inst;
+                case TGSI_TOKEN_TYPE_IMMEDIATE:
+                        parse_tgsi_immediate(trans,
+                                             &trans->parser.FullToken.FullImmediate);
+                        break;
+                }
+        }
+
+        switch (stage) {
+        case QSTAGE_FRAG:
+                emit_frag_end(trans, so);
+                break;
+        case QSTAGE_VERT:
+                emit_vert_end(trans, so);
+                break;
+        case QSTAGE_COORD:
+                emit_coord_end(trans, so);
+                break;
+        }
+
+        qir_dump(c);
+
+        tgsi_parse_free(&trans->parser);
+        free(trans->temps);
 
-        vc4_qpu_validate(gen, count);
+        vc4_generate_code(c);
 
-        return count;
+        return trans;
+}
+
+static struct vc4_shader_state *
+vc4_shader_state_create(struct pipe_context *pctx,
+                        const struct pipe_shader_state *cso)
+{
+        struct vc4_shader_state *so = CALLOC_STRUCT(vc4_shader_state);
+        if (!so)
+                return NULL;
+
+        so->base.tokens = tgsi_dup_tokens(cso->tokens);
+
+        return so;
+}
+
+static void
+copy_uniform_state_to_shader(struct vc4_shader_state *so,
+                             int shader_index,
+                             struct tgsi_to_qir *trans)
+{
+        int count = trans->num_uniforms;
+        struct vc4_shader_uniform_info *uinfo = &so->uniforms[shader_index];
+
+        uinfo->count = count;
+        uinfo->data = malloc(count * sizeof(*uinfo->data));
+        memcpy(uinfo->data, trans->uniform_data,
+               count * sizeof(*uinfo->data));
+        uinfo->contents = malloc(count * sizeof(*uinfo->contents));
+        memcpy(uinfo->contents, trans->uniform_contents,
+               count * sizeof(*uinfo->contents));
 }
 
 static void *
-vc4_vs_state_create(struct pipe_context *pctx,
+vc4_fs_state_create(struct pipe_context *pctx,
                     const struct pipe_shader_state *cso)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
@@ -289,27 +541,47 @@ vc4_vs_state_create(struct pipe_context *pctx,
         if (!so)
                 return NULL;
 
-        uint64_t gen[100];
-        uint64_t count = 0;
-        uint64_t *vsc = gen;
+        struct tgsi_to_qir *trans = vc4_shader_tgsi_to_qir(so, QSTAGE_FRAG);
+        copy_uniform_state_to_shader(so, 0, trans);
 
-        /* VS */
-        count += gen_vs_cs_code(gen + count, true);
-        fprintf(stderr, "VS:\n");
-        vc4_dump_program(vsc, count);
+        so->bo = vc4_bo_alloc_mem(vc4->screen, trans->c->qpu_insts,
+                                  trans->c->num_qpu_insts * sizeof(uint64_t),
+                                  "fs_code");
 
-        /* CS */
+        qir_compile_destroy(trans->c);
+        free(trans);
 
-        /* XXX alignment? */
-        uint64_t *csc = gen + count;
-        so->coord_shader_offset = count * sizeof(uint64_t);
-        count += gen_vs_cs_code(gen + count, false);
+        return so;
+}
 
-        fprintf(stderr, "CS:\n");
-        vc4_dump_program(csc, count - (csc - gen));
+static void *
+vc4_vs_state_create(struct pipe_context *pctx,
+                    const struct pipe_shader_state *cso)
+{
+        struct vc4_context *vc4 = vc4_context(pctx);
+        struct vc4_shader_state *so = vc4_shader_state_create(pctx, cso);
+        if (!so)
+                return NULL;
+
+        struct tgsi_to_qir *vs_trans = vc4_shader_tgsi_to_qir(so, QSTAGE_VERT);
+        copy_uniform_state_to_shader(so, 0, vs_trans);
+
+        struct tgsi_to_qir *cs_trans = vc4_shader_tgsi_to_qir(so, QSTAGE_COORD);
+        copy_uniform_state_to_shader(so, 1, cs_trans);
+
+        uint32_t vs_size = vs_trans->c->num_qpu_insts * sizeof(uint64_t);
+        uint32_t cs_size = cs_trans->c->num_qpu_insts * sizeof(uint64_t);
+        so->coord_shader_offset = vs_size; /* XXX: alignment? */
+        so->bo = vc4_bo_alloc(vc4->screen,
+                              so->coord_shader_offset + cs_size,
+                              "vs_code");
 
-        so->bo = vc4_bo_alloc_mem(vc4->screen, gen, count * sizeof(uint64_t),
-                                  "vs_code");
+        void *map = vc4_bo_map(so->bo);
+        memcpy(map, vs_trans->c->qpu_insts, vs_size);
+        memcpy(map + so->coord_shader_offset, cs_trans->c->qpu_insts, cs_size);
+
+        qir_compile_destroy(vs_trans->c);
+        qir_compile_destroy(cs_trans->c);
 
         return so;
 }
@@ -323,6 +595,41 @@ vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)
         free(so);
 }
 
+void
+vc4_get_uniform_bo(struct vc4_context *vc4, struct vc4_shader_state *shader,
+                   struct vc4_constbuf_stateobj *cb,
+                   int shader_index, struct vc4_bo **out_bo,
+                   uint32_t *out_offset)
+{
+        struct vc4_shader_uniform_info *uinfo = &shader->uniforms[shader_index];
+        struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, uinfo->count * 4, "ubo");
+        uint32_t *map = vc4_bo_map(ubo);
+
+        for (int i = 0; i < uinfo->count; i++) {
+                switch (uinfo->contents[i]) {
+                case QUNIFORM_CONSTANT:
+                        map[i] = uinfo->data[i];
+                        break;
+                case QUNIFORM_UNIFORM:
+                        map[i] = ((uint32_t *)cb->cb[0].user_buffer)[uinfo->data[i]];
+                        break;
+                case QUNIFORM_VIEWPORT_X_SCALE:
+                        map[i] = fui(vc4->framebuffer.width * 16.0f / 2.0f);
+                        break;
+                case QUNIFORM_VIEWPORT_Y_SCALE:
+                        map[i] = fui(vc4->framebuffer.height * -16.0f / 2.0f);
+                        break;
+                }
+#if 1
+                fprintf(stderr, "%p/%d: %d: 0x%08x (%f)\n",
+                        shader, shader_index, i, map[i], uif(map[i]));
+#endif
+        }
+
+        *out_bo = ubo;
+        *out_offset = 0;
+}
+
 static void
 vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso)
 {
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
new file mode 100644
index 00000000000..a4bb6cd1fd1
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+
+#include "vc4_qir.h"
+#include "vc4_qpu.h"
+
+struct qir_op_info {
+        const char *name;
+        uint8_t ndst, nsrc;
+};
+
+static const struct qir_op_info qir_op_info[] = {
+        [QOP_MOV] = { "mov", 1, 1 },
+        [QOP_FADD] = { "fadd", 1, 2 },
+        [QOP_FSUB] = { "fsub", 1, 2 },
+        [QOP_FMUL] = { "fmul", 1, 2 },
+        [QOP_FMIN] = { "fmin", 1, 2 },
+        [QOP_FMAX] = { "fmax", 1, 2 },
+        [QOP_FMINABS] = { "fminabs", 1, 2 },
+        [QOP_FMAXABS] = { "fmaxabs", 1, 2 },
+        [QOP_FTOI] = { "ftoi", 1, 1 },
+        [QOP_RCP] = { "rcp", 1, 1 },
+        [QOP_RSQ] = { "rsq", 1, 1 },
+        [QOP_EXP2] = { "exp2", 1, 2 },
+        [QOP_LOG2] = { "log2", 1, 2 },
+        [QOP_PACK_COLORS] = { "pack_colors", 1, 4 },
+        [QOP_PACK_SCALED] = { "pack_scaled", 1, 2 },
+        [QOP_VPM_WRITE] = { "vpm_write", 0, 1 },
+        [QOP_VPM_READ] = { "vpm_read", 0, 1 },
+        [QOP_TLB_COLOR_WRITE] = { "tlb_color", 0, 1 },
+};
+
+static const char *
+qir_get_op_name(enum qop qop)
+{
+        if (qop < ARRAY_SIZE(qir_op_info) && qir_op_info[qop].name)
+                return qir_op_info[qop].name;
+        else
+                return "???";
+}
+
+int
+qir_get_op_nsrc(enum qop qop)
+{
+        if (qop < ARRAY_SIZE(qir_op_info) && qir_op_info[qop].name)
+                return qir_op_info[qop].nsrc;
+        else
+                abort();
+}
+
+static void
+qir_print_reg(struct qreg reg)
+{
+        const char *files[] = {
+                [QFILE_TEMP] = "t",
+                [QFILE_VARY] = "v",
+                [QFILE_UNIF] = "u",
+        };
+
+        if (reg.file == QFILE_NULL)
+                fprintf(stderr, "null");
+        else
+                fprintf(stderr, "%s%d", files[reg.file], reg.index);
+}
+
+void
+qir_dump_inst(struct qinst *inst)
+{
+        fprintf(stderr, "%s ", qir_get_op_name(inst->op));
+
+        qir_print_reg(inst->dst);
+        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                fprintf(stderr, ", ");
+                qir_print_reg(inst->src[i]);
+        }
+}
+
+void
+qir_dump(struct qcompile *c)
+{
+        struct simple_node *node;
+
+        foreach(node, &c->instructions) {
+                struct qinst *inst = (struct qinst *)node;
+                qir_dump_inst(inst);
+                fprintf(stderr, "\n");
+        }
+}
+
+struct qreg
+qir_get_temp(struct qcompile *c)
+{
+        struct qreg reg;
+
+        reg.file = QFILE_TEMP;
+        reg.index = c->num_temps++;
+
+        return reg;
+}
+
+struct qinst *
+qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1)
+{
+        struct qinst *inst = CALLOC_STRUCT(qinst);
+
+        inst->op = op;
+        inst->dst = dst;
+        inst->src = calloc(2, sizeof(inst->src[0]));
+        inst->src[0] = src0;
+        inst->src[1] = src1;
+
+        return inst;
+}
+
+struct qinst *
+qir_inst4(enum qop op, struct qreg dst,
+          struct qreg a,
+          struct qreg b,
+          struct qreg c,
+          struct qreg d)
+{
+        struct qinst *inst = CALLOC_STRUCT(qinst);
+
+        inst->op = op;
+        inst->dst = dst;
+        inst->src = calloc(4, sizeof(*inst->src));
+        inst->src[0] = a;
+        inst->src[1] = b;
+        inst->src[2] = c;
+        inst->src[3] = d;
+
+        return inst;
+}
+
+void
+qir_emit(struct qcompile *c, struct qinst *inst)
+{
+        insert_at_tail(&c->instructions, &inst->link);
+}
+
+struct qcompile *
+qir_compile_init(void)
+{
+        struct qcompile *c = CALLOC_STRUCT(qcompile);
+
+        make_empty_list(&c->instructions);
+
+        return c;
+}
+
+void
+qir_compile_destroy(struct qcompile *c)
+{
+        free(c);
+}
+
+const char *
+qir_get_stage_name(enum qstage stage)
+{
+        static const char *names[] = {
+                [QSTAGE_FRAG] = "FS",
+                [QSTAGE_VERT] = "VS",
+                [QSTAGE_COORD] = "CS",
+        };
+
+        return names[stage];
+}
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
new file mode 100644
index 00000000000..ae9e1796b90
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VC4_QIR_H
+#define VC4_QIR_H
+
+#include <stdint.h>
+
+#include "util/u_simple_list.h"
+
+enum qfile {
+        QFILE_NULL,
+        QFILE_TEMP,
+        QFILE_VARY,
+        QFILE_UNIF,
+};
+
+struct qreg {
+        enum qfile file;
+        uint32_t index;
+};
+
+enum qop {
+        QOP_UNDEF,
+        QOP_MOV,
+        QOP_FADD,
+        QOP_FSUB,
+        QOP_FMUL,
+        QOP_FMIN,
+        QOP_FMAX,
+        QOP_FMINABS,
+        QOP_FMAXABS,
+        QOP_FTOI,
+        QOP_RCP,
+        QOP_RSQ,
+        QOP_EXP2,
+        QOP_LOG2,
+        QOP_VW_SETUP,
+        QOP_VR_SETUP,
+        QOP_PACK_SCALED,
+        QOP_PACK_COLORS,
+        QOP_VPM_WRITE,
+        QOP_VPM_READ,
+        QOP_TLB_COLOR_WRITE,
+};
+
+struct simple_node {
+        struct simple_node *next;
+        struct simple_node *prev;
+};
+
+struct qinst {
+        struct simple_node link;
+
+        enum qop op;
+        struct qreg dst;
+        struct qreg *src;
+};
+
+enum qstage {
+        /**
+         * Coordinate shader, runs during binning, before the VS, and just
+         * outputs position.
+         */
+        QSTAGE_COORD,
+        QSTAGE_VERT,
+        QSTAGE_FRAG,
+};
+
+enum quniform_contents {
+        /**
+         * Indicates that a constant 32-bit value is copied from the program's
+         * uniform contents.
+         */
+        QUNIFORM_CONSTANT,
+        /**
+         * Indicates that the program's uniform contents are used as an index
+         * into the GL uniform storage.
+         */
+        QUNIFORM_UNIFORM,
+
+        /** @{
+         * Scaling factors from clip coordinates to relative to the viewport
+         * center.
+         *
+         * This is used by the coordinate and vertex shaders to produce the
+         * 32-bit entry consisting of 2 16-bit fields with 12.4 signed fixed
+         * point offsets from the viewport ccenter.
+         */
+        QUNIFORM_VIEWPORT_X_SCALE,
+        QUNIFORM_VIEWPORT_Y_SCALE,
+        /** @} */
+};
+
+struct qcompile {
+        struct qreg undef;
+        enum qstage stage;
+        uint32_t num_temps;
+        struct simple_node instructions;
+        uint32_t immediates[1024];
+
+        uint64_t *qpu_insts;
+        uint32_t num_qpu_insts;
+};
+
+struct qcompile *qir_compile_init(void);
+void qir_compile_destroy(struct qcompile *c);
+struct qinst *qir_inst(enum qop op, struct qreg dst,
+                       struct qreg src0, struct qreg src1);
+struct qinst *qir_inst4(enum qop op, struct qreg dst,
+                        struct qreg a,
+                        struct qreg b,
+                        struct qreg c,
+                        struct qreg d);
+void qir_emit(struct qcompile *c, struct qinst *inst);
+struct qreg qir_get_temp(struct qcompile *c);
+int qir_get_op_nsrc(enum qop qop);
+
+void qir_dump(struct qcompile *c);
+void qir_dump_inst(struct qinst *inst);
+const char *qir_get_stage_name(enum qstage stage);
+
+#define QIR_ALU1(name)                                                   \
+static inline struct qreg                                                \
+qir_##name(struct qcompile *c, struct qreg a)                            \
+{                                                                        \
+        struct qreg t = qir_get_temp(c);                                 \
+        qir_emit(c, qir_inst(QOP_##name, t, a, c->undef));               \
+        return t;                                                        \
+}
+
+#define QIR_ALU2(name)                                                   \
+static inline struct qreg                                                \
+qir_##name(struct qcompile *c, struct qreg a, struct qreg b)             \
+{                                                                        \
+        struct qreg t = qir_get_temp(c);                                 \
+        qir_emit(c, qir_inst(QOP_##name, t, a, b));                      \
+        return t;                                                        \
+}
+
+QIR_ALU1(MOV)
+QIR_ALU2(FADD)
+QIR_ALU2(FSUB)
+QIR_ALU2(FMUL)
+QIR_ALU2(FMIN)
+QIR_ALU2(FMAX)
+QIR_ALU2(FMINABS)
+QIR_ALU2(FMAXABS)
+QIR_ALU1(FTOI)
+QIR_ALU1(RCP)
+QIR_ALU1(RSQ)
+QIR_ALU1(EXP2)
+QIR_ALU1(LOG2)
+QIR_ALU2(PACK_SCALED)
+
+static inline void
+qir_VPM_WRITE(struct qcompile *c, struct qreg a)
+{
+        qir_emit(c, qir_inst(QOP_VPM_WRITE, c->undef, a, c->undef));
+}
+
+#endif /* VC4_QIR_H */
diff --git a/src/gallium/drivers/vc4/vc4_qpu.c b/src/gallium/drivers/vc4/vc4_qpu.c
index 18863f7eac1..de07f72bdd6 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.c
+++ b/src/gallium/drivers/vc4/vc4_qpu.c
@@ -208,3 +208,11 @@ qpu_inst(uint64_t add, uint64_t mul)
 
         return merge;
 }
+
+uint64_t
+qpu_set_sig(uint64_t inst, uint32_t sig)
+{
+        assert(QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_NONE);
+        return (inst & ~QPU_SIG_MASK) | QPU_SET_FIELD(sig, QPU_SIG);
+}
+
diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h
index 00aebf0a706..45aac0e135b 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.h
+++ b/src/gallium/drivers/vc4/vc4_qpu.h
@@ -130,6 +130,7 @@ uint64_t qpu_m_alu2(enum qpu_op_mul op, struct qpu_reg dst,
                     struct qpu_reg src0, struct qpu_reg src1);
 uint64_t qpu_inst(uint64_t add, uint64_t mul);
 uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val);
+uint64_t qpu_set_sig(uint64_t inst, uint32_t sig);
 
 static inline uint64_t
 qpu_load_imm_f(struct qpu_reg dst, float val)
@@ -163,8 +164,8 @@ A_ALU2(FADD)
 A_ALU2(FSUB)
 A_ALU2(FMIN)
 A_ALU2(FMAX)
-A_ALU2(MINABS)
-A_ALU2(MAXABS)
+A_ALU2(FMINABS)
+A_ALU2(FMAXABS)
 A_ALU1(FTOI)
 A_ALU1(ITOF)
 A_ALU2(ADD)
diff --git a/src/gallium/drivers/vc4/vc4_qpu_defines.h b/src/gallium/drivers/vc4/vc4_qpu_defines.h
index d066f278ab3..13c940c0f8e 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_defines.h
+++ b/src/gallium/drivers/vc4/vc4_qpu_defines.h
@@ -34,8 +34,8 @@ enum qpu_op_add {
         QPU_A_FSUB,
         QPU_A_FMIN,
         QPU_A_FMAX,
-        QPU_A_MINABS,
-        QPU_A_MAXABS,
+        QPU_A_FMINABS,
+        QPU_A_FMAXABS,
         QPU_A_FTOI,
         QPU_A_ITOF,
         QPU_A_ADD = 12,
diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
index cf90cb2e768..0aea2970f68 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
@@ -33,8 +33,8 @@ static const char *qpu_add_opcodes[] = {
         [QPU_A_FSUB] = "fsub",
         [QPU_A_FMIN] = "fmin",
         [QPU_A_FMAX] = "fmax",
-        [QPU_A_MINABS] = "minabs",
-        [QPU_A_MAXABS] = "maxabs",
+        [QPU_A_FMINABS] = "fminabs",
+        [QPU_A_FMAXABS] = "fmaxabs",
         [QPU_A_FTOI] = "ftoi",
         [QPU_A_ITOF] = "itof",
         [QPU_A_ADD] = "add",
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
new file mode 100644
index 00000000000..0f6f2c171a4
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -0,0 +1,292 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "vc4_context.h"
+#include "vc4_qir.h"
+#include "vc4_qpu.h"
+
+static void
+vc4_dump_program(struct qcompile *c)
+{
+        fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage));
+
+        for (int i = 0; i < c->num_qpu_insts; i++) {
+                fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
+                vc4_qpu_disasm(&c->qpu_insts[i], 1);
+                fprintf(stderr, "\n");
+        }
+}
+
+void
+vc4_generate_code(struct qcompile *c)
+{
+        uint64_t *insts = malloc(sizeof(uint64_t) * 1024); /* XXX: sizing */
+        uint32_t ni = 0;
+        struct qpu_reg allocate_to_qpu_reg[4 + 32 + 32];
+        bool reg_in_use[ARRAY_SIZE(allocate_to_qpu_reg)];
+        int *reg_allocated = calloc(c->num_temps, sizeof(*reg_allocated));
+        int *reg_uses_remaining =
+                calloc(c->num_temps, sizeof(*reg_uses_remaining));
+
+        for (int i = 0; i < ARRAY_SIZE(reg_in_use); i++)
+                reg_in_use[i] = false;
+        for (int i = 0; i < c->num_temps; i++)
+                reg_allocated[i] = -1;
+        for (int i = 0; i < 4; i++)
+                allocate_to_qpu_reg[i] = qpu_rn(i);
+        for (int i = 0; i < 32; i++)
+                allocate_to_qpu_reg[i + 4] = qpu_ra(i);
+        for (int i = 0; i < 32; i++)
+                allocate_to_qpu_reg[i + 4 + 32] = qpu_rb(i);
+
+        struct simple_node *node;
+        foreach(node, &c->instructions) {
+                struct qinst *qinst = (struct qinst *)node;
+
+                if (qinst->dst.file == QFILE_TEMP)
+                        reg_uses_remaining[qinst->dst.index]++;
+                for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
+                        if (qinst->src[i].file == QFILE_TEMP)
+                                reg_uses_remaining[qinst->src[i].index]++;
+                }
+        }
+
+        switch (c->stage) {
+        case QSTAGE_VERT:
+        case QSTAGE_COORD:
+                insts[ni++] = qpu_load_imm_ui(qpu_vrsetup(), 0x00401a00);
+                insts[ni++] = qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00);
+                break;
+        case QSTAGE_FRAG:
+                break;
+        }
+
+        foreach(node, &c->instructions) {
+                struct qinst *qinst = (struct qinst *)node;
+
+#if 0
+                fprintf(stderr, "translating qinst to qpu: ");
+                qir_dump_inst(qinst);
+                fprintf(stderr, "\n");
+#endif
+
+                static const struct {
+                        uint32_t op;
+                        bool is_mul;
+                } translate[] = {
+#define A(name) [QOP_##name] = {QPU_A_##name, false}
+#define M(name) [QOP_##name] = {QPU_M_##name, true}
+                        A(FADD),
+                        A(FSUB),
+                        A(FMIN),
+                        A(FMAX),
+                        A(FMINABS),
+                        A(FMAXABS),
+                        A(FTOI),
+
+                        M(FMUL),
+                };
+
+                struct qpu_reg src[4];
+                for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
+                        int index = qinst->src[i].index;
+                        switch (qinst->src[i].file) {
+                        case QFILE_NULL:
+                                src[i] = qpu_rn(0);
+                                break;
+                        case QFILE_TEMP:
+                                assert(reg_allocated[index] != -1);
+                                src[i] = allocate_to_qpu_reg[reg_allocated[index]];
+                                reg_uses_remaining[index]--;
+                                if (reg_uses_remaining[index] == 0)
+                                        reg_in_use[reg_allocated[index]] = false;
+                                break;
+                        case QFILE_UNIF:
+                                src[i] = qpu_unif();
+                                break;
+                        case QFILE_VARY:
+                                src[i] = qpu_vary();
+                                break;
+                        }
+                }
+
+                struct qpu_reg dst;
+                switch (qinst->dst.file) {
+                case QFILE_NULL:
+                        dst = qpu_ra(QPU_W_NOP);
+                        break;
+
+                case QFILE_TEMP:
+                        if (reg_allocated[qinst->dst.index] == -1) {
+                                int alloc;
+                                for (alloc = 0;
+                                     alloc < ARRAY_SIZE(reg_in_use);
+                                     alloc++) {
+                                        /* The pack flags require an A-file register. */
+                                        if (qinst->op == QOP_PACK_SCALED &&
+                                            allocate_to_qpu_reg[alloc].mux != QPU_MUX_A) {
+                                                continue;
+                                        }
+
+                                        if (!reg_in_use[alloc])
+                                                break;
+                                }
+                                assert(alloc != ARRAY_SIZE(reg_in_use) && "need better reg alloc");
+                                reg_in_use[alloc] = true;
+                                reg_allocated[qinst->dst.index] = alloc;
+                        }
+
+                        dst = allocate_to_qpu_reg[reg_allocated[qinst->dst.index]];
+
+                        reg_uses_remaining[qinst->dst.index]--;
+                        if (reg_uses_remaining[qinst->dst.index] == 0) {
+                                reg_in_use[reg_allocated[qinst->dst.index]] =
+                                        false;
+                        }
+                        break;
+
+                case QFILE_VARY:
+                case QFILE_UNIF:
+                        assert(!"not reached");
+                        break;
+                }
+
+                switch (qinst->op) {
+                case QOP_MOV:
+                        /* Skip emitting the MOV if it's a no-op. */
+                        if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
+                            dst.mux != src[0].mux || dst.addr != src[0].addr) {
+                                insts[ni++] = qpu_inst(qpu_a_MOV(dst, src[0]),
+                                                       qpu_m_NOP());
+                        }
+                        break;
+
+                case QOP_VPM_WRITE:
+                        insts[ni++] = qpu_inst(qpu_a_MOV(qpu_ra(QPU_W_VPM),
+                                                         src[0]),
+                                               qpu_m_NOP());
+                        break;
+
+                case QOP_VPM_READ:
+                        insts[ni++] = qpu_inst(qpu_a_MOV(dst,
+                                                         qpu_ra(QPU_R_VPM)),
+                                               qpu_m_NOP());
+                        break;
+
+                case QOP_PACK_COLORS:
+                        for (int i = 0; i < 4; i++) {
+                                insts[ni++] = qpu_inst(qpu_a_NOP(),
+                                                       qpu_m_MOV(qpu_r5(), src[i]));
+                                insts[ni - 1] |= QPU_PM;
+                                insts[ni - 1] |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
+                                                               QPU_PACK);
+                        }
+
+                        insts[ni++] = qpu_inst(qpu_a_MOV(dst, qpu_r5()),
+                                               qpu_m_NOP());
+
+                        break;
+
+                case QOP_TLB_COLOR_WRITE:
+                        insts[ni++] = qpu_inst(qpu_a_MOV(qpu_tlbc(),
+                                                         src[0]),
+                                               qpu_m_NOP());
+                        break;
+
+                case QOP_PACK_SCALED:
+                        insts[ni++] = qpu_inst(qpu_a_MOV(dst, src[0]),
+                                               qpu_m_NOP());
+                        insts[ni - 1] |= QPU_SET_FIELD(QPU_PACK_A_16A, QPU_PACK);
+
+                        insts[ni++] = qpu_inst(qpu_a_MOV(dst, src[1]),
+                                               qpu_m_NOP());
+                        insts[ni - 1] |= QPU_SET_FIELD(QPU_PACK_A_16B, QPU_PACK);
+
+                        break;
+
+                default:
+                        assert(qinst->op < ARRAY_SIZE(translate));
+                        assert(translate[qinst->op].op != 0); /* NOPs */
+
+                        /* If we have only one source, put it in the second
+                         * argument slot as well so that we don't take up
+                         * another raddr just to get unused data.
+                         */
+                        if (qir_get_op_nsrc(qinst->op) == 1)
+                                src[1] = src[0];
+
+                        if ((src[0].mux == QPU_MUX_A || src[0].mux == QPU_MUX_B) &&
+                            (src[1].mux == QPU_MUX_A || src[1].mux == QPU_MUX_B) &&
+                            src[0].addr != src[1].addr) {
+                                insts[ni++] = qpu_inst(qpu_a_MOV(qpu_r5(), src[1]),
+                                                       qpu_m_NOP());
+                                src[1] = qpu_r5();
+                        }
+
+                        if (translate[qinst->op].is_mul) {
+                                insts[ni++] = qpu_inst(qpu_a_NOP(),
+                                                       qpu_m_alu2(translate[qinst->op].op,
+                                                                  dst, src[0], src[1]));
+                        } else {
+                                insts[ni++] = qpu_inst(qpu_a_alu2(translate[qinst->op].op,
+                                                                  dst, src[0], src[1]),
+                                                       qpu_m_NOP());
+                        }
+                        break;
+                }
+
+                if ((dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B) &&
+                    dst.addr < 32)
+                        insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
+        }
+
+        /* thread end can't have VPM write */
+        if (QPU_GET_FIELD(insts[ni - 1], QPU_WADDR_ADD) == QPU_W_VPM ||
+            QPU_GET_FIELD(insts[ni - 1], QPU_WADDR_MUL) == QPU_W_VPM)
+                insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
+
+        insts[ni - 1] = qpu_set_sig(insts[ni - 1], QPU_SIG_PROG_END);
+        insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
+        insts[ni++] = qpu_inst(qpu_a_NOP(), qpu_m_NOP());
+
+        switch (c->stage) {
+        case QSTAGE_VERT:
+        case QSTAGE_COORD:
+                break;
+        case QSTAGE_FRAG:
+                insts[2] = qpu_set_sig(insts[2], QPU_SIG_WAIT_FOR_SCOREBOARD);
+                insts[ni - 1] = qpu_set_sig(insts[ni - 1],
+                                            QPU_SIG_SCOREBOARD_UNLOCK);
+                break;
+        }
+
+        c->qpu_insts = insts;
+        c->num_qpu_insts = ni;
+
+        vc4_dump_program(c);
+        vc4_qpu_validate(insts, ni);
+}
+
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 6ad7eea8f24..ab1e8be6ee3 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -254,11 +254,10 @@ vc4_set_constant_buffer(struct pipe_context *pctx, uint shader, uint index,
         if (unlikely(!cb)) {
                 so->enabled_mask &= ~(1 << index);
                 so->dirty_mask &= ~(1 << index);
-                pipe_resource_reference(&so->cb[index].buffer, NULL);
                 return;
         }
 
-        pipe_resource_reference(&so->cb[index].buffer, cb->buffer);
+        assert(!cb->buffer);
         so->cb[index].buffer_offset = cb->buffer_offset;
         so->cb[index].buffer_size   = cb->buffer_size;
         so->cb[index].user_buffer   = cb->user_buffer;