14 files changed, 2781 insertions, 37 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index 44f28cd9d15..45a5350a383 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -124,7 +124,10 @@ CXX_SOURCES = \
 	brw_fs_reg_allocate.cpp \
 	brw_fs_schedule_instructions.cpp \
 	brw_fs_vector_splitting.cpp \
-	brw_shader.cpp
+	brw_shader.cpp \
+	brw_vec4_emit.cpp \
+	brw_vec4_reg_allocate.cpp \
+	brw_vec4_visitor.cpp
 
 ASM_SOURCES = 
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index cc11d06874d..7b6b64c1a5c 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -529,7 +529,7 @@ struct brw_context
        * the CURBE, the depth buffer, and a query BO.
        */
       drm_intel_bo *validated_bos[VERT_ATTRIB_MAX + BRW_WM_MAX_SURF + 16];
-      int validated_bo_count;
+      unsigned int validated_bo_count;
    } state;
 
    struct brw_cache cache;
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index da8d016da42..e3823c65d1a 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -638,6 +638,8 @@ enum opcode {
    FS_OPCODE_SPILL,
    FS_OPCODE_UNSPILL,
    FS_OPCODE_PULL_CONSTANT_LOAD,
+
+   VS_OPCODE_URB_WRITE,
 };
 
 #define BRW_PREDICATE_NONE             0
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 72d50eadbce..38dd99b693d 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -44,6 +44,9 @@
 #define BRW_SWIZZLE_NOOP      BRW_SWIZZLE4(0,1,2,3)
 #define BRW_SWIZZLE_XYZW      BRW_SWIZZLE4(0,1,2,3)
 #define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
+#define BRW_SWIZZLE_YYYY      BRW_SWIZZLE4(1,1,1,1)
+#define BRW_SWIZZLE_ZZZZ      BRW_SWIZZLE4(2,2,2,2)
+#define BRW_SWIZZLE_WWWW      BRW_SWIZZLE4(3,3,3,3)
 #define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index 285ba46bd46..7367ccaa7e0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -146,38 +146,7 @@ void
 fs_visitor::generate_math(fs_inst *inst,
 			  struct brw_reg dst, struct brw_reg *src)
 {
-   int op;
-
-   switch (inst->opcode) {
-   case SHADER_OPCODE_RCP:
-      op = BRW_MATH_FUNCTION_INV;
-      break;
-   case SHADER_OPCODE_RSQ:
-      op = BRW_MATH_FUNCTION_RSQ;
-      break;
-   case SHADER_OPCODE_SQRT:
-      op = BRW_MATH_FUNCTION_SQRT;
-      break;
-   case SHADER_OPCODE_EXP2:
-      op = BRW_MATH_FUNCTION_EXP;
-      break;
-   case SHADER_OPCODE_LOG2:
-      op = BRW_MATH_FUNCTION_LOG;
-      break;
-   case SHADER_OPCODE_POW:
-      op = BRW_MATH_FUNCTION_POW;
-      break;
-   case SHADER_OPCODE_SIN:
-      op = BRW_MATH_FUNCTION_SIN;
-      break;
-   case SHADER_OPCODE_COS:
-      op = BRW_MATH_FUNCTION_COS;
-      break;
-   default:
-      assert(!"not reached: unknown math function");
-      op = 0;
-      break;
-   }
+   int op = brw_math_function(inst->opcode);
 
    if (intel->gen >= 6) {
       assert(inst->mlen == 0);
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index f4005f80055..2eeeec25cac 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -199,3 +199,29 @@ brw_conditional_for_comparison(unsigned int op)
       return BRW_CONDITIONAL_NZ;
    }
 }
+
+uint32_t
+brw_math_function(enum opcode op)
+{
+   switch (op) {
+   case SHADER_OPCODE_RCP:
+      return BRW_MATH_FUNCTION_INV;
+   case SHADER_OPCODE_RSQ:
+      return BRW_MATH_FUNCTION_RSQ;
+   case SHADER_OPCODE_SQRT:
+      return BRW_MATH_FUNCTION_SQRT;
+   case SHADER_OPCODE_EXP2:
+      return BRW_MATH_FUNCTION_EXP;
+   case SHADER_OPCODE_LOG2:
+      return BRW_MATH_FUNCTION_LOG;
+   case SHADER_OPCODE_POW:
+      return BRW_MATH_FUNCTION_POW;
+   case SHADER_OPCODE_SIN:
+      return BRW_MATH_FUNCTION_SIN;
+   case SHADER_OPCODE_COS:
+      return BRW_MATH_FUNCTION_COS;
+   default:
+      assert(!"not reached: unknown math function");
+      return 0;
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 21671d1c8d6..1054d7a589e 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -22,8 +22,10 @@
  */
 
 #include <stdint.h>
+#include "brw_defines.h"
 
 #pragma once
 
 int brw_type_for_base_type(const struct glsl_type *type);
 uint32_t brw_conditional_for_comparison(unsigned int op);
+uint32_t brw_math_function(enum opcode op);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
new file mode 100644
index 00000000000..10168fc1cb0
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -0,0 +1,434 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_H
+#define BRW_VEC4_H
+
+#include <stdint.h>
+#include "brw_shader.h"
+#include "main/compiler.h"
+#include "program/hash_table.h"
+
+extern "C" {
+#include "brw_vs.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+};
+
+#include "../glsl/ir.h"
+
+namespace brw {
+
+class dst_reg;
+
+/**
+ * Common helper for constructing swizzles.  When only a subset of
+ * channels of a vec4 are used, we don't want to reference the other
+ * channels, as that will tell optimization passes that those other
+ * channels are used.
+ */
+static int
+swizzle_for_size(int size)
+{
+   int size_swizzles[4] = {
+      BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+      BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
+      BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z),
+      BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W),
+   };
+
+   assert((size >= 1) && (size <= 4));
+   return size_swizzles[size - 1];
+}
+
+enum register_file {
+   ARF = BRW_ARCHITECTURE_REGISTER_FILE,
+   GRF = BRW_GENERAL_REGISTER_FILE,
+   MRF = BRW_MESSAGE_REGISTER_FILE,
+   IMM = BRW_IMMEDIATE_VALUE,
+   HW_REG, /* a struct brw_reg */
+   ATTR,
+   UNIFORM, /* prog_data->params[hw_reg] */
+   BAD_FILE
+};
+
+class reg
+{
+public:
+   /** Register file: ARF, GRF, MRF, IMM. */
+   enum register_file file;
+   /** virtual register number.  0 = fixed hw reg */
+   int reg;
+   /** Offset within the virtual register. */
+   int reg_offset;
+   /** Register type.  BRW_REGISTER_TYPE_* */
+   int type;
+   bool sechalf;
+   struct brw_reg fixed_hw_reg;
+   int smear; /* -1, or a channel of the reg to smear to all channels. */
+
+   /** Value for file == BRW_IMMMEDIATE_FILE */
+   union {
+      int32_t i;
+      uint32_t u;
+      float f;
+   } imm;
+};
+
+class src_reg : public reg
+{
+public:
+   /* Callers of this ralloc-based new need not call delete. It's
+    * easier to just ralloc_free 'ctx' (or any of its ancestors). */
+   static void* operator new(size_t size, void *ctx)
+   {
+      void *node;
+
+      node = ralloc_size(ctx, size);
+      assert(node != NULL);
+
+      return node;
+   }
+
+   void init()
+   {
+      memset(this, 0, sizeof(*this));
+
+      this->file = BAD_FILE;
+   }
+
+   src_reg(register_file file, int reg, const glsl_type *type)
+   {
+      init();
+
+      this->file = file;
+      this->reg = reg;
+      if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
+	 this->swizzle = swizzle_for_size(type->vector_elements);
+      else
+	 this->swizzle = SWIZZLE_XYZW;
+   }
+
+   /** Generic unset register constructor. */
+   src_reg()
+   {
+      init();
+   }
+
+   src_reg(float f)
+   {
+      init();
+
+      this->file = IMM;
+      this->type = BRW_REGISTER_TYPE_F;
+      this->imm.f = f;
+   }
+
+   src_reg(uint32_t u)
+   {
+      init();
+
+      this->file = IMM;
+      this->type = BRW_REGISTER_TYPE_UD;
+      this->imm.f = u;
+   }
+
+   src_reg(int32_t i)
+   {
+      init();
+
+      this->file = IMM;
+      this->type = BRW_REGISTER_TYPE_D;
+      this->imm.i = i;
+   }
+
+   src_reg(class vec4_visitor *v, const struct glsl_type *type);
+
+   explicit src_reg(dst_reg reg);
+
+   GLuint swizzle; /**< SWIZZLE_XYZW swizzles from Mesa. */
+   bool negate;
+   bool abs;
+};
+
+class dst_reg : public reg
+{
+public:
+   /* Callers of this ralloc-based new need not call delete. It's
+    * easier to just ralloc_free 'ctx' (or any of its ancestors). */
+   static void* operator new(size_t size, void *ctx)
+   {
+      void *node;
+
+      node = ralloc_size(ctx, size);
+      assert(node != NULL);
+
+      return node;
+   }
+
+   void init()
+   {
+      memset(this, 0, sizeof(*this));
+      this->file = BAD_FILE;
+      this->writemask = WRITEMASK_XYZW;
+   }
+
+   dst_reg()
+   {
+      init();
+   }
+
+   dst_reg(register_file file, int reg)
+   {
+      init();
+
+      this->file = file;
+      this->reg = reg;
+   }
+
+   dst_reg(struct brw_reg reg)
+   {
+      init();
+
+      this->file = HW_REG;
+      this->fixed_hw_reg = reg;
+   }
+
+   dst_reg(class vec4_visitor *v, const struct glsl_type *type);
+
+   explicit dst_reg(src_reg reg);
+
+   int writemask; /**< Bitfield of WRITEMASK_[XYZW] */
+};
+
+class vec4_instruction : public exec_node {
+public:
+   /* Callers of this ralloc-based new need not call delete. It's
+    * easier to just ralloc_free 'ctx' (or any of its ancestors). */
+   static void* operator new(size_t size, void *ctx)
+   {
+      void *node;
+
+      node = rzalloc_size(ctx, size);
+      assert(node != NULL);
+
+      return node;
+   }
+
+   struct brw_reg get_dst(void);
+   struct brw_reg get_src(int i);
+
+   enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
+   dst_reg dst;
+   src_reg src[3];
+
+   bool saturate;
+   bool predicate_inverse;
+   uint32_t predicate;
+
+   int conditional_mod; /**< BRW_CONDITIONAL_* */
+
+   int sampler;
+   int target; /**< MRT target. */
+   bool shadow_compare;
+
+   bool eot;
+   bool header_present;
+   int mlen; /**< SEND message length */
+   int base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */
+
+   uint32_t offset; /* spill/unspill offset */
+   /** @{
+    * Annotation for the generated IR.  One of the two can be set.
+    */
+   ir_instruction *ir;
+   const char *annotation;
+};
+
+class vec4_visitor : public ir_visitor
+{
+public:
+   vec4_visitor(struct brw_vs_compile *c,
+		struct gl_shader_program *prog, struct brw_shader *shader);
+   ~vec4_visitor();
+
+   dst_reg dst_null_f()
+   {
+      return dst_reg(brw_null_reg());
+   }
+
+   dst_reg dst_null_d()
+   {
+      return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   }
+
+   dst_reg dst_null_cmp()
+   {
+      if (intel->gen > 4)
+	 return dst_null_d();
+      else
+	 return dst_null_f();
+   }
+
+   struct brw_context *brw;
+   const struct gl_vertex_program *vp;
+   struct intel_context *intel;
+   struct gl_context *ctx;
+   struct brw_vs_compile *c;
+   struct brw_vs_prog_data *prog_data;
+   struct brw_compile *p;
+   struct brw_shader *shader;
+   struct gl_shader_program *prog;
+   void *mem_ctx;
+   exec_list instructions;
+
+   char *fail_msg;
+   bool failed;
+
+   /**
+    * GLSL IR currently being processed, which is associated with our
+    * driver IR instructions for debugging purposes.
+    */
+   ir_instruction *base_ir;
+   const char *current_annotation;
+
+   int *virtual_grf_sizes;
+   int virtual_grf_count;
+   int virtual_grf_array_size;
+   int first_non_payload_grf;
+
+   dst_reg *variable_storage(ir_variable *var);
+
+   void reladdr_to_temp(ir_instruction *ir, src_reg *reg, int *num_reladdr);
+
+   src_reg src_reg_for_float(float val);
+
+   /**
+    * \name Visit methods
+    *
+    * As typical for the visitor pattern, there must be one \c visit method for
+    * each concrete subclass of \c ir_instruction.  Virtual base classes within
+    * the hierarchy should not have \c visit methods.
+    */
+   /*@{*/
+   virtual void visit(ir_variable *);
+   virtual void visit(ir_loop *);
+   virtual void visit(ir_loop_jump *);
+   virtual void visit(ir_function_signature *);
+   virtual void visit(ir_function *);
+   virtual void visit(ir_expression *);
+   virtual void visit(ir_swizzle *);
+   virtual void visit(ir_dereference_variable  *);
+   virtual void visit(ir_dereference_array *);
+   virtual void visit(ir_dereference_record *);
+   virtual void visit(ir_assignment *);
+   virtual void visit(ir_constant *);
+   virtual void visit(ir_call *);
+   virtual void visit(ir_return *);
+   virtual void visit(ir_discard *);
+   virtual void visit(ir_texture *);
+   virtual void visit(ir_if *);
+   /*@}*/
+
+   src_reg result;
+
+   /* Regs for vertex results.  Generated at ir_variable visiting time
+    * for the ir->location's used.
+    */
+   dst_reg output_reg[VERT_RESULT_MAX];
+
+   struct hash_table *variable_ht;
+
+   bool run(void);
+   void fail(const char *msg, ...);
+
+   int virtual_grf_alloc(int size);
+   int setup_attributes(int payload_reg);
+   void setup_payload();
+   void reg_allocate_trivial();
+   void reg_allocate();
+
+   vec4_instruction *emit(enum opcode opcode);
+
+   vec4_instruction *emit(enum opcode opcode, dst_reg dst, src_reg src0);
+
+   vec4_instruction *emit(enum opcode opcode, dst_reg dst,
+			  src_reg src0, src_reg src1);
+
+   vec4_instruction *emit(enum opcode opcode, dst_reg dst,
+			  src_reg src0, src_reg src1, src_reg src2);
+
+   /** Walks an exec_list of ir_instruction and sends it through this visitor. */
+   void visit_instructions(const exec_list *list);
+
+   void emit_bool_to_cond_code(ir_rvalue *ir);
+   void emit_bool_comparison(unsigned int op, dst_reg dst, src_reg src0, src_reg src1);
+   void emit_if_gen6(ir_if *ir);
+
+   void emit_block_move(ir_assignment *ir);
+
+   /**
+    * Emit the correct dot-product instruction for the type of arguments
+    */
+   void emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements);
+
+   void emit_scalar(ir_instruction *ir, enum prog_opcode op,
+		    dst_reg dst, src_reg src0);
+
+   void emit_scalar(ir_instruction *ir, enum prog_opcode op,
+		    dst_reg dst, src_reg src0, src_reg src1);
+
+   void emit_scs(ir_instruction *ir, enum prog_opcode op,
+		 dst_reg dst, const src_reg &src);
+
+   void emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src);
+   void emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src);
+   void emit_math(enum opcode opcode, dst_reg dst, src_reg src);
+   void emit_math2_gen6(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1);
+   void emit_math2_gen4(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1);
+   void emit_math(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1);
+
+   int emit_vue_header_gen6(int header_mrf);
+   int emit_vue_header_gen4(int header_mrf);
+   void emit_urb_writes(void);
+
+   GLboolean try_emit_sat(ir_expression *ir);
+
+   bool process_move_condition(ir_rvalue *ir);
+
+   void generate_code();
+   void generate_vs_instruction(vec4_instruction *inst,
+				struct brw_reg dst,
+				struct brw_reg *src);
+   void generate_math1_gen4(vec4_instruction *inst,
+			    struct brw_reg dst,
+			    struct brw_reg src);
+   void generate_math1_gen6(vec4_instruction *inst,
+			    struct brw_reg dst,
+			    struct brw_reg src);
+   void generate_urb_write(vec4_instruction *inst);
+};
+
+} /* namespace brw */
+
+#endif /* BRW_VEC4_H */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
new file mode 100644
index 00000000000..bdc7a79d83d
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
@@ -0,0 +1,568 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "../glsl/ir_print_visitor.h"
+
+extern "C" {
+#include "brw_eu.h"
+};
+
+using namespace brw;
+
+namespace brw {
+
+int
+vec4_visitor::setup_attributes(int payload_reg)
+{
+   int nr_attributes;
+   int attribute_map[VERT_ATTRIB_MAX];
+
+   nr_attributes = 0;
+   for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
+      if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
+	 attribute_map[i] = payload_reg + nr_attributes;
+	 nr_attributes++;
+      }
+   }
+
+   foreach_iter(exec_list_iterator, iter, this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)iter.get();
+
+      for (int i = 0; i < 3; i++) {
+	 if (inst->src[i].file != ATTR)
+	    continue;
+
+	 inst->src[i].file = HW_REG;
+	 inst->src[i].fixed_hw_reg = brw_vec8_grf(attribute_map[inst->src[i].reg], 0);
+	 inst->src[i].fixed_hw_reg.dw1.bits.swizzle = inst->src[i].swizzle;
+      }
+   }
+
+   /* The BSpec says we always have to read at least one thing from
+    * the VF, and it appears that the hardware wedges otherwise.
+    */
+   if (nr_attributes == 0)
+      nr_attributes = 1;
+
+   prog_data->urb_read_length = (nr_attributes + 1) / 2;
+
+   return nr_attributes;
+}
+
+void
+vec4_visitor::setup_payload(void)
+{
+   int reg = 0;
+
+   /* r0 is always reserved, as it contains the payload with the URB
+    * handles that are passed on to the URB write at the end of the
+    * thread.
+    */
+   reg++;
+
+   /* User clip planes from curbe:
+    */
+   if (c->key.nr_userclip) {
+      if (intel->gen >= 6) {
+	 for (int i = 0; i < c->key.nr_userclip; i++) {
+	    c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
+						  (i % 2) * 4), 0, 4, 1);
+	 }
+	 reg += ALIGN(c->key.nr_userclip, 2) / 2;
+      } else {
+	 for (int i = 0; i < c->key.nr_userclip; i++) {
+	    c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
+						  (i % 2) * 4), 0, 4, 1);
+	 }
+	 reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
+      }
+   }
+
+   /* FINISHME: push constants */
+   c->prog_data.curb_read_length = reg - 1;
+   c->prog_data.nr_params = 0;
+   /* XXX 0 causes a bug elsewhere... */
+   if (intel->gen < 6 && c->prog_data.nr_params == 0)
+      c->prog_data.nr_params = 4;
+
+   reg += setup_attributes(reg);
+
+   this->first_non_payload_grf = reg;
+}
+
+struct brw_reg
+vec4_instruction::get_dst(void)
+{
+   struct brw_reg brw_reg;
+
+   switch (dst.file) {
+   case GRF:
+      assert(dst.reg_offset == 0);
+      brw_reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0);
+      brw_reg = retype(brw_reg, dst.type);
+      brw_reg.dw1.bits.writemask = dst.writemask;
+      break;
+
+   case HW_REG:
+      brw_reg = dst.fixed_hw_reg;
+      break;
+
+   case BAD_FILE:
+      brw_reg = brw_null_reg();
+      break;
+
+   default:
+      assert(!"not reached");
+      brw_reg = brw_null_reg();
+      break;
+   }
+   return brw_reg;
+}
+
+struct brw_reg
+vec4_instruction::get_src(int i)
+{
+   struct brw_reg brw_reg;
+
+   switch (src[i].file) {
+   case GRF:
+      brw_reg = brw_vec8_grf(src[i].reg + src[i].reg_offset, 0);
+      brw_reg = retype(brw_reg, src[i].type);
+      brw_reg.dw1.bits.swizzle = src[i].swizzle;
+      if (src[i].abs)
+	 brw_reg = brw_abs(brw_reg);
+      if (src[i].negate)
+	 brw_reg = negate(brw_reg);
+      break;
+
+   case IMM:
+      switch (src[i].type) {
+      case BRW_REGISTER_TYPE_F:
+	 brw_reg = brw_imm_f(src[i].imm.f);
+	 break;
+      case BRW_REGISTER_TYPE_D:
+	 brw_reg = brw_imm_d(src[i].imm.i);
+	 break;
+      case BRW_REGISTER_TYPE_UD:
+	 brw_reg = brw_imm_ud(src[i].imm.u);
+	 break;
+      default:
+	 assert(!"not reached");
+	 brw_reg = brw_null_reg();
+	 break;
+      }
+      break;
+
+   case HW_REG:
+      brw_reg = src[i].fixed_hw_reg;
+      break;
+
+   case BAD_FILE:
+      /* Probably unused. */
+      brw_reg = brw_null_reg();
+      break;
+   case ATTR:
+   default:
+      assert(!"not reached");
+      brw_reg = brw_null_reg();
+      break;
+   }
+
+   return brw_reg;
+}
+
+void
+vec4_visitor::generate_math1_gen4(vec4_instruction *inst,
+				  struct brw_reg dst,
+				  struct brw_reg src)
+{
+   brw_math(p,
+	    dst,
+	    brw_math_function(inst->opcode),
+	    BRW_MATH_SATURATE_NONE,
+	    inst->base_mrf,
+	    src,
+	    BRW_MATH_DATA_SCALAR,
+	    BRW_MATH_PRECISION_FULL);
+}
+
+void
+vec4_visitor::generate_math1_gen6(vec4_instruction *inst,
+				  struct brw_reg dst,
+				  struct brw_reg src)
+{
+   brw_math(p,
+	    dst,
+	    brw_math_function(inst->opcode),
+	    BRW_MATH_SATURATE_NONE,
+	    inst->base_mrf,
+	    src,
+	    BRW_MATH_DATA_SCALAR,
+	    BRW_MATH_PRECISION_FULL);
+}
+
+void
+vec4_visitor::generate_urb_write(vec4_instruction *inst)
+{
+   brw_urb_WRITE(p,
+		 brw_null_reg(), /* dest */
+		 inst->base_mrf, /* starting mrf reg nr */
+		 brw_vec8_grf(0, 0), /* src */
+		 false,		/* allocate */
+		 true,		/* used */
+		 inst->mlen,
+		 0,		/* response len */
+		 inst->eot,	/* eot */
+		 inst->eot,	/* writes complete */
+		 inst->offset,	/* urb destination offset */
+		 BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+void
+vec4_visitor::generate_vs_instruction(vec4_instruction *instruction,
+				      struct brw_reg dst,
+				      struct brw_reg *src)
+{
+   vec4_instruction *inst = (vec4_instruction *)instruction;
+
+   switch (inst->opcode) {
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      if (intel->gen >= 6) {
+	 generate_math1_gen6(inst, dst, src[0]);
+      } else {
+	 generate_math1_gen4(inst, dst, src[0]);
+      }
+      break;
+
+   case SHADER_OPCODE_POW:
+      assert(!"finishme");
+      break;
+
+   case VS_OPCODE_URB_WRITE:
+      generate_urb_write(inst);
+      break;
+
+   default:
+      if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
+	 fail("unsupported opcode in `%s' in VS\n",
+	      brw_opcodes[inst->opcode].name);
+      } else {
+	 fail("Unsupported opcode %d in VS", inst->opcode);
+      }
+   }
+}
+
+bool
+vec4_visitor::run()
+{
+   /* Generate FS IR for main().  (the visitor only descends into
+    * functions called "main").
+    */
+   foreach_iter(exec_list_iterator, iter, *shader->ir) {
+      ir_instruction *ir = (ir_instruction *)iter.get();
+      base_ir = ir;
+      ir->accept(this);
+   }
+
+   emit_urb_writes();
+
+   if (failed)
+      return false;
+
+   setup_payload();
+   reg_allocate();
+
+   brw_set_access_mode(p, BRW_ALIGN_16);
+
+   generate_code();
+
+   return !failed;
+}
+
+void
+vec4_visitor::generate_code()
+{
+   int last_native_inst = p->nr_insn;
+   const char *last_annotation_string = NULL;
+   ir_instruction *last_annotation_ir = NULL;
+
+   int loop_stack_array_size = 16;
+   int loop_stack_depth = 0;
+   brw_instruction **loop_stack =
+      rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
+   int *if_depth_in_loop =
+      rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
+
+
+   if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
+      printf("Native code for vertex shader %d:\n", prog->Name);
+   }
+
+   foreach_list(node, &this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+      struct brw_reg src[3], dst;
+
+      if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
+	 if (last_annotation_ir != inst->ir) {
+	    last_annotation_ir = inst->ir;
+	    if (last_annotation_ir) {
+	       printf("   ");
+	       last_annotation_ir->print();
+	       printf("\n");
+	    }
+	 }
+	 if (last_annotation_string != inst->annotation) {
+	    last_annotation_string = inst->annotation;
+	    if (last_annotation_string)
+	       printf("   %s\n", last_annotation_string);
+	 }
+      }
+
+      for (unsigned int i = 0; i < 3; i++) {
+	 src[i] = inst->get_src(i);
+      }
+      dst = inst->get_dst();
+
+      brw_set_conditionalmod(p, inst->conditional_mod);
+      brw_set_predicate_control(p, inst->predicate);
+      brw_set_predicate_inverse(p, inst->predicate_inverse);
+      brw_set_saturate(p, inst->saturate);
+
+      switch (inst->opcode) {
+      case BRW_OPCODE_MOV:
+	 brw_MOV(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_ADD:
+	 brw_ADD(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_MUL:
+	 brw_MUL(p, dst, src[0], src[1]);
+	 break;
+
+      case BRW_OPCODE_FRC:
+	 brw_FRC(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDD:
+	 brw_RNDD(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDE:
+	 brw_RNDE(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDZ:
+	 brw_RNDZ(p, dst, src[0]);
+	 break;
+
+      case BRW_OPCODE_AND:
+	 brw_AND(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_OR:
+	 brw_OR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_XOR:
+	 brw_XOR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_NOT:
+	 brw_NOT(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_ASR:
+	 brw_ASR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SHR:
+	 brw_SHR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SHL:
+	 brw_SHL(p, dst, src[0], src[1]);
+	 break;
+
+      case BRW_OPCODE_CMP:
+	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SEL:
+	 brw_SEL(p, dst, src[0], src[1]);
+	 break;
+
+      case BRW_OPCODE_IF:
+	 if (inst->src[0].file != BAD_FILE) {
+	    /* The instruction has an embedded compare (only allowed on gen6) */
+	    assert(intel->gen == 6);
+	    gen6_IF(p, inst->conditional_mod, src[0], src[1]);
+	 } else {
+	    brw_IF(p, BRW_EXECUTE_8);
+	 }
+	 if_depth_in_loop[loop_stack_depth]++;
+	 break;
+
+      case BRW_OPCODE_ELSE:
+	 brw_ELSE(p);
+	 break;
+      case BRW_OPCODE_ENDIF:
+	 brw_ENDIF(p);
+	 if_depth_in_loop[loop_stack_depth]--;
+	 break;
+
+      case BRW_OPCODE_DO:
+	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
+	 if (loop_stack_array_size <= loop_stack_depth) {
+	    loop_stack_array_size *= 2;
+	    loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
+				  loop_stack_array_size);
+	    if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
+				        loop_stack_array_size);
+	 }
+	 if_depth_in_loop[loop_stack_depth] = 0;
+	 break;
+
+      case BRW_OPCODE_BREAK:
+	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 break;
+      case BRW_OPCODE_CONTINUE:
+	 /* FINISHME: We need to write the loop instruction support still. */
+	 if (intel->gen >= 6)
+	    gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
+	 else
+	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 break;
+
+      case BRW_OPCODE_WHILE: {
+	 struct brw_instruction *inst0, *inst1;
+	 GLuint br = 1;
+
+	 if (intel->gen >= 5)
+	    br = 2;
+
+	 assert(loop_stack_depth > 0);
+	 loop_stack_depth--;
+	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
+	 if (intel->gen < 6) {
+	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
+	    while (inst0 > loop_stack[loop_stack_depth]) {
+	       inst0--;
+	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
+		   inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+	    }
+	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
+			inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	       }
+	    }
+	 }
+      }
+	 break;
+
+      default:
+	 generate_vs_instruction(inst, dst, src);
+	 break;
+      }
+
+      if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
+	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
+	    if (0) {
+	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
+		      ((uint32_t *)&p->store[i])[3],
+		      ((uint32_t *)&p->store[i])[2],
+		      ((uint32_t *)&p->store[i])[1],
+		      ((uint32_t *)&p->store[i])[0]);
+	    }
+	    brw_disasm(stdout, &p->store[i], intel->gen);
+	 }
+      }
+
+      last_native_inst = p->nr_insn;
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
+      printf("\n");
+   }
+
+   ralloc_free(loop_stack);
+   ralloc_free(if_depth_in_loop);
+
+   brw_set_uip_jip(p);
+
+   /* OK, while the INTEL_DEBUG=vs above is very nice for debugging VS
+    * emit issues, it doesn't get the jump distances into the output,
+    * which is often something we want to debug.  So this is here in
+    * case you're doing that.
+    */
+   if (0) {
+      if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
+	 for (unsigned int i = 0; i < p->nr_insn; i++) {
+	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
+		   ((uint32_t *)&p->store[i])[3],
+		   ((uint32_t *)&p->store[i])[2],
+		   ((uint32_t *)&p->store[i])[1],
+		   ((uint32_t *)&p->store[i])[0]);
+	    brw_disasm(stdout, &p->store[i], intel->gen);
+	 }
+      }
+   }
+}
+
+extern "C" {
+
+bool
+brw_vs_emit(struct brw_vs_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
+   struct gl_shader_program *prog = ctx->Shader.CurrentVertexProgram;
+
+   if (!prog)
+      return false;
+
+   struct brw_shader *shader =
+     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
+   if (!shader)
+      return false;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
+      printf("GLSL IR for native vertex shader %d:\n", prog->Name);
+      _mesa_print_ir(shader->ir, NULL);
+      printf("\n\n");
+   }
+
+   vec4_visitor v(c, prog, shader);
+   if (!v.run()) {
+      /* FINISHME: Cleanly fail, test at link time, etc. */
+      assert(!"not reached");
+      return false;
+   }
+
+   return true;
+}
+
+} /* extern "C" */
+
+} /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
new file mode 100644
index 00000000000..e7f6b28a536
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "../glsl/ir_print_visitor.h"
+
+using namespace brw;
+
+namespace brw {
+
+static void
+assign(int *reg_hw_locations, reg *reg)
+{
+   if (reg->file == GRF) {
+      reg->reg = reg_hw_locations[reg->reg];
+   }
+}
+
+void
+vec4_visitor::reg_allocate_trivial()
+{
+   int last_grf = 0;
+   int hw_reg_mapping[this->virtual_grf_count];
+   int i;
+   int next;
+
+   /* Note that compressed instructions require alignment to 2 registers. */
+   hw_reg_mapping[0] = this->first_non_payload_grf;
+   next = hw_reg_mapping[0] + this->virtual_grf_sizes[0];
+   for (i = 1; i < this->virtual_grf_count; i++) {
+      hw_reg_mapping[i] = next;
+      next += this->virtual_grf_sizes[i];
+   }
+   prog_data->total_grf = next;
+
+   foreach_iter(exec_list_iterator, iter, this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)iter.get();
+
+      assign(hw_reg_mapping, &inst->dst);
+      assign(hw_reg_mapping, &inst->src[0]);
+      assign(hw_reg_mapping, &inst->src[1]);
+      assign(hw_reg_mapping, &inst->src[2]);
+   }
+
+   if (last_grf >= BRW_MAX_GRF) {
+      fail("Ran out of regs on trivial allocator (%d/%d)\n",
+	   last_grf, BRW_MAX_GRF);
+   }
+}
+
+void
+vec4_visitor::reg_allocate()
+{
+   reg_allocate_trivial();
+}
+
+} /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
new file mode 100644
index 00000000000..bba1d810f19
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -0,0 +1,1649 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "main/macros.h"
+
+namespace brw {
+
+src_reg::src_reg(dst_reg reg)
+{
+   init();
+
+   this->file = reg.file;
+   this->reg = reg.reg;
+   this->reg_offset = reg.reg_offset;
+   this->type = reg.type;
+
+   int swizzles[4];
+   int next_chan = 0;
+   int last = 0;
+
+   for (int i = 0; i < 4; i++) {
+      if (!(reg.writemask & (1 << i)))
+	 continue;
+
+      swizzles[next_chan++] = last = i;
+   }
+
+   for (; next_chan < 4; next_chan++) {
+      swizzles[next_chan] = last;
+   }
+
+   this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
+				swizzles[2], swizzles[3]);
+}
+
+dst_reg::dst_reg(src_reg reg)
+{
+   init();
+
+   this->file = reg.file;
+   this->reg = reg.reg;
+   this->reg_offset = reg.reg_offset;
+   this->type = reg.type;
+   this->writemask = WRITEMASK_XYZW;
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, dst_reg dst,
+		   src_reg src0, src_reg src1, src_reg src2)
+{
+   vec4_instruction *inst = new(mem_ctx) vec4_instruction();
+
+   inst->opcode = opcode;
+   inst->dst = dst;
+   inst->src[0] = src0;
+   inst->src[1] = src1;
+   inst->src[2] = src2;
+   inst->ir = this->base_ir;
+   inst->annotation = this->current_annotation;
+
+   this->instructions.push_tail(inst);
+
+   return inst;
+}
+
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
+{
+   return emit(opcode, dst, src0, src1, src_reg());
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
+{
+   assert(dst.writemask != 0);
+   return emit(opcode, dst, src0, src_reg(), src_reg());
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode)
+{
+   return emit(opcode, dst_reg(), src_reg(), src_reg(), src_reg());
+}
+
+void
+vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
+{
+   static enum opcode dot_opcodes[] = {
+      BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
+   };
+
+   emit(dot_opcodes[elements - 2], dst, src0, src1);
+}
+
+void
+vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
+{
+   /* The gen6 math instruction ignores the source modifiers --
+    * swizzle, abs, negate, and at least some parts of the register
+    * region description.  Move the source to the corresponding slots
+    * of the destination generally work.
+    */
+   src_reg expanded = src_reg(this, glsl_type::float_type);
+   emit(BRW_OPCODE_MOV, dst, src);
+   src = expanded;
+
+   emit(opcode, dst, src);
+}
+
+void
+vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
+{
+   vec4_instruction *inst = emit(opcode, dst, src);
+   inst->base_mrf = 1;
+   inst->mlen = 1;
+}
+
+void
+vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
+{
+   switch (opcode) {
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      break;
+   default:
+      assert(!"not reached: bad math opcode");
+      return;
+   }
+
+   if (intel->gen >= 6) {
+      return emit_math1_gen6(opcode, dst, src);
+   } else {
+      return emit_math1_gen4(opcode, dst, src);
+   }
+}
+
+void
+vec4_visitor::emit_math2_gen6(enum opcode opcode,
+			      dst_reg dst, src_reg src0, src_reg src1)
+{
+   src_reg expanded;
+
+   /* The gen6 math instruction ignores the source modifiers --
+    * swizzle, abs, negate, and at least some parts of the register
+    * region description.  Move the sources to temporaries to make it
+    * generally work.
+    */
+
+   expanded = src_reg(this, glsl_type::vec4_type);
+   emit(BRW_OPCODE_MOV, dst, src0);
+   src0 = expanded;
+
+   expanded = src_reg(this, glsl_type::vec4_type);
+   emit(BRW_OPCODE_MOV, dst, src1);
+   src1 = expanded;
+
+   emit(opcode, dst, src0, src1);
+}
+
+void
+vec4_visitor::emit_math2_gen4(enum opcode opcode,
+			      dst_reg dst, src_reg src0, src_reg src1)
+{
+   vec4_instruction *inst = emit(opcode, dst, src0, src1);
+   inst->base_mrf = 1;
+   inst->mlen = 2;
+}
+
+void
+vec4_visitor::emit_math(enum opcode opcode,
+			dst_reg dst, src_reg src0, src_reg src1)
+{
+   assert(opcode == SHADER_OPCODE_POW);
+
+   if (intel->gen >= 6) {
+      return emit_math2_gen6(opcode, dst, src0, src1);
+   } else {
+      return emit_math2_gen4(opcode, dst, src0, src1);
+   }
+}
+
+void
+vec4_visitor::visit_instructions(const exec_list *list)
+{
+   foreach_iter(exec_list_iterator, iter, *list) {
+      ir_instruction *ir = (ir_instruction *)iter.get();
+
+      base_ir = ir;
+      ir->accept(this);
+   }
+}
+
+
+static int
+type_size(const struct glsl_type *type)
+{
+   unsigned int i;
+   int size;
+
+   switch (type->base_type) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_BOOL:
+      if (type->is_matrix()) {
+	 return type->matrix_columns;
+      } else {
+	 /* Regardless of size of vector, it gets a vec4. This is bad
+	  * packing for things like floats, but otherwise arrays become a
+	  * mess.  Hopefully a later pass over the code can pack scalars
+	  * down if appropriate.
+	  */
+	 return 1;
+      }
+   case GLSL_TYPE_ARRAY:
+      assert(type->length > 0);
+      return type_size(type->fields.array) * type->length;
+   case GLSL_TYPE_STRUCT:
+      size = 0;
+      for (i = 0; i < type->length; i++) {
+	 size += type_size(type->fields.structure[i].type);
+      }
+      return size;
+   case GLSL_TYPE_SAMPLER:
+      /* Samplers take up one slot in UNIFORMS[], but they're baked in
+       * at link time.
+       */
+      return 1;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+int
+vec4_visitor::virtual_grf_alloc(int size)
+{
+   if (virtual_grf_array_size <= virtual_grf_count) {
+      if (virtual_grf_array_size == 0)
+	 virtual_grf_array_size = 16;
+      else
+	 virtual_grf_array_size *= 2;
+      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
+				   virtual_grf_array_size);
+   }
+   virtual_grf_sizes[virtual_grf_count] = size;
+   return virtual_grf_count++;
+}
+
+src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
+{
+   init();
+
+   this->file = GRF;
+   this->reg = v->virtual_grf_alloc(type_size(type));
+
+   if (type->is_array() || type->is_record()) {
+      this->swizzle = BRW_SWIZZLE_NOOP;
+   } else {
+      this->swizzle = swizzle_for_size(type->vector_elements);
+   }
+
+   this->type = brw_type_for_base_type(type);
+}
+
+dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
+{
+   init();
+
+   this->file = GRF;
+   this->reg = v->virtual_grf_alloc(type_size(type));
+
+   if (type->is_array() || type->is_record()) {
+      this->writemask = WRITEMASK_XYZW;
+   } else {
+      this->writemask = (1 << type->vector_elements) - 1;
+   }
+
+   this->type = brw_type_for_base_type(type);
+}
+
+dst_reg *
+vec4_visitor::variable_storage(ir_variable *var)
+{
+   return (dst_reg *)hash_table_find(this->variable_ht, var);
+}
+
+void
+vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
+{
+   ir_expression *expr = ir->as_expression();
+
+   if (expr) {
+      src_reg op[2];
+      vec4_instruction *inst;
+
+      assert(expr->get_num_operands() <= 2);
+      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
+	 assert(expr->operands[i]->type->is_scalar());
+
+	 expr->operands[i]->accept(this);
+	 op[i] = this->result;
+      }
+
+      switch (expr->operation) {
+      case ir_unop_logic_not:
+	 inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], src_reg(1));
+	 inst->conditional_mod = BRW_CONDITIONAL_Z;
+	 break;
+
+      case ir_binop_logic_xor:
+	 inst = emit(BRW_OPCODE_XOR, dst_null_d(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_binop_logic_or:
+	 inst = emit(BRW_OPCODE_OR, dst_null_d(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_binop_logic_and:
+	 inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_unop_f2b:
+	 if (intel->gen >= 6) {
+	    inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0.0f));
+	 } else {
+	    inst = emit(BRW_OPCODE_MOV, dst_null_f(), op[0]);
+	 }
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_unop_i2b:
+	 if (intel->gen >= 6) {
+	    inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
+	 } else {
+	    inst = emit(BRW_OPCODE_MOV, dst_null_d(), op[0]);
+	 }
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_binop_greater:
+      case ir_binop_gequal:
+      case ir_binop_less:
+      case ir_binop_lequal:
+      case ir_binop_equal:
+      case ir_binop_all_equal:
+      case ir_binop_nequal:
+      case ir_binop_any_nequal:
+	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
+	 inst->conditional_mod =
+	    brw_conditional_for_comparison(expr->operation);
+	 break;
+
+      default:
+	 assert(!"not reached");
+	 break;
+      }
+      return;
+   }
+
+   ir->accept(this);
+
+   if (intel->gen >= 6) {
+      vec4_instruction *inst = emit(BRW_OPCODE_AND, dst_null_d(),
+			       this->result, src_reg(1));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+   } else {
+      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst_null_d(), this->result);
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+   }
+}
+
+/**
+ * Emit a gen6 IF statement with the comparison folded into the IF
+ * instruction.
+ */
+void
+vec4_visitor::emit_if_gen6(ir_if *ir)
+{
+   ir_expression *expr = ir->condition->as_expression();
+
+   if (expr) {
+      src_reg op[2];
+      vec4_instruction *inst;
+      dst_reg temp;
+
+      assert(expr->get_num_operands() <= 2);
+      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
+	 assert(expr->operands[i]->type->is_scalar());
+
+	 expr->operands[i]->accept(this);
+	 op[i] = this->result;
+      }
+
+      switch (expr->operation) {
+      case ir_unop_logic_not:
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_Z;
+	 return;
+
+      case ir_binop_logic_xor:
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_binop_logic_or:
+	 temp = dst_reg(this, glsl_type::bool_type);
+	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_binop_logic_and:
+	 temp = dst_reg(this, glsl_type::bool_type);
+	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_unop_f2b:
+	 inst = emit(BRW_OPCODE_IF, dst_null_f(), op[0], src_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_unop_i2b:
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_binop_greater:
+      case ir_binop_gequal:
+      case ir_binop_less:
+      case ir_binop_lequal:
+      case ir_binop_equal:
+      case ir_binop_all_equal:
+      case ir_binop_nequal:
+      case ir_binop_any_nequal:
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
+	 inst->conditional_mod =
+	    brw_conditional_for_comparison(expr->operation);
+	 return;
+      default:
+	 assert(!"not reached");
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+      }
+      return;
+   }
+
+   ir->condition->accept(this);
+
+   vec4_instruction *inst = emit(BRW_OPCODE_IF, dst_null_d(),
+			    this->result, src_reg(0));
+   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+}
+
+void
+vec4_visitor::visit(ir_variable *ir)
+{
+   dst_reg *reg = NULL;
+
+   if (variable_storage(ir))
+      return;
+
+   switch (ir->mode) {
+   case ir_var_in:
+      reg = new(mem_ctx) dst_reg(ATTR, ir->location);
+      reg->type = brw_type_for_base_type(ir->type);
+      hash_table_insert(this->variable_ht, reg, ir);
+      break;
+
+   case ir_var_out:
+      reg = new(mem_ctx) dst_reg(this, ir->type);
+      hash_table_insert(this->variable_ht, reg, ir);
+
+      for (int i = 0; i < type_size(ir->type); i++) {
+	 output_reg[ir->location + i] = *reg;
+	 output_reg[ir->location + i].reg_offset = i;
+      }
+      break;
+
+   case ir_var_temporary:
+      reg = new(mem_ctx) dst_reg(this, ir->type);
+      hash_table_insert(this->variable_ht, reg, ir);
+
+      break;
+
+   case ir_var_uniform:
+      /* FINISHME: uniforms */
+      break;
+   }
+}
+
+void
+vec4_visitor::visit(ir_loop *ir)
+{
+   ir_dereference_variable *counter = NULL;
+
+   /* We don't want debugging output to print the whole body of the
+    * loop as the annotation.
+    */
+   this->base_ir = NULL;
+
+   if (ir->counter != NULL)
+      counter = new(ir) ir_dereference_variable(ir->counter);
+
+   if (ir->from != NULL) {
+      assert(ir->counter != NULL);
+
+      ir_assignment *a = new(ir) ir_assignment(counter, ir->from, NULL);
+
+      a->accept(this);
+      delete a;
+   }
+
+   emit(BRW_OPCODE_DO);
+
+   if (ir->to) {
+      ir_expression *e =
+	 new(ir) ir_expression(ir->cmp, glsl_type::bool_type,
+			       counter, ir->to);
+      ir_if *if_stmt =  new(ir) ir_if(e);
+
+      ir_loop_jump *brk = new(ir) ir_loop_jump(ir_loop_jump::jump_break);
+
+      if_stmt->then_instructions.push_tail(brk);
+
+      if_stmt->accept(this);
+
+      delete if_stmt;
+      delete e;
+      delete brk;
+   }
+
+   visit_instructions(&ir->body_instructions);
+
+   if (ir->increment) {
+      ir_expression *e =
+	 new(ir) ir_expression(ir_binop_add, counter->type,
+			       counter, ir->increment);
+
+      ir_assignment *a = new(ir) ir_assignment(counter, e, NULL);
+
+      a->accept(this);
+      delete a;
+      delete e;
+   }
+
+   emit(BRW_OPCODE_WHILE);
+}
+
+void
+vec4_visitor::visit(ir_loop_jump *ir)
+{
+   switch (ir->mode) {
+   case ir_loop_jump::jump_break:
+      emit(BRW_OPCODE_BREAK);
+      break;
+   case ir_loop_jump::jump_continue:
+      emit(BRW_OPCODE_CONTINUE);
+      break;
+   }
+}
+
+
+void
+vec4_visitor::visit(ir_function_signature *ir)
+{
+   assert(0);
+   (void)ir;
+}
+
+void
+vec4_visitor::visit(ir_function *ir)
+{
+   /* Ignore function bodies other than main() -- we shouldn't see calls to
+    * them since they should all be inlined.
+    */
+   if (strcmp(ir->name, "main") == 0) {
+      const ir_function_signature *sig;
+      exec_list empty;
+
+      sig = ir->matching_signature(&empty);
+
+      assert(sig);
+
+      visit_instructions(&sig->body);
+   }
+}
+
+GLboolean
+vec4_visitor::try_emit_sat(ir_expression *ir)
+{
+   ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
+   if (!sat_src)
+      return false;
+
+   sat_src->accept(this);
+   src_reg src = this->result;
+
+   this->result = src_reg(this, ir->type);
+   vec4_instruction *inst;
+   inst = emit(BRW_OPCODE_MOV, dst_reg(this->result), src);
+   inst->saturate = true;
+
+   return true;
+}
+
+void
+vec4_visitor::emit_bool_comparison(unsigned int op,
+				 dst_reg dst, src_reg src0, src_reg src1)
+{
+   /* original gen4 does destination conversion before comparison. */
+   if (intel->gen < 5)
+      dst.type = src0.type;
+
+   vec4_instruction *inst = emit(BRW_OPCODE_CMP, dst, src0, src1);
+   inst->conditional_mod = brw_conditional_for_comparison(op);
+
+   dst.type = BRW_REGISTER_TYPE_D;
+   emit(BRW_OPCODE_AND, dst, src_reg(dst), src_reg(0x1));
+}
+
+void
+vec4_visitor::visit(ir_expression *ir)
+{
+   unsigned int operand;
+   src_reg op[Elements(ir->operands)];
+   src_reg result_src;
+   dst_reg result_dst;
+   vec4_instruction *inst;
+
+   if (try_emit_sat(ir))
+      return;
+
+   for (operand = 0; operand < ir->get_num_operands(); operand++) {
+      this->result.file = BAD_FILE;
+      ir->operands[operand]->accept(this);
+      if (this->result.file == BAD_FILE) {
+	 printf("Failed to get tree for expression operand:\n");
+	 ir->operands[operand]->print();
+	 exit(1);
+      }
+      op[operand] = this->result;
+
+      /* Matrix expression operands should have been broken down to vector
+       * operations already.
+       */
+      assert(!ir->operands[operand]->type->is_matrix());
+   }
+
+   int vector_elements = ir->operands[0]->type->vector_elements;
+   if (ir->operands[1]) {
+      vector_elements = MAX2(vector_elements,
+			     ir->operands[1]->type->vector_elements);
+   }
+
+   this->result.file = BAD_FILE;
+
+   /* Storage for our result.  Ideally for an assignment we'd be using
+    * the actual storage for the result here, instead.
+    */
+   result_src = src_reg(this, ir->type);
+   /* convenience for the emit functions below. */
+   result_dst = dst_reg(result_src);
+   /* If nothing special happens, this is the result. */
+   this->result = result_src;
+   /* Limit writes to the channels that will be used by result_src later.
+    * This does limit this temp's use as a temporary for multi-instruction
+    * sequences.
+    */
+   result_dst.writemask = (1 << ir->type->vector_elements) - 1;
+
+   switch (ir->operation) {
+   case ir_unop_logic_not:
+      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
+       * ones complement of the whole register, not just bit 0.
+       */
+      emit(BRW_OPCODE_XOR, result_dst, op[0], src_reg(1));
+      break;
+   case ir_unop_neg:
+      op[0].negate = !op[0].negate;
+      this->result = op[0];
+      break;
+   case ir_unop_abs:
+      op[0].abs = true;
+      op[0].negate = false;
+      this->result = op[0];
+      break;
+
+   case ir_unop_sign:
+      emit(BRW_OPCODE_MOV, result_dst, src_reg(0.0f));
+
+      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
+      inst->conditional_mod = BRW_CONDITIONAL_G;
+      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1.0f));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+
+      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
+      inst->conditional_mod = BRW_CONDITIONAL_L;
+      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(-1.0f));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+
+      break;
+
+   case ir_unop_rcp:
+      emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
+      break;
+
+   case ir_unop_exp2:
+      emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
+      break;
+   case ir_unop_log2:
+      emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
+      break;
+   case ir_unop_exp:
+   case ir_unop_log:
+      assert(!"not reached: should be handled by ir_explog_to_explog2");
+      break;
+   case ir_unop_sin:
+   case ir_unop_sin_reduced:
+      emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
+      break;
+   case ir_unop_cos:
+   case ir_unop_cos_reduced:
+      emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
+      break;
+
+   case ir_unop_dFdx:
+   case ir_unop_dFdy:
+      assert(!"derivatives not valid in vertex shader");
+      break;
+
+   case ir_unop_noise:
+      assert(!"not reached: should be handled by lower_noise");
+      break;
+
+   case ir_binop_add:
+      emit(BRW_OPCODE_ADD, result_dst, op[0], op[1]);
+      break;
+   case ir_binop_sub:
+      assert(!"not reached: should be handled by ir_sub_to_add_neg");
+      break;
+
+   case ir_binop_mul:
+      emit(BRW_OPCODE_MUL, result_dst, op[0], op[1]);
+      break;
+   case ir_binop_div:
+      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
+   case ir_binop_mod:
+      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
+      break;
+
+   case ir_binop_less:
+   case ir_binop_greater:
+   case ir_binop_lequal:
+   case ir_binop_gequal:
+   case ir_binop_equal:
+   case ir_binop_nequal: {
+      dst_reg temp = result_dst;
+      /* original gen4 does implicit conversion before comparison. */
+      if (intel->gen < 5)
+	 temp.type = op[0].type;
+
+      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
+      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
+      emit(BRW_OPCODE_AND, result_dst, this->result, src_reg(0x1));
+      break;
+   }
+
+   case ir_binop_all_equal:
+      /* "==" operator producing a scalar boolean. */
+      if (ir->operands[0]->type->is_vector() ||
+	  ir->operands[1]->type->is_vector()) {
+	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_Z;
+
+	 emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
+	 inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
+	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+      } else {
+	 dst_reg temp = result_dst;
+	 /* original gen4 does implicit conversion before comparison. */
+	 if (intel->gen < 5)
+	    temp.type = op[0].type;
+
+	 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
+      }
+      break;
+   case ir_binop_any_nequal:
+      /* "!=" operator producing a scalar boolean. */
+      if (ir->operands[0]->type->is_vector() ||
+	  ir->operands[1]->type->is_vector()) {
+	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+	 emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
+	 inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
+	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      } else {
+	 dst_reg temp = result_dst;
+	 /* original gen4 does implicit conversion before comparison. */
+	 if (intel->gen < 5)
+	    temp.type = op[0].type;
+
+	 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
+      }
+      break;
+
+   case ir_unop_any:
+      emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
+      emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
+
+      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+
+   case ir_binop_logic_xor:
+      emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
+      break;
+
+   case ir_binop_logic_or:
+      emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
+      break;
+
+   case ir_binop_logic_and:
+      emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
+      break;
+
+   case ir_binop_dot:
+      assert(ir->operands[0]->type->is_vector());
+      assert(ir->operands[0]->type == ir->operands[1]->type);
+      emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
+      break;
+
+   case ir_unop_sqrt:
+      emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
+      break;
+   case ir_unop_rsq:
+      emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
+      break;
+   case ir_unop_i2f:
+   case ir_unop_i2u:
+   case ir_unop_u2i:
+   case ir_unop_u2f:
+   case ir_unop_b2f:
+   case ir_unop_b2i:
+   case ir_unop_f2i:
+      emit(BRW_OPCODE_MOV, result_dst, op[0]);
+      break;
+   case ir_unop_f2b:
+   case ir_unop_i2b: {
+      dst_reg temp = result_dst;
+      /* original gen4 does implicit conversion before comparison. */
+      if (intel->gen < 5)
+	 temp.type = op[0].type;
+
+      inst = emit(BRW_OPCODE_CMP, temp, op[0], src_reg(0.0f));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+      inst = emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(1));
+      break;
+   }
+
+   case ir_unop_trunc:
+      emit(BRW_OPCODE_RNDZ, result_dst, op[0]);
+      break;
+   case ir_unop_ceil:
+      op[0].negate = !op[0].negate;
+      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
+      this->result.negate = true;
+      break;
+   case ir_unop_floor:
+      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
+      break;
+   case ir_unop_fract:
+      inst = emit(BRW_OPCODE_FRC, result_dst, op[0]);
+      break;
+   case ir_unop_round_even:
+      emit(BRW_OPCODE_RNDE, result_dst, op[0]);
+      break;
+
+   case ir_binop_min:
+      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
+      inst->conditional_mod = BRW_CONDITIONAL_L;
+
+      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   case ir_binop_max:
+      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
+      inst->conditional_mod = BRW_CONDITIONAL_G;
+
+      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+
+   case ir_binop_pow:
+      emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
+      break;
+
+   case ir_unop_bit_not:
+      inst = emit(BRW_OPCODE_NOT, result_dst, op[0]);
+      break;
+   case ir_binop_bit_and:
+      inst = emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
+      break;
+   case ir_binop_bit_xor:
+      inst = emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
+      break;
+   case ir_binop_bit_or:
+      inst = emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
+      break;
+
+   case ir_binop_lshift:
+   case ir_binop_rshift:
+      assert(!"GLSL 1.30 features unsupported");
+      break;
+
+   case ir_quadop_vector:
+      assert(!"not reached: should be handled by lower_quadop_vector");
+      break;
+   }
+}
+
+
+void
+vec4_visitor::visit(ir_swizzle *ir)
+{
+   src_reg src;
+   int i = 0;
+   int swizzle[4];
+
+   /* Note that this is only swizzles in expressions, not those on the left
+    * hand side of an assignment, which do write masking.  See ir_assignment
+    * for that.
+    */
+
+   ir->val->accept(this);
+   src = this->result;
+   assert(src.file != BAD_FILE);
+
+   if (i < ir->type->vector_elements) {
+      switch (i) {
+      case 0:
+	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
+	 break;
+      case 1:
+	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
+	 break;
+      case 2:
+	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
+	 break;
+      case 3:
+	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
+	    break;
+      }
+   }
+   for (; i < 4; i++) {
+      /* Replicate the last channel out. */
+      swizzle[i] = swizzle[ir->type->vector_elements - 1];
+   }
+
+   src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+
+   this->result = src;
+}
+
+void
+vec4_visitor::visit(ir_dereference_variable *ir)
+{
+   dst_reg *reg = variable_storage(ir->var);
+
+   if (!reg) {
+      fail("Failed to find variable storage for %s\n", ir->var->name);
+      this->result = src_reg(brw_null_reg());
+      return;
+   }
+
+   this->result = src_reg(*reg);
+}
+
+void
+vec4_visitor::visit(ir_dereference_array *ir)
+{
+   ir_constant *constant_index;
+   src_reg src;
+   int element_size = type_size(ir->type);
+
+   constant_index = ir->array_index->constant_expression_value();
+
+   ir->array->accept(this);
+   src = this->result;
+
+   if (constant_index) {
+      src.reg_offset += constant_index->value.i[0] * element_size;
+   } else {
+#if 0 /* Variable array index */
+      /* Variable index array dereference.  It eats the "vec4" of the
+       * base of the array and an index that offsets the Mesa register
+       * index.
+       */
+      ir->array_index->accept(this);
+
+      src_reg index_reg;
+
+      if (element_size == 1) {
+	 index_reg = this->result;
+      } else {
+	 index_reg = src_reg(this, glsl_type::float_type);
+
+	 emit(BRW_OPCODE_MUL, dst_reg(index_reg),
+	      this->result, src_reg_for_float(element_size));
+      }
+
+      src.reladdr = ralloc(mem_ctx, src_reg);
+      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
+#endif
+   }
+
+   /* If the type is smaller than a vec4, replicate the last channel out. */
+   if (ir->type->is_scalar() || ir->type->is_vector())
+      src.swizzle = swizzle_for_size(ir->type->vector_elements);
+   else
+      src.swizzle = BRW_SWIZZLE_NOOP;
+
+   this->result = src;
+}
+
+void
+vec4_visitor::visit(ir_dereference_record *ir)
+{
+   unsigned int i;
+   const glsl_type *struct_type = ir->record->type;
+   int offset = 0;
+
+   ir->record->accept(this);
+
+   for (i = 0; i < struct_type->length; i++) {
+      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
+	 break;
+      offset += type_size(struct_type->fields.structure[i].type);
+   }
+
+   /* If the type is smaller than a vec4, replicate the last channel out. */
+   if (ir->type->is_scalar() || ir->type->is_vector())
+      this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
+   else
+      this->result.swizzle = BRW_SWIZZLE_NOOP;
+
+   this->result.reg_offset += offset;
+}
+
+/**
+ * We want to be careful in assignment setup to hit the actual storage
+ * instead of potentially using a temporary like we might with the
+ * ir_dereference handler.
+ */
+static dst_reg
+get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
+{
+   /* The LHS must be a dereference.  If the LHS is a variable indexed array
+    * access of a vector, it must be separated into a series conditional moves
+    * before reaching this point (see ir_vec_index_to_cond_assign).
+    */
+   assert(ir->as_dereference());
+   ir_dereference_array *deref_array = ir->as_dereference_array();
+   if (deref_array) {
+      assert(!deref_array->array->type->is_vector());
+   }
+
+   /* Use the rvalue deref handler for the most part.  We'll ignore
+    * swizzles in it and write swizzles using writemask, though.
+    */
+   ir->accept(v);
+   return dst_reg(v->result);
+}
+
+void
+vec4_visitor::emit_block_move(ir_assignment *ir)
+{
+   ir->rhs->accept(this);
+   src_reg src = this->result;
+
+   dst_reg dst = get_assignment_lhs(ir->lhs, this);
+
+   /* FINISHME: This should really set to the correct maximal writemask for each
+    * FINISHME: component written (in the loops below).
+    */
+   dst.writemask = WRITEMASK_XYZW;
+
+   for (int i = 0; i < type_size(ir->lhs->type); i++) {
+      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst, src);
+      if (ir->condition)
+	 inst->predicate = BRW_PREDICATE_NORMAL;
+
+      dst.reg_offset++;
+      src.reg_offset++;
+   }
+}
+
+void
+vec4_visitor::visit(ir_assignment *ir)
+{
+   if (!ir->lhs->type->is_scalar() &&
+       !ir->lhs->type->is_vector()) {
+      emit_block_move(ir);
+      return;
+   }
+
+   /* Now we're down to just a scalar/vector with writemasks. */
+   int i;
+
+   ir->rhs->accept(this);
+   src_reg src = this->result;
+
+   dst_reg dst = get_assignment_lhs(ir->lhs, this);
+
+   int swizzles[4];
+   int first_enabled_chan = 0;
+   int src_chan = 0;
+
+   assert(ir->lhs->type->is_vector());
+   dst.writemask = ir->write_mask;
+
+   for (int i = 0; i < 4; i++) {
+      if (dst.writemask & (1 << i)) {
+	 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
+	 break;
+      }
+   }
+
+   /* Swizzle a small RHS vector into the channels being written.
+    *
+    * glsl ir treats write_mask as dictating how many channels are
+    * present on the RHS while in our instructions we need to make
+    * those channels appear in the slots of the vec4 they're written to.
+    */
+   for (int i = 0; i < 4; i++) {
+      if (dst.writemask & (1 << i))
+	 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
+      else
+	 swizzles[i] = first_enabled_chan;
+   }
+   src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
+			      swizzles[2], swizzles[3]);
+
+   if (ir->condition) {
+      emit_bool_to_cond_code(ir->condition);
+   }
+
+   for (i = 0; i < type_size(ir->lhs->type); i++) {
+      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst, src);
+
+      if (ir->condition)
+	 inst->predicate = BRW_PREDICATE_NORMAL;
+
+      dst.reg_offset++;
+      src.reg_offset++;
+   }
+}
+
+
+void
+vec4_visitor::visit(ir_constant *ir)
+{
+   if (ir->type->base_type == GLSL_TYPE_STRUCT) {
+      src_reg temp_base = src_reg(this, ir->type);
+      dst_reg temp = dst_reg(temp_base);
+
+      foreach_iter(exec_list_iterator, iter, ir->components) {
+	 ir_constant *field_value = (ir_constant *)iter.get();
+	 int size = type_size(field_value->type);
+
+	 assert(size > 0);
+
+	 field_value->accept(this);
+	 src_reg src = this->result;
+
+	 for (int i = 0; i < (unsigned int)size; i++) {
+	    emit(BRW_OPCODE_MOV, temp, src);
+
+	    src.reg_offset++;
+	    temp.reg_offset++;
+	 }
+      }
+      this->result = temp_base;
+      return;
+   }
+
+   if (ir->type->is_array()) {
+      src_reg temp_base = src_reg(this, ir->type);
+      dst_reg temp = dst_reg(temp_base);
+      int size = type_size(ir->type->fields.array);
+
+      assert(size > 0);
+
+      for (unsigned int i = 0; i < ir->type->length; i++) {
+	 ir->array_elements[i]->accept(this);
+	 src_reg src = this->result;
+	 for (int j = 0; j < size; j++) {
+	    emit(BRW_OPCODE_MOV, temp, src);
+
+	    src.reg_offset++;
+	    temp.reg_offset++;
+	 }
+      }
+      this->result = temp_base;
+      return;
+   }
+
+   if (ir->type->is_matrix()) {
+      this->result = src_reg(this, ir->type);
+      dst_reg dst = dst_reg(this->result);
+
+      assert(ir->type->base_type == GLSL_TYPE_FLOAT);
+
+      for (int i = 0; i < ir->type->matrix_columns; i++) {
+	 for (int j = 0; j < ir->type->vector_elements; j++) {
+	    dst.writemask = 1 << j;
+	    emit(BRW_OPCODE_MOV, dst,
+		 src_reg(ir->value.f[i * ir->type->vector_elements + j]));
+	 }
+	 dst.reg_offset++;
+      }
+      return;
+   }
+
+   for (int i = 0; i < ir->type->vector_elements; i++) {
+      this->result = src_reg(this, ir->type);
+      dst_reg dst = dst_reg(this->result);
+
+      dst.writemask = 1 << i;
+
+      switch (ir->type->base_type) {
+      case GLSL_TYPE_FLOAT:
+	 emit(BRW_OPCODE_MOV, dst, src_reg(ir->value.f[i]));
+	 break;
+      case GLSL_TYPE_INT:
+	 emit(BRW_OPCODE_MOV, dst, src_reg(ir->value.i[i]));
+	 break;
+      case GLSL_TYPE_UINT:
+	 emit(BRW_OPCODE_MOV, dst, src_reg(ir->value.u[i]));
+	 break;
+      case GLSL_TYPE_BOOL:
+	 emit(BRW_OPCODE_MOV, dst, src_reg(ir->value.b[i]));
+	 break;
+      default:
+	 assert(!"Non-float/uint/int/bool constant");
+	 break;
+      }
+   }
+}
+
+void
+vec4_visitor::visit(ir_call *ir)
+{
+   assert(!"not reached");
+}
+
+void
+vec4_visitor::visit(ir_texture *ir)
+{
+   assert(!"not reached");
+}
+
+void
+vec4_visitor::visit(ir_return *ir)
+{
+   assert(!"not reached");
+}
+
+void
+vec4_visitor::visit(ir_discard *ir)
+{
+   assert(!"not reached");
+}
+
+void
+vec4_visitor::visit(ir_if *ir)
+{
+   this->base_ir = ir->condition;
+   ir->condition->accept(this);
+   assert(this->result.file != BAD_FILE);
+
+   /* FINISHME: condcode */
+   emit(BRW_OPCODE_IF);
+
+   visit_instructions(&ir->then_instructions);
+
+   if (!ir->else_instructions.is_empty()) {
+      this->base_ir = ir->condition;
+      emit(BRW_OPCODE_ELSE);
+
+      visit_instructions(&ir->else_instructions);
+   }
+
+   this->base_ir = ir->condition;
+   emit(BRW_OPCODE_ENDIF);
+}
+
+int
+vec4_visitor::emit_vue_header_gen4(int header_mrf)
+{
+   /* Get the position */
+   src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
+
+   /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
+   dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
+
+   current_annotation = "NDC";
+   dst_reg ndc_w = ndc;
+   ndc_w.writemask = WRITEMASK_W;
+   src_reg pos_w = pos;
+   pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
+   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
+
+   dst_reg ndc_xyz = ndc;
+   ndc_xyz.writemask = WRITEMASK_XYZ;
+
+   emit(BRW_OPCODE_MUL, ndc_xyz, pos, src_reg(ndc_w));
+
+   if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
+       c->key.nr_userclip || brw->has_negative_rhw_bug) {
+      dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
+      GLuint i;
+
+      emit(BRW_OPCODE_MOV, header1, 0u);
+
+      if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
+	 assert(!"finishme: psiz");
+	 src_reg psiz;
+
+	 header1.writemask = WRITEMASK_W;
+	 emit(BRW_OPCODE_MUL, header1, psiz, 1u << 11);
+	 emit(BRW_OPCODE_AND, header1, src_reg(header1), 0x7ff << 8);
+      }
+
+      for (i = 0; i < c->key.nr_userclip; i++) {
+	 vec4_instruction *inst;
+
+	 inst = emit(BRW_OPCODE_DP4, dst_reg(brw_null_reg()),
+		     pos, src_reg(c->userplane[i]));
+	 inst->conditional_mod = BRW_CONDITIONAL_L;
+
+	 emit(BRW_OPCODE_OR, header1, src_reg(header1), 1u << i);
+	 inst->predicate = BRW_PREDICATE_NORMAL;
+      }
+
+      /* i965 clipping workaround:
+       * 1) Test for -ve rhw
+       * 2) If set,
+       *      set ndc = (0,0,0,0)
+       *      set ucp[6] = 1
+       *
+       * Later, clipping will detect ucp[6] and ensure the primitive is
+       * clipped against all fixed planes.
+       */
+      if (brw->has_negative_rhw_bug) {
+#if 0
+	 /* FINISHME */
+	 brw_CMP(p,
+		 vec8(brw_null_reg()),
+		 BRW_CONDITIONAL_L,
+		 brw_swizzle1(ndc, 3),
+		 brw_imm_f(0));
+
+	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
+	 brw_MOV(p, ndc, brw_imm_f(0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+#endif
+      }
+
+      header1.writemask = WRITEMASK_XYZW;
+      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(header1));
+   } else {
+      emit(BRW_OPCODE_MOV, retype(brw_message_reg(header_mrf++),
+				  BRW_REGISTER_TYPE_UD), 0u);
+   }
+
+   if (intel->gen == 5) {
+      /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
+       * dword 0-3 (m1) of the header is indices, point width, clip flags.
+       * dword 4-7 (m2) is the ndc position (set above)
+       * dword 8-11 (m3) of the vertex header is the 4D space position
+       * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
+       * m6 is a pad so that the vertex element data is aligned
+       * m7 is the first vertex data we fill, which is the vertex position.
+       */
+      current_annotation = "NDC";
+      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
+
+      current_annotation = "gl_Position";
+      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
+
+      /* user clip distance. */
+      header_mrf += 2;
+
+      /* Pad so that vertex element data (starts with position) is aligned. */
+      header_mrf++;
+   } else {
+      /* There are 8 dwords in VUE header pre-Ironlake:
+       * dword 0-3 (m1) is indices, point width, clip flags.
+       * dword 4-7 (m2) is ndc position (set above)
+       *
+       * dword 8-11 (m3) is the first vertex data, which we always have be the
+       * vertex position.
+       */
+      current_annotation = "NDC";
+      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
+
+      current_annotation = "gl_Position";
+      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
+   }
+
+   return header_mrf;
+}
+
+int
+vec4_visitor::emit_vue_header_gen6(int header_mrf)
+{
+   struct brw_reg reg;
+
+   /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
+    * dword 0-3 (m2) of the header is indices, point width, clip flags.
+    * dword 4-7 (m3) is the 4D space position
+    * dword 8-15 (m4,m5) of the vertex header is the user clip distance if
+    * enabled.
+    *
+    * m4 or 6 is the first vertex element data we fill, which is
+    * the vertex position.
+    */
+
+   current_annotation = "indices, point width, clip flags";
+   reg = brw_message_reg(header_mrf++);
+   emit(BRW_OPCODE_MOV, retype(reg, BRW_REGISTER_TYPE_D), src_reg(0));
+   if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
+      emit(BRW_OPCODE_MOV, brw_writemask(reg, WRITEMASK_W),
+	   src_reg(output_reg[VERT_RESULT_PSIZ]));
+   }
+
+   current_annotation = "gl_Position";
+   emit(BRW_OPCODE_MOV,
+	brw_message_reg(header_mrf++), src_reg(output_reg[VERT_RESULT_HPOS]));
+
+   current_annotation = "user clip distances";
+   if (c->key.nr_userclip) {
+      for (int i = 0; i < c->key.nr_userclip; i++) {
+	 struct brw_reg m;
+	 if (i < 4)
+	    m = brw_message_reg(header_mrf);
+	 else
+	    m = brw_message_reg(header_mrf + 1);
+
+	 emit(BRW_OPCODE_DP4,
+	      dst_reg(brw_writemask(m, 1 << (i & 7))),
+	      src_reg(c->userplane[i]));
+      }
+      header_mrf += 2;
+   }
+
+   current_annotation = NULL;
+
+   return header_mrf;
+}
+
+static int
+align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
+{
+   struct intel_context *intel = &brw->intel;
+
+   if (intel->gen >= 6) {
+      /* URB data written (does not include the message header reg) must
+       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
+       * section 5.4.3.2.2: URB_INTERLEAVED.
+       *
+       * URB entries are allocated on a multiple of 1024 bits, so an
+       * extra 128 bits written here to make the end align to 256 is
+       * no problem.
+       */
+      if ((mlen % 2) != 1)
+	 mlen++;
+   }
+
+   return mlen;
+}
+
+/**
+ * Generates the VUE payload plus the 1 or 2 URB write instructions to
+ * complete the VS thread.
+ *
+ * The VUE layout is documented in Volume 2a.
+ */
+void
+vec4_visitor::emit_urb_writes()
+{
+   int base_mrf = 1;
+   int mrf = base_mrf;
+   int urb_entry_size;
+
+   /* FINISHME: edgeflag */
+
+   /* First mrf is the g0-based message header containing URB handles and such,
+    * which is implied in VS_OPCODE_URB_WRITE.
+    */
+   mrf++;
+
+   if (intel->gen >= 6) {
+      mrf = emit_vue_header_gen6(mrf);
+   } else {
+      mrf = emit_vue_header_gen4(mrf);
+   }
+
+   int attr;
+   for (attr = 0; attr < VERT_RESULT_MAX; attr++) {
+      if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
+	 continue;
+
+      /* This is loaded into the VUE header, and thus doesn't occupy
+       * an attribute slot.
+       */
+      if (attr == VERT_RESULT_PSIZ)
+	 continue;
+
+      emit(BRW_OPCODE_MOV, brw_message_reg(mrf++), src_reg(output_reg[attr]));
+
+      /* If this is MRF 15, we can't fit anything more into this URB
+       * WRITE.  Note that base_mrf of 1 means that MRF 15 is an
+       * even-numbered amount of URB write data, which will meet
+       * gen6's requirements for length alignment.
+       */
+      if (mrf == 15)
+	 break;
+   }
+
+   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
+   inst->base_mrf = base_mrf;
+   inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
+   inst->eot = true;
+
+   urb_entry_size = mrf - base_mrf;
+
+   for (; attr < VERT_RESULT_MAX; attr++) {
+      if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
+	 continue;
+      fail("Second URB write not supported.\n");
+      break;
+   }
+
+   if (intel->gen == 6)
+      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 8) / 8;
+   else
+      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 4) / 4;
+}
+
+vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
+			   struct gl_shader_program *prog,
+			   struct brw_shader *shader)
+{
+   this->c = c;
+   this->p = &c->func;
+   this->brw = p->brw;
+   this->intel = &brw->intel;
+   this->ctx = &intel->ctx;
+   this->prog = prog;
+   this->shader = shader;
+
+   this->mem_ctx = ralloc_context(NULL);
+   this->failed = false;
+
+   this->base_ir = NULL;
+   this->current_annotation = NULL;
+
+   this->c = c;
+   this->prog_data = &c->prog_data;
+
+   this->variable_ht = hash_table_ctor(0,
+				       hash_table_pointer_hash,
+				       hash_table_pointer_compare);
+
+   this->virtual_grf_sizes = NULL;
+   this->virtual_grf_count = 0;
+   this->virtual_grf_array_size = 0;
+}
+
+vec4_visitor::~vec4_visitor()
+{
+   hash_table_dtor(this->variable_ht);
+}
+
+
+void
+vec4_visitor::fail(const char *format, ...)
+{
+   va_list va;
+   char *msg;
+
+   if (failed)
+      return;
+
+   failed = true;
+
+   va_start(va, format);
+   msg = ralloc_vasprintf(mem_ctx, format, va);
+   va_end(va);
+   msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
+
+   this->fail_msg = msg;
+
+   if (INTEL_DEBUG & DEBUG_VS) {
+      fprintf(stderr, "%s",  msg);
+   }
+}
+
+} /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index a9ad5311fe3..bd0677db151 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -30,6 +30,7 @@
   */
            
 
+#include "main/compiler.h"
 #include "brw_context.h"
 #include "brw_vs.h"
 #include "brw_util.h"
@@ -50,6 +51,7 @@ static void do_vs_prog( struct brw_context *brw,
    void *mem_ctx;
    int aux_size;
    int i;
+   static int new_vs = -1;
 
    memset(&c, 0, sizeof(c));
    memcpy(&c.key, key, sizeof(*key));
@@ -85,7 +87,15 @@ static void do_vs_prog( struct brw_context *brw,
 
    /* Emit GEN4 code.
     */
-   brw_vs_emit(&c);
+   if (new_vs == -1)
+      new_vs = getenv("INTEL_NEW_VS") != NULL;
+
+   if (new_vs) {
+      if (!brw_vs_emit(&c))
+	 brw_old_vs_emit(&c);
+   } else {
+      brw_old_vs_emit(&c);
+   }
 
    /* get the program
     */
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 432994a8534..9f9fed33970 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -92,6 +92,7 @@ struct brw_vs_compile {
    GLboolean needs_stack;
 };
 
-void brw_vs_emit( struct brw_vs_compile *c );
+bool brw_vs_emit(struct brw_vs_compile *c);
+void brw_old_vs_emit(struct brw_vs_compile *c);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index dbabb44e45c..a06a2bbec52 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -1903,7 +1903,7 @@ brw_vs_rescale_gl_fixed(struct brw_vs_compile *c)
 
 /* Emit the vertex program instructions here.
  */
-void brw_vs_emit(struct brw_vs_compile *c )
+void brw_old_vs_emit(struct brw_vs_compile *c )
 {
 #define MAX_IF_DEPTH 32
 #define MAX_LOOP_DEPTH 32