62 files changed, 3674 insertions, 3039 deletions
diff --git a/src/mesa/drivers/dri/Makefile.defines b/src/mesa/drivers/dri/Makefile.defines
new file mode 100644
index 00000000000..19b6de8b85a
--- /dev/null
+++ b/src/mesa/drivers/dri/Makefile.defines
@@ -0,0 +1,34 @@
+# -*-makefile-*-
+
+COMMON_GALLIUM_SOURCES = \
+        ../common/utils.c \
+        ../common/vblank.c \
+        ../common/dri_util.c \
+        ../common/xmlconfig.c
+
+COMMON_SOURCES = $(COMMON_GALLIUM_SOURCES) \
+        ../../common/driverfuncs.c \
+        ../common/texmem.c \
+        ../common/drirenderbuffer.c
+
+INCLUDES = $(SHARED_INCLUDES) $(EXPAT_INCLUDES)
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+	  $(CXX_SOURCES:.cpp=.o) \
+	  $(ASM_SOURCES:.S=.o) 
+
+
+### Include directories
+SHARED_INCLUDES = \
+	-I. \
+	-I$(TOP)/src/mesa/drivers/dri/common \
+	-Iserver \
+	-I$(TOP)/include \
+	-I$(TOP)/src/mapi \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/egl/main \
+	-I$(TOP)/src/egl/drivers/dri \
+	$(LIBDRM_CFLAGS)
+
+INCLUDES += $(API_DEFINES)
+CXXFLAGS += $(API_DEFINES)
diff --git a/src/mesa/drivers/dri/Makefile.template b/src/mesa/drivers/dri/Makefile.targets
index d1a119379ed..436b2a3c477 100644
--- a/src/mesa/drivers/dri/Makefile.template
+++ b/src/mesa/drivers/dri/Makefile.targets
@@ -1,38 +1,5 @@
 # -*-makefile-*-
 
-COMMON_GALLIUM_SOURCES = \
-        ../common/utils.c \
-        ../common/vblank.c \
-        ../common/dri_util.c \
-        ../common/xmlconfig.c
-
-COMMON_SOURCES = $(COMMON_GALLIUM_SOURCES) \
-        ../../common/driverfuncs.c \
-        ../common/texmem.c \
-        ../common/drirenderbuffer.c
-
-INCLUDES = $(SHARED_INCLUDES) $(EXPAT_INCLUDES)
-
-OBJECTS = $(C_SOURCES:.c=.o) \
-	  $(CXX_SOURCES:.cpp=.o) \
-	  $(ASM_SOURCES:.S=.o) 
-
-
-### Include directories
-SHARED_INCLUDES = \
-	-I. \
-	-I$(TOP)/src/mesa/drivers/dri/common \
-	-Iserver \
-	-I$(TOP)/include \
-	-I$(TOP)/src/mapi \
-	-I$(TOP)/src/mesa \
-	-I$(TOP)/src/egl/main \
-	-I$(TOP)/src/egl/drivers/dri \
-	$(LIBDRM_CFLAGS)
-
-CFLAGS += $(API_DEFINES)
-CXXFLAGS += $(API_DEFINES)
-
 ##### RULES #####
 
 .c.o:
@@ -45,7 +12,7 @@ CXXFLAGS += $(API_DEFINES)
 	$(CC) -c $(INCLUDES) $(DRI_CFLAGS) $(DRIVER_DEFINES) $< -o $@
 
 
-##### TARGETS #####
+#### TARGETS #####
 
 default: subdirs lib
 
@@ -55,7 +22,7 @@ lib: symlinks subdirs depend
 	@$(MAKE) $(LIBNAME) $(TOP)/$(LIB_DIR)/$(LIBNAME)
 
 $(LIBNAME): $(OBJECTS) $(EXTRA_MODULES) $(MESA_MODULES) Makefile \
-		$(TOP)/src/mesa/drivers/dri/Makefile.template $(TOP)/src/mesa/drivers/dri/common/dri_test.o
+		$(TOP)/src/mesa/drivers/dri/Makefile.targets $(TOP)/src/mesa/drivers/dri/common/dri_test.o
 	$(MKLIB) -o [email protected] -noprefix -linker '$(CXX)' -ldflags '$(LDFLAGS)' \
 		$(OBJECTS) $(EXTRA_MODULES) $(DRI_LIB_DEPS)
 	$(CXX) $(CFLAGS) -o [email protected] $(TOP)/src/mesa/drivers/dri/common/dri_test.o [email protected] $(DRI_LIB_DEPS)
diff --git a/src/mesa/drivers/dri/i810/Makefile b/src/mesa/drivers/dri/i810/Makefile
index 54a837d5ea9..edc6dd21732 100644
--- a/src/mesa/drivers/dri/i810/Makefile
+++ b/src/mesa/drivers/dri/i810/Makefile
@@ -5,6 +5,8 @@ include $(TOP)/configs/current
 
 LIBNAME = i810_dri.so
 
+include ../Makefile.defines
+
 DRIVER_SOURCES = \
 	i810context.c \
 	i810ioctl.c \
@@ -24,5 +26,5 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-include ../Makefile.template
+include ../Makefile.targets
 
diff --git a/src/mesa/drivers/dri/i915/Makefile b/src/mesa/drivers/dri/i915/Makefile
index 65fd658c047..79e03f2f1d1 100644
--- a/src/mesa/drivers/dri/i915/Makefile
+++ b/src/mesa/drivers/dri/i915/Makefile
@@ -4,6 +4,8 @@ include $(TOP)/configs/current
 
 LIBNAME = i915_dri.so
 
+include ../Makefile.defines
+
 DRIVER_SOURCES = \
 	i830_context.c \
 	i830_state.c \
@@ -63,7 +65,7 @@ DRIVER_DEFINES = -I../intel -DI915 \
 INCLUDES += $(INTEL_CFLAGS)
 DRI_LIB_DEPS += $(INTEL_LIBS)
 
-include ../Makefile.template
+include ../Makefile.targets
 
 intel_decode.o: ../intel/intel_decode.c
 
diff --git a/src/mesa/drivers/dri/i915/i915_vtbl.c b/src/mesa/drivers/dri/i915/i915_vtbl.c
index 89650b618e4..820feba04ba 100644
--- a/src/mesa/drivers/dri/i915/i915_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i915_vtbl.c
@@ -687,6 +687,14 @@ i915_assert_not_dirty( struct intel_context *intel )
    (void) dirty;
 }
 
+/** Return false; i915 does not support HiZ. */
+static bool
+i915_is_hiz_depth_format(struct intel_context *intel,
+                         gl_format format)
+{
+   return false;
+}
+
 void
 i915InitVtbl(struct i915_context *i915)
 {
@@ -702,4 +710,5 @@ i915InitVtbl(struct i915_context *i915)
    i915->intel.vtbl.assert_not_dirty = i915_assert_not_dirty;
    i915->intel.vtbl.finish_batch = intel_finish_vb;
    i915->intel.vtbl.render_target_supported = i915_render_target_supported;
+   i915->intel.vtbl.is_hiz_depth_format = i915_is_hiz_depth_format;
 }
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index b96f42bfe88..44f28cd9d15 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -4,6 +4,8 @@ include $(TOP)/configs/current
 
 LIBNAME = i965_dri.so
 
+include ../Makefile.defines
+
 DRIVER_SOURCES = \
 	intel_batchbuffer.c \
 	intel_blit.c \
@@ -116,10 +118,13 @@ C_SOURCES = \
 CXX_SOURCES = \
 	brw_cubemap_normalize.cpp \
 	brw_fs.cpp \
+	brw_fs_emit.cpp \
+	brw_fs_visitor.cpp \
 	brw_fs_channel_expressions.cpp \
 	brw_fs_reg_allocate.cpp \
 	brw_fs_schedule_instructions.cpp \
-	brw_fs_vector_splitting.cpp
+	brw_fs_vector_splitting.cpp \
+	brw_shader.cpp
 
 ASM_SOURCES = 
 
@@ -128,7 +133,7 @@ DRIVER_DEFINES = -I../intel
 INCLUDES += $(INTEL_CFLAGS)
 DRI_LIB_DEPS += $(INTEL_LIBS)
 
-include ../Makefile.template
+include ../Makefile.targets
 
 intel_decode.o: ../intel/intel_decode.c
 intel_tex_layout.o: ../intel/intel_tex_layout.c
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index df753abed02..0256ab9061f 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -94,9 +94,9 @@ GLboolean brwCreateContext( int api,
       ctx->Const.MaxVertexTextureImageUnits +
       ctx->Const.MaxTextureImageUnits;
 
-   /* Mesa limits textures to 4kx4k; it would be nice to fix that someday
-    */
-   ctx->Const.MaxTextureLevels = 13;
+   ctx->Const.MaxTextureLevels = 14; /* 8192 */
+   if (ctx->Const.MaxTextureLevels > MAX_TEXTURE_LEVELS)
+	   ctx->Const.MaxTextureLevels = MAX_TEXTURE_LEVELS;
    ctx->Const.Max3DTextureLevels = 9;
    ctx->Const.MaxCubeTextureLevels = 12;
    ctx->Const.MaxTextureRectSize = (1<<12);
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 1d2ef066db2..621b6f8990b 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -231,8 +231,8 @@ struct brw_wm_prog_data {
 
    GLuint first_curbe_grf;
    GLuint first_curbe_grf_16;
-   GLuint total_grf;
-   GLuint total_grf_16;
+   GLuint reg_blocks;
+   GLuint reg_blocks_16;
    GLuint total_scratch;
 
    GLuint nr_params;       /**< number of float params/constants */
@@ -863,6 +863,17 @@ float convert_param(enum param_conversion conversion, float param)
    }
 }
 
+/**
+ * Pre-gen6, the register file of the EUs was shared between threads,
+ * and each thread used some subset allocated on a 16-register block
+ * granularity.  The unit states wanted these block counts.
+ */
+static inline int
+brw_register_blocks(int reg_count)
+{
+   return ALIGN(reg_count, 16) / 16 - 1;
+}
+
 GLboolean brw_do_cubemap_normalize(struct exec_list *instructions);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 5eb7892bb08..6d41b1e69d3 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -28,7 +28,10 @@
   * Authors:
   *   Keith Whitwell <[email protected]>
   */
- 
+
+#define INTEL_MASK(high, low) (((1<<((high)-(low)+1))-1)<<(low))
+#define SET_FIELD(value, field) (((value) << field ## _SHIFT) & field ## _MASK)
+#define GET_FIELD(word, field) (((word)  & field ## _MASK) >> field ## _SHIFT)
 
 #ifndef BRW_DEFINES_H
 #define BRW_DEFINES_H
@@ -243,8 +246,17 @@
 #define BRW_STENCILOP_DECR               6
 #define BRW_STENCILOP_INVERT             7
 
+/* Surface state DW0 */
+#define BRW_SURFACE_RC_READ_WRITE	(1 << 8)
+#define BRW_SURFACE_MIPLAYOUT_SHIFT	10
 #define BRW_SURFACE_MIPMAPLAYOUT_BELOW   0
 #define BRW_SURFACE_MIPMAPLAYOUT_RIGHT   1
+#define BRW_SURFACE_CUBEFACE_ENABLES	0x3f
+#define BRW_SURFACE_BLEND_ENABLED	(1 << 13)
+#define BRW_SURFACE_WRITEDISABLE_B_SHIFT	14
+#define BRW_SURFACE_WRITEDISABLE_G_SHIFT	15
+#define BRW_SURFACE_WRITEDISABLE_R_SHIFT	16
+#define BRW_SURFACE_WRITEDISABLE_A_SHIFT	17
 
 #define BRW_SURFACEFORMAT_R32G32B32A32_FLOAT             0x000 
 #define BRW_SURFACEFORMAT_R32G32B32A32_SINT              0x001 
@@ -403,10 +415,14 @@
 #define BRW_SURFACEFORMAT_R16G16B16_SNORM                0x19D 
 #define BRW_SURFACEFORMAT_R16G16B16_SSCALED              0x19E 
 #define BRW_SURFACEFORMAT_R16G16B16_USCALED              0x19F
+#define BRW_SURFACE_FORMAT_SHIFT	18
+#define BRW_SURFACE_FORMAT_MASK		INTEL_MASK(26, 18)
 
 #define BRW_SURFACERETURNFORMAT_FLOAT32  0
 #define BRW_SURFACERETURNFORMAT_S1       1
 
+#define BRW_SURFACE_TYPE_SHIFT		29
+#define BRW_SURFACE_TYPE_MASK		INTEL_MASK(31, 29)
 #define BRW_SURFACE_1D      0
 #define BRW_SURFACE_2D      1
 #define BRW_SURFACE_3D      2
@@ -414,6 +430,32 @@
 #define BRW_SURFACE_BUFFER  4
 #define BRW_SURFACE_NULL    7
 
+/* Surface state DW2 */
+#define BRW_SURFACE_HEIGHT_SHIFT	19
+#define BRW_SURFACE_HEIGHT_MASK		INTEL_MASK(31, 19)
+#define BRW_SURFACE_WIDTH_SHIFT		6
+#define BRW_SURFACE_WIDTH_MASK		INTEL_MASK(18, 6)
+#define BRW_SURFACE_LOD_SHIFT		2
+#define BRW_SURFACE_LOD_MASK		INTEL_MASK(5, 2)
+
+/* Surface state DW3 */
+#define BRW_SURFACE_DEPTH_SHIFT		21
+#define BRW_SURFACE_DEPTH_MASK		INTEL_MASK(31, 21)
+#define BRW_SURFACE_PITCH_SHIFT		3
+#define BRW_SURFACE_PITCH_MASK		INTEL_MASK(19, 3)
+#define BRW_SURFACE_TILED		(1 << 1)
+#define BRW_SURFACE_TILED_Y		(1 << 0)
+
+/* Surface state DW4 */
+#define BRW_SURFACE_MIN_LOD_SHIFT	28
+#define BRW_SURFACE_MIN_LOD_MASK	INTEL_MASK(31, 28)
+
+/* Surface state DW5 */
+#define BRW_SURFACE_X_OFFSET_SHIFT	25
+#define BRW_SURFACE_X_OFFSET_MASK	INTEL_MASK(31, 25)
+#define BRW_SURFACE_Y_OFFSET_SHIFT	20
+#define BRW_SURFACE_Y_OFFSET_MASK	INTEL_MASK(23, 20)
+
 #define BRW_TEXCOORDMODE_WRAP            0
 #define BRW_TEXCOORDMODE_MIRROR          1
 #define BRW_TEXCOORDMODE_CLAMP           2
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 2d41302d15a..e7370f36064 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -93,8 +93,6 @@ static void brw_set_dest(struct brw_compile *p,
 			 struct brw_instruction *insn,
 			 struct brw_reg dest)
 {
-   struct intel_context *intel = &p->brw->intel;
-
    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
        dest.file != BRW_MESSAGE_REGISTER_FILE)
       assert(dest.nr < 128);
@@ -1254,7 +1252,6 @@ struct brw_instruction *gen6_CONT(struct brw_compile *p,
 				  struct brw_instruction *do_insn)
 {
    struct brw_instruction *insn;
-   int br = 2;
 
    insn = next_insn(p, BRW_OPCODE_CONTINUE);
    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 6d545afab64..09033aecd7c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -19,10 +19,13 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs.cpp
  *
- * Authors:
- *    Eric Anholt <[email protected]>
- *
+ * This file drives the GLSL IR -> LIR translation, contains the
+ * optimizations on the LIR, and drives the generation of native code
+ * from the LIR.
  */
 
 extern "C" {
@@ -34,7 +37,6 @@ extern "C" {
 #include "main/uniforms.h"
 #include "program/prog_parameter.h"
 #include "program/prog_print.h"
-#include "program/prog_optimize.h"
 #include "program/register_allocate.h"
 #include "program/sampler.h"
 #include "program/hash_table.h"
@@ -42,113 +44,15 @@ extern "C" {
 #include "brw_eu.h"
 #include "brw_wm.h"
 }
+#include "brw_shader.h"
 #include "brw_fs.h"
 #include "../glsl/glsl_types.h"
-#include "../glsl/ir_optimization.h"
 #include "../glsl/ir_print_visitor.h"
 
 #define MAX_INSTRUCTION (1 << 30)
-static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
-
-struct gl_shader *
-brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
-{
-   struct brw_shader *shader;
-
-   shader = rzalloc(NULL, struct brw_shader);
-   if (shader) {
-      shader->base.Type = type;
-      shader->base.Name = name;
-      _mesa_init_shader(ctx, &shader->base);
-   }
-
-   return &shader->base;
-}
-
-struct gl_shader_program *
-brw_new_shader_program(struct gl_context *ctx, GLuint name)
-{
-   struct brw_shader_program *prog;
-   prog = rzalloc(NULL, struct brw_shader_program);
-   if (prog) {
-      prog->base.Name = name;
-      _mesa_init_shader_program(ctx, &prog->base);
-   }
-   return &prog->base;
-}
-
-GLboolean
-brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct intel_context *intel = &brw->intel;
-
-   struct brw_shader *shader =
-      (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
-   if (shader != NULL) {
-      void *mem_ctx = ralloc_context(NULL);
-      bool progress;
-
-      if (shader->ir)
-	 ralloc_free(shader->ir);
-      shader->ir = new(shader) exec_list;
-      clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
-
-      do_mat_op_to_vec(shader->ir);
-      lower_instructions(shader->ir,
-			 MOD_TO_FRACT |
-			 DIV_TO_MUL_RCP |
-			 SUB_TO_ADD_NEG |
-			 EXP_TO_EXP2 |
-			 LOG_TO_LOG2);
-
-      /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
-       * if-statements need to be flattened.
-       */
-      if (intel->gen < 6)
-	 lower_if_to_cond_assign(shader->ir, 16);
-
-      do_lower_texture_projection(shader->ir);
-      do_vec_index_to_cond_assign(shader->ir);
-      brw_do_cubemap_normalize(shader->ir);
-      lower_noise(shader->ir);
-      lower_quadop_vector(shader->ir, false);
-      lower_variable_index_to_cond_assign(shader->ir,
-					  GL_TRUE, /* input */
-					  GL_TRUE, /* output */
-					  GL_TRUE, /* temp */
-					  GL_TRUE /* uniform */
-					  );
-
-      do {
-	 progress = false;
-
-	 brw_do_channel_expressions(shader->ir);
-	 brw_do_vector_splitting(shader->ir);
-
-	 progress = do_lower_jumps(shader->ir, true, true,
-				   true, /* main return */
-				   false, /* continue */
-				   false /* loops */
-				   ) || progress;
-
-	 progress = do_common_optimization(shader->ir, true, 32) || progress;
-      } while (progress);
-
-      validate_ir_tree(shader->ir);
 
-      reparent_ir(shader->ir, shader->ir);
-      ralloc_free(mem_ctx);
-   }
-
-   if (!_mesa_ir_link_shader(ctx, prog))
-      return GL_FALSE;
-
-   return GL_TRUE;
-}
-
-static int
-type_size(const struct glsl_type *type)
+int
+fs_visitor::type_size(const struct glsl_type *type)
 {
    unsigned int size, i;
 
@@ -180,17 +84,23 @@ type_size(const struct glsl_type *type)
 void
 fs_visitor::fail(const char *format, ...)
 {
-   if (!failed) {
-      failed = true;
+   va_list va;
+   char *msg;
 
-      if (INTEL_DEBUG & DEBUG_WM) {
-	 fprintf(stderr, "FS compile failed: ");
+   if (failed)
+      return;
 
-	 va_list va;
-	 va_start(va, format);
-	 vfprintf(stderr, format, va);
-	 va_end(va);
-      }
+   failed = true;
+
+   va_start(va, format);
+   msg = ralloc_vasprintf(mem_ctx, format, va);
+   va_end(va);
+   msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
+
+   this->fail_msg = msg;
+
+   if (INTEL_DEBUG & DEBUG_WM) {
+      fprintf(stderr, msg);
    }
 }
 
@@ -297,38 +207,13 @@ fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
    this->type = type;
 }
 
-int
-brw_type_for_base_type(const struct glsl_type *type)
-{
-   switch (type->base_type) {
-   case GLSL_TYPE_FLOAT:
-      return BRW_REGISTER_TYPE_F;
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_BOOL:
-      return BRW_REGISTER_TYPE_D;
-   case GLSL_TYPE_UINT:
-      return BRW_REGISTER_TYPE_UD;
-   case GLSL_TYPE_ARRAY:
-   case GLSL_TYPE_STRUCT:
-   case GLSL_TYPE_SAMPLER:
-      /* These should be overridden with the type of the member when
-       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
-       * way to trip up if we don't.
-       */
-      return BRW_REGISTER_TYPE_UD;
-   default:
-      assert(!"not reached");
-      return BRW_REGISTER_TYPE_F;
-   }
-}
-
 /** Automatic reg constructor. */
 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 {
    init();
 
    this->file = GRF;
-   this->reg = v->virtual_grf_alloc(type_size(type));
+   this->reg = v->virtual_grf_alloc(v->type_size(type));
    this->reg_offset = 0;
    this->type = brw_type_for_base_type(type);
 }
@@ -584,7 +469,7 @@ fs_visitor::emit_general_interpolation(ir_variable *ir)
 	       attr.reg_offset++;
 	    }
 
-	    if (intel->gen < 6 && !(is_gl_Color && c->key.linear_color)) {
+	    if (intel->gen < 6) {
 	       attr.reg_offset -= type->vector_elements;
 	       for (unsigned int k = 0; k < type->vector_elements; k++) {
 		  emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
@@ -706,2134 +591,6 @@ fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
    return inst;
 }
 
-void
-fs_visitor::visit(ir_variable *ir)
-{
-   fs_reg *reg = NULL;
-
-   if (variable_storage(ir))
-      return;
-
-   if (strcmp(ir->name, "gl_FragColor") == 0) {
-      this->frag_color = ir;
-   } else if (strcmp(ir->name, "gl_FragData") == 0) {
-      this->frag_data = ir;
-   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
-      this->frag_depth = ir;
-   }
-
-   if (ir->mode == ir_var_in) {
-      if (!strcmp(ir->name, "gl_FragCoord")) {
-	 reg = emit_fragcoord_interpolation(ir);
-      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
-	 reg = emit_frontfacing_interpolation(ir);
-      } else {
-	 reg = emit_general_interpolation(ir);
-      }
-      assert(reg);
-      hash_table_insert(this->variable_ht, reg, ir);
-      return;
-   }
-
-   if (ir->mode == ir_var_uniform) {
-      int param_index = c->prog_data.nr_params;
-
-      if (c->dispatch_width == 16) {
-	 if (!variable_storage(ir)) {
-	    fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
-	 }
-	 return;
-      }
-
-      if (!strncmp(ir->name, "gl_", 3)) {
-	 setup_builtin_uniform_values(ir);
-      } else {
-	 setup_uniform_values(ir->location, ir->type);
-      }
-
-      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
-      reg->type = brw_type_for_base_type(ir->type);
-   }
-
-   if (!reg)
-      reg = new(this->mem_ctx) fs_reg(this, ir->type);
-
-   hash_table_insert(this->variable_ht, reg, ir);
-}
-
-void
-fs_visitor::visit(ir_dereference_variable *ir)
-{
-   fs_reg *reg = variable_storage(ir->var);
-   this->result = *reg;
-}
-
-void
-fs_visitor::visit(ir_dereference_record *ir)
-{
-   const glsl_type *struct_type = ir->record->type;
-
-   ir->record->accept(this);
-
-   unsigned int offset = 0;
-   for (unsigned int i = 0; i < struct_type->length; i++) {
-      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
-	 break;
-      offset += type_size(struct_type->fields.structure[i].type);
-   }
-   this->result.reg_offset += offset;
-   this->result.type = brw_type_for_base_type(ir->type);
-}
-
-void
-fs_visitor::visit(ir_dereference_array *ir)
-{
-   ir_constant *index;
-   int element_size;
-
-   ir->array->accept(this);
-   index = ir->array_index->as_constant();
-
-   element_size = type_size(ir->type);
-   this->result.type = brw_type_for_base_type(ir->type);
-
-   if (index) {
-      assert(this->result.file == UNIFORM ||
-	     (this->result.file == GRF &&
-	      this->result.reg != 0));
-      this->result.reg_offset += index->value.i[0] * element_size;
-   } else {
-      assert(!"FINISHME: non-constant array element");
-   }
-}
-
-/* Instruction selection: Produce a MOV.sat instead of
- * MIN(MAX(val, 0), 1) when possible.
- */
-bool
-fs_visitor::try_emit_saturate(ir_expression *ir)
-{
-   ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
-
-   if (!sat_val)
-      return false;
-
-   sat_val->accept(this);
-   fs_reg src = this->result;
-
-   this->result = fs_reg(this, ir->type);
-   fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
-   inst->saturate = true;
-
-   return true;
-}
-
-static uint32_t
-brw_conditional_for_comparison(unsigned int op)
-{
-   switch (op) {
-   case ir_binop_less:
-      return BRW_CONDITIONAL_L;
-   case ir_binop_greater:
-      return BRW_CONDITIONAL_G;
-   case ir_binop_lequal:
-      return BRW_CONDITIONAL_LE;
-   case ir_binop_gequal:
-      return BRW_CONDITIONAL_GE;
-   case ir_binop_equal:
-   case ir_binop_all_equal: /* same as equal for scalars */
-      return BRW_CONDITIONAL_Z;
-   case ir_binop_nequal:
-   case ir_binop_any_nequal: /* same as nequal for scalars */
-      return BRW_CONDITIONAL_NZ;
-   default:
-      assert(!"not reached: bad operation for comparison");
-      return BRW_CONDITIONAL_NZ;
-   }
-}
-
-void
-fs_visitor::visit(ir_expression *ir)
-{
-   unsigned int operand;
-   fs_reg op[2], temp;
-   fs_inst *inst;
-
-   assert(ir->get_num_operands() <= 2);
-
-   if (try_emit_saturate(ir))
-      return;
-
-   for (operand = 0; operand < ir->get_num_operands(); operand++) {
-      ir->operands[operand]->accept(this);
-      if (this->result.file == BAD_FILE) {
-	 ir_print_visitor v;
-	 fail("Failed to get tree for expression operand:\n");
-	 ir->operands[operand]->accept(&v);
-      }
-      op[operand] = this->result;
-
-      /* Matrix expression operands should have been broken down to vector
-       * operations already.
-       */
-      assert(!ir->operands[operand]->type->is_matrix());
-      /* And then those vector operands should have been broken down to scalar.
-       */
-      assert(!ir->operands[operand]->type->is_vector());
-   }
-
-   /* Storage for our result.  If our result goes into an assignment, it will
-    * just get copy-propagated out, so no worries.
-    */
-   this->result = fs_reg(this, ir->type);
-
-   switch (ir->operation) {
-   case ir_unop_logic_not:
-      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
-       * ones complement of the whole register, not just bit 0.
-       */
-      emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
-      break;
-   case ir_unop_neg:
-      op[0].negate = !op[0].negate;
-      this->result = op[0];
-      break;
-   case ir_unop_abs:
-      op[0].abs = true;
-      op[0].negate = false;
-      this->result = op[0];
-      break;
-   case ir_unop_sign:
-      temp = fs_reg(this, ir->type);
-
-      emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
-
-      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
-      inst->conditional_mod = BRW_CONDITIONAL_G;
-      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
-      inst->predicated = true;
-
-      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
-      inst->conditional_mod = BRW_CONDITIONAL_L;
-      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
-      inst->predicated = true;
-
-      break;
-   case ir_unop_rcp:
-      emit_math(FS_OPCODE_RCP, this->result, op[0]);
-      break;
-
-   case ir_unop_exp2:
-      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
-      break;
-   case ir_unop_log2:
-      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
-      break;
-   case ir_unop_exp:
-   case ir_unop_log:
-      assert(!"not reached: should be handled by ir_explog_to_explog2");
-      break;
-   case ir_unop_sin:
-   case ir_unop_sin_reduced:
-      emit_math(FS_OPCODE_SIN, this->result, op[0]);
-      break;
-   case ir_unop_cos:
-   case ir_unop_cos_reduced:
-      emit_math(FS_OPCODE_COS, this->result, op[0]);
-      break;
-
-   case ir_unop_dFdx:
-      emit(FS_OPCODE_DDX, this->result, op[0]);
-      break;
-   case ir_unop_dFdy:
-      emit(FS_OPCODE_DDY, this->result, op[0]);
-      break;
-
-   case ir_binop_add:
-      emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
-      break;
-   case ir_binop_sub:
-      assert(!"not reached: should be handled by ir_sub_to_add_neg");
-      break;
-
-   case ir_binop_mul:
-      emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
-      break;
-   case ir_binop_div:
-      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
-      break;
-   case ir_binop_mod:
-      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
-      break;
-
-   case ir_binop_less:
-   case ir_binop_greater:
-   case ir_binop_lequal:
-   case ir_binop_gequal:
-   case ir_binop_equal:
-   case ir_binop_all_equal:
-   case ir_binop_nequal:
-   case ir_binop_any_nequal:
-      temp = this->result;
-      /* original gen4 does implicit conversion before comparison. */
-      if (intel->gen < 5)
-	 temp.type = op[0].type;
-
-      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
-      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
-      emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1));
-      break;
-
-   case ir_binop_logic_xor:
-      emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
-      break;
-
-   case ir_binop_logic_or:
-      emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
-      break;
-
-   case ir_binop_logic_and:
-      emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
-      break;
-
-   case ir_binop_dot:
-   case ir_unop_any:
-      assert(!"not reached: should be handled by brw_fs_channel_expressions");
-      break;
-
-   case ir_unop_noise:
-      assert(!"not reached: should be handled by lower_noise");
-      break;
-
-   case ir_quadop_vector:
-      assert(!"not reached: should be handled by lower_quadop_vector");
-      break;
-
-   case ir_unop_sqrt:
-      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
-      break;
-
-   case ir_unop_rsq:
-      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
-      break;
-
-   case ir_unop_i2f:
-   case ir_unop_b2f:
-   case ir_unop_b2i:
-   case ir_unop_f2i:
-      emit(BRW_OPCODE_MOV, this->result, op[0]);
-      break;
-   case ir_unop_f2b:
-   case ir_unop_i2b:
-      temp = this->result;
-      /* original gen4 does implicit conversion before comparison. */
-      if (intel->gen < 5)
-	 temp.type = op[0].type;
-
-      inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
-      break;
-
-   case ir_unop_trunc:
-      emit(BRW_OPCODE_RNDZ, this->result, op[0]);
-      break;
-   case ir_unop_ceil:
-      op[0].negate = !op[0].negate;
-      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
-      this->result.negate = true;
-      break;
-   case ir_unop_floor:
-      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
-      break;
-   case ir_unop_fract:
-      inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
-      break;
-   case ir_unop_round_even:
-      emit(BRW_OPCODE_RNDE, this->result, op[0]);
-      break;
-
-   case ir_binop_min:
-      inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
-      inst->conditional_mod = BRW_CONDITIONAL_L;
-
-      inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
-      inst->predicated = true;
-      break;
-   case ir_binop_max:
-      inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
-      inst->conditional_mod = BRW_CONDITIONAL_G;
-
-      inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
-      inst->predicated = true;
-      break;
-
-   case ir_binop_pow:
-      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
-      break;
-
-   case ir_unop_bit_not:
-      inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
-      break;
-   case ir_binop_bit_and:
-      inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
-      break;
-   case ir_binop_bit_xor:
-      inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
-      break;
-   case ir_binop_bit_or:
-      inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
-      break;
-
-   case ir_unop_u2f:
-   case ir_binop_lshift:
-   case ir_binop_rshift:
-      assert(!"GLSL 1.30 features unsupported");
-      break;
-   }
-}
-
-void
-fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
-				   const glsl_type *type, bool predicated)
-{
-   switch (type->base_type) {
-   case GLSL_TYPE_FLOAT:
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_BOOL:
-      for (unsigned int i = 0; i < type->components(); i++) {
-	 l.type = brw_type_for_base_type(type);
-	 r.type = brw_type_for_base_type(type);
-
-	 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
-	 inst->predicated = predicated;
-
-	 l.reg_offset++;
-	 r.reg_offset++;
-      }
-      break;
-   case GLSL_TYPE_ARRAY:
-      for (unsigned int i = 0; i < type->length; i++) {
-	 emit_assignment_writes(l, r, type->fields.array, predicated);
-      }
-      break;
-
-   case GLSL_TYPE_STRUCT:
-      for (unsigned int i = 0; i < type->length; i++) {
-	 emit_assignment_writes(l, r, type->fields.structure[i].type,
-				predicated);
-      }
-      break;
-
-   case GLSL_TYPE_SAMPLER:
-      break;
-
-   default:
-      assert(!"not reached");
-      break;
-   }
-}
-
-void
-fs_visitor::visit(ir_assignment *ir)
-{
-   struct fs_reg l, r;
-   fs_inst *inst;
-
-   /* FINISHME: arrays on the lhs */
-   ir->lhs->accept(this);
-   l = this->result;
-
-   ir->rhs->accept(this);
-   r = this->result;
-
-   assert(l.file != BAD_FILE);
-   assert(r.file != BAD_FILE);
-
-   if (ir->condition) {
-      emit_bool_to_cond_code(ir->condition);
-   }
-
-   if (ir->lhs->type->is_scalar() ||
-       ir->lhs->type->is_vector()) {
-      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
-	 if (ir->write_mask & (1 << i)) {
-	    inst = emit(BRW_OPCODE_MOV, l, r);
-	    if (ir->condition)
-	       inst->predicated = true;
-	    r.reg_offset++;
-	 }
-	 l.reg_offset++;
-      }
-   } else {
-      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
-   }
-}
-
-fs_inst *
-fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
-			      int sampler)
-{
-   int mlen;
-   int base_mrf = 1;
-   bool simd16 = false;
-   fs_reg orig_dst;
-
-   /* g0 header. */
-   mlen = 1;
-
-   if (ir->shadow_comparitor) {
-      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-	 fs_inst *inst = emit(BRW_OPCODE_MOV,
-			      fs_reg(MRF, base_mrf + mlen + i), coordinate);
-	 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
-	    inst->saturate = true;
-
-	 coordinate.reg_offset++;
-      }
-      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
-      mlen += 3;
-
-      if (ir->op == ir_tex) {
-	 /* There's no plain shadow compare message, so we use shadow
-	  * compare with a bias of 0.0.
-	  */
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
-	 mlen++;
-      } else if (ir->op == ir_txb) {
-	 ir->lod_info.bias->accept(this);
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-	 mlen++;
-      } else {
-	 assert(ir->op == ir_txl);
-	 ir->lod_info.lod->accept(this);
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-	 mlen++;
-      }
-
-      ir->shadow_comparitor->accept(this);
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-      mlen++;
-   } else if (ir->op == ir_tex) {
-      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-	 fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
-			      coordinate);
-	 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
-	    inst->saturate = true;
-	 coordinate.reg_offset++;
-      }
-      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
-      mlen += 3;
-   } else if (ir->op == ir_txd) {
-      assert(!"TXD isn't supported on gen4 yet.");
-   } else {
-      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
-       * instructions.  We'll need to do SIMD16 here.
-       */
-      assert(ir->op == ir_txb || ir->op == ir_txl);
-
-      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-	 fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF,
-						     base_mrf + mlen + i * 2),
-			      coordinate);
-	 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
-	    inst->saturate = true;
-	 coordinate.reg_offset++;
-      }
-
-      /* lod/bias appears after u/v/r. */
-      mlen += 6;
-
-      if (ir->op == ir_txb) {
-	 ir->lod_info.bias->accept(this);
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-	 mlen++;
-      } else {
-	 ir->lod_info.lod->accept(this);
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-	 mlen++;
-      }
-
-      /* The unused upper half. */
-      mlen++;
-
-      /* Now, since we're doing simd16, the return is 2 interleaved
-       * vec4s where the odd-indexed ones are junk. We'll need to move
-       * this weirdness around to the expected layout.
-       */
-      simd16 = true;
-      orig_dst = dst;
-      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
-						       2));
-      dst.type = BRW_REGISTER_TYPE_F;
-   }
-
-   fs_inst *inst = NULL;
-   switch (ir->op) {
-   case ir_tex:
-      inst = emit(FS_OPCODE_TEX, dst);
-      break;
-   case ir_txb:
-      inst = emit(FS_OPCODE_TXB, dst);
-      break;
-   case ir_txl:
-      inst = emit(FS_OPCODE_TXL, dst);
-      break;
-   case ir_txd:
-      inst = emit(FS_OPCODE_TXD, dst);
-      break;
-   case ir_txf:
-      assert(!"GLSL 1.30 features unsupported");
-      break;
-   }
-   inst->base_mrf = base_mrf;
-   inst->mlen = mlen;
-   inst->header_present = true;
-
-   if (simd16) {
-      for (int i = 0; i < 4; i++) {
-	 emit(BRW_OPCODE_MOV, orig_dst, dst);
-	 orig_dst.reg_offset++;
-	 dst.reg_offset += 2;
-      }
-   }
-
-   return inst;
-}
-
-/* gen5's sampler has slots for u, v, r, array index, then optional
- * parameters like shadow comparitor or LOD bias.  If optional
- * parameters aren't present, those base slots are optional and don't
- * need to be included in the message.
- *
- * We don't fill in the unnecessary slots regardless, which may look
- * surprising in the disassembly.
- */
-fs_inst *
-fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
-			      int sampler)
-{
-   int mlen = 0;
-   int base_mrf = 2;
-   int reg_width = c->dispatch_width / 8;
-   bool header_present = false;
-
-   if (ir->offset) {
-      /* The offsets set up by the ir_texture visitor are in the
-       * m1 header, so we can't go headerless.
-       */
-      header_present = true;
-      mlen++;
-      base_mrf--;
-   }
-
-   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-      fs_inst *inst = emit(BRW_OPCODE_MOV,
-			   fs_reg(MRF, base_mrf + mlen + i * reg_width),
-			   coordinate);
-      if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
-	 inst->saturate = true;
-      coordinate.reg_offset++;
-   }
-   mlen += ir->coordinate->type->vector_elements * reg_width;
-
-   if (ir->shadow_comparitor) {
-      mlen = MAX2(mlen, header_present + 4 * reg_width);
-
-      ir->shadow_comparitor->accept(this);
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-      mlen += reg_width;
-   }
-
-   fs_inst *inst = NULL;
-   switch (ir->op) {
-   case ir_tex:
-      inst = emit(FS_OPCODE_TEX, dst);
-      break;
-   case ir_txb:
-      ir->lod_info.bias->accept(this);
-      mlen = MAX2(mlen, header_present + 4 * reg_width);
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-      mlen += reg_width;
-
-      inst = emit(FS_OPCODE_TXB, dst);
-
-      break;
-   case ir_txl:
-      ir->lod_info.lod->accept(this);
-      mlen = MAX2(mlen, header_present + 4 * reg_width);
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-      mlen += reg_width;
-
-      inst = emit(FS_OPCODE_TXL, dst);
-      break;
-   case ir_txd:
-   case ir_txf:
-      assert(!"GLSL 1.30 features unsupported");
-      break;
-   }
-   inst->base_mrf = base_mrf;
-   inst->mlen = mlen;
-   inst->header_present = header_present;
-
-   if (mlen > 11) {
-      fail("Message length >11 disallowed by hardware\n");
-   }
-
-   return inst;
-}
-
-fs_inst *
-fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
-			      int sampler)
-{
-   int mlen = 0;
-   int base_mrf = 2;
-   int reg_width = c->dispatch_width / 8;
-   bool header_present = false;
-
-   if (ir->offset) {
-      /* The offsets set up by the ir_texture visitor are in the
-       * m1 header, so we can't go headerless.
-       */
-      header_present = true;
-      mlen++;
-      base_mrf--;
-   }
-
-   if (ir->shadow_comparitor) {
-      ir->shadow_comparitor->accept(this);
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-      mlen += reg_width;
-   }
-
-   /* Set up the LOD info */
-   switch (ir->op) {
-   case ir_tex:
-      break;
-   case ir_txb:
-      ir->lod_info.bias->accept(this);
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-      mlen += reg_width;
-      break;
-   case ir_txl:
-      ir->lod_info.lod->accept(this);
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-      mlen += reg_width;
-      break;
-   case ir_txd:
-   case ir_txf:
-      assert(!"GLSL 1.30 features unsupported");
-      break;
-   }
-
-   /* Set up the coordinate */
-   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-      fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
-			   coordinate);
-      if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
-	 inst->saturate = true;
-      coordinate.reg_offset++;
-      mlen += reg_width;
-   }
-
-   /* Generate the SEND */
-   fs_inst *inst = NULL;
-   switch (ir->op) {
-   case ir_tex: inst = emit(FS_OPCODE_TEX, dst); break;
-   case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
-   case ir_txl: inst = emit(FS_OPCODE_TXL, dst); break;
-   case ir_txd: inst = emit(FS_OPCODE_TXD, dst); break;
-   case ir_txf: assert(!"TXF unsupported.");
-   }
-   inst->base_mrf = base_mrf;
-   inst->mlen = mlen;
-   inst->header_present = header_present;
-
-   if (mlen > 11) {
-      fail("Message length >11 disallowed by hardware\n");
-   }
-
-   return inst;
-}
-
-void
-fs_visitor::visit(ir_texture *ir)
-{
-   int sampler;
-   fs_inst *inst = NULL;
-
-   ir->coordinate->accept(this);
-   fs_reg coordinate = this->result;
-
-   if (ir->offset != NULL) {
-      ir_constant *offset = ir->offset->as_constant();
-      assert(offset != NULL);
-
-      signed char offsets[3];
-      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
-	 offsets[i] = (signed char) offset->value.i[i];
-
-      /* Combine all three offsets into a single unsigned dword:
-       *
-       *    bits 11:8 - U Offset (X component)
-       *    bits  7:4 - V Offset (Y component)
-       *    bits  3:0 - R Offset (Z component)
-       */
-      unsigned offset_bits = 0;
-      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
-	 const unsigned shift = 4 * (2 - i);
-	 offset_bits |= (offsets[i] << shift) & (0xF << shift);
-      }
-
-      /* Explicitly set up the message header by copying g0 to msg reg m1. */
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
-	   fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD));
-
-      /* Then set the offset bits in DWord 2 of the message header. */
-      emit(BRW_OPCODE_MOV,
-	   fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
-			 BRW_REGISTER_TYPE_UD)),
-	   fs_reg(brw_imm_uw(offset_bits)));
-   }
-
-   /* Should be lowered by do_lower_texture_projection */
-   assert(!ir->projector);
-
-   sampler = _mesa_get_sampler_uniform_value(ir->sampler,
-					     ctx->Shader.CurrentFragmentProgram,
-					     &brw->fragment_program->Base);
-   sampler = c->fp->program.Base.SamplerUnits[sampler];
-
-   /* The 965 requires the EU to do the normalization of GL rectangle
-    * texture coordinates.  We use the program parameter state
-    * tracking to get the scaling factor.
-    */
-   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
-      struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
-      int tokens[STATE_LENGTH] = {
-	 STATE_INTERNAL,
-	 STATE_TEXRECT_SCALE,
-	 sampler,
-	 0,
-	 0
-      };
-
-      if (c->dispatch_width == 16) {
-	 fail("rectangle scale uniform setup not supported on 16-wide\n");
-	 this->result = fs_reg(this, ir->type);
-	 return;
-      }
-
-      c->prog_data.param_convert[c->prog_data.nr_params] =
-	 PARAM_NO_CONVERT;
-      c->prog_data.param_convert[c->prog_data.nr_params + 1] =
-	 PARAM_NO_CONVERT;
-
-      fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
-      fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
-      GLuint index = _mesa_add_state_reference(params,
-					       (gl_state_index *)tokens);
-
-      this->param_index[c->prog_data.nr_params] = index;
-      this->param_offset[c->prog_data.nr_params] = 0;
-      c->prog_data.nr_params++;
-      this->param_index[c->prog_data.nr_params] = index;
-      this->param_offset[c->prog_data.nr_params] = 1;
-      c->prog_data.nr_params++;
-
-      fs_reg dst = fs_reg(this, ir->coordinate->type);
-      fs_reg src = coordinate;
-      coordinate = dst;
-
-      emit(BRW_OPCODE_MUL, dst, src, scale_x);
-      dst.reg_offset++;
-      src.reg_offset++;
-      emit(BRW_OPCODE_MUL, dst, src, scale_y);
-   }
-
-   /* Writemasking doesn't eliminate channels on SIMD8 texture
-    * samples, so don't worry about them.
-    */
-   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
-
-   if (intel->gen >= 7) {
-      inst = emit_texture_gen7(ir, dst, coordinate, sampler);
-   } else if (intel->gen >= 5) {
-      inst = emit_texture_gen5(ir, dst, coordinate, sampler);
-   } else {
-      inst = emit_texture_gen4(ir, dst, coordinate, sampler);
-   }
-
-   /* If there's an offset, we already set up m1.  To avoid the implied move,
-    * use the null register.  Otherwise, we want an implied move from g0.
-    */
-   if (ir->offset != NULL || !inst->header_present)
-      inst->src[0] = fs_reg(brw_null_reg());
-   else
-      inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
-
-   inst->sampler = sampler;
-
-   this->result = dst;
-
-   if (ir->shadow_comparitor)
-      inst->shadow_compare = true;
-
-   if (ir->type == glsl_type::float_type) {
-      /* Ignore DEPTH_TEXTURE_MODE swizzling. */
-      assert(ir->sampler->type->sampler_shadow);
-   } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
-      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
-
-      for (int i = 0; i < 4; i++) {
-	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
-	 fs_reg l = swizzle_dst;
-	 l.reg_offset += i;
-
-	 if (swiz == SWIZZLE_ZERO) {
-	    emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
-	 } else if (swiz == SWIZZLE_ONE) {
-	    emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
-	 } else {
-	    fs_reg r = dst;
-	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
-	    emit(BRW_OPCODE_MOV, l, r);
-	 }
-      }
-      this->result = swizzle_dst;
-   }
-}
-
-void
-fs_visitor::visit(ir_swizzle *ir)
-{
-   ir->val->accept(this);
-   fs_reg val = this->result;
-
-   if (ir->type->vector_elements == 1) {
-      this->result.reg_offset += ir->mask.x;
-      return;
-   }
-
-   fs_reg result = fs_reg(this, ir->type);
-   this->result = result;
-
-   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
-      fs_reg channel = val;
-      int swiz = 0;
-
-      switch (i) {
-      case 0:
-	 swiz = ir->mask.x;
-	 break;
-      case 1:
-	 swiz = ir->mask.y;
-	 break;
-      case 2:
-	 swiz = ir->mask.z;
-	 break;
-      case 3:
-	 swiz = ir->mask.w;
-	 break;
-      }
-
-      channel.reg_offset += swiz;
-      emit(BRW_OPCODE_MOV, result, channel);
-      result.reg_offset++;
-   }
-}
-
-void
-fs_visitor::visit(ir_discard *ir)
-{
-   assert(ir->condition == NULL); /* FINISHME */
-
-   emit(FS_OPCODE_DISCARD);
-   kill_emitted = true;
-}
-
-void
-fs_visitor::visit(ir_constant *ir)
-{
-   /* Set this->result to reg at the bottom of the function because some code
-    * paths will cause this visitor to be applied to other fields.  This will
-    * cause the value stored in this->result to be modified.
-    *
-    * Make reg constant so that it doesn't get accidentally modified along the
-    * way.  Yes, I actually had this problem. :(
-    */
-   const fs_reg reg(this, ir->type);
-   fs_reg dst_reg = reg;
-
-   if (ir->type->is_array()) {
-      const unsigned size = type_size(ir->type->fields.array);
-
-      for (unsigned i = 0; i < ir->type->length; i++) {
-	 ir->array_elements[i]->accept(this);
-	 fs_reg src_reg = this->result;
-
-	 dst_reg.type = src_reg.type;
-	 for (unsigned j = 0; j < size; j++) {
-	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
-	    src_reg.reg_offset++;
-	    dst_reg.reg_offset++;
-	 }
-      }
-   } else if (ir->type->is_record()) {
-      foreach_list(node, &ir->components) {
-	 ir_instruction *const field = (ir_instruction *) node;
-	 const unsigned size = type_size(field->type);
-
-	 field->accept(this);
-	 fs_reg src_reg = this->result;
-
-	 dst_reg.type = src_reg.type;
-	 for (unsigned j = 0; j < size; j++) {
-	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
-	    src_reg.reg_offset++;
-	    dst_reg.reg_offset++;
-	 }
-      }
-   } else {
-      const unsigned size = type_size(ir->type);
-
-      for (unsigned i = 0; i < size; i++) {
-	 switch (ir->type->base_type) {
-	 case GLSL_TYPE_FLOAT:
-	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
-	    break;
-	 case GLSL_TYPE_UINT:
-	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
-	    break;
-	 case GLSL_TYPE_INT:
-	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
-	    break;
-	 case GLSL_TYPE_BOOL:
-	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
-	    break;
-	 default:
-	    assert(!"Non-float/uint/int/bool constant");
-	 }
-	 dst_reg.reg_offset++;
-      }
-   }
-
-   this->result = reg;
-}
-
-void
-fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
-{
-   ir_expression *expr = ir->as_expression();
-
-   if (expr) {
-      fs_reg op[2];
-      fs_inst *inst;
-
-      assert(expr->get_num_operands() <= 2);
-      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-	 assert(expr->operands[i]->type->is_scalar());
-
-	 expr->operands[i]->accept(this);
-	 op[i] = this->result;
-      }
-
-      switch (expr->operation) {
-      case ir_unop_logic_not:
-	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
-	 inst->conditional_mod = BRW_CONDITIONAL_Z;
-	 break;
-
-      case ir_binop_logic_xor:
-	 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]);
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 break;
-
-      case ir_binop_logic_or:
-	 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]);
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 break;
-
-      case ir_binop_logic_and:
-	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]);
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 break;
-
-      case ir_unop_f2b:
-	 if (intel->gen >= 6) {
-	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
-	 } else {
-	    inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
-	 }
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 break;
-
-      case ir_unop_i2b:
-	 if (intel->gen >= 6) {
-	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
-	 } else {
-	    inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
-	 }
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 break;
-
-      case ir_binop_greater:
-      case ir_binop_gequal:
-      case ir_binop_less:
-      case ir_binop_lequal:
-      case ir_binop_equal:
-      case ir_binop_all_equal:
-      case ir_binop_nequal:
-      case ir_binop_any_nequal:
-	 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
-	 inst->conditional_mod =
-	    brw_conditional_for_comparison(expr->operation);
-	 break;
-
-      default:
-	 assert(!"not reached");
-	 fail("bad cond code\n");
-	 break;
-      }
-      return;
-   }
-
-   ir->accept(this);
-
-   if (intel->gen >= 6) {
-      fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-   } else {
-      fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result);
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-   }
-}
-
-/**
- * Emit a gen6 IF statement with the comparison folded into the IF
- * instruction.
- */
-void
-fs_visitor::emit_if_gen6(ir_if *ir)
-{
-   ir_expression *expr = ir->condition->as_expression();
-
-   if (expr) {
-      fs_reg op[2];
-      fs_inst *inst;
-      fs_reg temp;
-
-      assert(expr->get_num_operands() <= 2);
-      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-	 assert(expr->operands[i]->type->is_scalar());
-
-	 expr->operands[i]->accept(this);
-	 op[i] = this->result;
-      }
-
-      switch (expr->operation) {
-      case ir_unop_logic_not:
-	 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_Z;
-	 return;
-
-      case ir_binop_logic_xor:
-	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 return;
-
-      case ir_binop_logic_or:
-	 temp = fs_reg(this, glsl_type::bool_type);
-	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
-	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 return;
-
-      case ir_binop_logic_and:
-	 temp = fs_reg(this, glsl_type::bool_type);
-	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
-	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 return;
-
-      case ir_unop_f2b:
-	 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 return;
-
-      case ir_unop_i2b:
-	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 return;
-
-      case ir_binop_greater:
-      case ir_binop_gequal:
-      case ir_binop_less:
-      case ir_binop_lequal:
-      case ir_binop_equal:
-      case ir_binop_all_equal:
-      case ir_binop_nequal:
-      case ir_binop_any_nequal:
-	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
-	 inst->conditional_mod =
-	    brw_conditional_for_comparison(expr->operation);
-	 return;
-      default:
-	 assert(!"not reached");
-	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 fail("bad condition\n");
-	 return;
-      }
-      return;
-   }
-
-   ir->condition->accept(this);
-
-   fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
-   inst->conditional_mod = BRW_CONDITIONAL_NZ;
-}
-
-void
-fs_visitor::visit(ir_if *ir)
-{
-   fs_inst *inst;
-
-   if (intel->gen != 6 && c->dispatch_width == 16) {
-      fail("Can't support (non-uniform) control flow on 16-wide\n");
-   }
-
-   /* Don't point the annotation at the if statement, because then it plus
-    * the then and else blocks get printed.
-    */
-   this->base_ir = ir->condition;
-
-   if (intel->gen == 6) {
-      emit_if_gen6(ir);
-   } else {
-      emit_bool_to_cond_code(ir->condition);
-
-      inst = emit(BRW_OPCODE_IF);
-      inst->predicated = true;
-   }
-
-   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
-      ir_instruction *ir = (ir_instruction *)iter.get();
-      this->base_ir = ir;
-
-      ir->accept(this);
-   }
-
-   if (!ir->else_instructions.is_empty()) {
-      emit(BRW_OPCODE_ELSE);
-
-      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
-	 ir_instruction *ir = (ir_instruction *)iter.get();
-	 this->base_ir = ir;
-
-	 ir->accept(this);
-      }
-   }
-
-   emit(BRW_OPCODE_ENDIF);
-}
-
-void
-fs_visitor::visit(ir_loop *ir)
-{
-   fs_reg counter = reg_undef;
-
-   if (c->dispatch_width == 16) {
-      fail("Can't support (non-uniform) control flow on 16-wide\n");
-   }
-
-   if (ir->counter) {
-      this->base_ir = ir->counter;
-      ir->counter->accept(this);
-      counter = *(variable_storage(ir->counter));
-
-      if (ir->from) {
-	 this->base_ir = ir->from;
-	 ir->from->accept(this);
-
-	 emit(BRW_OPCODE_MOV, counter, this->result);
-      }
-   }
-
-   emit(BRW_OPCODE_DO);
-
-   if (ir->to) {
-      this->base_ir = ir->to;
-      ir->to->accept(this);
-
-      fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
-      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
-
-      inst = emit(BRW_OPCODE_BREAK);
-      inst->predicated = true;
-   }
-
-   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
-      ir_instruction *ir = (ir_instruction *)iter.get();
-
-      this->base_ir = ir;
-      ir->accept(this);
-   }
-
-   if (ir->increment) {
-      this->base_ir = ir->increment;
-      ir->increment->accept(this);
-      emit(BRW_OPCODE_ADD, counter, counter, this->result);
-   }
-
-   emit(BRW_OPCODE_WHILE);
-}
-
-void
-fs_visitor::visit(ir_loop_jump *ir)
-{
-   switch (ir->mode) {
-   case ir_loop_jump::jump_break:
-      emit(BRW_OPCODE_BREAK);
-      break;
-   case ir_loop_jump::jump_continue:
-      emit(BRW_OPCODE_CONTINUE);
-      break;
-   }
-}
-
-void
-fs_visitor::visit(ir_call *ir)
-{
-   assert(!"FINISHME");
-}
-
-void
-fs_visitor::visit(ir_return *ir)
-{
-   assert(!"FINISHME");
-}
-
-void
-fs_visitor::visit(ir_function *ir)
-{
-   /* Ignore function bodies other than main() -- we shouldn't see calls to
-    * them since they should all be inlined before we get to ir_to_mesa.
-    */
-   if (strcmp(ir->name, "main") == 0) {
-      const ir_function_signature *sig;
-      exec_list empty;
-
-      sig = ir->matching_signature(&empty);
-
-      assert(sig);
-
-      foreach_iter(exec_list_iterator, iter, sig->body) {
-	 ir_instruction *ir = (ir_instruction *)iter.get();
-	 this->base_ir = ir;
-
-	 ir->accept(this);
-      }
-   }
-}
-
-void
-fs_visitor::visit(ir_function_signature *ir)
-{
-   assert(!"not reached");
-   (void)ir;
-}
-
-fs_inst *
-fs_visitor::emit(fs_inst inst)
-{
-   fs_inst *list_inst = new(mem_ctx) fs_inst;
-   *list_inst = inst;
-
-   if (force_uncompressed_stack > 0)
-      list_inst->force_uncompressed = true;
-   else if (force_sechalf_stack > 0)
-      list_inst->force_sechalf = true;
-
-   list_inst->annotation = this->current_annotation;
-   list_inst->ir = this->base_ir;
-
-   this->instructions.push_tail(list_inst);
-
-   return list_inst;
-}
-
-/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
-void
-fs_visitor::emit_dummy_fs()
-{
-   /* Everyone's favorite color. */
-   emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f));
-   emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f));
-   emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f));
-   emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f));
-
-   fs_inst *write;
-   write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
-   write->base_mrf = 0;
-}
-
-/* The register location here is relative to the start of the URB
- * data.  It will get adjusted to be a real location before
- * generate_code() time.
- */
-struct brw_reg
-fs_visitor::interp_reg(int location, int channel)
-{
-   int regnr = urb_setup[location] * 2 + channel / 2;
-   int stride = (channel & 1) * 4;
-
-   assert(urb_setup[location] != -1);
-
-   return brw_vec1_grf(regnr, stride);
-}
-
-/** Emits the interpolation for the varying inputs. */
-void
-fs_visitor::emit_interpolation_setup_gen4()
-{
-   this->current_annotation = "compute pixel centers";
-   this->pixel_x = fs_reg(this, glsl_type::uint_type);
-   this->pixel_y = fs_reg(this, glsl_type::uint_type);
-   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
-   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
-
-   emit(FS_OPCODE_PIXEL_X, this->pixel_x);
-   emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
-
-   this->current_annotation = "compute pixel deltas from v0";
-   if (brw->has_pln) {
-      this->delta_x = fs_reg(this, glsl_type::vec2_type);
-      this->delta_y = this->delta_x;
-      this->delta_y.reg_offset++;
-   } else {
-      this->delta_x = fs_reg(this, glsl_type::float_type);
-      this->delta_y = fs_reg(this, glsl_type::float_type);
-   }
-   emit(BRW_OPCODE_ADD, this->delta_x,
-	this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
-   emit(BRW_OPCODE_ADD, this->delta_y,
-	this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
-
-   this->current_annotation = "compute pos.w and 1/pos.w";
-   /* Compute wpos.w.  It's always in our setup, since it's needed to
-    * interpolate the other attributes.
-    */
-   this->wpos_w = fs_reg(this, glsl_type::float_type);
-   emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
-	interp_reg(FRAG_ATTRIB_WPOS, 3));
-   /* Compute the pixel 1/W value from wpos.w. */
-   this->pixel_w = fs_reg(this, glsl_type::float_type);
-   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
-   this->current_annotation = NULL;
-}
-
-/** Emits the interpolation for the varying inputs. */
-void
-fs_visitor::emit_interpolation_setup_gen6()
-{
-   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
-
-   /* If the pixel centers end up used, the setup is the same as for gen4. */
-   this->current_annotation = "compute pixel centers";
-   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
-   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
-   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
-   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
-   emit(BRW_OPCODE_ADD,
-	int_pixel_x,
-	fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
-	fs_reg(brw_imm_v(0x10101010)));
-   emit(BRW_OPCODE_ADD,
-	int_pixel_y,
-	fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
-	fs_reg(brw_imm_v(0x11001100)));
-
-   /* As of gen6, we can no longer mix float and int sources.  We have
-    * to turn the integer pixel centers into floats for their actual
-    * use.
-    */
-   this->pixel_x = fs_reg(this, glsl_type::float_type);
-   this->pixel_y = fs_reg(this, glsl_type::float_type);
-   emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
-   emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
-
-   this->current_annotation = "compute pos.w";
-   this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
-   this->wpos_w = fs_reg(this, glsl_type::float_type);
-   emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w);
-
-   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
-   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
-
-   this->current_annotation = NULL;
-}
-
-void
-fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color)
-{
-   int reg_width = c->dispatch_width / 8;
-
-   if (c->dispatch_width == 8 || intel->gen == 6) {
-      /* SIMD8 write looks like:
-       * m + 0: r0
-       * m + 1: r1
-       * m + 2: g0
-       * m + 3: g1
-       *
-       * gen6 SIMD16 DP write looks like:
-       * m + 0: r0
-       * m + 1: r1
-       * m + 2: g0
-       * m + 3: g1
-       * m + 4: b0
-       * m + 5: b1
-       * m + 6: a0
-       * m + 7: a1
-       */
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width),
-	   color);
-   } else {
-      /* pre-gen6 SIMD16 single source DP write looks like:
-       * m + 0: r0
-       * m + 1: g0
-       * m + 2: b0
-       * m + 3: a0
-       * m + 4: r1
-       * m + 5: g1
-       * m + 6: b1
-       * m + 7: a1
-       */
-      if (brw->has_compr4) {
-	 /* By setting the high bit of the MRF register number, we
-	  * indicate that we want COMPR4 mode - instead of doing the
-	  * usual destination + 1 for the second half we get
-	  * destination + 4.
-	  */
-	 emit(BRW_OPCODE_MOV,
-	      fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index), color);
-      } else {
-	 push_force_uncompressed();
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color);
-	 pop_force_uncompressed();
-
-	 push_force_sechalf();
-	 color.sechalf = true;
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color);
-	 pop_force_sechalf();
-	 color.sechalf = false;
-      }
-   }
-}
-
-void
-fs_visitor::emit_fb_writes()
-{
-   this->current_annotation = "FB write header";
-   GLboolean header_present = GL_TRUE;
-   int nr = 0;
-   int reg_width = c->dispatch_width / 8;
-
-   if (intel->gen >= 6 &&
-       !this->kill_emitted &&
-       c->key.nr_color_regions == 1) {
-      header_present = false;
-   }
-
-   if (header_present) {
-      /* m0, m1 header */
-      nr += 2;
-   }
-
-   if (c->aa_dest_stencil_reg) {
-      push_force_uncompressed();
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
-	   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
-      pop_force_uncompressed();
-   }
-
-   /* Reserve space for color. It'll be filled in per MRT below. */
-   int color_mrf = nr;
-   nr += 4 * reg_width;
-
-   if (c->source_depth_to_render_target) {
-      if (intel->gen == 6 && c->dispatch_width == 16) {
-	 /* For outputting oDepth on gen6, SIMD8 writes have to be
-	  * used.  This would require 8-wide moves of each half to
-	  * message regs, kind of like pre-gen5 SIMD16 FB writes.
-	  * Just bail on doing so for now.
-	  */
-	 fail("Missing support for simd16 depth writes on gen6\n");
-      }
-
-      if (c->computes_depth) {
-	 /* Hand over gl_FragDepth. */
-	 assert(this->frag_depth);
-	 fs_reg depth = *(variable_storage(this->frag_depth));
-
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
-      } else {
-	 /* Pass through the payload depth. */
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
-	      fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
-      }
-      nr += reg_width;
-   }
-
-   if (c->dest_depth_reg) {
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
-	   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
-      nr += reg_width;
-   }
-
-   fs_reg color = reg_undef;
-   if (this->frag_color)
-      color = *(variable_storage(this->frag_color));
-   else if (this->frag_data) {
-      color = *(variable_storage(this->frag_data));
-      color.type = BRW_REGISTER_TYPE_F;
-   }
-
-   for (int target = 0; target < c->key.nr_color_regions; target++) {
-      this->current_annotation = ralloc_asprintf(this->mem_ctx,
-						 "FB write target %d",
-						 target);
-      if (this->frag_color || this->frag_data) {
-	 for (int i = 0; i < 4; i++) {
-	    emit_color_write(i, color_mrf, color);
-	    color.reg_offset++;
-	 }
-      }
-
-      if (this->frag_color)
-	 color.reg_offset -= 4;
-
-      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
-      inst->target = target;
-      inst->base_mrf = 0;
-      inst->mlen = nr;
-      if (target == c->key.nr_color_regions - 1)
-	 inst->eot = true;
-      inst->header_present = header_present;
-   }
-
-   if (c->key.nr_color_regions == 0) {
-      if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
-	 /* If the alpha test is enabled but there's no color buffer,
-	  * we still need to send alpha out the pipeline to our null
-	  * renderbuffer.
-	  */
-	 color.reg_offset += 3;
-	 emit_color_write(3, color_mrf, color);
-      }
-
-      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
-      inst->base_mrf = 0;
-      inst->mlen = nr;
-      inst->eot = true;
-      inst->header_present = header_present;
-   }
-
-   this->current_annotation = NULL;
-}
-
-void
-fs_visitor::generate_fb_write(fs_inst *inst)
-{
-   GLboolean eot = inst->eot;
-   struct brw_reg implied_header;
-
-   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
-    * move, here's g1.
-    */
-   brw_push_insn_state(p);
-   brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-
-   if (inst->header_present) {
-      if (intel->gen >= 6) {
-	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
-	 brw_MOV(p,
-		 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
-		 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-
-	 if (inst->target > 0) {
-	    /* Set the render target index for choosing BLEND_STATE. */
-	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
-			      BRW_REGISTER_TYPE_UD),
-		    brw_imm_ud(inst->target));
-	 }
-
-	 implied_header = brw_null_reg();
-      } else {
-	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
-
-	 brw_MOV(p,
-		 brw_message_reg(inst->base_mrf + 1),
-		 brw_vec8_grf(1, 0));
-      }
-   } else {
-      implied_header = brw_null_reg();
-   }
-
-   brw_pop_insn_state(p);
-
-   brw_fb_WRITE(p,
-		c->dispatch_width,
-		inst->base_mrf,
-		implied_header,
-		inst->target,
-		inst->mlen,
-		0,
-		eot,
-		inst->header_present);
-}
-
-/* Computes the integer pixel x,y values from the origin.
- *
- * This is the basis of gl_FragCoord computation, but is also used
- * pre-gen6 for computing the deltas from v0 for computing
- * interpolation.
- */
-void
-fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x)
-{
-   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
-   struct brw_reg src;
-   struct brw_reg deltas;
-
-   if (is_x) {
-      src = stride(suboffset(g1_uw, 4), 2, 4, 0);
-      deltas = brw_imm_v(0x10101010);
-   } else {
-      src = stride(suboffset(g1_uw, 5), 2, 4, 0);
-      deltas = brw_imm_v(0x11001100);
-   }
-
-   if (c->dispatch_width == 16) {
-      dst = vec16(dst);
-   }
-
-   /* We do this 8 or 16-wide, but since the destination is UW we
-    * don't do compression in the 16-wide case.
-    */
-   brw_push_insn_state(p);
-   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_ADD(p, dst, src, deltas);
-   brw_pop_insn_state(p);
-}
-
-void
-fs_visitor::generate_linterp(fs_inst *inst,
-			     struct brw_reg dst, struct brw_reg *src)
-{
-   struct brw_reg delta_x = src[0];
-   struct brw_reg delta_y = src[1];
-   struct brw_reg interp = src[2];
-
-   if (brw->has_pln &&
-       delta_y.nr == delta_x.nr + 1 &&
-       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
-      brw_PLN(p, dst, interp, delta_x);
-   } else {
-      brw_LINE(p, brw_null_reg(), interp, delta_x);
-      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
-   }
-}
-
-void
-fs_visitor::generate_math(fs_inst *inst,
-			  struct brw_reg dst, struct brw_reg *src)
-{
-   int op;
-
-   switch (inst->opcode) {
-   case FS_OPCODE_RCP:
-      op = BRW_MATH_FUNCTION_INV;
-      break;
-   case FS_OPCODE_RSQ:
-      op = BRW_MATH_FUNCTION_RSQ;
-      break;
-   case FS_OPCODE_SQRT:
-      op = BRW_MATH_FUNCTION_SQRT;
-      break;
-   case FS_OPCODE_EXP2:
-      op = BRW_MATH_FUNCTION_EXP;
-      break;
-   case FS_OPCODE_LOG2:
-      op = BRW_MATH_FUNCTION_LOG;
-      break;
-   case FS_OPCODE_POW:
-      op = BRW_MATH_FUNCTION_POW;
-      break;
-   case FS_OPCODE_SIN:
-      op = BRW_MATH_FUNCTION_SIN;
-      break;
-   case FS_OPCODE_COS:
-      op = BRW_MATH_FUNCTION_COS;
-      break;
-   default:
-      assert(!"not reached: unknown math function");
-      op = 0;
-      break;
-   }
-
-   if (intel->gen >= 6) {
-      assert(inst->mlen == 0);
-
-      if (inst->opcode == FS_OPCODE_POW) {
-	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-	 brw_math2(p, dst, op, src[0], src[1]);
-
-	 if (c->dispatch_width == 16) {
-	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-	    brw_math2(p, sechalf(dst), op, sechalf(src[0]), sechalf(src[1]));
-	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
-	 }
-      } else {
-	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-	 brw_math(p, dst,
-		  op,
-		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
-		  BRW_MATH_SATURATE_NONE,
-		  0, src[0],
-		  BRW_MATH_DATA_VECTOR,
-		  BRW_MATH_PRECISION_FULL);
-
-	 if (c->dispatch_width == 16) {
-	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-	    brw_math(p, sechalf(dst),
-		     op,
-		     inst->saturate ? BRW_MATH_SATURATE_SATURATE :
-		     BRW_MATH_SATURATE_NONE,
-		     0, sechalf(src[0]),
-		     BRW_MATH_DATA_VECTOR,
-		     BRW_MATH_PRECISION_FULL);
-	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
-	 }
-      }
-   } else /* gen <= 5 */{
-      assert(inst->mlen >= 1);
-
-      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_math(p, dst,
-	       op,
-	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
-	       BRW_MATH_SATURATE_NONE,
-	       inst->base_mrf, src[0],
-	       BRW_MATH_DATA_VECTOR,
-	       BRW_MATH_PRECISION_FULL);
-
-      if (c->dispatch_width == 16) {
-	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-	 brw_math(p, sechalf(dst),
-		  op,
-		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
-		  BRW_MATH_SATURATE_NONE,
-		  inst->base_mrf + 1, sechalf(src[0]),
-		  BRW_MATH_DATA_VECTOR,
-		  BRW_MATH_PRECISION_FULL);
-
-	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
-      }
-   }
-}
-
-void
-fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
-{
-   int msg_type = -1;
-   int rlen = 4;
-   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
-
-   if (c->dispatch_width == 16)
-      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
-
-   if (intel->gen >= 5) {
-      switch (inst->opcode) {
-      case FS_OPCODE_TEX:
-	 if (inst->shadow_compare) {
-	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
-	 } else {
-	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
-	 }
-	 break;
-      case FS_OPCODE_TXB:
-	 if (inst->shadow_compare) {
-	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
-	 } else {
-	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
-	 }
-	 break;
-      case FS_OPCODE_TXL:
-	 if (inst->shadow_compare) {
-	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
-	 } else {
-	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
-	 }
-	 break;
-      case FS_OPCODE_TXD:
-	 assert(!"TXD isn't supported on gen5+ yet.");
-	 break;
-      }
-   } else {
-      switch (inst->opcode) {
-      case FS_OPCODE_TEX:
-	 /* Note that G45 and older determines shadow compare and dispatch width
-	  * from message length for most messages.
-	  */
-	 assert(c->dispatch_width == 8);
-	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
-	 if (inst->shadow_compare) {
-	    assert(inst->mlen == 6);
-	 } else {
-	    assert(inst->mlen <= 4);
-	 }
-	 break;
-      case FS_OPCODE_TXB:
-	 if (inst->shadow_compare) {
-	    assert(inst->mlen == 6);
-	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
-	 } else {
-	    assert(inst->mlen == 9);
-	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
-	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
-	 }
-	 break;
-      case FS_OPCODE_TXL:
-	 if (inst->shadow_compare) {
-	    assert(inst->mlen == 6);
-	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
-	 } else {
-	    assert(inst->mlen == 9);
-	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
-	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
-	 }
-	 break;
-      case FS_OPCODE_TXD:
-	 assert(!"TXD isn't supported on gen4 yet.");
-	 break;
-      }
-   }
-   assert(msg_type != -1);
-
-   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
-      rlen = 8;
-      dst = vec16(dst);
-   }
-
-   brw_SAMPLE(p,
-	      retype(dst, BRW_REGISTER_TYPE_UW),
-	      inst->base_mrf,
-	      src,
-              SURF_INDEX_TEXTURE(inst->sampler),
-	      inst->sampler,
-	      WRITEMASK_XYZW,
-	      msg_type,
-	      rlen,
-	      inst->mlen,
-	      0,
-	      inst->header_present,
-	      simd_mode);
-}
-
-
-/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
- * looking like:
- *
- * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
- *
- * and we're trying to produce:
- *
- *           DDX                     DDY
- * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
- *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
- *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
- *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
- *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
- *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
- *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
- *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
- *
- * and add another set of two more subspans if in 16-pixel dispatch mode.
- *
- * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
- * for each pair, and vertstride = 2 jumps us 2 elements after processing a
- * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
- * between each other.  We could probably do it like ddx and swizzle the right
- * order later, but bail for now and just produce
- * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
- */
-void
-fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
-{
-   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
-				 BRW_REGISTER_TYPE_F,
-				 BRW_VERTICAL_STRIDE_2,
-				 BRW_WIDTH_2,
-				 BRW_HORIZONTAL_STRIDE_0,
-				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
-   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
-				 BRW_REGISTER_TYPE_F,
-				 BRW_VERTICAL_STRIDE_2,
-				 BRW_WIDTH_2,
-				 BRW_HORIZONTAL_STRIDE_0,
-				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
-   brw_ADD(p, dst, src0, negate(src1));
-}
-
-void
-fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
-{
-   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
-				 BRW_REGISTER_TYPE_F,
-				 BRW_VERTICAL_STRIDE_4,
-				 BRW_WIDTH_4,
-				 BRW_HORIZONTAL_STRIDE_0,
-				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
-   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
-				 BRW_REGISTER_TYPE_F,
-				 BRW_VERTICAL_STRIDE_4,
-				 BRW_WIDTH_4,
-				 BRW_HORIZONTAL_STRIDE_0,
-				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
-   brw_ADD(p, dst, src0, negate(src1));
-}
-
-void
-fs_visitor::generate_discard(fs_inst *inst)
-{
-   struct brw_reg f0 = brw_flag_reg();
-
-   if (intel->gen >= 6) {
-      struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
-      struct brw_reg some_register;
-
-      /* As of gen6, we no longer have the mask register to look at,
-       * so life gets a bit more complicated.
-       */
-
-      /* Load the flag register with all ones. */
-      brw_push_insn_state(p);
-      brw_set_mask_control(p, BRW_MASK_DISABLE);
-      brw_MOV(p, f0, brw_imm_uw(0xffff));
-      brw_pop_insn_state(p);
-
-      /* Do a comparison that should always fail, to produce 0s in the flag
-       * reg where we have active channels.
-       */
-      some_register = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
-      brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
-	      BRW_CONDITIONAL_NZ, some_register, some_register);
-
-      /* Undo CMP's whacking of predication*/
-      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-      brw_push_insn_state(p);
-      brw_set_mask_control(p, BRW_MASK_DISABLE);
-      brw_AND(p, g1, f0, g1);
-      brw_pop_insn_state(p);
-   } else {
-      struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
-      struct brw_reg mask = brw_uw1_reg(mask.file, mask.nr, 0);
-
-      brw_push_insn_state(p);
-      brw_set_mask_control(p, BRW_MASK_DISABLE);
-      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-
-      /* Unlike the 965, we have the mask reg, so we just need
-       * somewhere to invert that (containing channels to be disabled)
-       * so it can be ANDed with the mask of pixels still to be
-       * written. Use the flag reg for consistency with gen6+.
-       */
-      brw_NOT(p, f0, brw_mask_reg(1)); /* IMASK */
-      brw_AND(p, g0, f0, g0);
-
-      brw_pop_insn_state(p);
-   }
-}
-
-void
-fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
-{
-   assert(inst->mlen != 0);
-
-   brw_MOV(p,
-	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
-	   retype(src, BRW_REGISTER_TYPE_UD));
-   brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
-				 inst->offset);
-}
-
-void
-fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
-{
-   assert(inst->mlen != 0);
-
-   /* Clear any post destination dependencies that would be ignored by
-    * the block read.  See the B-Spec for pre-gen5 send instruction.
-    *
-    * This could use a better solution, since texture sampling and
-    * math reads could potentially run into it as well -- anywhere
-    * that we have a SEND with a destination that is a register that
-    * was written but not read within the last N instructions (what's
-    * N?  unsure).  This is rare because of dead code elimination, but
-    * not impossible.
-    */
-   if (intel->gen == 4 && !intel->is_g4x)
-      brw_MOV(p, brw_null_reg(), dst);
-
-   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
-				inst->offset);
-
-   if (intel->gen == 4 && !intel->is_g4x) {
-      /* gen4 errata: destination from a send can't be used as a
-       * destination until it's been read.  Just read it so we don't
-       * have to worry.
-       */
-      brw_MOV(p, brw_null_reg(), dst);
-   }
-}
-
-
-void
-fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
-{
-   assert(inst->mlen != 0);
-
-   /* Clear any post destination dependencies that would be ignored by
-    * the block read.  See the B-Spec for pre-gen5 send instruction.
-    *
-    * This could use a better solution, since texture sampling and
-    * math reads could potentially run into it as well -- anywhere
-    * that we have a SEND with a destination that is a register that
-    * was written but not read within the last N instructions (what's
-    * N?  unsure).  This is rare because of dead code elimination, but
-    * not impossible.
-    */
-   if (intel->gen == 4 && !intel->is_g4x)
-      brw_MOV(p, brw_null_reg(), dst);
-
-   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
-			inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
-
-   if (intel->gen == 4 && !intel->is_g4x) {
-      /* gen4 errata: destination from a send can't be used as a
-       * destination until it's been read.  Just read it so we don't
-       * have to worry.
-       */
-      brw_MOV(p, brw_null_reg(), dst);
-   }
-}
-
 /**
  * To be called after the last _mesa_add_state_reference() call, to
  * set up prog_data.param[] for assign_curb_setup() and
@@ -2892,7 +649,7 @@ fs_visitor::calculate_urb_setup()
    /* Figure out where each of the incoming setup attributes lands. */
    if (intel->gen >= 6) {
       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
-	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
+	 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
 	    urb_setup[i] = urb_next++;
 	 }
       }
@@ -3276,12 +1033,16 @@ fs_visitor::propagate_constants()
 		  scan_inst->src[i] = inst->src[0];
 		  progress = true;
 	       } else if (i == 0 && scan_inst->src[1].file != IMM) {
-		  /* Fit this constant in by swapping the operands and
-		   * flipping the predicate
-		   */
 		  scan_inst->src[0] = scan_inst->src[1];
 		  scan_inst->src[1] = inst->src[0];
-		  scan_inst->predicate_inverse = !scan_inst->predicate_inverse;
+
+		  /* If this was predicated, flipping operands means
+		   * we also need to flip the predicate.
+		   */
+		  if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) {
+		     scan_inst->predicate_inverse =
+			!scan_inst->predicate_inverse;
+		  }
 		  progress = true;
 	       }
 	       break;
@@ -3734,355 +1495,6 @@ fs_visitor::virtual_grf_interferes(int a, int b)
    return start < end;
 }
 
-static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
-{
-   struct brw_reg brw_reg;
-
-   switch (reg->file) {
-   case GRF:
-   case ARF:
-   case MRF:
-      if (reg->smear == -1) {
-	 brw_reg = brw_vec8_reg(reg->file,
-				reg->hw_reg, 0);
-      } else {
-	 brw_reg = brw_vec1_reg(reg->file,
-				reg->hw_reg, reg->smear);
-      }
-      brw_reg = retype(brw_reg, reg->type);
-      if (reg->sechalf)
-	 brw_reg = sechalf(brw_reg);
-      break;
-   case IMM:
-      switch (reg->type) {
-      case BRW_REGISTER_TYPE_F:
-	 brw_reg = brw_imm_f(reg->imm.f);
-	 break;
-      case BRW_REGISTER_TYPE_D:
-	 brw_reg = brw_imm_d(reg->imm.i);
-	 break;
-      case BRW_REGISTER_TYPE_UD:
-	 brw_reg = brw_imm_ud(reg->imm.u);
-	 break;
-      default:
-	 assert(!"not reached");
-	 brw_reg = brw_null_reg();
-	 break;
-      }
-      break;
-   case FIXED_HW_REG:
-      brw_reg = reg->fixed_hw_reg;
-      break;
-   case BAD_FILE:
-      /* Probably unused. */
-      brw_reg = brw_null_reg();
-      break;
-   case UNIFORM:
-      assert(!"not reached");
-      brw_reg = brw_null_reg();
-      break;
-   default:
-      assert(!"not reached");
-      brw_reg = brw_null_reg();
-      break;
-   }
-   if (reg->abs)
-      brw_reg = brw_abs(brw_reg);
-   if (reg->negate)
-      brw_reg = negate(brw_reg);
-
-   return brw_reg;
-}
-
-void
-fs_visitor::generate_code()
-{
-   int last_native_inst = p->nr_insn;
-   const char *last_annotation_string = NULL;
-   ir_instruction *last_annotation_ir = NULL;
-
-   int loop_stack_array_size = 16;
-   int loop_stack_depth = 0;
-   brw_instruction **loop_stack =
-      rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
-   int *if_depth_in_loop =
-      rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
-
-
-   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      printf("Native code for fragment shader %d (%d-wide dispatch):\n",
-	     ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width);
-   }
-
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
-      struct brw_reg src[3], dst;
-
-      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-	 if (last_annotation_ir != inst->ir) {
-	    last_annotation_ir = inst->ir;
-	    if (last_annotation_ir) {
-	       printf("   ");
-	       last_annotation_ir->print();
-	       printf("\n");
-	    }
-	 }
-	 if (last_annotation_string != inst->annotation) {
-	    last_annotation_string = inst->annotation;
-	    if (last_annotation_string)
-	       printf("   %s\n", last_annotation_string);
-	 }
-      }
-
-      for (unsigned int i = 0; i < 3; i++) {
-	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
-      }
-      dst = brw_reg_from_fs_reg(&inst->dst);
-
-      brw_set_conditionalmod(p, inst->conditional_mod);
-      brw_set_predicate_control(p, inst->predicated);
-      brw_set_predicate_inverse(p, inst->predicate_inverse);
-      brw_set_saturate(p, inst->saturate);
-
-      if (inst->force_uncompressed || c->dispatch_width == 8) {
-	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-      } else if (inst->force_sechalf) {
-	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-      } else {
-	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
-      }
-
-      switch (inst->opcode) {
-      case BRW_OPCODE_MOV:
-	 brw_MOV(p, dst, src[0]);
-	 break;
-      case BRW_OPCODE_ADD:
-	 brw_ADD(p, dst, src[0], src[1]);
-	 break;
-      case BRW_OPCODE_MUL:
-	 brw_MUL(p, dst, src[0], src[1]);
-	 break;
-
-      case BRW_OPCODE_FRC:
-	 brw_FRC(p, dst, src[0]);
-	 break;
-      case BRW_OPCODE_RNDD:
-	 brw_RNDD(p, dst, src[0]);
-	 break;
-      case BRW_OPCODE_RNDE:
-	 brw_RNDE(p, dst, src[0]);
-	 break;
-      case BRW_OPCODE_RNDZ:
-	 brw_RNDZ(p, dst, src[0]);
-	 break;
-
-      case BRW_OPCODE_AND:
-	 brw_AND(p, dst, src[0], src[1]);
-	 break;
-      case BRW_OPCODE_OR:
-	 brw_OR(p, dst, src[0], src[1]);
-	 break;
-      case BRW_OPCODE_XOR:
-	 brw_XOR(p, dst, src[0], src[1]);
-	 break;
-      case BRW_OPCODE_NOT:
-	 brw_NOT(p, dst, src[0]);
-	 break;
-      case BRW_OPCODE_ASR:
-	 brw_ASR(p, dst, src[0], src[1]);
-	 break;
-      case BRW_OPCODE_SHR:
-	 brw_SHR(p, dst, src[0], src[1]);
-	 break;
-      case BRW_OPCODE_SHL:
-	 brw_SHL(p, dst, src[0], src[1]);
-	 break;
-
-      case BRW_OPCODE_CMP:
-	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
-	 break;
-      case BRW_OPCODE_SEL:
-	 brw_SEL(p, dst, src[0], src[1]);
-	 break;
-
-      case BRW_OPCODE_IF:
-	 if (inst->src[0].file != BAD_FILE) {
-	    /* The instruction has an embedded compare (only allowed on gen6) */
-	    assert(intel->gen == 6);
-	    gen6_IF(p, inst->conditional_mod, src[0], src[1]);
-	 } else {
-	    brw_IF(p, c->dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
-	 }
-	 if_depth_in_loop[loop_stack_depth]++;
-	 break;
-
-      case BRW_OPCODE_ELSE:
-	 brw_ELSE(p);
-	 break;
-      case BRW_OPCODE_ENDIF:
-	 brw_ENDIF(p);
-	 if_depth_in_loop[loop_stack_depth]--;
-	 break;
-
-      case BRW_OPCODE_DO:
-	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
-	 if (loop_stack_array_size <= loop_stack_depth) {
-	    loop_stack_array_size *= 2;
-	    loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
-				  loop_stack_array_size);
-	    if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
-				        loop_stack_array_size);
-	 }
-	 if_depth_in_loop[loop_stack_depth] = 0;
-	 break;
-
-      case BRW_OPCODE_BREAK:
-	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
-	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-	 break;
-      case BRW_OPCODE_CONTINUE:
-	 /* FINISHME: We need to write the loop instruction support still. */
-	 if (intel->gen >= 6)
-	    gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
-	 else
-	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
-	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-	 break;
-
-      case BRW_OPCODE_WHILE: {
-	 struct brw_instruction *inst0, *inst1;
-	 GLuint br = 1;
-
-	 if (intel->gen >= 5)
-	    br = 2;
-
-	 assert(loop_stack_depth > 0);
-	 loop_stack_depth--;
-	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
-	 if (intel->gen < 6) {
-	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
-	    while (inst0 > loop_stack[loop_stack_depth]) {
-	       inst0--;
-	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
-		   inst0->bits3.if_else.jump_count == 0) {
-		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
-	    }
-	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
-			inst0->bits3.if_else.jump_count == 0) {
-		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
-	       }
-	    }
-	 }
-      }
-	 break;
-
-      case FS_OPCODE_RCP:
-      case FS_OPCODE_RSQ:
-      case FS_OPCODE_SQRT:
-      case FS_OPCODE_EXP2:
-      case FS_OPCODE_LOG2:
-      case FS_OPCODE_POW:
-      case FS_OPCODE_SIN:
-      case FS_OPCODE_COS:
-	 generate_math(inst, dst, src);
-	 break;
-      case FS_OPCODE_PIXEL_X:
-	 generate_pixel_xy(dst, true);
-	 break;
-      case FS_OPCODE_PIXEL_Y:
-	 generate_pixel_xy(dst, false);
-	 break;
-      case FS_OPCODE_CINTERP:
-	 brw_MOV(p, dst, src[0]);
-	 break;
-      case FS_OPCODE_LINTERP:
-	 generate_linterp(inst, dst, src);
-	 break;
-      case FS_OPCODE_TEX:
-      case FS_OPCODE_TXB:
-      case FS_OPCODE_TXD:
-      case FS_OPCODE_TXL:
-	 generate_tex(inst, dst, src[0]);
-	 break;
-      case FS_OPCODE_DISCARD:
-	 generate_discard(inst);
-	 break;
-      case FS_OPCODE_DDX:
-	 generate_ddx(inst, dst, src[0]);
-	 break;
-      case FS_OPCODE_DDY:
-	 generate_ddy(inst, dst, src[0]);
-	 break;
-
-      case FS_OPCODE_SPILL:
-	 generate_spill(inst, src[0]);
-	 break;
-
-      case FS_OPCODE_UNSPILL:
-	 generate_unspill(inst, dst);
-	 break;
-
-      case FS_OPCODE_PULL_CONSTANT_LOAD:
-	 generate_pull_constant_load(inst, dst);
-	 break;
-
-      case FS_OPCODE_FB_WRITE:
-	 generate_fb_write(inst);
-	 break;
-      default:
-	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
-	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
-			  brw_opcodes[inst->opcode].name);
-	 } else {
-	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
-	 }
-	 fail("unsupported opcode in FS\n");
-      }
-
-      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
-	    if (0) {
-	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
-		      ((uint32_t *)&p->store[i])[3],
-		      ((uint32_t *)&p->store[i])[2],
-		      ((uint32_t *)&p->store[i])[1],
-		      ((uint32_t *)&p->store[i])[0]);
-	    }
-	    brw_disasm(stdout, &p->store[i], intel->gen);
-	 }
-      }
-
-      last_native_inst = p->nr_insn;
-   }
-
-   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      printf("\n");
-   }
-
-   ralloc_free(loop_stack);
-   ralloc_free(if_depth_in_loop);
-
-   brw_set_uip_jip(p);
-
-   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
-    * emit issues, it doesn't get the jump distances into the output,
-    * which is often something we want to debug.  So this is here in
-    * case you're doing that.
-    */
-   if (0) {
-      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-	 for (unsigned int i = 0; i < p->nr_insn; i++) {
-	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
-		   ((uint32_t *)&p->store[i])[3],
-		   ((uint32_t *)&p->store[i])[2],
-		   ((uint32_t *)&p->store[i])[1],
-		   ((uint32_t *)&p->store[i])[0]);
-	    brw_disasm(stdout, &p->store[i], intel->gen);
-	 }
-      }
-   }
-}
-
 bool
 fs_visitor::run()
 {
@@ -4118,6 +1530,7 @@ fs_visitor::run()
       foreach_iter(exec_list_iterator, iter, *shader->ir) {
 	 ir_instruction *ir = (ir_instruction *)iter.get();
 	 base_ir = ir;
+	 this->result = reg_undef;
 	 ir->accept(this);
       }
 
@@ -4171,9 +1584,9 @@ fs_visitor::run()
    generate_code();
 
    if (c->dispatch_width == 8) {
-      c->prog_data.total_grf = grf_used;
+      c->prog_data.reg_blocks = brw_register_blocks(grf_used);
    } else {
-      c->prog_data.total_grf_16 = grf_used;
+      c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
       c->prog_data.prog_offset_16 = prog_offset_16;
 
       /* Make sure we didn't try to sneak in an extra uniform */
@@ -4184,11 +1597,10 @@ fs_visitor::run()
 }
 
 bool
-brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
+brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
+	       struct gl_shader_program *prog)
 {
    struct intel_context *intel = &brw->intel;
-   struct gl_context *ctx = &intel->ctx;
-   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
 
    if (!prog)
       return false;
@@ -4208,16 +1620,17 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
     */
    c->dispatch_width = 8;
 
-   fs_visitor v(c, shader);
+   fs_visitor v(c, prog, shader);
    if (!v.run()) {
-      /* FINISHME: Cleanly fail, test at link time, etc. */
-      assert(!"not reached");
+      prog->LinkStatus = GL_FALSE;
+      prog->InfoLog = ralloc_strdup(prog, v.fail_msg);
+
       return false;
    }
 
    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
       c->dispatch_width = 16;
-      fs_visitor v2(c, shader);
+      fs_visitor v2(c, prog, shader);
       v2.import_uniforms(v.variable_ht);
       v2.run();
    }
@@ -4226,3 +1639,73 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
 
    return true;
 }
+
+bool
+brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_wm_prog_key key;
+   struct gl_fragment_program *fp = prog->FragmentProgram;
+   struct brw_fragment_program *bfp = brw_fragment_program(fp);
+
+   if (!fp)
+      return true;
+
+   memset(&key, 0, sizeof(key));
+
+   if (fp->UsesKill)
+      key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+
+   if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
+      key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
+
+   /* Just assume depth testing. */
+   key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
+   key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
+
+   key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
+   for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
+      int vp_index = -1;
+
+      if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
+	 continue;
+
+      key.proj_attrib_mask |= 1 << i;
+
+      if (i <= FRAG_ATTRIB_TEX7)
+	 vp_index = i;
+      else if (i >= FRAG_ATTRIB_VAR0)
+	 vp_index = i - FRAG_ATTRIB_VAR0 + VERT_RESULT_VAR0;
+
+      if (vp_index >= 0)
+	 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
+   }
+
+   key.clamp_fragment_color = true;
+
+   for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) {
+      /* FINISHME: depth compares might use (0,0,0,W) for example */
+      key.tex_swizzles[i] = SWIZZLE_XYZW;
+   }
+
+   if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
+      key.drawable_height = ctx->DrawBuffer->Height;
+      key.render_to_fbo = ctx->DrawBuffer->Name != 0;
+   }
+
+   key.nr_color_regions = 1;
+
+   key.program_string_id = bfp->id;
+
+   drm_intel_bo *old_prog_bo = brw->wm.prog_bo;
+   struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
+   brw->wm.prog_bo = NULL;
+
+   bool success = do_wm_prog(brw, prog, bfp, &key);
+
+   drm_intel_bo_unreference(brw->wm.prog_bo);
+   brw->wm.prog_bo = old_prog_bo;
+   brw->wm.prog_data = old_prog_data;
+
+   return success;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 4b355c979eb..7570dda1024 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -359,12 +359,14 @@ class fs_visitor : public ir_visitor
 {
 public:
 
-   fs_visitor(struct brw_wm_compile *c, struct brw_shader *shader)
+   fs_visitor(struct brw_wm_compile *c, struct gl_shader_program *prog,
+	      struct brw_shader *shader)
    {
       this->c = c;
       this->p = &c->func;
       this->brw = p->brw;
-      this->fp = brw->fragment_program;
+      this->fp = prog->FragmentProgram;
+      this->prog = prog;
       this->intel = &brw->intel;
       this->ctx = &intel->ctx;
       this->mem_ctx = ralloc_context(NULL);
@@ -466,6 +468,8 @@ public:
       return emit(fs_inst(opcode, dst, src0, src1, src2));
    }
 
+   int type_size(const struct glsl_type *type);
+
    bool run();
    void setup_paramvalues_refs();
    void assign_curb_setup();
@@ -542,6 +546,7 @@ public:
    struct brw_wm_compile *c;
    struct brw_compile *p;
    struct brw_shader *shader;
+   struct gl_shader_program *prog;
    void *mem_ctx;
    exec_list instructions;
 
@@ -570,8 +575,12 @@ public:
    /** @} */
 
    bool failed;
+   char *fail_msg;
 
-   /* Result of last visit() method. */
+   /* On entry to a visit() method, this is the storage for the
+    * result.  On exit, the visit() called may have changed it, in
+    * which case the parent must use the new storage instead.
+    */
    fs_reg result;
 
    fs_reg pixel_x;
@@ -590,3 +599,4 @@ public:
 
 GLboolean brw_do_channel_expressions(struct exec_list *instructions);
 GLboolean brw_do_vector_splitting(struct exec_list *instructions);
+bool brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
new file mode 100644
index 00000000000..6b7c434949c
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -0,0 +1,875 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_emit.cpp
+ *
+ * This file supports emitting code from the FS LIR to the actual
+ * native instructions.
+ */
+
+extern "C" {
+#include "main/macros.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+} /* extern "C" */
+
+#include "brw_fs.h"
+#include "../glsl/ir_print_visitor.h"
+
+void
+fs_visitor::generate_fb_write(fs_inst *inst)
+{
+   GLboolean eot = inst->eot;
+   struct brw_reg implied_header;
+
+   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
+    * move, here's g1.
+    */
+   brw_push_insn_state(p);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+   if (inst->header_present) {
+      if (intel->gen >= 6) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+	 brw_MOV(p,
+		 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
+		 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+	 if (inst->target > 0) {
+	    /* Set the render target index for choosing BLEND_STATE. */
+	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
+			      BRW_REGISTER_TYPE_UD),
+		    brw_imm_ud(inst->target));
+	 }
+
+	 implied_header = brw_null_reg();
+      } else {
+	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
+	 brw_MOV(p,
+		 brw_message_reg(inst->base_mrf + 1),
+		 brw_vec8_grf(1, 0));
+      }
+   } else {
+      implied_header = brw_null_reg();
+   }
+
+   brw_pop_insn_state(p);
+
+   brw_fb_WRITE(p,
+		c->dispatch_width,
+		inst->base_mrf,
+		implied_header,
+		inst->target,
+		inst->mlen,
+		0,
+		eot,
+		inst->header_present);
+}
+
+/* Computes the integer pixel x,y values from the origin.
+ *
+ * This is the basis of gl_FragCoord computation, but is also used
+ * pre-gen6 for computing the deltas from v0 for computing
+ * interpolation.
+ */
+void
+fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x)
+{
+   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
+   struct brw_reg src;
+   struct brw_reg deltas;
+
+   if (is_x) {
+      src = stride(suboffset(g1_uw, 4), 2, 4, 0);
+      deltas = brw_imm_v(0x10101010);
+   } else {
+      src = stride(suboffset(g1_uw, 5), 2, 4, 0);
+      deltas = brw_imm_v(0x11001100);
+   }
+
+   if (c->dispatch_width == 16) {
+      dst = vec16(dst);
+   }
+
+   /* We do this 8 or 16-wide, but since the destination is UW we
+    * don't do compression in the 16-wide case.
+    */
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_ADD(p, dst, src, deltas);
+   brw_pop_insn_state(p);
+}
+
+void
+fs_visitor::generate_linterp(fs_inst *inst,
+			     struct brw_reg dst, struct brw_reg *src)
+{
+   struct brw_reg delta_x = src[0];
+   struct brw_reg delta_y = src[1];
+   struct brw_reg interp = src[2];
+
+   if (brw->has_pln &&
+       delta_y.nr == delta_x.nr + 1 &&
+       (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
+      brw_PLN(p, dst, interp, delta_x);
+   } else {
+      brw_LINE(p, brw_null_reg(), interp, delta_x);
+      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
+   }
+}
+
+void
+fs_visitor::generate_math(fs_inst *inst,
+			  struct brw_reg dst, struct brw_reg *src)
+{
+   int op;
+
+   switch (inst->opcode) {
+   case FS_OPCODE_RCP:
+      op = BRW_MATH_FUNCTION_INV;
+      break;
+   case FS_OPCODE_RSQ:
+      op = BRW_MATH_FUNCTION_RSQ;
+      break;
+   case FS_OPCODE_SQRT:
+      op = BRW_MATH_FUNCTION_SQRT;
+      break;
+   case FS_OPCODE_EXP2:
+      op = BRW_MATH_FUNCTION_EXP;
+      break;
+   case FS_OPCODE_LOG2:
+      op = BRW_MATH_FUNCTION_LOG;
+      break;
+   case FS_OPCODE_POW:
+      op = BRW_MATH_FUNCTION_POW;
+      break;
+   case FS_OPCODE_SIN:
+      op = BRW_MATH_FUNCTION_SIN;
+      break;
+   case FS_OPCODE_COS:
+      op = BRW_MATH_FUNCTION_COS;
+      break;
+   default:
+      assert(!"not reached: unknown math function");
+      op = 0;
+      break;
+   }
+
+   if (intel->gen >= 6) {
+      assert(inst->mlen == 0);
+
+      if (inst->opcode == FS_OPCODE_POW) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	 brw_math2(p, dst, op, src[0], src[1]);
+
+	 if (c->dispatch_width == 16) {
+	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+	    brw_math2(p, sechalf(dst), op, sechalf(src[0]), sechalf(src[1]));
+	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+	 }
+      } else {
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	 brw_math(p, dst,
+		  op,
+		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
+		  BRW_MATH_SATURATE_NONE,
+		  0, src[0],
+		  BRW_MATH_DATA_VECTOR,
+		  BRW_MATH_PRECISION_FULL);
+
+	 if (c->dispatch_width == 16) {
+	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+	    brw_math(p, sechalf(dst),
+		     op,
+		     inst->saturate ? BRW_MATH_SATURATE_SATURATE :
+		     BRW_MATH_SATURATE_NONE,
+		     0, sechalf(src[0]),
+		     BRW_MATH_DATA_VECTOR,
+		     BRW_MATH_PRECISION_FULL);
+	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+	 }
+      }
+   } else /* gen <= 5 */{
+      assert(inst->mlen >= 1);
+
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_math(p, dst,
+	       op,
+	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
+	       BRW_MATH_SATURATE_NONE,
+	       inst->base_mrf, src[0],
+	       BRW_MATH_DATA_VECTOR,
+	       BRW_MATH_PRECISION_FULL);
+
+      if (c->dispatch_width == 16) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+	 brw_math(p, sechalf(dst),
+		  op,
+		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
+		  BRW_MATH_SATURATE_NONE,
+		  inst->base_mrf + 1, sechalf(src[0]),
+		  BRW_MATH_DATA_VECTOR,
+		  BRW_MATH_PRECISION_FULL);
+
+	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+      }
+   }
+}
+
+void
+fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
+{
+   int msg_type = -1;
+   int rlen = 4;
+   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+
+   if (c->dispatch_width == 16)
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+
+   if (intel->gen >= 5) {
+      switch (inst->opcode) {
+      case FS_OPCODE_TEX:
+	 if (inst->shadow_compare) {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
+	 } else {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
+	 }
+	 break;
+      case FS_OPCODE_TXB:
+	 if (inst->shadow_compare) {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
+	 } else {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
+	 }
+	 break;
+      case FS_OPCODE_TXL:
+	 if (inst->shadow_compare) {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
+	 } else {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
+	 }
+	 break;
+      case FS_OPCODE_TXD:
+	 assert(!"TXD isn't supported on gen5+ yet.");
+	 break;
+      }
+   } else {
+      switch (inst->opcode) {
+      case FS_OPCODE_TEX:
+	 /* Note that G45 and older determines shadow compare and dispatch width
+	  * from message length for most messages.
+	  */
+	 assert(c->dispatch_width == 8);
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+	 if (inst->shadow_compare) {
+	    assert(inst->mlen == 6);
+	 } else {
+	    assert(inst->mlen <= 4);
+	 }
+	 break;
+      case FS_OPCODE_TXB:
+	 if (inst->shadow_compare) {
+	    assert(inst->mlen == 6);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
+	 } else {
+	    assert(inst->mlen == 9);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+	 }
+	 break;
+      case FS_OPCODE_TXL:
+	 if (inst->shadow_compare) {
+	    assert(inst->mlen == 6);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
+	 } else {
+	    assert(inst->mlen == 9);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
+	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+	 }
+	 break;
+      case FS_OPCODE_TXD:
+	 assert(!"TXD isn't supported on gen4 yet.");
+	 break;
+      }
+   }
+   assert(msg_type != -1);
+
+   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
+      rlen = 8;
+      dst = vec16(dst);
+   }
+
+   brw_SAMPLE(p,
+	      retype(dst, BRW_REGISTER_TYPE_UW),
+	      inst->base_mrf,
+	      src,
+              SURF_INDEX_TEXTURE(inst->sampler),
+	      inst->sampler,
+	      WRITEMASK_XYZW,
+	      msg_type,
+	      rlen,
+	      inst->mlen,
+	      0,
+	      inst->header_present,
+	      simd_mode);
+}
+
+
+/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
+ * looking like:
+ *
+ * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
+ *
+ * and we're trying to produce:
+ *
+ *           DDX                     DDY
+ * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
+ *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
+ *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
+ *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
+ *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
+ *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
+ *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
+ *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
+ *
+ * and add another set of two more subspans if in 16-pixel dispatch mode.
+ *
+ * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
+ * for each pair, and vertstride = 2 jumps us 2 elements after processing a
+ * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
+ * between each other.  We could probably do it like ddx and swizzle the right
+ * order later, but bail for now and just produce
+ * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
+ */
+void
+fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
+{
+   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
+				 BRW_REGISTER_TYPE_F,
+				 BRW_VERTICAL_STRIDE_2,
+				 BRW_WIDTH_2,
+				 BRW_HORIZONTAL_STRIDE_0,
+				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
+				 BRW_REGISTER_TYPE_F,
+				 BRW_VERTICAL_STRIDE_2,
+				 BRW_WIDTH_2,
+				 BRW_HORIZONTAL_STRIDE_0,
+				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+   brw_ADD(p, dst, src0, negate(src1));
+}
+
+void
+fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
+{
+   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
+				 BRW_REGISTER_TYPE_F,
+				 BRW_VERTICAL_STRIDE_4,
+				 BRW_WIDTH_4,
+				 BRW_HORIZONTAL_STRIDE_0,
+				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
+				 BRW_REGISTER_TYPE_F,
+				 BRW_VERTICAL_STRIDE_4,
+				 BRW_WIDTH_4,
+				 BRW_HORIZONTAL_STRIDE_0,
+				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+   brw_ADD(p, dst, src0, negate(src1));
+}
+
+void
+fs_visitor::generate_discard(fs_inst *inst)
+{
+   struct brw_reg f0 = brw_flag_reg();
+
+   if (intel->gen >= 6) {
+      struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
+      struct brw_reg some_register;
+
+      /* As of gen6, we no longer have the mask register to look at,
+       * so life gets a bit more complicated.
+       */
+
+      /* Load the flag register with all ones. */
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_MOV(p, f0, brw_imm_uw(0xffff));
+      brw_pop_insn_state(p);
+
+      /* Do a comparison that should always fail, to produce 0s in the flag
+       * reg where we have active channels.
+       */
+      some_register = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
+      brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+	      BRW_CONDITIONAL_NZ, some_register, some_register);
+
+      /* Undo CMP's whacking of predication*/
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_AND(p, g1, f0, g1);
+      brw_pop_insn_state(p);
+   } else {
+      struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+      /* Unlike the 965, we have the mask reg, so we just need
+       * somewhere to invert that (containing channels to be disabled)
+       * so it can be ANDed with the mask of pixels still to be
+       * written. Use the flag reg for consistency with gen6+.
+       */
+      brw_NOT(p, f0, brw_mask_reg(1)); /* IMASK */
+      brw_AND(p, g0, f0, g0);
+
+      brw_pop_insn_state(p);
+   }
+}
+
+void
+fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
+{
+   assert(inst->mlen != 0);
+
+   brw_MOV(p,
+	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
+	   retype(src, BRW_REGISTER_TYPE_UD));
+   brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
+				 inst->offset);
+}
+
+void
+fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
+{
+   assert(inst->mlen != 0);
+
+   /* Clear any post destination dependencies that would be ignored by
+    * the block read.  See the B-Spec for pre-gen5 send instruction.
+    *
+    * This could use a better solution, since texture sampling and
+    * math reads could potentially run into it as well -- anywhere
+    * that we have a SEND with a destination that is a register that
+    * was written but not read within the last N instructions (what's
+    * N?  unsure).  This is rare because of dead code elimination, but
+    * not impossible.
+    */
+   if (intel->gen == 4 && !intel->is_g4x)
+      brw_MOV(p, brw_null_reg(), dst);
+
+   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
+				inst->offset);
+
+   if (intel->gen == 4 && !intel->is_g4x) {
+      /* gen4 errata: destination from a send can't be used as a
+       * destination until it's been read.  Just read it so we don't
+       * have to worry.
+       */
+      brw_MOV(p, brw_null_reg(), dst);
+   }
+}
+
+void
+fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
+{
+   assert(inst->mlen != 0);
+
+   /* Clear any post destination dependencies that would be ignored by
+    * the block read.  See the B-Spec for pre-gen5 send instruction.
+    *
+    * This could use a better solution, since texture sampling and
+    * math reads could potentially run into it as well -- anywhere
+    * that we have a SEND with a destination that is a register that
+    * was written but not read within the last N instructions (what's
+    * N?  unsure).  This is rare because of dead code elimination, but
+    * not impossible.
+    */
+   if (intel->gen == 4 && !intel->is_g4x)
+      brw_MOV(p, brw_null_reg(), dst);
+
+   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
+			inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
+
+   if (intel->gen == 4 && !intel->is_g4x) {
+      /* gen4 errata: destination from a send can't be used as a
+       * destination until it's been read.  Just read it so we don't
+       * have to worry.
+       */
+      brw_MOV(p, brw_null_reg(), dst);
+   }
+}
+
+static struct brw_reg
+brw_reg_from_fs_reg(fs_reg *reg)
+{
+   struct brw_reg brw_reg;
+
+   switch (reg->file) {
+   case GRF:
+   case ARF:
+   case MRF:
+      if (reg->smear == -1) {
+	 brw_reg = brw_vec8_reg(reg->file,
+				reg->hw_reg, 0);
+      } else {
+	 brw_reg = brw_vec1_reg(reg->file,
+				reg->hw_reg, reg->smear);
+      }
+      brw_reg = retype(brw_reg, reg->type);
+      if (reg->sechalf)
+	 brw_reg = sechalf(brw_reg);
+      break;
+   case IMM:
+      switch (reg->type) {
+      case BRW_REGISTER_TYPE_F:
+	 brw_reg = brw_imm_f(reg->imm.f);
+	 break;
+      case BRW_REGISTER_TYPE_D:
+	 brw_reg = brw_imm_d(reg->imm.i);
+	 break;
+      case BRW_REGISTER_TYPE_UD:
+	 brw_reg = brw_imm_ud(reg->imm.u);
+	 break;
+      default:
+	 assert(!"not reached");
+	 brw_reg = brw_null_reg();
+	 break;
+      }
+      break;
+   case FIXED_HW_REG:
+      brw_reg = reg->fixed_hw_reg;
+      break;
+   case BAD_FILE:
+      /* Probably unused. */
+      brw_reg = brw_null_reg();
+      break;
+   case UNIFORM:
+      assert(!"not reached");
+      brw_reg = brw_null_reg();
+      break;
+   default:
+      assert(!"not reached");
+      brw_reg = brw_null_reg();
+      break;
+   }
+   if (reg->abs)
+      brw_reg = brw_abs(brw_reg);
+   if (reg->negate)
+      brw_reg = negate(brw_reg);
+
+   return brw_reg;
+}
+
+void
+fs_visitor::generate_code()
+{
+   int last_native_inst = p->nr_insn;
+   const char *last_annotation_string = NULL;
+   ir_instruction *last_annotation_ir = NULL;
+
+   int loop_stack_array_size = 16;
+   int loop_stack_depth = 0;
+   brw_instruction **loop_stack =
+      rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
+   int *if_depth_in_loop =
+      rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
+
+
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+      printf("Native code for fragment shader %d (%d-wide dispatch):\n",
+	     prog->Name, c->dispatch_width);
+   }
+
+   foreach_iter(exec_list_iterator, iter, this->instructions) {
+      fs_inst *inst = (fs_inst *)iter.get();
+      struct brw_reg src[3], dst;
+
+      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+	 if (last_annotation_ir != inst->ir) {
+	    last_annotation_ir = inst->ir;
+	    if (last_annotation_ir) {
+	       printf("   ");
+	       last_annotation_ir->print();
+	       printf("\n");
+	    }
+	 }
+	 if (last_annotation_string != inst->annotation) {
+	    last_annotation_string = inst->annotation;
+	    if (last_annotation_string)
+	       printf("   %s\n", last_annotation_string);
+	 }
+      }
+
+      for (unsigned int i = 0; i < 3; i++) {
+	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
+      }
+      dst = brw_reg_from_fs_reg(&inst->dst);
+
+      brw_set_conditionalmod(p, inst->conditional_mod);
+      brw_set_predicate_control(p, inst->predicated);
+      brw_set_predicate_inverse(p, inst->predicate_inverse);
+      brw_set_saturate(p, inst->saturate);
+
+      if (inst->force_uncompressed || c->dispatch_width == 8) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      } else if (inst->force_sechalf) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      } else {
+	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+      }
+
+      switch (inst->opcode) {
+      case BRW_OPCODE_MOV:
+	 brw_MOV(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_ADD:
+	 brw_ADD(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_MUL:
+	 brw_MUL(p, dst, src[0], src[1]);
+	 break;
+
+      case BRW_OPCODE_FRC:
+	 brw_FRC(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDD:
+	 brw_RNDD(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDE:
+	 brw_RNDE(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDZ:
+	 brw_RNDZ(p, dst, src[0]);
+	 break;
+
+      case BRW_OPCODE_AND:
+	 brw_AND(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_OR:
+	 brw_OR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_XOR:
+	 brw_XOR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_NOT:
+	 brw_NOT(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_ASR:
+	 brw_ASR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SHR:
+	 brw_SHR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SHL:
+	 brw_SHL(p, dst, src[0], src[1]);
+	 break;
+
+      case BRW_OPCODE_CMP:
+	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SEL:
+	 brw_SEL(p, dst, src[0], src[1]);
+	 break;
+
+      case BRW_OPCODE_IF:
+	 if (inst->src[0].file != BAD_FILE) {
+	    /* The instruction has an embedded compare (only allowed on gen6) */
+	    assert(intel->gen == 6);
+	    gen6_IF(p, inst->conditional_mod, src[0], src[1]);
+	 } else {
+	    brw_IF(p, c->dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
+	 }
+	 if_depth_in_loop[loop_stack_depth]++;
+	 break;
+
+      case BRW_OPCODE_ELSE:
+	 brw_ELSE(p);
+	 break;
+      case BRW_OPCODE_ENDIF:
+	 brw_ENDIF(p);
+	 if_depth_in_loop[loop_stack_depth]--;
+	 break;
+
+      case BRW_OPCODE_DO:
+	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
+	 if (loop_stack_array_size <= loop_stack_depth) {
+	    loop_stack_array_size *= 2;
+	    loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
+				  loop_stack_array_size);
+	    if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
+				        loop_stack_array_size);
+	 }
+	 if_depth_in_loop[loop_stack_depth] = 0;
+	 break;
+
+      case BRW_OPCODE_BREAK:
+	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 break;
+      case BRW_OPCODE_CONTINUE:
+	 /* FINISHME: We need to write the loop instruction support still. */
+	 if (intel->gen >= 6)
+	    gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
+	 else
+	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 break;
+
+      case BRW_OPCODE_WHILE: {
+	 struct brw_instruction *inst0, *inst1;
+	 GLuint br = 1;
+
+	 if (intel->gen >= 5)
+	    br = 2;
+
+	 assert(loop_stack_depth > 0);
+	 loop_stack_depth--;
+	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
+	 if (intel->gen < 6) {
+	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
+	    while (inst0 > loop_stack[loop_stack_depth]) {
+	       inst0--;
+	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
+		   inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+	    }
+	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
+			inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	       }
+	    }
+	 }
+      }
+	 break;
+
+      case FS_OPCODE_RCP:
+      case FS_OPCODE_RSQ:
+      case FS_OPCODE_SQRT:
+      case FS_OPCODE_EXP2:
+      case FS_OPCODE_LOG2:
+      case FS_OPCODE_POW:
+      case FS_OPCODE_SIN:
+      case FS_OPCODE_COS:
+	 generate_math(inst, dst, src);
+	 break;
+      case FS_OPCODE_PIXEL_X:
+	 generate_pixel_xy(dst, true);
+	 break;
+      case FS_OPCODE_PIXEL_Y:
+	 generate_pixel_xy(dst, false);
+	 break;
+      case FS_OPCODE_CINTERP:
+	 brw_MOV(p, dst, src[0]);
+	 break;
+      case FS_OPCODE_LINTERP:
+	 generate_linterp(inst, dst, src);
+	 break;
+      case FS_OPCODE_TEX:
+      case FS_OPCODE_TXB:
+      case FS_OPCODE_TXD:
+      case FS_OPCODE_TXL:
+	 generate_tex(inst, dst, src[0]);
+	 break;
+      case FS_OPCODE_DISCARD:
+	 generate_discard(inst);
+	 break;
+      case FS_OPCODE_DDX:
+	 generate_ddx(inst, dst, src[0]);
+	 break;
+      case FS_OPCODE_DDY:
+	 generate_ddy(inst, dst, src[0]);
+	 break;
+
+      case FS_OPCODE_SPILL:
+	 generate_spill(inst, src[0]);
+	 break;
+
+      case FS_OPCODE_UNSPILL:
+	 generate_unspill(inst, dst);
+	 break;
+
+      case FS_OPCODE_PULL_CONSTANT_LOAD:
+	 generate_pull_constant_load(inst, dst);
+	 break;
+
+      case FS_OPCODE_FB_WRITE:
+	 generate_fb_write(inst);
+	 break;
+      default:
+	 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
+	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
+			  brw_opcodes[inst->opcode].name);
+	 } else {
+	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
+	 }
+	 fail("unsupported opcode in FS\n");
+      }
+
+      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
+	    if (0) {
+	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
+		      ((uint32_t *)&p->store[i])[3],
+		      ((uint32_t *)&p->store[i])[2],
+		      ((uint32_t *)&p->store[i])[1],
+		      ((uint32_t *)&p->store[i])[0]);
+	    }
+	    brw_disasm(stdout, &p->store[i], intel->gen);
+	 }
+      }
+
+      last_native_inst = p->nr_insn;
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+      printf("\n");
+   }
+
+   ralloc_free(loop_stack);
+   ralloc_free(if_depth_in_loop);
+
+   brw_set_uip_jip(p);
+
+   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
+    * emit issues, it doesn't get the jump distances into the output,
+    * which is often something we want to debug.  So this is here in
+    * case you're doing that.
+    */
+   if (0) {
+      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+	 for (unsigned int i = 0; i < p->nr_insn; i++) {
+	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
+		   ((uint32_t *)&p->store[i])[3],
+		   ((uint32_t *)&p->store[i])[2],
+		   ((uint32_t *)&p->store[i])[1],
+		   ((uint32_t *)&p->store[i])[0]);
+	    brw_disasm(stdout, &p->store[i], intel->gen);
+	 }
+      }
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
index fb1192c810a..d8218c26edb 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
@@ -140,6 +140,7 @@ public:
    }
    void add_barrier_deps(schedule_node *n);
    void add_dep(schedule_node *before, schedule_node *after, int latency);
+   void add_dep(schedule_node *before, schedule_node *after);
 
    void add_inst(fs_inst *inst);
    void calculate_deps();
@@ -210,6 +211,15 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
    after->parent_count++;
 }
 
+void
+instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
+{
+   if (!before)
+      return;
+
+   add_dep(before, after, before->latency);
+}
+
 /**
  * Sometimes we really want this node to execute after everything that
  * was before it and before everything that followed it.  This adds
@@ -253,6 +263,12 @@ instruction_scheduler::calculate_deps()
    schedule_node *last_grf_write[virtual_grf_count];
    schedule_node *last_mrf_write[BRW_MAX_MRF];
    schedule_node *last_conditional_mod = NULL;
+   /* Fixed HW registers are assumed to be separate from the virtual
+    * GRFs, so they can be tracked separately.  We don't really write
+    * to fixed GRFs much, so don't bother tracking them on a more
+    * granular level.
+    */
+   schedule_node *last_fixed_grf_write = NULL;
 
    /* The last instruction always needs to still be the last
     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
@@ -274,10 +290,11 @@ instruction_scheduler::calculate_deps()
       /* read-after-write deps. */
       for (int i = 0; i < 3; i++) {
 	 if (inst->src[i].file == GRF) {
-	    if (last_grf_write[inst->src[i].reg]) {
-	       add_dep(last_grf_write[inst->src[i].reg], n,
-		       last_grf_write[inst->src[i].reg]->latency);
-	    }
+	    add_dep(last_grf_write[inst->src[i].reg], n);
+	 } else if (inst->src[i].file == FIXED_HW_REG &&
+		    (inst->src[i].fixed_hw_reg.file ==
+		     BRW_GENERAL_REGISTER_FILE)) {
+	    add_dep(last_fixed_grf_write, n);
 	 } else if (inst->src[i].file != BAD_FILE &&
 		    inst->src[i].file != IMM &&
 		    inst->src[i].file != UNIFORM) {
@@ -291,53 +308,41 @@ instruction_scheduler::calculate_deps()
 	  * instruction once it's sent, not when the result comes
 	  * back.
 	  */
-	 if (last_mrf_write[inst->base_mrf + i]) {
-	    add_dep(last_mrf_write[inst->base_mrf + i], n,
-		    last_mrf_write[inst->base_mrf + i]->latency);
-	 }
+	 add_dep(last_mrf_write[inst->base_mrf + i], n);
       }
 
       if (inst->predicated) {
 	 assert(last_conditional_mod);
-	 add_dep(last_conditional_mod, n, last_conditional_mod->latency);
+	 add_dep(last_conditional_mod, n);
       }
 
       /* write-after-write deps. */
       if (inst->dst.file == GRF) {
-	 if (last_grf_write[inst->dst.reg]) {
-	    add_dep(last_grf_write[inst->dst.reg], n,
-		    last_grf_write[inst->dst.reg]->latency);
-	 }
+	 add_dep(last_grf_write[inst->dst.reg], n);
 	 last_grf_write[inst->dst.reg] = n;
       } else if (inst->dst.file == MRF) {
 	 int reg = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
 
-	 if (last_mrf_write[reg]) {
-	    add_dep(last_mrf_write[reg], n,
-		    last_mrf_write[reg]->latency);
-	 }
+	 add_dep(last_mrf_write[reg], n);
 	 last_mrf_write[reg] = n;
 	 if (is_compressed(inst)) {
 	    if (inst->dst.hw_reg & BRW_MRF_COMPR4)
 	       reg += 4;
 	    else
 	       reg++;
-	    if (last_mrf_write[reg]) {
-	       add_dep(last_mrf_write[reg], n,
-		       last_mrf_write[reg]->latency);
-	    }
+	    add_dep(last_mrf_write[reg], n);
 	    last_mrf_write[reg] = n;
 	 }
+      } else if (inst->dst.file == FIXED_HW_REG &&
+		 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+	 last_fixed_grf_write = n;
       } else if (inst->dst.file != BAD_FILE) {
 	 add_barrier_deps(n);
       }
 
       if (inst->mlen > 0) {
 	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
-	    if (last_mrf_write[inst->base_mrf + i]) {
-	       add_dep(last_mrf_write[inst->base_mrf + i], n,
-		       last_mrf_write[inst->base_mrf + i]->latency);
-	    }
+	    add_dep(last_mrf_write[inst->base_mrf + i], n);
 	    last_mrf_write[inst->base_mrf + i] = n;
 	 }
       }
@@ -352,6 +357,7 @@ instruction_scheduler::calculate_deps()
    memset(last_grf_write, 0, sizeof(last_grf_write));
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
    last_conditional_mod = NULL;
+   last_fixed_grf_write = NULL;
 
    exec_node *node;
    exec_node *prev;
@@ -364,9 +370,11 @@ instruction_scheduler::calculate_deps()
       /* write-after-read deps. */
       for (int i = 0; i < 3; i++) {
 	 if (inst->src[i].file == GRF) {
-	    if (last_grf_write[inst->src[i].reg]) {
-	       add_dep(n, last_grf_write[inst->src[i].reg], n->latency);
-	    }
+	    add_dep(n, last_grf_write[inst->src[i].reg]);
+	 } else if (inst->src[i].file == FIXED_HW_REG &&
+		    (inst->src[i].fixed_hw_reg.file ==
+		     BRW_GENERAL_REGISTER_FILE)) {
+	    add_dep(n, last_fixed_grf_write);
 	 } else if (inst->src[i].file != BAD_FILE &&
 		    inst->src[i].file != IMM &&
 		    inst->src[i].file != UNIFORM) {
@@ -384,9 +392,7 @@ instruction_scheduler::calculate_deps()
       }
 
       if (inst->predicated) {
-	 if (last_conditional_mod) {
-	    add_dep(n, last_conditional_mod, n->latency);
-	 }
+	 add_dep(n, last_conditional_mod);
       }
 
       /* Update the things this instruction wrote, so earlier reads
@@ -407,6 +413,9 @@ instruction_scheduler::calculate_deps()
 
 	    last_mrf_write[reg] = n;
 	 }
+      } else if (inst->dst.file == FIXED_HW_REG &&
+		 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+	 last_fixed_grf_write = n;
       } else if (inst->dst.file != BAD_FILE) {
 	 add_barrier_deps(n);
       }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
new file mode 100644
index 00000000000..b4857871c78
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -0,0 +1,1734 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_visitor.cpp
+ *
+ * This file supports generating the FS LIR from the GLSL IR.  The LIR
+ * makes it easier to do backend-specific optimizations than doing so
+ * in the GLSL IR or in the native code.
+ */
+extern "C" {
+
+#include <sys/types.h>
+
+#include "main/macros.h"
+#include "main/shaderobj.h"
+#include "main/uniforms.h"
+#include "program/prog_parameter.h"
+#include "program/prog_print.h"
+#include "program/prog_optimize.h"
+#include "program/register_allocate.h"
+#include "program/sampler.h"
+#include "program/hash_table.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_wm.h"
+}
+#include "brw_shader.h"
+#include "brw_fs.h"
+#include "../glsl/glsl_types.h"
+#include "../glsl/ir_optimization.h"
+#include "../glsl/ir_print_visitor.h"
+
+void
+fs_visitor::visit(ir_variable *ir)
+{
+   fs_reg *reg = NULL;
+
+   if (variable_storage(ir))
+      return;
+
+   if (strcmp(ir->name, "gl_FragColor") == 0) {
+      this->frag_color = ir;
+   } else if (strcmp(ir->name, "gl_FragData") == 0) {
+      this->frag_data = ir;
+   } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
+      this->frag_depth = ir;
+   }
+
+   if (ir->mode == ir_var_in) {
+      if (!strcmp(ir->name, "gl_FragCoord")) {
+	 reg = emit_fragcoord_interpolation(ir);
+      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
+	 reg = emit_frontfacing_interpolation(ir);
+      } else {
+	 reg = emit_general_interpolation(ir);
+      }
+      assert(reg);
+      hash_table_insert(this->variable_ht, reg, ir);
+      return;
+   }
+
+   if (ir->mode == ir_var_uniform) {
+      int param_index = c->prog_data.nr_params;
+
+      if (c->dispatch_width == 16) {
+	 if (!variable_storage(ir)) {
+	    fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
+	 }
+	 return;
+      }
+
+      if (!strncmp(ir->name, "gl_", 3)) {
+	 setup_builtin_uniform_values(ir);
+      } else {
+	 setup_uniform_values(ir->location, ir->type);
+      }
+
+      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
+      reg->type = brw_type_for_base_type(ir->type);
+   }
+
+   if (!reg)
+      reg = new(this->mem_ctx) fs_reg(this, ir->type);
+
+   hash_table_insert(this->variable_ht, reg, ir);
+}
+
+void
+fs_visitor::visit(ir_dereference_variable *ir)
+{
+   fs_reg *reg = variable_storage(ir->var);
+   this->result = *reg;
+}
+
+void
+fs_visitor::visit(ir_dereference_record *ir)
+{
+   const glsl_type *struct_type = ir->record->type;
+
+   ir->record->accept(this);
+
+   unsigned int offset = 0;
+   for (unsigned int i = 0; i < struct_type->length; i++) {
+      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
+	 break;
+      offset += type_size(struct_type->fields.structure[i].type);
+   }
+   this->result.reg_offset += offset;
+   this->result.type = brw_type_for_base_type(ir->type);
+}
+
+void
+fs_visitor::visit(ir_dereference_array *ir)
+{
+   ir_constant *index;
+   int element_size;
+
+   ir->array->accept(this);
+   index = ir->array_index->as_constant();
+
+   element_size = type_size(ir->type);
+   this->result.type = brw_type_for_base_type(ir->type);
+
+   if (index) {
+      assert(this->result.file == UNIFORM ||
+	     (this->result.file == GRF &&
+	      this->result.reg != 0));
+      this->result.reg_offset += index->value.i[0] * element_size;
+   } else {
+      assert(!"FINISHME: non-constant array element");
+   }
+}
+
+/* Instruction selection: Produce a MOV.sat instead of
+ * MIN(MAX(val, 0), 1) when possible.
+ */
+bool
+fs_visitor::try_emit_saturate(ir_expression *ir)
+{
+   ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
+
+   if (!sat_val)
+      return false;
+
+   this->result = reg_undef;
+   sat_val->accept(this);
+   fs_reg src = this->result;
+
+   this->result = fs_reg(this, ir->type);
+   fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
+   inst->saturate = true;
+
+   return true;
+}
+
+void
+fs_visitor::visit(ir_expression *ir)
+{
+   unsigned int operand;
+   fs_reg op[2], temp;
+   fs_inst *inst;
+
+   assert(ir->get_num_operands() <= 2);
+
+   if (try_emit_saturate(ir))
+      return;
+
+   /* This is where our caller would like us to put the result, if possible. */
+   fs_reg saved_result_storage = this->result;
+
+   for (operand = 0; operand < ir->get_num_operands(); operand++) {
+      this->result = reg_undef;
+      ir->operands[operand]->accept(this);
+      if (this->result.file == BAD_FILE) {
+	 ir_print_visitor v;
+	 fail("Failed to get tree for expression operand:\n");
+	 ir->operands[operand]->accept(&v);
+      }
+      op[operand] = this->result;
+
+      /* Matrix expression operands should have been broken down to vector
+       * operations already.
+       */
+      assert(!ir->operands[operand]->type->is_matrix());
+      /* And then those vector operands should have been broken down to scalar.
+       */
+      assert(!ir->operands[operand]->type->is_vector());
+   }
+
+   /* Inherit storage from our parent if possible, and otherwise we
+    * alloc a temporary.
+    */
+   if (saved_result_storage.file == BAD_FILE) {
+      this->result = fs_reg(this, ir->type);
+   } else {
+      this->result = saved_result_storage;
+   }
+
+   switch (ir->operation) {
+   case ir_unop_logic_not:
+      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
+       * ones complement of the whole register, not just bit 0.
+       */
+      emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
+      break;
+   case ir_unop_neg:
+      op[0].negate = !op[0].negate;
+      this->result = op[0];
+      break;
+   case ir_unop_abs:
+      op[0].abs = true;
+      op[0].negate = false;
+      this->result = op[0];
+      break;
+   case ir_unop_sign:
+      temp = fs_reg(this, ir->type);
+
+      /* Unalias the destination.  (imagine a = sign(a)) */
+      this->result = fs_reg(this, ir->type);
+
+      emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
+
+      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
+      inst->conditional_mod = BRW_CONDITIONAL_G;
+      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
+      inst->predicated = true;
+
+      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
+      inst->conditional_mod = BRW_CONDITIONAL_L;
+      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
+      inst->predicated = true;
+
+      break;
+   case ir_unop_rcp:
+      emit_math(FS_OPCODE_RCP, this->result, op[0]);
+      break;
+
+   case ir_unop_exp2:
+      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
+      break;
+   case ir_unop_log2:
+      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
+      break;
+   case ir_unop_exp:
+   case ir_unop_log:
+      assert(!"not reached: should be handled by ir_explog_to_explog2");
+      break;
+   case ir_unop_sin:
+   case ir_unop_sin_reduced:
+      emit_math(FS_OPCODE_SIN, this->result, op[0]);
+      break;
+   case ir_unop_cos:
+   case ir_unop_cos_reduced:
+      emit_math(FS_OPCODE_COS, this->result, op[0]);
+      break;
+
+   case ir_unop_dFdx:
+      emit(FS_OPCODE_DDX, this->result, op[0]);
+      break;
+   case ir_unop_dFdy:
+      emit(FS_OPCODE_DDY, this->result, op[0]);
+      break;
+
+   case ir_binop_add:
+      emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
+      break;
+   case ir_binop_sub:
+      assert(!"not reached: should be handled by ir_sub_to_add_neg");
+      break;
+
+   case ir_binop_mul:
+      emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
+      break;
+   case ir_binop_div:
+      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
+      break;
+   case ir_binop_mod:
+      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
+      break;
+
+   case ir_binop_less:
+   case ir_binop_greater:
+   case ir_binop_lequal:
+   case ir_binop_gequal:
+   case ir_binop_equal:
+   case ir_binop_all_equal:
+   case ir_binop_nequal:
+   case ir_binop_any_nequal:
+      temp = this->result;
+      /* original gen4 does implicit conversion before comparison. */
+      if (intel->gen < 5)
+	 temp.type = op[0].type;
+
+      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
+      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
+      emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1));
+      break;
+
+   case ir_binop_logic_xor:
+      emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
+      break;
+
+   case ir_binop_logic_or:
+      emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
+      break;
+
+   case ir_binop_logic_and:
+      emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
+      break;
+
+   case ir_binop_dot:
+   case ir_unop_any:
+      assert(!"not reached: should be handled by brw_fs_channel_expressions");
+      break;
+
+   case ir_unop_noise:
+      assert(!"not reached: should be handled by lower_noise");
+      break;
+
+   case ir_quadop_vector:
+      assert(!"not reached: should be handled by lower_quadop_vector");
+      break;
+
+   case ir_unop_sqrt:
+      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
+      break;
+
+   case ir_unop_rsq:
+      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
+      break;
+
+   case ir_unop_i2f:
+   case ir_unop_b2f:
+   case ir_unop_b2i:
+   case ir_unop_f2i:
+      emit(BRW_OPCODE_MOV, this->result, op[0]);
+      break;
+   case ir_unop_f2b:
+   case ir_unop_i2b:
+      temp = this->result;
+      /* original gen4 does implicit conversion before comparison. */
+      if (intel->gen < 5)
+	 temp.type = op[0].type;
+
+      inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+      inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
+      break;
+
+   case ir_unop_trunc:
+      emit(BRW_OPCODE_RNDZ, this->result, op[0]);
+      break;
+   case ir_unop_ceil:
+      op[0].negate = !op[0].negate;
+      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
+      this->result.negate = true;
+      break;
+   case ir_unop_floor:
+      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
+      break;
+   case ir_unop_fract:
+      inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
+      break;
+   case ir_unop_round_even:
+      emit(BRW_OPCODE_RNDE, this->result, op[0]);
+      break;
+
+   case ir_binop_min:
+      if (intel->gen >= 6) {
+	 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_L;
+      } else {
+	 /* Unalias the destination */
+	 this->result = fs_reg(this, ir->type);
+
+	 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_L;
+
+	 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
+	 inst->predicated = true;
+      }
+      break;
+   case ir_binop_max:
+      if (intel->gen >= 6) {
+	 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_GE;
+      } else {
+	 /* Unalias the destination */
+	 this->result = fs_reg(this, ir->type);
+
+	 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_G;
+
+	 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
+	 inst->predicated = true;
+      }
+      break;
+
+   case ir_binop_pow:
+      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
+      break;
+
+   case ir_unop_bit_not:
+      inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
+      break;
+   case ir_binop_bit_and:
+      inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
+      break;
+   case ir_binop_bit_xor:
+      inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
+      break;
+   case ir_binop_bit_or:
+      inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
+      break;
+
+   case ir_unop_u2f:
+   case ir_binop_lshift:
+   case ir_binop_rshift:
+      assert(!"GLSL 1.30 features unsupported");
+      break;
+   }
+}
+
+void
+fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
+				   const glsl_type *type, bool predicated)
+{
+   switch (type->base_type) {
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_BOOL:
+      for (unsigned int i = 0; i < type->components(); i++) {
+	 l.type = brw_type_for_base_type(type);
+	 r.type = brw_type_for_base_type(type);
+
+	 if (predicated || !l.equals(&r)) {
+	    fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
+	    inst->predicated = predicated;
+	 }
+
+	 l.reg_offset++;
+	 r.reg_offset++;
+      }
+      break;
+   case GLSL_TYPE_ARRAY:
+      for (unsigned int i = 0; i < type->length; i++) {
+	 emit_assignment_writes(l, r, type->fields.array, predicated);
+      }
+      break;
+
+   case GLSL_TYPE_STRUCT:
+      for (unsigned int i = 0; i < type->length; i++) {
+	 emit_assignment_writes(l, r, type->fields.structure[i].type,
+				predicated);
+      }
+      break;
+
+   case GLSL_TYPE_SAMPLER:
+      break;
+
+   default:
+      assert(!"not reached");
+      break;
+   }
+}
+
+void
+fs_visitor::visit(ir_assignment *ir)
+{
+   struct fs_reg l, r;
+   fs_inst *inst;
+
+   /* FINISHME: arrays on the lhs */
+   this->result = reg_undef;
+   ir->lhs->accept(this);
+   l = this->result;
+
+   /* If we're doing a direct assignment, an RHS expression could
+    * drop its result right into our destination.  Otherwise, tell it
+    * not to.
+    */
+   if (ir->condition ||
+       !(ir->lhs->type->is_scalar() ||
+	 (ir->lhs->type->is_vector() &&
+	  ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1))) {
+      this->result = reg_undef;
+   }
+
+   ir->rhs->accept(this);
+   r = this->result;
+
+   assert(l.file != BAD_FILE);
+   assert(r.file != BAD_FILE);
+
+   if (ir->condition) {
+      emit_bool_to_cond_code(ir->condition);
+   }
+
+   if (ir->lhs->type->is_scalar() ||
+       ir->lhs->type->is_vector()) {
+      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
+	 if (ir->write_mask & (1 << i)) {
+	    if (ir->condition) {
+	       inst = emit(BRW_OPCODE_MOV, l, r);
+	       inst->predicated = true;
+	    } else if (!l.equals(&r)) {
+	       inst = emit(BRW_OPCODE_MOV, l, r);
+	    }
+
+	    r.reg_offset++;
+	 }
+	 l.reg_offset++;
+      }
+   } else {
+      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
+   }
+}
+
+fs_inst *
+fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
+			      int sampler)
+{
+   int mlen;
+   int base_mrf = 1;
+   bool simd16 = false;
+   fs_reg orig_dst;
+
+   /* g0 header. */
+   mlen = 1;
+
+   if (ir->shadow_comparitor) {
+      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
+	 fs_inst *inst = emit(BRW_OPCODE_MOV,
+			      fs_reg(MRF, base_mrf + mlen + i), coordinate);
+	 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
+	    inst->saturate = true;
+
+	 coordinate.reg_offset++;
+      }
+      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
+      mlen += 3;
+
+      if (ir->op == ir_tex) {
+	 /* There's no plain shadow compare message, so we use shadow
+	  * compare with a bias of 0.0.
+	  */
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
+	 mlen++;
+      } else if (ir->op == ir_txb) {
+	 this->result = reg_undef;
+	 ir->lod_info.bias->accept(this);
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
+	 mlen++;
+      } else {
+	 assert(ir->op == ir_txl);
+	 this->result = reg_undef;
+	 ir->lod_info.lod->accept(this);
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
+	 mlen++;
+      }
+
+      this->result = reg_undef;
+      ir->shadow_comparitor->accept(this);
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
+      mlen++;
+   } else if (ir->op == ir_tex) {
+      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
+	 fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
+			      coordinate);
+	 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
+	    inst->saturate = true;
+	 coordinate.reg_offset++;
+      }
+      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
+      mlen += 3;
+   } else if (ir->op == ir_txd) {
+      assert(!"TXD isn't supported on gen4 yet.");
+   } else {
+      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
+       * instructions.  We'll need to do SIMD16 here.
+       */
+      assert(ir->op == ir_txb || ir->op == ir_txl);
+
+      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
+	 fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF,
+						     base_mrf + mlen + i * 2),
+			      coordinate);
+	 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
+	    inst->saturate = true;
+	 coordinate.reg_offset++;
+      }
+
+      /* lod/bias appears after u/v/r. */
+      mlen += 6;
+
+      if (ir->op == ir_txb) {
+	 this->result = reg_undef;
+	 ir->lod_info.bias->accept(this);
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
+	 mlen++;
+      } else {
+	 this->result = reg_undef;
+	 ir->lod_info.lod->accept(this);
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
+	 mlen++;
+      }
+
+      /* The unused upper half. */
+      mlen++;
+
+      /* Now, since we're doing simd16, the return is 2 interleaved
+       * vec4s where the odd-indexed ones are junk. We'll need to move
+       * this weirdness around to the expected layout.
+       */
+      simd16 = true;
+      orig_dst = dst;
+      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
+						       2));
+      dst.type = BRW_REGISTER_TYPE_F;
+   }
+
+   fs_inst *inst = NULL;
+   switch (ir->op) {
+   case ir_tex:
+      inst = emit(FS_OPCODE_TEX, dst);
+      break;
+   case ir_txb:
+      inst = emit(FS_OPCODE_TXB, dst);
+      break;
+   case ir_txl:
+      inst = emit(FS_OPCODE_TXL, dst);
+      break;
+   case ir_txd:
+      inst = emit(FS_OPCODE_TXD, dst);
+      break;
+   case ir_txf:
+      assert(!"GLSL 1.30 features unsupported");
+      break;
+   }
+   inst->base_mrf = base_mrf;
+   inst->mlen = mlen;
+   inst->header_present = true;
+
+   if (simd16) {
+      for (int i = 0; i < 4; i++) {
+	 emit(BRW_OPCODE_MOV, orig_dst, dst);
+	 orig_dst.reg_offset++;
+	 dst.reg_offset += 2;
+      }
+   }
+
+   return inst;
+}
+
+/* gen5's sampler has slots for u, v, r, array index, then optional
+ * parameters like shadow comparitor or LOD bias.  If optional
+ * parameters aren't present, those base slots are optional and don't
+ * need to be included in the message.
+ *
+ * We don't fill in the unnecessary slots regardless, which may look
+ * surprising in the disassembly.
+ */
+fs_inst *
+fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
+			      int sampler)
+{
+   int mlen = 0;
+   int base_mrf = 2;
+   int reg_width = c->dispatch_width / 8;
+   bool header_present = false;
+
+   if (ir->offset) {
+      /* The offsets set up by the ir_texture visitor are in the
+       * m1 header, so we can't go headerless.
+       */
+      header_present = true;
+      mlen++;
+      base_mrf--;
+   }
+
+   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
+      fs_inst *inst = emit(BRW_OPCODE_MOV,
+			   fs_reg(MRF, base_mrf + mlen + i * reg_width),
+			   coordinate);
+      if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
+	 inst->saturate = true;
+      coordinate.reg_offset++;
+   }
+   mlen += ir->coordinate->type->vector_elements * reg_width;
+
+   if (ir->shadow_comparitor) {
+      mlen = MAX2(mlen, header_present + 4 * reg_width);
+
+      this->result = reg_undef;
+      ir->shadow_comparitor->accept(this);
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
+      mlen += reg_width;
+   }
+
+   fs_inst *inst = NULL;
+   switch (ir->op) {
+   case ir_tex:
+      inst = emit(FS_OPCODE_TEX, dst);
+      break;
+   case ir_txb:
+      this->result = reg_undef;
+      ir->lod_info.bias->accept(this);
+      mlen = MAX2(mlen, header_present + 4 * reg_width);
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
+      mlen += reg_width;
+
+      inst = emit(FS_OPCODE_TXB, dst);
+
+      break;
+   case ir_txl:
+      this->result = reg_undef;
+      ir->lod_info.lod->accept(this);
+      mlen = MAX2(mlen, header_present + 4 * reg_width);
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
+      mlen += reg_width;
+
+      inst = emit(FS_OPCODE_TXL, dst);
+      break;
+   case ir_txd:
+   case ir_txf:
+      assert(!"GLSL 1.30 features unsupported");
+      break;
+   }
+   inst->base_mrf = base_mrf;
+   inst->mlen = mlen;
+   inst->header_present = header_present;
+
+   if (mlen > 11) {
+      fail("Message length >11 disallowed by hardware\n");
+   }
+
+   return inst;
+}
+
+fs_inst *
+fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
+			      int sampler)
+{
+   int mlen = 0;
+   int base_mrf = 2;
+   int reg_width = c->dispatch_width / 8;
+   bool header_present = false;
+
+   if (ir->offset) {
+      /* The offsets set up by the ir_texture visitor are in the
+       * m1 header, so we can't go headerless.
+       */
+      header_present = true;
+      mlen++;
+      base_mrf--;
+   }
+
+   if (ir->shadow_comparitor) {
+      ir->shadow_comparitor->accept(this);
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
+      mlen += reg_width;
+   }
+
+   /* Set up the LOD info */
+   switch (ir->op) {
+   case ir_tex:
+      break;
+   case ir_txb:
+      ir->lod_info.bias->accept(this);
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
+      mlen += reg_width;
+      break;
+   case ir_txl:
+      ir->lod_info.lod->accept(this);
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
+      mlen += reg_width;
+      break;
+   case ir_txd:
+   case ir_txf:
+      assert(!"GLSL 1.30 features unsupported");
+      break;
+   }
+
+   /* Set up the coordinate */
+   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
+      fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
+			   coordinate);
+      if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
+	 inst->saturate = true;
+      coordinate.reg_offset++;
+      mlen += reg_width;
+   }
+
+   /* Generate the SEND */
+   fs_inst *inst = NULL;
+   switch (ir->op) {
+   case ir_tex: inst = emit(FS_OPCODE_TEX, dst); break;
+   case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
+   case ir_txl: inst = emit(FS_OPCODE_TXL, dst); break;
+   case ir_txd: inst = emit(FS_OPCODE_TXD, dst); break;
+   case ir_txf: assert(!"TXF unsupported.");
+   }
+   inst->base_mrf = base_mrf;
+   inst->mlen = mlen;
+   inst->header_present = header_present;
+
+   if (mlen > 11) {
+      fail("Message length >11 disallowed by hardware\n");
+   }
+
+   return inst;
+}
+
+void
+fs_visitor::visit(ir_texture *ir)
+{
+   int sampler;
+   fs_inst *inst = NULL;
+
+   this->result = reg_undef;
+   ir->coordinate->accept(this);
+   fs_reg coordinate = this->result;
+
+   if (ir->offset != NULL) {
+      ir_constant *offset = ir->offset->as_constant();
+      assert(offset != NULL);
+
+      signed char offsets[3];
+      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
+	 offsets[i] = (signed char) offset->value.i[i];
+
+      /* Combine all three offsets into a single unsigned dword:
+       *
+       *    bits 11:8 - U Offset (X component)
+       *    bits  7:4 - V Offset (Y component)
+       *    bits  3:0 - R Offset (Z component)
+       */
+      unsigned offset_bits = 0;
+      for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
+	 const unsigned shift = 4 * (2 - i);
+	 offset_bits |= (offsets[i] << shift) & (0xF << shift);
+      }
+
+      /* Explicitly set up the message header by copying g0 to msg reg m1. */
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
+	   fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD));
+
+      /* Then set the offset bits in DWord 2 of the message header. */
+      emit(BRW_OPCODE_MOV,
+	   fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
+			 BRW_REGISTER_TYPE_UD)),
+	   fs_reg(brw_imm_uw(offset_bits)));
+   }
+
+   /* Should be lowered by do_lower_texture_projection */
+   assert(!ir->projector);
+
+   sampler = _mesa_get_sampler_uniform_value(ir->sampler,
+					     prog,
+					     &fp->Base);
+   sampler = fp->Base.SamplerUnits[sampler];
+
+   /* The 965 requires the EU to do the normalization of GL rectangle
+    * texture coordinates.  We use the program parameter state
+    * tracking to get the scaling factor.
+    */
+   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
+      struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
+      int tokens[STATE_LENGTH] = {
+	 STATE_INTERNAL,
+	 STATE_TEXRECT_SCALE,
+	 sampler,
+	 0,
+	 0
+      };
+
+      if (c->dispatch_width == 16) {
+	 fail("rectangle scale uniform setup not supported on 16-wide\n");
+	 this->result = fs_reg(this, ir->type);
+	 return;
+      }
+
+      c->prog_data.param_convert[c->prog_data.nr_params] =
+	 PARAM_NO_CONVERT;
+      c->prog_data.param_convert[c->prog_data.nr_params + 1] =
+	 PARAM_NO_CONVERT;
+
+      fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
+      fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
+      GLuint index = _mesa_add_state_reference(params,
+					       (gl_state_index *)tokens);
+
+      this->param_index[c->prog_data.nr_params] = index;
+      this->param_offset[c->prog_data.nr_params] = 0;
+      c->prog_data.nr_params++;
+      this->param_index[c->prog_data.nr_params] = index;
+      this->param_offset[c->prog_data.nr_params] = 1;
+      c->prog_data.nr_params++;
+
+      fs_reg dst = fs_reg(this, ir->coordinate->type);
+      fs_reg src = coordinate;
+      coordinate = dst;
+
+      emit(BRW_OPCODE_MUL, dst, src, scale_x);
+      dst.reg_offset++;
+      src.reg_offset++;
+      emit(BRW_OPCODE_MUL, dst, src, scale_y);
+   }
+
+   /* Writemasking doesn't eliminate channels on SIMD8 texture
+    * samples, so don't worry about them.
+    */
+   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
+
+   if (intel->gen >= 7) {
+      inst = emit_texture_gen7(ir, dst, coordinate, sampler);
+   } else if (intel->gen >= 5) {
+      inst = emit_texture_gen5(ir, dst, coordinate, sampler);
+   } else {
+      inst = emit_texture_gen4(ir, dst, coordinate, sampler);
+   }
+
+   /* If there's an offset, we already set up m1.  To avoid the implied move,
+    * use the null register.  Otherwise, we want an implied move from g0.
+    */
+   if (ir->offset != NULL || !inst->header_present)
+      inst->src[0] = reg_undef;
+   else
+      inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
+
+   inst->sampler = sampler;
+
+   this->result = dst;
+
+   if (ir->shadow_comparitor)
+      inst->shadow_compare = true;
+
+   if (ir->type == glsl_type::float_type) {
+      /* Ignore DEPTH_TEXTURE_MODE swizzling. */
+      assert(ir->sampler->type->sampler_shadow);
+   } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
+      fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
+
+      for (int i = 0; i < 4; i++) {
+	 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
+	 fs_reg l = swizzle_dst;
+	 l.reg_offset += i;
+
+	 if (swiz == SWIZZLE_ZERO) {
+	    emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
+	 } else if (swiz == SWIZZLE_ONE) {
+	    emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
+	 } else {
+	    fs_reg r = dst;
+	    r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
+	    emit(BRW_OPCODE_MOV, l, r);
+	 }
+      }
+      this->result = swizzle_dst;
+   }
+}
+
+void
+fs_visitor::visit(ir_swizzle *ir)
+{
+   this->result = reg_undef;
+   ir->val->accept(this);
+   fs_reg val = this->result;
+
+   if (ir->type->vector_elements == 1) {
+      this->result.reg_offset += ir->mask.x;
+      return;
+   }
+
+   fs_reg result = fs_reg(this, ir->type);
+   this->result = result;
+
+   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
+      fs_reg channel = val;
+      int swiz = 0;
+
+      switch (i) {
+      case 0:
+	 swiz = ir->mask.x;
+	 break;
+      case 1:
+	 swiz = ir->mask.y;
+	 break;
+      case 2:
+	 swiz = ir->mask.z;
+	 break;
+      case 3:
+	 swiz = ir->mask.w;
+	 break;
+      }
+
+      channel.reg_offset += swiz;
+      emit(BRW_OPCODE_MOV, result, channel);
+      result.reg_offset++;
+   }
+}
+
+void
+fs_visitor::visit(ir_discard *ir)
+{
+   assert(ir->condition == NULL); /* FINISHME */
+
+   emit(FS_OPCODE_DISCARD);
+   kill_emitted = true;
+}
+
+void
+fs_visitor::visit(ir_constant *ir)
+{
+   /* Set this->result to reg at the bottom of the function because some code
+    * paths will cause this visitor to be applied to other fields.  This will
+    * cause the value stored in this->result to be modified.
+    *
+    * Make reg constant so that it doesn't get accidentally modified along the
+    * way.  Yes, I actually had this problem. :(
+    */
+   const fs_reg reg(this, ir->type);
+   fs_reg dst_reg = reg;
+
+   if (ir->type->is_array()) {
+      const unsigned size = type_size(ir->type->fields.array);
+
+      for (unsigned i = 0; i < ir->type->length; i++) {
+	 this->result = reg_undef;
+	 ir->array_elements[i]->accept(this);
+	 fs_reg src_reg = this->result;
+
+	 dst_reg.type = src_reg.type;
+	 for (unsigned j = 0; j < size; j++) {
+	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
+	    src_reg.reg_offset++;
+	    dst_reg.reg_offset++;
+	 }
+      }
+   } else if (ir->type->is_record()) {
+      foreach_list(node, &ir->components) {
+	 ir_instruction *const field = (ir_instruction *) node;
+	 const unsigned size = type_size(field->type);
+
+	 this->result = reg_undef;
+	 field->accept(this);
+	 fs_reg src_reg = this->result;
+
+	 dst_reg.type = src_reg.type;
+	 for (unsigned j = 0; j < size; j++) {
+	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
+	    src_reg.reg_offset++;
+	    dst_reg.reg_offset++;
+	 }
+      }
+   } else {
+      const unsigned size = type_size(ir->type);
+
+      for (unsigned i = 0; i < size; i++) {
+	 switch (ir->type->base_type) {
+	 case GLSL_TYPE_FLOAT:
+	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
+	    break;
+	 case GLSL_TYPE_UINT:
+	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
+	    break;
+	 case GLSL_TYPE_INT:
+	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
+	    break;
+	 case GLSL_TYPE_BOOL:
+	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
+	    break;
+	 default:
+	    assert(!"Non-float/uint/int/bool constant");
+	 }
+	 dst_reg.reg_offset++;
+      }
+   }
+
+   this->result = reg;
+}
+
+void
+fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
+{
+   ir_expression *expr = ir->as_expression();
+
+   if (expr) {
+      fs_reg op[2];
+      fs_inst *inst;
+
+      assert(expr->get_num_operands() <= 2);
+      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
+	 assert(expr->operands[i]->type->is_scalar());
+
+	 this->result = reg_undef;
+	 expr->operands[i]->accept(this);
+	 op[i] = this->result;
+      }
+
+      switch (expr->operation) {
+      case ir_unop_logic_not:
+	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
+	 inst->conditional_mod = BRW_CONDITIONAL_Z;
+	 break;
+
+      case ir_binop_logic_xor:
+	 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_binop_logic_or:
+	 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_binop_logic_and:
+	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_unop_f2b:
+	 if (intel->gen >= 6) {
+	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
+	 } else {
+	    inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
+	 }
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_unop_i2b:
+	 if (intel->gen >= 6) {
+	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
+	 } else {
+	    inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
+	 }
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_binop_greater:
+      case ir_binop_gequal:
+      case ir_binop_less:
+      case ir_binop_lequal:
+      case ir_binop_equal:
+      case ir_binop_all_equal:
+      case ir_binop_nequal:
+      case ir_binop_any_nequal:
+	 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
+	 inst->conditional_mod =
+	    brw_conditional_for_comparison(expr->operation);
+	 break;
+
+      default:
+	 assert(!"not reached");
+	 fail("bad cond code\n");
+	 break;
+      }
+      return;
+   }
+
+   this->result = reg_undef;
+   ir->accept(this);
+
+   if (intel->gen >= 6) {
+      fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+   } else {
+      fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result);
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+   }
+}
+
+/**
+ * Emit a gen6 IF statement with the comparison folded into the IF
+ * instruction.
+ */
+void
+fs_visitor::emit_if_gen6(ir_if *ir)
+{
+   ir_expression *expr = ir->condition->as_expression();
+
+   if (expr) {
+      fs_reg op[2];
+      fs_inst *inst;
+      fs_reg temp;
+
+      assert(expr->get_num_operands() <= 2);
+      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
+	 assert(expr->operands[i]->type->is_scalar());
+
+	 this->result = reg_undef;
+	 expr->operands[i]->accept(this);
+	 op[i] = this->result;
+      }
+
+      switch (expr->operation) {
+      case ir_unop_logic_not:
+	 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_Z;
+	 return;
+
+      case ir_binop_logic_xor:
+	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_binop_logic_or:
+	 temp = fs_reg(this, glsl_type::bool_type);
+	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
+	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_binop_logic_and:
+	 temp = fs_reg(this, glsl_type::bool_type);
+	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
+	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_unop_f2b:
+	 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_unop_i2b:
+	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_binop_greater:
+      case ir_binop_gequal:
+      case ir_binop_less:
+      case ir_binop_lequal:
+      case ir_binop_equal:
+      case ir_binop_all_equal:
+      case ir_binop_nequal:
+      case ir_binop_any_nequal:
+	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
+	 inst->conditional_mod =
+	    brw_conditional_for_comparison(expr->operation);
+	 return;
+      default:
+	 assert(!"not reached");
+	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 fail("bad condition\n");
+	 return;
+      }
+      return;
+   }
+
+   this->result = reg_undef;
+   ir->condition->accept(this);
+
+   fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
+   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+}
+
+void
+fs_visitor::visit(ir_if *ir)
+{
+   fs_inst *inst;
+
+   if (intel->gen != 6 && c->dispatch_width == 16) {
+      fail("Can't support (non-uniform) control flow on 16-wide\n");
+   }
+
+   /* Don't point the annotation at the if statement, because then it plus
+    * the then and else blocks get printed.
+    */
+   this->base_ir = ir->condition;
+
+   if (intel->gen == 6) {
+      emit_if_gen6(ir);
+   } else {
+      emit_bool_to_cond_code(ir->condition);
+
+      inst = emit(BRW_OPCODE_IF);
+      inst->predicated = true;
+   }
+
+   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
+      ir_instruction *ir = (ir_instruction *)iter.get();
+      this->base_ir = ir;
+      this->result = reg_undef;
+      ir->accept(this);
+   }
+
+   if (!ir->else_instructions.is_empty()) {
+      emit(BRW_OPCODE_ELSE);
+
+      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
+	 ir_instruction *ir = (ir_instruction *)iter.get();
+	 this->base_ir = ir;
+	 this->result = reg_undef;
+	 ir->accept(this);
+      }
+   }
+
+   emit(BRW_OPCODE_ENDIF);
+}
+
+void
+fs_visitor::visit(ir_loop *ir)
+{
+   fs_reg counter = reg_undef;
+
+   if (c->dispatch_width == 16) {
+      fail("Can't support (non-uniform) control flow on 16-wide\n");
+   }
+
+   if (ir->counter) {
+      this->base_ir = ir->counter;
+      ir->counter->accept(this);
+      counter = *(variable_storage(ir->counter));
+
+      if (ir->from) {
+	 this->result = counter;
+
+	 this->base_ir = ir->from;
+	 this->result = counter;
+	 ir->from->accept(this);
+
+	 if (!this->result.equals(&counter))
+	    emit(BRW_OPCODE_MOV, counter, this->result);
+      }
+   }
+
+   emit(BRW_OPCODE_DO);
+
+   if (ir->to) {
+      this->base_ir = ir->to;
+      this->result = reg_undef;
+      ir->to->accept(this);
+
+      fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
+      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
+
+      inst = emit(BRW_OPCODE_BREAK);
+      inst->predicated = true;
+   }
+
+   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
+      ir_instruction *ir = (ir_instruction *)iter.get();
+
+      this->base_ir = ir;
+      this->result = reg_undef;
+      ir->accept(this);
+   }
+
+   if (ir->increment) {
+      this->base_ir = ir->increment;
+      this->result = reg_undef;
+      ir->increment->accept(this);
+      emit(BRW_OPCODE_ADD, counter, counter, this->result);
+   }
+
+   emit(BRW_OPCODE_WHILE);
+}
+
+void
+fs_visitor::visit(ir_loop_jump *ir)
+{
+   switch (ir->mode) {
+   case ir_loop_jump::jump_break:
+      emit(BRW_OPCODE_BREAK);
+      break;
+   case ir_loop_jump::jump_continue:
+      emit(BRW_OPCODE_CONTINUE);
+      break;
+   }
+}
+
+void
+fs_visitor::visit(ir_call *ir)
+{
+   assert(!"FINISHME");
+}
+
+void
+fs_visitor::visit(ir_return *ir)
+{
+   assert(!"FINISHME");
+}
+
+void
+fs_visitor::visit(ir_function *ir)
+{
+   /* Ignore function bodies other than main() -- we shouldn't see calls to
+    * them since they should all be inlined before we get to ir_to_mesa.
+    */
+   if (strcmp(ir->name, "main") == 0) {
+      const ir_function_signature *sig;
+      exec_list empty;
+
+      sig = ir->matching_signature(&empty);
+
+      assert(sig);
+
+      foreach_iter(exec_list_iterator, iter, sig->body) {
+	 ir_instruction *ir = (ir_instruction *)iter.get();
+	 this->base_ir = ir;
+	 this->result = reg_undef;
+	 ir->accept(this);
+      }
+   }
+}
+
+void
+fs_visitor::visit(ir_function_signature *ir)
+{
+   assert(!"not reached");
+   (void)ir;
+}
+
+fs_inst *
+fs_visitor::emit(fs_inst inst)
+{
+   fs_inst *list_inst = new(mem_ctx) fs_inst;
+   *list_inst = inst;
+
+   if (force_uncompressed_stack > 0)
+      list_inst->force_uncompressed = true;
+   else if (force_sechalf_stack > 0)
+      list_inst->force_sechalf = true;
+
+   list_inst->annotation = this->current_annotation;
+   list_inst->ir = this->base_ir;
+
+   this->instructions.push_tail(list_inst);
+
+   return list_inst;
+}
+
+/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
+void
+fs_visitor::emit_dummy_fs()
+{
+   /* Everyone's favorite color. */
+   emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f));
+   emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f));
+   emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f));
+   emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f));
+
+   fs_inst *write;
+   write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
+   write->base_mrf = 0;
+}
+
+/* The register location here is relative to the start of the URB
+ * data.  It will get adjusted to be a real location before
+ * generate_code() time.
+ */
+struct brw_reg
+fs_visitor::interp_reg(int location, int channel)
+{
+   int regnr = urb_setup[location] * 2 + channel / 2;
+   int stride = (channel & 1) * 4;
+
+   assert(urb_setup[location] != -1);
+
+   return brw_vec1_grf(regnr, stride);
+}
+
+/** Emits the interpolation for the varying inputs. */
+void
+fs_visitor::emit_interpolation_setup_gen4()
+{
+   this->current_annotation = "compute pixel centers";
+   this->pixel_x = fs_reg(this, glsl_type::uint_type);
+   this->pixel_y = fs_reg(this, glsl_type::uint_type);
+   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
+   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
+
+   emit(FS_OPCODE_PIXEL_X, this->pixel_x);
+   emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
+
+   this->current_annotation = "compute pixel deltas from v0";
+   if (brw->has_pln) {
+      this->delta_x = fs_reg(this, glsl_type::vec2_type);
+      this->delta_y = this->delta_x;
+      this->delta_y.reg_offset++;
+   } else {
+      this->delta_x = fs_reg(this, glsl_type::float_type);
+      this->delta_y = fs_reg(this, glsl_type::float_type);
+   }
+   emit(BRW_OPCODE_ADD, this->delta_x,
+	this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
+   emit(BRW_OPCODE_ADD, this->delta_y,
+	this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
+
+   this->current_annotation = "compute pos.w and 1/pos.w";
+   /* Compute wpos.w.  It's always in our setup, since it's needed to
+    * interpolate the other attributes.
+    */
+   this->wpos_w = fs_reg(this, glsl_type::float_type);
+   emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
+	interp_reg(FRAG_ATTRIB_WPOS, 3));
+   /* Compute the pixel 1/W value from wpos.w. */
+   this->pixel_w = fs_reg(this, glsl_type::float_type);
+   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
+   this->current_annotation = NULL;
+}
+
+/** Emits the interpolation for the varying inputs. */
+void
+fs_visitor::emit_interpolation_setup_gen6()
+{
+   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
+
+   /* If the pixel centers end up used, the setup is the same as for gen4. */
+   this->current_annotation = "compute pixel centers";
+   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
+   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
+   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
+   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
+   emit(BRW_OPCODE_ADD,
+	int_pixel_x,
+	fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
+	fs_reg(brw_imm_v(0x10101010)));
+   emit(BRW_OPCODE_ADD,
+	int_pixel_y,
+	fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
+	fs_reg(brw_imm_v(0x11001100)));
+
+   /* As of gen6, we can no longer mix float and int sources.  We have
+    * to turn the integer pixel centers into floats for their actual
+    * use.
+    */
+   this->pixel_x = fs_reg(this, glsl_type::float_type);
+   this->pixel_y = fs_reg(this, glsl_type::float_type);
+   emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
+   emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
+
+   this->current_annotation = "compute pos.w";
+   this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
+   this->wpos_w = fs_reg(this, glsl_type::float_type);
+   emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w);
+
+   this->delta_x = fs_reg(brw_vec8_grf(2, 0));
+   this->delta_y = fs_reg(brw_vec8_grf(3, 0));
+
+   this->current_annotation = NULL;
+}
+
+void
+fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color)
+{
+   int reg_width = c->dispatch_width / 8;
+
+   if (c->dispatch_width == 8 || intel->gen == 6) {
+      /* SIMD8 write looks like:
+       * m + 0: r0
+       * m + 1: r1
+       * m + 2: g0
+       * m + 3: g1
+       *
+       * gen6 SIMD16 DP write looks like:
+       * m + 0: r0
+       * m + 1: r1
+       * m + 2: g0
+       * m + 3: g1
+       * m + 4: b0
+       * m + 5: b1
+       * m + 6: a0
+       * m + 7: a1
+       */
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width),
+	   color);
+   } else {
+      /* pre-gen6 SIMD16 single source DP write looks like:
+       * m + 0: r0
+       * m + 1: g0
+       * m + 2: b0
+       * m + 3: a0
+       * m + 4: r1
+       * m + 5: g1
+       * m + 6: b1
+       * m + 7: a1
+       */
+      if (brw->has_compr4) {
+	 /* By setting the high bit of the MRF register number, we
+	  * indicate that we want COMPR4 mode - instead of doing the
+	  * usual destination + 1 for the second half we get
+	  * destination + 4.
+	  */
+	 emit(BRW_OPCODE_MOV,
+	      fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index), color);
+      } else {
+	 push_force_uncompressed();
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color);
+	 pop_force_uncompressed();
+
+	 push_force_sechalf();
+	 color.sechalf = true;
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color);
+	 pop_force_sechalf();
+	 color.sechalf = false;
+      }
+   }
+}
+
+void
+fs_visitor::emit_fb_writes()
+{
+   this->current_annotation = "FB write header";
+   GLboolean header_present = GL_TRUE;
+   int nr = 0;
+   int reg_width = c->dispatch_width / 8;
+
+   if (intel->gen >= 6 &&
+       !this->kill_emitted &&
+       c->key.nr_color_regions == 1) {
+      header_present = false;
+   }
+
+   if (header_present) {
+      /* m0, m1 header */
+      nr += 2;
+   }
+
+   if (c->aa_dest_stencil_reg) {
+      push_force_uncompressed();
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
+	   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
+      pop_force_uncompressed();
+   }
+
+   /* Reserve space for color. It'll be filled in per MRT below. */
+   int color_mrf = nr;
+   nr += 4 * reg_width;
+
+   if (c->source_depth_to_render_target) {
+      if (intel->gen == 6 && c->dispatch_width == 16) {
+	 /* For outputting oDepth on gen6, SIMD8 writes have to be
+	  * used.  This would require 8-wide moves of each half to
+	  * message regs, kind of like pre-gen5 SIMD16 FB writes.
+	  * Just bail on doing so for now.
+	  */
+	 fail("Missing support for simd16 depth writes on gen6\n");
+      }
+
+      if (c->computes_depth) {
+	 /* Hand over gl_FragDepth. */
+	 assert(this->frag_depth);
+	 fs_reg depth = *(variable_storage(this->frag_depth));
+
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
+      } else {
+	 /* Pass through the payload depth. */
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
+	      fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
+      }
+      nr += reg_width;
+   }
+
+   if (c->dest_depth_reg) {
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
+	   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
+      nr += reg_width;
+   }
+
+   fs_reg color = reg_undef;
+   if (this->frag_color)
+      color = *(variable_storage(this->frag_color));
+   else if (this->frag_data) {
+      color = *(variable_storage(this->frag_data));
+      color.type = BRW_REGISTER_TYPE_F;
+   }
+
+   for (int target = 0; target < c->key.nr_color_regions; target++) {
+      this->current_annotation = ralloc_asprintf(this->mem_ctx,
+						 "FB write target %d",
+						 target);
+      if (this->frag_color || this->frag_data) {
+	 for (int i = 0; i < 4; i++) {
+	    emit_color_write(i, color_mrf, color);
+	    color.reg_offset++;
+	 }
+      }
+
+      if (this->frag_color)
+	 color.reg_offset -= 4;
+
+      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
+      inst->target = target;
+      inst->base_mrf = 0;
+      inst->mlen = nr;
+      if (target == c->key.nr_color_regions - 1)
+	 inst->eot = true;
+      inst->header_present = header_present;
+   }
+
+   if (c->key.nr_color_regions == 0) {
+      if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
+	 /* If the alpha test is enabled but there's no color buffer,
+	  * we still need to send alpha out the pipeline to our null
+	  * renderbuffer.
+	  */
+	 color.reg_offset += 3;
+	 emit_color_write(3, color_mrf, color);
+      }
+
+      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
+      inst->base_mrf = 0;
+      inst->mlen = nr;
+      inst->eot = true;
+      inst->header_present = header_present;
+   }
+
+   this->current_annotation = NULL;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index 9b82fe159f9..c2227777cfb 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -181,9 +181,6 @@ static void upload_sf_prog(struct brw_context *brw)
    key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
    key.do_twoside_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
 
-   /* _NEW_HINT */
-   key.linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
-
    /* _NEW_POLYGON */
    if (key.do_twoside_color) {
       /* If we're rendering to a FBO, we have to invert the polygon
diff --git a/src/mesa/drivers/dri/i965/brw_sf.h b/src/mesa/drivers/dri/i965/brw_sf.h
index e525c730d3f..be32085c697 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.h
+++ b/src/mesa/drivers/dri/i965/brw_sf.h
@@ -52,7 +52,6 @@ struct brw_sf_prog_key {
    GLuint do_flat_shading:1;
    GLuint frontface_ccw:1;
    GLuint do_point_sprite:1;
-   GLuint linear_color:1;  /**< linear interp vs. perspective interp */
    GLuint sprite_origin_lower_left:1;
    GLuint pad:24;
 };
diff --git a/src/mesa/drivers/dri/i965/brw_sf_emit.c b/src/mesa/drivers/dri/i965/brw_sf_emit.c
index 4b2e26cbed7..52a3fb3893d 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_emit.c
@@ -316,7 +316,7 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
    GLbitfield64 persp_mask;
    GLbitfield64 linear_mask;
 
-   if (c->key.do_flat_shading || c->key.linear_color)
+   if (c->key.do_flat_shading)
       persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS |
                                     FRAG_BIT_COL0 |
                                     FRAG_BIT_COL1);
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
new file mode 100644
index 00000000000..9471883fb2b
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -0,0 +1,194 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+extern "C" {
+#include "main/macros.h"
+#include "brw_context.h"
+}
+#include "brw_fs.h"
+#include "../glsl/ir_optimization.h"
+#include "../glsl/ir_print_visitor.h"
+
+struct gl_shader *
+brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
+{
+   struct brw_shader *shader;
+
+   shader = rzalloc(NULL, struct brw_shader);
+   if (shader) {
+      shader->base.Type = type;
+      shader->base.Name = name;
+      _mesa_init_shader(ctx, &shader->base);
+   }
+
+   return &shader->base;
+}
+
+struct gl_shader_program *
+brw_new_shader_program(struct gl_context *ctx, GLuint name)
+{
+   struct brw_shader_program *prog;
+   prog = rzalloc(NULL, struct brw_shader_program);
+   if (prog) {
+      prog->base.Name = name;
+      _mesa_init_shader_program(ctx, &prog->base);
+   }
+   return &prog->base;
+}
+
+/**
+ * Performs a compile of the shader stages even when we don't know
+ * what non-orthogonal state will be set, in the hope that it reflects
+ * the eventual NOS used, and thus allows us to produce link failures.
+ */
+bool
+brw_shader_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
+{
+   if (!brw_fs_precompile(ctx, prog))
+      return false;
+
+   return true;
+}
+
+GLboolean
+brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct intel_context *intel = &brw->intel;
+
+   struct brw_shader *shader =
+      (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
+   if (shader != NULL) {
+      void *mem_ctx = ralloc_context(NULL);
+      bool progress;
+
+      if (shader->ir)
+	 ralloc_free(shader->ir);
+      shader->ir = new(shader) exec_list;
+      clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
+
+      do_mat_op_to_vec(shader->ir);
+      lower_instructions(shader->ir,
+			 MOD_TO_FRACT |
+			 DIV_TO_MUL_RCP |
+			 SUB_TO_ADD_NEG |
+			 EXP_TO_EXP2 |
+			 LOG_TO_LOG2);
+
+      /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
+       * if-statements need to be flattened.
+       */
+      if (intel->gen < 6)
+	 lower_if_to_cond_assign(shader->ir, 16);
+
+      do_lower_texture_projection(shader->ir);
+      do_vec_index_to_cond_assign(shader->ir);
+      brw_do_cubemap_normalize(shader->ir);
+      lower_noise(shader->ir);
+      lower_quadop_vector(shader->ir, false);
+      lower_variable_index_to_cond_assign(shader->ir,
+					  GL_TRUE, /* input */
+					  GL_TRUE, /* output */
+					  GL_TRUE, /* temp */
+					  GL_TRUE /* uniform */
+					  );
+
+      do {
+	 progress = false;
+
+	 brw_do_channel_expressions(shader->ir);
+	 brw_do_vector_splitting(shader->ir);
+
+	 progress = do_lower_jumps(shader->ir, true, true,
+				   true, /* main return */
+				   false, /* continue */
+				   false /* loops */
+				   ) || progress;
+
+	 progress = do_common_optimization(shader->ir, true, 32) || progress;
+      } while (progress);
+
+      validate_ir_tree(shader->ir);
+
+      reparent_ir(shader->ir, shader->ir);
+      ralloc_free(mem_ctx);
+   }
+
+   if (!_mesa_ir_link_shader(ctx, prog))
+      return GL_FALSE;
+
+   if (!brw_shader_precompile(ctx, prog))
+      return GL_FALSE;
+
+   return GL_TRUE;
+}
+
+
+int
+brw_type_for_base_type(const struct glsl_type *type)
+{
+   switch (type->base_type) {
+   case GLSL_TYPE_FLOAT:
+      return BRW_REGISTER_TYPE_F;
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_BOOL:
+      return BRW_REGISTER_TYPE_D;
+   case GLSL_TYPE_UINT:
+      return BRW_REGISTER_TYPE_UD;
+   case GLSL_TYPE_ARRAY:
+   case GLSL_TYPE_STRUCT:
+   case GLSL_TYPE_SAMPLER:
+      /* These should be overridden with the type of the member when
+       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
+       * way to trip up if we don't.
+       */
+      return BRW_REGISTER_TYPE_UD;
+   default:
+      assert(!"not reached");
+      return BRW_REGISTER_TYPE_F;
+   }
+}
+
+uint32_t
+brw_conditional_for_comparison(unsigned int op)
+{
+   switch (op) {
+   case ir_binop_less:
+      return BRW_CONDITIONAL_L;
+   case ir_binop_greater:
+      return BRW_CONDITIONAL_G;
+   case ir_binop_lequal:
+      return BRW_CONDITIONAL_LE;
+   case ir_binop_gequal:
+      return BRW_CONDITIONAL_GE;
+   case ir_binop_equal:
+   case ir_binop_all_equal: /* same as equal for scalars */
+      return BRW_CONDITIONAL_Z;
+   case ir_binop_nequal:
+   case ir_binop_any_nequal: /* same as nequal for scalars */
+      return BRW_CONDITIONAL_NZ;
+   default:
+      assert(!"not reached: bad operation for comparison");
+      return BRW_CONDITIONAL_NZ;
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
new file mode 100644
index 00000000000..4c568a26caa
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+int brw_type_for_base_type(const struct glsl_type *type);
+uint32_t brw_conditional_for_comparison(unsigned int op);
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index 3a3aa8c0346..ef58619702d 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -110,7 +110,7 @@ static void dump_wm_surface_state(struct brw_context *brw)
 
    for (i = 0; i < brw->wm.nr_surfaces; i++) {
       unsigned int surfoff;
-      struct brw_surface_state *surf;
+      uint32_t *surf;
       char name[20];
 
       if (brw->wm.surf_offset[i] == 0) {
@@ -118,21 +118,25 @@ static void dump_wm_surface_state(struct brw_context *brw)
 	 continue;
       }
       surfoff = bo->offset + brw->wm.surf_offset[i];
-      surf = (struct brw_surface_state *)(base + brw->wm.surf_offset[i]);
+      surf = (uint32_t *)(base + brw->wm.surf_offset[i]);
 
       sprintf(name, "WM SURF%d", i);
       state_out(name, surf, surfoff, 0, "%s %s\n",
-		get_965_surfacetype(surf->ss0.surface_type),
-		get_965_surface_format(surf->ss0.surface_format));
+		get_965_surfacetype(GET_FIELD(surf[0], BRW_SURFACE_TYPE)),
+		get_965_surface_format(GET_FIELD(surf[0], BRW_SURFACE_FORMAT)));
       state_out(name, surf, surfoff, 1, "offset\n");
       state_out(name, surf, surfoff, 2, "%dx%d size, %d mips\n",
-		surf->ss2.width + 1, surf->ss2.height + 1, surf->ss2.mip_count);
-      state_out(name, surf, surfoff, 3, "pitch %d, %stiled\n",
-		surf->ss3.pitch + 1, surf->ss3.tiled_surface ? "" : "not ");
+		GET_FIELD(surf[2], BRW_SURFACE_WIDTH) + 1,
+		GET_FIELD(surf[2], BRW_SURFACE_HEIGHT) + 1);
+      state_out(name, surf, surfoff, 3, "pitch %d, %s tiled\n",
+		GET_FIELD(surf[3], BRW_SURFACE_PITCH) + 1,
+		(surf[3] & BRW_SURFACE_TILED) ?
+		((surf[3] & BRW_SURFACE_TILED_Y) ? "Y" : "X") : "not");
       state_out(name, surf, surfoff, 4, "mip base %d\n",
-		surf->ss4.min_lod);
+		GET_FIELD(surf[4], BRW_SURFACE_MIN_LOD));
       state_out(name, surf, surfoff, 5, "x,y offset: %d,%d\n",
-		surf->ss5.x_offset, surf->ss5.y_offset);
+		GET_FIELD(surf[5], BRW_SURFACE_X_OFFSET),
+		GET_FIELD(surf[5], BRW_SURFACE_Y_OFFSET));
    }
    drm_intel_bo_unmap(bo);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index ad31222e9ec..7b9cdba4cbf 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -1225,80 +1225,6 @@ struct gen7_sf_clip_viewport {
    GLfloat pad1[4];
 };
 
-/* Documented in the subsystem/shared-functions/sampler chapter...
- *
- * vol5c Shared Functions - 1.13.4.1.1
- */
-struct brw_surface_state
-{
-   struct {
-      GLuint cube_pos_z:1; 
-      GLuint cube_neg_z:1; 
-      GLuint cube_pos_y:1; 
-      GLuint cube_neg_y:1; 
-      GLuint cube_pos_x:1; 
-      GLuint cube_neg_x:1; 
-      GLuint pad:2;
-      /* Required on gen6 for surfaces accessed through render cache messages.
-       */
-      GLuint render_cache_read_write:1;
-      /* Ironlake and newer: instead of replicating one of the texels */
-      GLuint cube_corner_average:1;
-      GLuint mipmap_layout_mode:1; 
-      GLuint vert_line_stride_ofs:1; 
-      GLuint vert_line_stride:1; 
-      GLuint color_blend:1; 
-      GLuint writedisable_blue:1; 
-      GLuint writedisable_green:1; 
-      GLuint writedisable_red:1; 
-      GLuint writedisable_alpha:1; 
-      GLuint surface_format:9;     /**< BRW_SURFACEFORMAT_x */
-      GLuint data_return_format:1; 
-      GLuint pad0:1;
-      GLuint surface_type:3;       /**< BRW_SURFACE_1D/2D/3D/CUBE */
-   } ss0;
-   
-   struct {
-      GLuint base_addr;  
-   } ss1;
-   
-   struct {
-      GLuint pad:2;
-      GLuint mip_count:4; 
-      GLuint width:13; 
-      GLuint height:13; 
-   } ss2;
-
-   struct {
-      GLuint tile_walk:1; 
-      GLuint tiled_surface:1; 
-      GLuint pad:1; 
-      GLuint pitch:18; 
-      GLuint depth:11; 
-   } ss3;
-   
-   struct {
-      GLuint multisample_position_palette_index:3;
-      GLuint pad1:1;
-      GLuint num_multisamples:3;
-      GLuint pad0:1;
-      GLuint render_target_view_extent:9;
-      GLuint min_array_elt:11;
-      GLuint min_lod:4; 
-   } ss4;
-
-   struct {
-      GLuint pad1:16;
-      GLuint cache_control:2;
-      GLuint gfdt:1;
-      GLuint encrypt:1;
-      GLuint y_offset:4;
-      GLuint pad0:1;
-      GLuint x_offset:7;
-   } ss5;   /* New in G4X */
-
-};
-
 /* volume 5c Shared Functions - 1.13.4.1.2 */
 struct gen7_surface_state
 {
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 9ac0713a1d3..4a3a2bfada2 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -61,7 +61,7 @@ GLboolean brw_miptree_layout(struct intel_context *intel,
 	   */
 	  h0 = ALIGN(mt->height0, align_h);
 	  h1 = ALIGN(minify(h0), align_h);
-	  qpitch = (h0 + h1 + 11 * align_h);
+	  qpitch = (h0 + h1 + (intel->gen >= 7 ? 12 : 11) * align_h);
           if (mt->compressed)
 	     qpitch /= 4;
 
@@ -152,9 +152,6 @@ GLboolean brw_miptree_layout(struct intel_context *intel,
        * in the texture surfaces run, so they may be "vertical" through
        * memory.  As a result, the docs say in Surface Padding Requirements:
        * Sampling Engine Surfaces that two extra rows of padding are required.
-       * We don't know of similar requirements for pre-965, but given that
-       * those docs are silent on padding requirements in general, let's play
-       * it safe.
        */
       if (mt->target == GL_TEXTURE_CUBE_MAP)
 	 mt->total_height += 2;
diff --git a/src/mesa/drivers/dri/i965/brw_vs_constval.c b/src/mesa/drivers/dri/i965/brw_vs_constval.c
index 47cc0a7da7a..9fdfebe9f76 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_constval.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_constval.c
@@ -194,11 +194,19 @@ static void calc_wm_input_sizes( struct brw_context *brw )
    /* BRW_NEW_VERTEX_PROGRAM */
    const struct brw_vertex_program *vp =
       brw_vertex_program_const(brw->vertex_program);
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
    /* BRW_NEW_INPUT_DIMENSIONS */
    struct tracker t;
    GLuint insn;
    GLuint i;
 
+   /* If we're going to go through brw_fs.cpp, we don't end up using
+    * brw->wm.input_size_masks.
+    */
+   if (prog && prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
+      return;
+
    memset(&t, 0, sizeof(t));
 
    /* _NEW_LIGHT */
@@ -238,7 +246,9 @@ static void calc_wm_input_sizes( struct brw_context *brw )
 const struct brw_tracked_state brw_wm_input_sizes = {
    .dirty = {
       .mesa  = _NEW_LIGHT,
-      .brw   = BRW_NEW_VERTEX_PROGRAM | BRW_NEW_INPUT_DIMENSIONS,
+      .brw   = (BRW_NEW_FRAGMENT_PROGRAM |
+		BRW_NEW_VERTEX_PROGRAM |
+		BRW_NEW_INPUT_DIMENSIONS),
       .cache = 0
    },
    .prepare = calc_wm_input_sizes
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 9f99ef57214..69650e1df77 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -136,6 +136,16 @@ static void brw_invalidate_state( struct intel_context *intel, GLuint new_state
    /* nothing */
 }
 
+/**
+ * \see intel_context.vtbl.is_hiz_depth_format
+ */
+static bool brw_is_hiz_depth_format(struct intel_context *intel,
+                                    gl_format format)
+{
+   /* In the future, this will support Z_FLOAT32. */
+   return intel->has_hiz && (format == MESA_FORMAT_X8_Z24);
+}
+
 
 void brwInitVtbl( struct brw_context *brw )
 {
@@ -152,4 +162,5 @@ void brwInitVtbl( struct brw_context *brw )
    brw->intel.vtbl.set_draw_region = brw_set_draw_region;
    brw->intel.vtbl.debug_batch = brw_debug_batch;
    brw->intel.vtbl.render_target_supported = brw_render_target_supported;
+   brw->intel.vtbl.is_hiz_depth_format = brw_is_hiz_depth_format;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 907976295ab..1aebd12df49 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -34,6 +34,7 @@
 #include "brw_state.h"
 #include "main/formats.h"
 #include "main/samplerobj.h"
+#include "program/prog_parameter.h"
 
 #include "../glsl/ralloc.h"
 
@@ -115,7 +116,7 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
    brw_wm_pass2(c);
 
    /* how many general-purpose registers are used */
-   c->prog_data.total_grf = c->max_wm_grf;
+   c->prog_data.reg_blocks = brw_register_blocks(c->max_wm_grf);
 
    /* Emit GEN4 code.
     */
@@ -184,9 +185,10 @@ brw_wm_payload_setup(struct brw_context *brw,
  * Depending on the instructions used (i.e. flow control instructions)
  * we'll use one of two code generators.
  */
-static void do_wm_prog( struct brw_context *brw,
-			struct brw_fragment_program *fp, 
-			struct brw_wm_prog_key *key)
+bool do_wm_prog(struct brw_context *brw,
+		struct gl_shader_program *prog,
+		struct brw_fragment_program *fp,
+		struct brw_wm_prog_key *key)
 {
    struct intel_context *intel = &brw->intel;
    struct brw_wm_compile *c;
@@ -202,7 +204,7 @@ static void do_wm_prog( struct brw_context *brw,
           * without triggering a segfault, no way to signal,
           * so just return.
           */
-         return;
+         return false;
       }
       c->instruction = rzalloc_array(c, struct brw_wm_instruction, BRW_WM_MAX_INSN);
       c->prog_instructions = rzalloc_array(c, struct prog_instruction, BRW_WM_MAX_INSN);
@@ -226,7 +228,10 @@ static void do_wm_prog( struct brw_context *brw,
 
    brw_init_compile(brw, &c->func, c);
 
-   if (!brw_wm_fs_emit(brw, c)) {
+   if (prog && prog->FragmentProgram) {
+      if (!brw_wm_fs_emit(brw, c, prog))
+	 return false;
+   } else {
       /* Fallback for fixed function and ARB_fp shaders. */
       c->dispatch_width = 16;
       brw_wm_payload_setup(brw, c);
@@ -274,6 +279,8 @@ static void do_wm_prog( struct brw_context *brw,
 				      program, program_size,
 				      &c->prog_data, sizeof(c->prog_data),
 				      &brw->wm.prog_data);
+
+   return true;
 }
 
 
@@ -355,9 +362,6 @@ static void brw_wm_populate_key( struct brw_context *brw,
    /* _NEW_LIGHT */
    key->flat_shade = (ctx->Light.ShadeModel == GL_FLAT);
 
-   /* _NEW_HINT */
-   key->linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
-
    /* _NEW_FRAG_CLAMP | _NEW_BUFFERS */
    key->clamp_fragment_color = ctx->Color._ClampFragmentColor;
 
@@ -426,9 +430,6 @@ static void brw_wm_populate_key( struct brw_context *brw,
       }
    }
 
-   /* Shadow */
-   key->shadowtex_mask = fp->program.Base.ShadowSamplers;
-
    /* _NEW_BUFFERS */
    /*
     * Include the draw buffer origin and height so that we can calculate
@@ -468,6 +469,8 @@ static void brw_wm_populate_key( struct brw_context *brw,
 
 static void brw_prepare_wm_prog(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
    struct brw_wm_prog_key key;
    struct brw_fragment_program *fp = (struct brw_fragment_program *)
       brw->fragment_program;
@@ -480,8 +483,11 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
    brw->wm.prog_bo = brw_search_cache(&brw->cache, BRW_WM_PROG,
 				      &key, sizeof(key),
 				      &brw->wm.prog_data);
-   if (brw->wm.prog_bo == NULL)
-      do_wm_prog(brw, fp, &key);
+   if (brw->wm.prog_bo == NULL) {
+      bool success = do_wm_prog(brw, ctx->Shader.CurrentFragmentProgram, fp,
+				&key);
+      assert(success);
+   }
 }
 
 
@@ -489,7 +495,6 @@ const struct brw_tracked_state brw_wm_prog = {
    .dirty = {
       .mesa  = (_NEW_COLOR |
 		_NEW_DEPTH |
-                _NEW_HINT |
 		_NEW_STENCIL |
 		_NEW_POLYGON |
 		_NEW_LINE |
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 8ab531bdf87..e244b55a083 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -59,16 +59,16 @@
 #define AA_ALWAYS    2
 
 struct brw_wm_prog_key {
+   uint8_t iz_lookup;
    GLuint stats_wm:1;
    GLuint flat_shade:1;
-   GLuint linear_color:1;  /**< linear interpolation vs perspective interp */
    GLuint nr_color_regions:5;
    GLuint render_to_fbo:1;
    GLuint alpha_test:1;
    GLuint clamp_fragment_color:1;
+   GLuint line_aa:2;
 
    GLbitfield proj_attrib_mask; /**< one bit per fragment program attribute */
-   GLuint shadowtex_mask:16;
    GLuint yuvtex_mask:16;
    GLuint yuvtex_swap_mask:16;	/* UV swaped */
    uint16_t gl_clamp_mask[3];
@@ -76,8 +76,6 @@ struct brw_wm_prog_key {
    GLushort tex_swizzles[BRW_MAX_TEX_UNIT];
    GLushort drawable_height;
    GLbitfield64 vp_outputs_written;
-   GLuint iz_lookup;
-   GLuint line_aa;
    GLuint program_string_id:32;
 };
 
@@ -314,7 +312,8 @@ void brw_wm_print_program( struct brw_wm_compile *c,
 void brw_wm_lookup_iz(struct intel_context *intel,
 		      struct brw_wm_compile *c);
 
-bool brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
+bool brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
+		    struct gl_shader_program *prog);
 
 /* brw_wm_emit.c */
 void emit_alu1(struct brw_compile *p,
@@ -476,5 +475,9 @@ bool brw_color_buffer_write_enabled(struct brw_context *brw);
 bool brw_render_target_supported(gl_format format);
 void brw_wm_payload_setup(struct brw_context *brw,
 			  struct brw_wm_compile *c);
+bool do_wm_prog(struct brw_context *brw,
+		struct gl_shader_program *prog,
+		struct brw_fragment_program *fp,
+		struct brw_wm_prog_key *key);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index 9ddbee2edf4..59dcda7b414 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -417,25 +417,14 @@ static void emit_interp( struct brw_wm_compile *c,
 		 src_undef());
       }
       else {
-         if (c->key.linear_color) {
-            emit_op(c,
-                    WM_LINTERP,
-                    dst,
-                    0,
-                    interp,
-                    deltas,
-                    src_undef());
-         }
-         else {
-            /* perspective-corrected color interpolation */
-            emit_op(c,
-                    WM_PINTERP,
-                    dst,
-                    0,
-                    interp,
-                    deltas,
-                    get_pixel_w(c));
-         }
+	 /* perspective-corrected color interpolation */
+	 emit_op(c,
+		 WM_PINTERP,
+		 dst,
+		 0,
+		 interp,
+		 deltas,
+		 get_pixel_w(c));
       }
       break;
    case FRAG_ATTRIB_FOGC:
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index a356711470a..ef98f8126dc 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -91,8 +91,8 @@ brw_prepare_wm_unit(struct brw_context *brw)
    }
 
    /* CACHE_NEW_WM_PROG */
-   wm->thread0.grf_reg_count = ALIGN(brw->wm.prog_data->total_grf, 16) / 16 - 1;
-   wm->wm9.grf_reg_count_2 = ALIGN(brw->wm.prog_data->total_grf_16, 16) / 16 - 1;
+   wm->thread0.grf_reg_count = brw->wm.prog_data->reg_blocks;
+   wm->wm9.grf_reg_count_2 = brw->wm.prog_data->reg_blocks_16;
    wm->thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */
    /* reloc */
    wm->wm9.kernel_start_pointer_2 = (brw->wm.prog_bo->offset +
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index de1953ed600..6c1eba69d4b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -200,22 +200,16 @@ translate_tex_format(gl_format mesa_format,
    }
 }
 
-static void
-brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling)
+static uint32_t
+brw_get_surface_tiling_bits(uint32_t tiling)
 {
    switch (tiling) {
-   case I915_TILING_NONE:
-      surf->ss3.tiled_surface = 0;
-      surf->ss3.tile_walk = 0;
-      break;
    case I915_TILING_X:
-      surf->ss3.tiled_surface = 1;
-      surf->ss3.tile_walk = BRW_TILEWALK_XMAJOR;
-      break;
+      return BRW_SURFACE_TILED;
    case I915_TILING_Y:
-      surf->ss3.tiled_surface = 1;
-      surf->ss3.tile_walk = BRW_TILEWALK_YMAJOR;
-      break;
+      return BRW_SURFACE_TILED | BRW_SURFACE_TILED_Y;
+   default:
+      return 0;
    }
 }
 
@@ -228,46 +222,36 @@ brw_update_texture_surface( struct gl_context *ctx, GLuint unit )
    struct gl_texture_image *firstImage = tObj->Image[0][tObj->BaseLevel];
    struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
    const GLuint surf_index = SURF_INDEX_TEXTURE(unit);
-   struct brw_surface_state *surf;
+   uint32_t *surf;
 
-   surf = brw_state_batch(brw, sizeof(*surf), 32,
-			 &brw->wm.surf_offset[surf_index]);
-   memset(surf, 0, sizeof(*surf));
+   surf = brw_state_batch(brw, 6 * 4, 32, &brw->wm.surf_offset[surf_index]);
 
-   surf->ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
-   surf->ss0.surface_type = translate_tex_target(tObj->Target);
-   surf->ss0.surface_format = translate_tex_format(firstImage->TexFormat,
-                                                   firstImage->InternalFormat,
-                                                   sampler->DepthMode,
-                                                   sampler->sRGBDecode);
+   surf[0] = (translate_tex_target(tObj->Target) << BRW_SURFACE_TYPE_SHIFT |
+	      BRW_SURFACE_MIPMAPLAYOUT_BELOW << BRW_SURFACE_MIPLAYOUT_SHIFT |
+	      BRW_SURFACE_CUBEFACE_ENABLES |
+	      (translate_tex_format(firstImage->TexFormat,
+				    firstImage->InternalFormat,
+				    sampler->DepthMode,
+				    sampler->sRGBDecode) <<
+	       BRW_SURFACE_FORMAT_SHIFT));
 
-   /* This is ok for all textures with channel width 8bit or less:
-    */
-/*    surf->ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
-   surf->ss1.base_addr = intelObj->mt->region->buffer->offset; /* reloc */
+   surf[1] = intelObj->mt->region->buffer->offset; /* reloc */
 
-   surf->ss2.mip_count = intelObj->_MaxLevel - tObj->BaseLevel;
-   surf->ss2.width = firstImage->Width - 1;
-   surf->ss2.height = firstImage->Height - 1;
-   brw_set_surface_tiling(surf, intelObj->mt->region->tiling);
-   surf->ss3.pitch = (intelObj->mt->region->pitch * intelObj->mt->cpp) - 1;
-   surf->ss3.depth = firstImage->Depth - 1;
+   surf[2] = ((intelObj->_MaxLevel - tObj->BaseLevel) << BRW_SURFACE_LOD_SHIFT |
+	      (firstImage->Width - 1) << BRW_SURFACE_WIDTH_SHIFT |
+	      (firstImage->Height - 1) << BRW_SURFACE_HEIGHT_SHIFT);
 
-   surf->ss4.min_lod = 0;
- 
-   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-      surf->ss0.cube_pos_x = 1;
-      surf->ss0.cube_pos_y = 1;
-      surf->ss0.cube_pos_z = 1;
-      surf->ss0.cube_neg_x = 1;
-      surf->ss0.cube_neg_y = 1;
-      surf->ss0.cube_neg_z = 1;
-   }
+   surf[3] = (brw_get_surface_tiling_bits(intelObj->mt->region->tiling) |
+	      (firstImage->Depth - 1) << BRW_SURFACE_DEPTH_SHIFT |
+	      ((intelObj->mt->region->pitch * intelObj->mt->cpp) - 1) <<
+	      BRW_SURFACE_PITCH_SHIFT);
+
+   surf[4] = 0;
+   surf[5] = 0;
 
    /* Emit relocation to surface contents */
    drm_intel_bo_emit_reloc(brw->intel.batch.bo,
-			   brw->wm.surf_offset[surf_index] +
-			   offsetof(struct brw_surface_state, ss1),
+			   brw->wm.surf_offset[surf_index] + 4,
 			   intelObj->mt->region->buffer, 0,
 			   I915_GEM_DOMAIN_SAMPLER, 0);
 }
@@ -284,34 +268,34 @@ brw_create_constant_surface(struct brw_context *brw,
 {
    struct intel_context *intel = &brw->intel;
    const GLint w = width - 1;
-   struct brw_surface_state *surf;
+   uint32_t *surf;
 
-   surf = brw_state_batch(brw, sizeof(*surf), 32, out_offset);
-   memset(surf, 0, sizeof(*surf));
+   surf = brw_state_batch(brw, 6 * 4, 32, out_offset);
 
-   surf->ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
-   surf->ss0.surface_type = BRW_SURFACE_BUFFER;
-   surf->ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+   surf[0] = (BRW_SURFACE_BUFFER << BRW_SURFACE_TYPE_SHIFT |
+	      BRW_SURFACE_MIPMAPLAYOUT_BELOW << BRW_SURFACE_MIPLAYOUT_SHIFT |
+	      BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_SURFACE_FORMAT_SHIFT);
 
    if (intel->gen >= 6)
-      surf->ss0.render_cache_read_write = 1;
+      surf[0] |= BRW_SURFACE_RC_READ_WRITE;
+
+   surf[1] = bo->offset; /* reloc */
 
-   assert(bo);
-   surf->ss1.base_addr = bo->offset; /* reloc */
+   surf[2] = (((w & 0x7f) - 1) << BRW_SURFACE_WIDTH_SHIFT |
+	      (((w >> 7) & 0x1fff) - 1) << BRW_SURFACE_HEIGHT_SHIFT);
 
-   surf->ss2.width = w & 0x7f;            /* bits 6:0 of size or width */
-   surf->ss2.height = (w >> 7) & 0x1fff;  /* bits 19:7 of size or width */
-   surf->ss3.depth = (w >> 20) & 0x7f;    /* bits 26:20 of size or width */
-   surf->ss3.pitch = (width * 16) - 1; /* ignored?? */
-   brw_set_surface_tiling(surf, I915_TILING_NONE); /* tiling now allowed */
+   surf[3] = ((((w >> 20) & 0x7f) - 1) << BRW_SURFACE_DEPTH_SHIFT |
+	      (width * 16 - 1) << BRW_SURFACE_PITCH_SHIFT);
+
+   surf[4] = 0;
+   surf[5] = 0;
 
    /* Emit relocation to surface contents.  Section 5.1.1 of the gen4
     * bspec ("Data Cache") says that the data cache does not exist as
     * a separate cache and is just the sampler cache.
     */
    drm_intel_bo_emit_reloc(brw->intel.batch.bo,
-			   (*out_offset +
-			    offsetof(struct brw_surface_state, ss1)),
+			   *out_offset + 4,
 			   bo, 0,
 			   I915_GEM_DOMAIN_SAMPLER, 0);
 }
@@ -416,23 +400,23 @@ static void
 brw_update_null_renderbuffer_surface(struct brw_context *brw, unsigned int unit)
 {
    struct intel_context *intel = &brw->intel;
-   struct brw_surface_state *surf;
-
-   surf = brw_state_batch(brw, sizeof(*surf), 32,
-			 &brw->wm.surf_offset[unit]);
-   memset(surf, 0, sizeof(*surf));
+   uint32_t *surf;
 
-   surf->ss0.surface_type = BRW_SURFACE_NULL;
-   surf->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+   surf = brw_state_batch(brw, 6 * 4, 32, &brw->wm.surf_offset[unit]);
 
+   surf[0] = (BRW_SURFACE_NULL << BRW_SURFACE_TYPE_SHIFT |
+	      BRW_SURFACEFORMAT_B8G8R8A8_UNORM << BRW_SURFACE_FORMAT_SHIFT);
    if (intel->gen < 6) {
-      /* _NEW_COLOR */
-      surf->ss0.color_blend = 0;
-      surf->ss0.writedisable_red =   1;
-      surf->ss0.writedisable_green = 1;
-      surf->ss0.writedisable_blue =  1;
-      surf->ss0.writedisable_alpha = 1;
+      surf[0] |= (1 << BRW_SURFACE_WRITEDISABLE_R_SHIFT |
+		  1 << BRW_SURFACE_WRITEDISABLE_G_SHIFT |
+		  1 << BRW_SURFACE_WRITEDISABLE_B_SHIFT |
+		  1 << BRW_SURFACE_WRITEDISABLE_A_SHIFT);
    }
+   surf[1] = 0;
+   surf[2] = 0;
+   surf[3] = 0;
+   surf[4] = 0;
+   surf[5] = 0;
 }
 
 /**
@@ -449,12 +433,11 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
    struct gl_context *ctx = &intel->ctx;
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    struct intel_region *region = irb->region;
-   struct brw_surface_state *surf;
+   uint32_t *surf;
    uint32_t tile_x, tile_y;
+   uint32_t format = 0;
 
-   surf = brw_state_batch(brw, sizeof(*surf), 32,
-			  &brw->wm.surf_offset[unit]);
-   memset(surf, 0, sizeof(*surf));
+   surf = brw_state_batch(brw, 6 * 4, 32, &brw->wm.surf_offset[unit]);
 
    switch (irb->Base.Format) {
    case MESA_FORMAT_XRGB8888:
@@ -465,7 +448,7 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
        * cases where GL_DST_ALPHA (or GL_ONE_MINUS_DST_ALPHA) is
        * used.
        */
-      surf->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+      format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
       break;
    case MESA_FORMAT_INTENSITY_FLOAT32:
    case MESA_FORMAT_LUMINANCE_FLOAT32:
@@ -473,25 +456,35 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
        * channel into R, which is to say that we just treat them as
        * GL_RED.
        */
-      surf->ss0.surface_format = BRW_SURFACEFORMAT_R32_FLOAT;
+      format = BRW_SURFACEFORMAT_R32_FLOAT;
       break;
    case MESA_FORMAT_SARGB8:
       /* without GL_EXT_framebuffer_sRGB we shouldn't bind sRGB
 	 surfaces to the blend/update as sRGB */
       if (ctx->Color.sRGBEnabled)
-	 surf->ss0.surface_format = brw_format_for_mesa_format(irb->Base.Format);
+	 format = brw_format_for_mesa_format(irb->Base.Format);
       else
-	 surf->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+	 format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
       break;
    default:
       assert(brw_render_target_supported(irb->Base.Format));
-      surf->ss0.surface_format = brw_format_for_mesa_format(irb->Base.Format);
+      format = brw_format_for_mesa_format(irb->Base.Format);
    }
 
-   surf->ss0.surface_type = BRW_SURFACE_2D;
+   surf[0] = (BRW_SURFACE_2D << BRW_SURFACE_TYPE_SHIFT |
+	      format << BRW_SURFACE_FORMAT_SHIFT);
+
    /* reloc */
-   surf->ss1.base_addr = intel_region_tile_offsets(region, &tile_x, &tile_y);
-   surf->ss1.base_addr += region->buffer->offset; /* reloc */
+   surf[1] = (intel_region_tile_offsets(region, &tile_x, &tile_y) +
+	      region->buffer->offset);
+
+   surf[2] = ((rb->Width - 1) << BRW_SURFACE_WIDTH_SHIFT |
+	      (rb->Height - 1) << BRW_SURFACE_HEIGHT_SHIFT);
+
+   surf[3] = (brw_get_surface_tiling_bits(region->tiling) |
+	      ((region->pitch * region->cpp) - 1) << BRW_SURFACE_PITCH_SHIFT);
+
+   surf[4] = 0;
 
    assert(brw->has_surface_tile_offset || (tile_x == 0 && tile_y == 0));
    /* Note that the low bits of these fields are missing, so
@@ -499,35 +492,35 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
     */
    assert(tile_x % 4 == 0);
    assert(tile_y % 2 == 0);
-   surf->ss5.x_offset = tile_x / 4;
-   surf->ss5.y_offset = tile_y / 2;
-
-   surf->ss2.width = rb->Width - 1;
-   surf->ss2.height = rb->Height - 1;
-   brw_set_surface_tiling(surf, region->tiling);
-   surf->ss3.pitch = (region->pitch * region->cpp) - 1;
+   surf[5] = ((tile_x / 4) << BRW_SURFACE_X_OFFSET_SHIFT |
+	      (tile_y / 2) << BRW_SURFACE_Y_OFFSET_SHIFT);
 
    if (intel->gen < 6) {
       /* _NEW_COLOR */
-      surf->ss0.color_blend = (!ctx->Color._LogicOpEnabled &&
-			      (ctx->Color.BlendEnabled & (1 << unit)));
-      surf->ss0.writedisable_red =   !ctx->Color.ColorMask[unit][0];
-      surf->ss0.writedisable_green = !ctx->Color.ColorMask[unit][1];
-      surf->ss0.writedisable_blue =  !ctx->Color.ColorMask[unit][2];
+      if (!ctx->Color._LogicOpEnabled &&
+	  (ctx->Color.BlendEnabled & (1 << unit)))
+	 surf[0] |= BRW_SURFACE_BLEND_ENABLED;
+
+      if (!ctx->Color.ColorMask[unit][0])
+	 surf[0] |= 1 << BRW_SURFACE_WRITEDISABLE_R_SHIFT;
+      if (!ctx->Color.ColorMask[unit][1])
+	 surf[0] |= 1 << BRW_SURFACE_WRITEDISABLE_G_SHIFT;
+      if (!ctx->Color.ColorMask[unit][2])
+	 surf[0] |= 1 << BRW_SURFACE_WRITEDISABLE_B_SHIFT;
+
       /* As mentioned above, disable writes to the alpha component when the
        * renderbuffer is XRGB.
        */
-      if (ctx->DrawBuffer->Visual.alphaBits == 0)
-	 surf->ss0.writedisable_alpha = 1;
-      else
-	 surf->ss0.writedisable_alpha = !ctx->Color.ColorMask[unit][3];
+      if (ctx->DrawBuffer->Visual.alphaBits == 0 ||
+	  !ctx->Color.ColorMask[unit][3]) {
+	 surf[0] |= 1 << BRW_SURFACE_WRITEDISABLE_A_SHIFT;
+      }
    }
 
    drm_intel_bo_emit_reloc(brw->intel.batch.bo,
-			   brw->wm.surf_offset[unit] +
-			   offsetof(struct brw_surface_state, ss1),
+			   brw->wm.surf_offset[unit] + 4,
 			   region->buffer,
-			   surf->ss1.base_addr - region->buffer->offset,
+			   surf[1] - region->buffer->offset,
 			   I915_GEM_DOMAIN_RENDER,
 			   I915_GEM_DOMAIN_RENDER);
 }
@@ -539,16 +532,14 @@ prepare_wm_surfaces(struct brw_context *brw)
    int i;
    int nr_surfaces = 0;
 
-   if (ctx->DrawBuffer->_NumColorDrawBuffers >= 1) {
-      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
-	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
-	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
-	 struct intel_region *region = irb ? irb->region : NULL;
+   for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+      struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+      struct intel_region *region = irb ? irb->region : NULL;
 
-	 if (region)
-	    brw_add_validated_bo(brw, region->buffer);
-	 nr_surfaces = SURF_INDEX_DRAW(i) + 1;
-      }
+      if (region)
+	 brw_add_validated_bo(brw, region->buffer);
+      nr_surfaces = SURF_INDEX_DRAW(i) + 1;
    }
 
    if (brw->wm.const_bo) {
@@ -558,10 +549,11 @@ prepare_wm_surfaces(struct brw_context *brw)
 
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
       const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
-      struct gl_texture_object *tObj = texUnit->_Current;
-      struct intel_texture_object *intelObj = intel_texture_object(tObj);
 
       if (texUnit->_ReallyEnabled) {
+	 struct gl_texture_object *tObj = texUnit->_Current;
+	 struct intel_texture_object *intelObj = intel_texture_object(tObj);
+
 	 brw_add_validated_bo(brw, intelObj->mt->region->buffer);
 	 nr_surfaces = SURF_INDEX_TEXTURE(i) + 1;
       }
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index 6a7add8e562..ae7a1d6c35c 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -33,7 +33,6 @@ static void
 upload_vs_state(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
-   struct gl_context *ctx = &intel->ctx;
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_VS << 16 | (2 - 2));
diff --git a/src/mesa/drivers/dri/intel/intel_buffers.c b/src/mesa/drivers/dri/intel/intel_buffers.c
index ee551ef60d4..7eb50edc6b4 100644
--- a/src/mesa/drivers/dri/intel/intel_buffers.c
+++ b/src/mesa/drivers/dri/intel/intel_buffers.c
@@ -93,6 +93,7 @@ intel_draw_buffer(struct gl_context * ctx, struct gl_framebuffer *fb)
    struct intel_context *intel = intel_context(ctx);
    struct intel_region *colorRegions[MAX_DRAW_BUFFERS], *depthRegion = NULL;
    struct intel_renderbuffer *irbDepth = NULL, *irbStencil = NULL;
+   bool fb_has_hiz = intel_framebuffer_has_hiz(fb);
 
    if (!fb) {
       /* this can happen during the initial context initialization */
@@ -166,11 +167,11 @@ intel_draw_buffer(struct gl_context * ctx, struct gl_framebuffer *fb)
 
    /***
     *** Get depth buffer region and check if we need a software fallback.
-    *** Note that the depth buffer is usually a DEPTH_STENCIL buffer.
     ***/
    if (fb->_DepthBuffer && fb->_DepthBuffer->Wrapped) {
       irbDepth = intel_renderbuffer(fb->_DepthBuffer->Wrapped);
       if (irbDepth && irbDepth->region) {
+	 assert(!fb_has_hiz || irbDepth->Base.Format != MESA_FORMAT_S8_Z24);
          FALLBACK(intel, INTEL_FALLBACK_DEPTH_BUFFER, GL_FALSE);
          depthRegion = irbDepth->region;
       }
@@ -187,13 +188,16 @@ intel_draw_buffer(struct gl_context * ctx, struct gl_framebuffer *fb)
 
    /***
     *** Stencil buffer
-    *** This can only be hardware accelerated if we're using a
-    *** combined DEPTH_STENCIL buffer.
     ***/
    if (fb->_StencilBuffer && fb->_StencilBuffer->Wrapped) {
       irbStencil = intel_renderbuffer(fb->_StencilBuffer->Wrapped);
       if (irbStencil && irbStencil->region) {
-         ASSERT(irbStencil->Base.Format == MESA_FORMAT_S8_Z24);
+	 if (!intel->has_separate_stencil)
+	    assert(irbStencil->Base.Format == MESA_FORMAT_S8_Z24);
+	 if (fb_has_hiz || intel->must_use_separate_stencil)
+	    assert(irbStencil->Base.Format == MESA_FORMAT_S8);
+	 if (irbStencil->Base.Format == MESA_FORMAT_S8)
+	    assert(intel->has_separate_stencil);
          FALLBACK(intel, INTEL_FALLBACK_STENCIL_BUFFER, GL_FALSE);
       }
       else {
@@ -208,8 +212,10 @@ intel_draw_buffer(struct gl_context * ctx, struct gl_framebuffer *fb)
    /* If we have a (packed) stencil buffer attached but no depth buffer,
     * we still need to set up the shared depth/stencil state so we can use it.
     */
-   if (depthRegion == NULL && irbStencil && irbStencil->region)
+   if (depthRegion == NULL && irbStencil && irbStencil->region
+       && irbStencil->Base.Format == MESA_FORMAT_S8_Z24) {
       depthRegion = irbStencil->region;
+   }
 
    /*
     * Update depth and stencil test state
@@ -302,18 +308,6 @@ intelReadBuffer(struct gl_context * ctx, GLenum mode)
       if (!was_front_buffer_reading && intel->is_front_buffer_reading)
 	 dri2InvalidateDrawable(intel->driContext->driReadablePriv);
    }
-
-   if (ctx->ReadBuffer == ctx->DrawBuffer) {
-      /* This will update FBO completeness status.
-       * A framebuffer will be incomplete if the GL_READ_BUFFER setting
-       * refers to a missing renderbuffer.  Calling glReadBuffer can set
-       * that straight and can make the drawing buffer complete.
-       */
-      intel_draw_buffer(ctx, ctx->DrawBuffer);
-   }
-   /* Generally, functions which read pixels (glReadPixels, glCopyPixels, etc)
-    * reference ctx->ReadBuffer and do appropriate state checks.
-    */
 }
 
 
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 4516db20ffc..2ea52c26106 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -620,6 +620,53 @@ intelInitDriverFunctions(struct dd_function_table *functions)
    intel_init_syncobj_functions(functions);
 }
 
+/**
+ * Override intel->has_hiz with environment variable INTEL_HIZ.
+ *
+ * Valid values for INTEL_HIZ are "0" and "1". If an invalid valid value is
+ * encountered, a warning is emitted and INTEL_HIZ is ignored.
+ */
+static void
+intel_override_hiz(struct intel_context *intel)
+{
+   const char *s = getenv("INTEL_HIZ");
+   if (!s) {
+      return;
+   } else if (!strncmp("0", s, 2)) {
+      intel->has_hiz = false;
+   } else if (!strncmp("1", s, 2)) {
+      intel->has_hiz = true;
+   } else {
+      _mesa_warning(&intel->ctx,
+                    "env variable INTEL_HIZ=\"%s\" has invalid value and "
+                    "is ignored", s);
+   }
+}
+
+/**
+ * Override intel->has_separate_stencil with environment variable
+ * INTEL_SEPARATE_STENCIL.
+ *
+ * Valid values for INTEL_SEPARATE_STENCIL are "0" and "1". If an invalid
+ * value is encountered, a warning is emitted and INTEL_SEPARATE_STENCIL is
+ * ignored.
+ */
+static void
+intel_override_separate_stencil(struct intel_context *intel)
+{
+   const char *s = getenv("INTEL_SEPARATE_STENCIL");
+   if (!s) {
+      return;
+   } else if (!strncmp("0", s, 2)) {
+      intel->has_separate_stencil = false;
+   } else if (!strncmp("1", s, 2)) {
+      intel->has_separate_stencil = true;
+   } else {
+      _mesa_warning(&intel->ctx,
+                    "env variable INTEL_SEPARATE_STENCIL=\"%s\" has invalid "
+                    "value and is ignored", s);
+   }
+}
 
 GLboolean
 intelInitContext(struct intel_context *intel,
@@ -667,9 +714,14 @@ intelInitContext(struct intel_context *intel,
    if (IS_GEN7(intel->intelScreen->deviceID)) {
       intel->needs_ff_sync = GL_TRUE;
       intel->has_luminance_srgb = GL_TRUE;
+      /* FINISHME: Enable intel->has_separate_stencil on Gen7. */
+      /* FINISHME: Enable intel->must_use_separate_stencil on Gen7. */
+      /* FINISHME: Enable intel->has_hiz on Gen7. */
    } else if (IS_GEN6(intel->intelScreen->deviceID)) {
       intel->needs_ff_sync = GL_TRUE;
       intel->has_luminance_srgb = GL_TRUE;
+      /* FINISHME: Enable intel->has_separate_stencil on Gen6. */
+      /* FINISHME: Enable intel->has_hiz on Gen6. */
    } else if (IS_GEN5(intel->intelScreen->deviceID)) {
       intel->needs_ff_sync = GL_TRUE;
       intel->has_luminance_srgb = GL_TRUE;
@@ -689,6 +741,9 @@ intelInitContext(struct intel_context *intel,
       }
    }
 
+   intel_override_hiz(intel);
+   intel_override_separate_stencil(intel);
+
    memset(&ctx->TextureFormatSupported, 0,
 	  sizeof(ctx->TextureFormatSupported));
    ctx->TextureFormatSupported[MESA_FORMAT_ARGB8888] = GL_TRUE;
@@ -703,7 +758,12 @@ intelInitContext(struct intel_context *intel,
    ctx->TextureFormatSupported[MESA_FORMAT_AL88] = GL_TRUE;
    if (intel->gen >= 4)
       ctx->TextureFormatSupported[MESA_FORMAT_AL1616] = GL_TRUE;
-   ctx->TextureFormatSupported[MESA_FORMAT_S8_Z24] = GL_TRUE;
+
+   /* Depth and stencil */
+   ctx->TextureFormatSupported[MESA_FORMAT_S8_Z24] = !intel->must_use_separate_stencil;
+   ctx->TextureFormatSupported[MESA_FORMAT_X8_Z24] = intel->has_separate_stencil;
+   ctx->TextureFormatSupported[MESA_FORMAT_S8] = intel->has_separate_stencil;
+
    /*
     * This was disabled in initial FBO enabling to avoid combinations
     * of depth+stencil that wouldn't work together.  We since decided
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index d3a8a659caa..f599861cba8 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -149,6 +149,10 @@ struct intel_context
 
       void (*debug_batch)(struct intel_context *intel);
       bool (*render_target_supported)(gl_format format);
+
+      /** Can HiZ be enabled on a depthbuffer of the given format? */
+      bool (*is_hiz_depth_format)(struct intel_context *intel,
+	                          gl_format format);
    } vtbl;
 
    GLbitfield Fallback;  /**< mask of INTEL_FALLBACK_x bits */
@@ -166,6 +170,9 @@ struct intel_context
    GLboolean is_945;
    GLboolean has_luminance_srgb;
    GLboolean has_xrgb_textures;
+   GLboolean has_separate_stencil;
+   GLboolean must_use_separate_stencil;
+   GLboolean has_hiz;
 
    int urb_size;
 
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index ad2468a3237..7434e0efff6 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -79,6 +79,9 @@ intel_delete_renderbuffer(struct gl_renderbuffer *rb)
    if (intel && irb->region) {
       intel_region_release(&irb->region);
    }
+   if (intel && irb->hiz_region) {
+      intel_region_release(&irb->hiz_region);
+   }
 
    free(irb);
 }
@@ -129,7 +132,12 @@ intel_alloc_renderbuffer_storage(struct gl_context * ctx, struct gl_renderbuffer
    case GL_STENCIL_INDEX8_EXT:
    case GL_STENCIL_INDEX16_EXT:
       /* These aren't actual texture formats, so force them here. */
-      rb->Format = MESA_FORMAT_S8_Z24;
+      if (intel->has_separate_stencil) {
+	 rb->Format = MESA_FORMAT_S8;
+      } else {
+	 assert(!intel->must_use_separate_stencil);
+	 rb->Format = MESA_FORMAT_S8_Z24;
+      }
       break;
    }
 
@@ -143,6 +151,9 @@ intel_alloc_renderbuffer_storage(struct gl_context * ctx, struct gl_renderbuffer
    if (irb->region) {
       intel_region_release(&irb->region);
    }
+   if (irb->hiz_region) {
+      intel_region_release(&irb->hiz_region);
+   }
 
    /* allocate new memory region/renderbuffer */
 
@@ -154,19 +165,54 @@ intel_alloc_renderbuffer_storage(struct gl_context * ctx, struct gl_renderbuffer
       GLenum base_format = _mesa_get_format_base_format(rb->Format);
 
       if (intel->gen >= 4 && (base_format == GL_DEPTH_COMPONENT ||
+			      base_format == GL_STENCIL_INDEX ||
 			      base_format == GL_DEPTH_STENCIL))
 	 tiling = I915_TILING_Y;
       else
 	 tiling = I915_TILING_X;
    }
 
-   irb->region = intel_region_alloc(intel->intelScreen, tiling, cpp,
-				    width, height, GL_TRUE);
+   if (irb->Base.Format == MESA_FORMAT_S8) {
+      /*
+       * The stencil buffer has quirky pitch requirements.  From Vol 2a,
+       * 11.5.6.2.1 3DSTATE_STENCIL_BUFFER, field "Surface Pitch":
+       *    The pitch must be set to 2x the value computed based on width, as
+       *    the stencil buffer is stored with two rows interleaved.
+       * To accomplish this, we resort to the nasty hack of doubling the drm
+       * region's cpp and halving its height.
+       *
+       * If we neglect to double the pitch, then drm_intel_gem_bo_map_gtt()
+       * maps the memory incorrectly.
+       */
+      irb->region = intel_region_alloc(intel->intelScreen,
+				       I915_TILING_Y,
+				       cpp * 2,
+				       width,
+				       height / 2,
+				       GL_TRUE);
+   } else {
+      irb->region = intel_region_alloc(intel->intelScreen, tiling, cpp,
+				       width, height, GL_TRUE);
+   }
+
    if (!irb->region)
       return GL_FALSE;       /* out of memory? */
 
    ASSERT(irb->region->buffer);
 
+   if (intel->vtbl.is_hiz_depth_format(intel, rb->Format)) {
+      irb->hiz_region = intel_region_alloc(intel->intelScreen,
+                                           I915_TILING_Y,
+                                           irb->region->cpp,
+                                           irb->region->width,
+                                           irb->region->height,
+                                           GL_TRUE);
+      if (!irb->hiz_region) {
+         intel_region_release(&irb->region);
+         return GL_FALSE;
+      }
+   }
+
    rb->Width = width;
    rb->Height = height;
 
@@ -374,6 +420,9 @@ static GLboolean
 intel_update_wrapper(struct gl_context *ctx, struct intel_renderbuffer *irb, 
 		     struct gl_texture_image *texImage)
 {
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_texture_image *intel_image = intel_texture_image(texImage);
+
    if (!intel_span_supports_format(texImage->TexFormat)) {
       DBG("Render to texture BAD FORMAT %s\n",
 	  _mesa_get_format_name(texImage->TexFormat));
@@ -392,6 +441,32 @@ intel_update_wrapper(struct gl_context *ctx, struct intel_renderbuffer *irb,
    irb->Base.Delete = intel_delete_renderbuffer;
    irb->Base.AllocStorage = intel_nop_alloc_storage;
 
+   /* Point the renderbuffer's region to the texture's region. */
+   if (irb->region != intel_image->mt->region) {
+      intel_region_release(&irb->region);
+      intel_region_reference(&irb->region, intel_image->mt->region);
+   }
+
+   /* Allocate the texture's hiz region if necessary. */
+   if (intel->vtbl.is_hiz_depth_format(intel, texImage->TexFormat)
+       && !intel_image->mt->hiz_region) {
+      intel_image->mt->hiz_region =
+         intel_region_alloc(intel->intelScreen,
+                            I915_TILING_Y,
+                            _mesa_get_format_bytes(texImage->TexFormat),
+                            texImage->Width,
+                            texImage->Height,
+                            GL_TRUE);
+      if (!intel_image->mt->hiz_region)
+         return GL_FALSE;
+   }
+
+   /* Point the renderbuffer's hiz region to the texture's hiz region. */
+   if (irb->hiz_region != intel_image->mt->hiz_region) {
+      intel_region_release(&irb->hiz_region);
+      intel_region_reference(&irb->hiz_region, intel_image->mt->hiz_region);
+   }
+
    return GL_TRUE;
 }
 
@@ -497,13 +572,6 @@ intel_render_texture(struct gl_context * ctx,
        att->Texture->Name, newImage->Width, newImage->Height,
        irb->Base.RefCount);
 
-   /* point the renderbufer's region to the texture image region */
-   if (irb->region != intel_image->mt->region) {
-      if (irb->region)
-	 intel_region_release(&irb->region);
-      intel_region_reference(&irb->region, intel_image->mt->region);
-   }
-
    intel_set_draw_offset_for_image(intel_image, att->Zoffset);
    intel_image->used_as_render_target = GL_TRUE;
 
@@ -597,21 +665,33 @@ intel_validate_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb)
       intel_get_renderbuffer(fb, BUFFER_STENCIL);
    int i;
 
-   if (depthRb && stencilRb && stencilRb != depthRb) {
-      if (fb->Attachment[BUFFER_DEPTH].Type == GL_TEXTURE &&
-	  fb->Attachment[BUFFER_STENCIL].Type == GL_TEXTURE &&
-	  (fb->Attachment[BUFFER_DEPTH].Texture->Name ==
-	   fb->Attachment[BUFFER_STENCIL].Texture->Name)) {
-	 /* OK */
-      } else {
-	 /* we only support combined depth/stencil buffers, not separate
-	  * stencil buffers.
-	  */
-	 DBG("Only supports combined depth/stencil (found %s, %s)\n",
-	     depthRb ? _mesa_get_format_name(depthRb->Base.Format): "NULL",
-	     stencilRb ? _mesa_get_format_name(stencilRb->Base.Format): "NULL");
-	 fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
-      }
+   /*
+    * The depth and stencil renderbuffers are the same renderbuffer or wrap
+    * the same texture.
+    */
+   bool depth_stencil_are_same;
+   if (depthRb && stencilRb && depthRb == stencilRb)
+      depth_stencil_are_same = true;
+   else if (depthRb && stencilRb && depthRb != stencilRb
+	    && (fb->Attachment[BUFFER_DEPTH].Type == GL_TEXTURE)
+	    && (fb->Attachment[BUFFER_STENCIL].Type == GL_TEXTURE)
+	    && (fb->Attachment[BUFFER_DEPTH].Texture->Name
+		== fb->Attachment[BUFFER_STENCIL].Texture->Name))
+      depth_stencil_are_same = true;
+   else
+      depth_stencil_are_same = false;
+
+   bool fb_has_combined_depth_stencil_format =
+     (depthRb && depthRb->Base.Format == MESA_FORMAT_S8_Z24) ||
+     (stencilRb && stencilRb->Base.Format == MESA_FORMAT_S8_Z24);
+
+   bool fb_has_hiz = intel_framebuffer_has_hiz(fb);
+
+   if ((intel->must_use_separate_stencil || fb_has_hiz)
+	 && (depth_stencil_are_same || fb_has_combined_depth_stencil_format)) {
+      fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
+   } else if (!intel->has_separate_stencil && depthRb && stencilRb && !depth_stencil_are_same) {
+      fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
    }
 
    for (i = 0; i < Elements(fb->Attachment); i++) {
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.h b/src/mesa/drivers/dri/intel/intel_fbo.h
index 028f657d12d..212dd9aadc8 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.h
+++ b/src/mesa/drivers/dri/intel/intel_fbo.h
@@ -28,6 +28,7 @@
 #ifndef INTEL_FBO_H
 #define INTEL_FBO_H
 
+#include <stdbool.h>
 #include "main/formats.h"
 #include "intel_screen.h"
 
@@ -40,6 +41,9 @@ struct intel_renderbuffer
 {
    struct gl_renderbuffer Base;
    struct intel_region *region;
+
+   /** Only used by depth renderbuffers for which HiZ is enabled. */
+   struct intel_region *hiz_region;
 };
 
 
@@ -80,6 +84,29 @@ intel_get_renderbuffer(struct gl_framebuffer *fb, int attIndex)
       return NULL;
 }
 
+/**
+ * If the framebuffer has a depth buffer attached, then return its HiZ region.
+ * The HiZ region may be null.
+ */
+static INLINE struct intel_region*
+intel_framebuffer_get_hiz_region(struct gl_framebuffer *fb)
+{
+   struct intel_renderbuffer *rb = NULL;
+   if (fb)
+      rb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
+
+   if (rb)
+      return rb->hiz_region;
+   else
+      return NULL;
+}
+
+static INLINE bool
+intel_framebuffer_has_hiz(struct gl_framebuffer *fb)
+{
+   return intel_framebuffer_get_hiz_region(fb) != NULL;
+}
+
 
 extern void
 intel_renderbuffer_set_region(struct intel_context *intel,
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
index a3409274fb7..e62905de7c3 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
@@ -200,6 +200,7 @@ intel_miptree_release(struct intel_context *intel,
       DBG("%s deleting %p\n", __FUNCTION__, *mt);
 
       intel_region_release(&((*mt)->region));
+      intel_region_release(&((*mt)->hiz_region));
 
       for (i = 0; i < MAX_TEXTURE_LEVELS; i++) {
 	 free((*mt)->level[i].x_offset);
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
index 760a8bce601..325e3916981 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
@@ -113,6 +113,20 @@ struct intel_mipmap_tree
     */
    struct intel_region *region;
 
+   /**
+    * This points to an auxillary hiz region if all of the following hold:
+    *     1. The texture has been attached to an FBO as a depthbuffer.
+    *     2. The texture format is hiz compatible.
+    *     3. The intel context supports hiz.
+    *
+    * When a texture is attached to multiple FBO's, a separate renderbuffer
+    * wrapper is created for each attachment. This necessitates storing the
+    * hiz region in the texture itself instead of the renderbuffer wrapper.
+    *
+    * \see intel_fbo.c:intel_wrap_texture()
+    */
+   struct intel_region *hiz_region;
+
    /* These are also refcounted:
     */
    GLuint refcount;
diff --git a/src/mesa/drivers/dri/intel/intel_span.c b/src/mesa/drivers/dri/intel/intel_span.c
index 16bce20317e..5290342c3e1 100644
--- a/src/mesa/drivers/dri/intel/intel_span.c
+++ b/src/mesa/drivers/dri/intel/intel_span.c
@@ -70,9 +70,6 @@ intel_set_span_functions(struct intel_context *intel,
 
 #define HW_UNLOCK()
 
-/* Convenience macros to avoid typing the address argument over and over */
-#define NO_TILE(_X, _Y) (((_Y) * irb->region->pitch + (_X)) * irb->region->cpp)
-
 /* r5g6b5 color span and pixel functions */
 #define SPANTMP_PIXEL_FMT GL_RGB
 #define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
diff --git a/src/mesa/drivers/dri/intel/intel_tex_format.c b/src/mesa/drivers/dri/intel/intel_tex_format.c
index befa615d1e6..6890a690ab1 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_format.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_format.c
@@ -22,6 +22,7 @@ intel_mesa_format_to_rb_datatype(gl_format format)
    case MESA_FORMAT_RGB565:
    case MESA_FORMAT_ARGB1555:
    case MESA_FORMAT_ARGB4444:
+   case MESA_FORMAT_S8:
       return GL_UNSIGNED_BYTE;
    case MESA_FORMAT_R16:
    case MESA_FORMAT_RG1616:
diff --git a/src/mesa/drivers/dri/mach64/Makefile b/src/mesa/drivers/dri/mach64/Makefile
index c20fdece297..0474c1a165f 100644
--- a/src/mesa/drivers/dri/mach64/Makefile
+++ b/src/mesa/drivers/dri/mach64/Makefile
@@ -5,6 +5,8 @@ include $(TOP)/configs/current
 
 LIBNAME = mach64_dri.so
 
+include ../Makefile.defines
+
 DRIVER_SOURCES = \
 	mach64_context.c \
 	mach64_ioctl.c \
@@ -25,5 +27,5 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-include ../Makefile.template
+include ../Makefile.targets
 
diff --git a/src/mesa/drivers/dri/mga/Makefile b/src/mesa/drivers/dri/mga/Makefile
index 92533bccc29..9948ee767ac 100644
--- a/src/mesa/drivers/dri/mga/Makefile
+++ b/src/mesa/drivers/dri/mga/Makefile
@@ -5,6 +5,8 @@ include $(TOP)/configs/current
 
 LIBNAME = mga_dri.so
 
+include ../Makefile.defines
+
 DRIVER_SOURCES = \
 	mgadd.c \
 	mgaioctl.c \
@@ -27,5 +29,5 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-include ../Makefile.template
+include ../Makefile.targets
 
diff --git a/src/mesa/drivers/dri/nouveau/Makefile b/src/mesa/drivers/dri/nouveau/Makefile
index 3b506a91ffa..e485a98761f 100644
--- a/src/mesa/drivers/dri/nouveau/Makefile
+++ b/src/mesa/drivers/dri/nouveau/Makefile
@@ -3,11 +3,10 @@
 TOP = ../../../../..
 include $(TOP)/configs/current
 
-CFLAGS += $(shell pkg-config libdrm libdrm_nouveau --cflags)
-DRI_LIB_DEPS += $(shell pkg-config libdrm_nouveau --libs)
-
 LIBNAME = nouveau_vieux_dri.so
 
+include ../Makefile.defines
+
 DRIVER_SOURCES = \
 	nouveau_screen.c \
 	nouveau_context.c \
@@ -51,7 +50,9 @@ C_SOURCES = \
 
 ASM_SOURCES =
 
+INCLUDES += $(NOUVEAU_CFLAGS)
+DRI_LIB_DEPS += $(NOUVEAU_LIBS)
 
-include ../Makefile.template
+include ../Makefile.targets
 
 symlinks:
diff --git a/src/mesa/drivers/dri/r128/Makefile b/src/mesa/drivers/dri/r128/Makefile
index 8144c9b43ff..8b23ccc8cbe 100644
--- a/src/mesa/drivers/dri/r128/Makefile
+++ b/src/mesa/drivers/dri/r128/Makefile
@@ -5,6 +5,8 @@ include $(TOP)/configs/current
 
 LIBNAME = r128_dri.so
 
+include ../Makefile.defines
+
 DRIVER_SOURCES = \
 	r128_context.c \
 	r128_lock.c \
@@ -25,5 +27,5 @@ C_SOURCES = \
 ASM_SOURCES = 
 
 
-include ../Makefile.template
+include ../Makefile.targets
 
diff --git a/src/mesa/drivers/dri/r200/Makefile b/src/mesa/drivers/dri/r200/Makefile
index 8013768e9fb..4547f7e2ee0 100644
--- a/src/mesa/drivers/dri/r200/Makefile
+++ b/src/mesa/drivers/dri/r200/Makefile
@@ -3,10 +3,10 @@
 TOP = ../../../../..
 include $(TOP)/configs/current
 
-CFLAGS += $(RADEON_CFLAGS)
-
 LIBNAME = r200_dri.so
 
+include ../Makefile.defines
+
 ifeq ($(RADEON_LDFLAGS),)
 CS_SOURCES = radeon_cs_space_drm.c radeon_bo.c radeon_cs.c
 endif
@@ -55,12 +55,13 @@ X86_SOURCES =
 
 DRIVER_DEFINES = -DRADEON_R200
 
+INCLUDES += $(RADEON_CFLAGS)
 DRI_LIB_DEPS += $(RADEON_LDFLAGS)
 
 ##### TARGETS #####
 
 
-include ../Makefile.template
+include ../Makefile.targets
 
 #INCLUDES += -I../radeon/server
 
diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile
index 2245998c952..9f23a8496aa 100644
--- a/src/mesa/drivers/dri/r300/Makefile
+++ b/src/mesa/drivers/dri/r300/Makefile
@@ -3,17 +3,17 @@
 TOP = ../../../../..
 include $(TOP)/configs/current
 
-CFLAGS += $(RADEON_CFLAGS)
-
 LIBNAME = r300_dri.so
 
+include ../Makefile.defines
+
 ifeq ($(RADEON_LDFLAGS),)
 CS_SOURCES = radeon_cs_space_drm.c radeon_bo.c radeon_cs.c
 endif
 
 COMMON_SOURCES = \
 	../../common/driverfuncs.c \
-	../common/mm.c \
+	../common/drirenderbuffer.c \
 	../common/utils.c \
 	../common/texmem.c \
 	../common/vblank.c \
@@ -64,6 +64,7 @@ C_SOURCES = $(COMMON_SOURCES) $(DRIVER_SOURCES)
 DRIVER_DEFINES = -DRADEON_R300
 #	-DRADEON_BO_TRACK \
 
+INCLUDES += $(RADEON_CFLAGS)
 DRI_LIB_DEPS += $(RADEON_LDFLAGS)
 
 SUBDIRS = compiler
@@ -73,5 +74,5 @@ EXTRA_MODULES = compiler/libr300compiler.a
 
 ##### TARGETS #####
 
-include ../Makefile.template
+include ../Makefile.targets
 
diff --git a/src/mesa/drivers/dri/r300/compiler/Makefile b/src/mesa/drivers/dri/r300/compiler/Makefile
index 4bedfacd632..5aa13329ac2 100644
--- a/src/mesa/drivers/dri/r300/compiler/Makefile
+++ b/src/mesa/drivers/dri/r300/compiler/Makefile
@@ -38,7 +38,9 @@ C_SOURCES = \
 		r3xx_vertprog.c \
 		r3xx_vertprog_dump.c \
 		\
-		memory_pool.c
+		memory_pool.c \
+		$(TOP)/src/glsl/ralloc.c \
+		$(TOP)/src/mesa/program/register_allocate.c
 
 
 ### Basic defines ###
@@ -52,6 +54,7 @@ INCLUDES = \
 	-I$(TOP)/include \
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/src/glsl \
+	-I$(TOP)/src/mapi
 
 
 ##### TARGETS #####
diff --git a/src/mesa/drivers/dri/r300/compiler/SConscript b/src/mesa/drivers/dri/r300/compiler/SConscript
index 9931537492e..2c748b6e214 100755
--- a/src/mesa/drivers/dri/r300/compiler/SConscript
+++ b/src/mesa/drivers/dri/r300/compiler/SConscript
@@ -4,6 +4,7 @@ env = env.Clone()
 env.Append(CPPPATH = '#/include')
 env.Append(CPPPATH = '#/src/mesa')
 env.Append(CPPPATH = '#/src/glsl')
+env.Append(CPPPATH = '#/src/mapi')
 
 # temporary fix
 env['CFLAGS'] = str(env['CFLAGS']).replace('-Werror=declaration-after-statement', '')
@@ -43,6 +44,8 @@ r300compiler = env.ConvenienceLibrary(
         'r3xx_vertprog.c',
         'r3xx_vertprog_dump.c',
         'memory_pool.c',
+	'#/src/glsl/ralloc.c',
+	'#/src/mesa/program/register_allocate.c'
     ])
 
 Return('r300compiler')
diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
index da6c8b602e1..93a0dd168f4 100644
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@@ -1619,7 +1619,6 @@ static void r500SetupRSUnit(struct gl_context * ctx)
 		WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
 }
 
-#define MIN3(a, b, c)	((a) < (b) ? MIN2(a, c) : MIN2(b, c))
 
 void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
 			GLuint output_count, GLuint temp_count)
diff --git a/src/mesa/drivers/dri/r600/Makefile b/src/mesa/drivers/dri/r600/Makefile
index 200bc20f647..bec0b5a53fe 100644
--- a/src/mesa/drivers/dri/r600/Makefile
+++ b/src/mesa/drivers/dri/r600/Makefile
@@ -3,17 +3,17 @@
 TOP = ../../../../..
 include $(TOP)/configs/current
 
-CFLAGS += $(RADEON_CFLAGS)
-
 LIBNAME = r600_dri.so
 
+include ../Makefile.defines
+
 ifeq ($(RADEON_LDFLAGS),)
 CS_SOURCES = radeon_cs_space_drm.c radeon_bo.c radeon_cs.c
 endif
 
 COMMON_SOURCES = \
 	../../common/driverfuncs.c \
-	../common/mm.c \
+	../common/drirenderbuffer.c \
 	../common/utils.c \
 	../common/texmem.c \
 	../common/vblank.c \
@@ -78,9 +78,10 @@ C_SOURCES = $(COMMON_SOURCES) $(DRIVER_SOURCES)
 DRIVER_DEFINES = -DRADEON_R600
 #	-DRADEON_BO_TRACK \
 
+INCLUDES += $(RADEON_CFLAGS)
 DRI_LIB_DEPS += $(RADEON_LDFLAGS)
 
 ##### TARGETS #####
 
-include ../Makefile.template
+include ../Makefile.targets
 
diff --git a/src/mesa/drivers/dri/r600/evergreen_blit.c b/src/mesa/drivers/dri/r600/evergreen_blit.c
index 0e4da5499ba..95ac4aee5ce 100644
--- a/src/mesa/drivers/dri/r600/evergreen_blit.c
+++ b/src/mesa/drivers/dri/r600/evergreen_blit.c
@@ -1525,6 +1525,48 @@ eg_set_default_state(context_t *context)
 	    num_hs_stack_entries = 42;
 	    num_ls_stack_entries = 42;
 	    break;
+    case CHIP_FAMILY_SUMO:
+	    num_ps_gprs = 93;
+	    num_vs_gprs = 46;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 31;
+	    num_es_gprs = 31;
+	    num_hs_gprs = 23;
+	    num_ls_gprs = 23;
+	    num_ps_threads = 96;
+	    num_vs_threads = 25;
+	    num_gs_threads = 25;
+	    num_es_threads = 25;
+	    num_hs_threads = 25;
+	    num_ls_threads = 25;
+	    num_ps_stack_entries = 42;
+	    num_vs_stack_entries = 42;
+	    num_gs_stack_entries = 42;
+	    num_es_stack_entries = 42;
+	    num_hs_stack_entries = 42;
+	    num_ls_stack_entries = 42;
+	    break;
+    case CHIP_FAMILY_SUMO2:
+	    num_ps_gprs = 93;
+	    num_vs_gprs = 46;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 31;
+	    num_es_gprs = 31;
+	    num_hs_gprs = 23;
+	    num_ls_gprs = 23;
+	    num_ps_threads = 96;
+	    num_vs_threads = 25;
+	    num_gs_threads = 25;
+	    num_es_threads = 25;
+	    num_hs_threads = 25;
+	    num_ls_threads = 25;
+	    num_ps_stack_entries = 85;
+	    num_vs_stack_entries = 85;
+	    num_gs_stack_entries = 85;
+	    num_es_stack_entries = 85;
+	    num_hs_stack_entries = 85;
+	    num_ls_stack_entries = 85;
+	    break;
     case CHIP_FAMILY_BARTS:
 	    num_ps_gprs = 93;
 	    num_vs_gprs = 46;
@@ -1592,6 +1634,8 @@ eg_set_default_state(context_t *context)
 
     if ((context->radeon.radeonScreen->chip_family == CHIP_FAMILY_CEDAR) ||
 	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_PALM) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_SUMO) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_SUMO2) ||
 	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_CAICOS))
 	    CLEARbit(sq_config, EG_SQ_CONFIG__VC_ENABLE_bit);
     else
diff --git a/src/mesa/drivers/dri/r600/evergreen_chip.c b/src/mesa/drivers/dri/r600/evergreen_chip.c
index 42566e537a5..388a96ff067 100644
--- a/src/mesa/drivers/dri/r600/evergreen_chip.c
+++ b/src/mesa/drivers/dri/r600/evergreen_chip.c
@@ -287,7 +287,9 @@ static void evergreenSetupVTXConstants(struct gl_context  * ctx,
 	    return;
 
     if ((context->radeon.radeonScreen->chip_family == CHIP_FAMILY_CEDAR) ||
-	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_PALM))
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_PALM) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_SUMO) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_SUMO2))
 	    r700SyncSurf(context, paos->bo, RADEON_GEM_DOMAIN_GTT, 0, TC_ACTION_ENA_bit);
     else
 	    r700SyncSurf(context, paos->bo, RADEON_GEM_DOMAIN_GTT, 0, VC_ACTION_ENA_bit);
diff --git a/src/mesa/drivers/dri/r600/evergreen_state.c b/src/mesa/drivers/dri/r600/evergreen_state.c
index 309c93fe088..81bf1172dc3 100644
--- a/src/mesa/drivers/dri/r600/evergreen_state.c
+++ b/src/mesa/drivers/dri/r600/evergreen_state.c
@@ -1470,6 +1470,22 @@ static void evergreenInitSQConfig(struct gl_context * ctx)
         uMaxThreads = 192;
         uMaxStackEntries = 256;
 	    break;
+    case CHIP_FAMILY_SUMO:
+	    uSqNumCfInsts       = 2;
+        bVC_ENABLE = GL_FALSE;
+        uMaxGPRs = 256;
+        uPSThreadCount = 96;
+        uMaxThreads = 248;
+        uMaxStackEntries = 256;
+	    break;
+    case CHIP_FAMILY_SUMO2:
+	    uSqNumCfInsts       = 2;
+        bVC_ENABLE = GL_FALSE;
+        uMaxGPRs = 256;
+        uPSThreadCount = 96;
+        uMaxThreads = 248;
+        uMaxStackEntries = 512;
+	    break;
     case CHIP_FAMILY_BARTS:
 	    uSqNumCfInsts       = 2;
         bVC_ENABLE = GL_TRUE;
diff --git a/src/mesa/drivers/dri/radeon/Makefile b/src/mesa/drivers/dri/radeon/Makefile
index 93219e40afd..6b5d3335452 100644
--- a/src/mesa/drivers/dri/radeon/Makefile
+++ b/src/mesa/drivers/dri/radeon/Makefile
@@ -4,10 +4,10 @@
 TOP = ../../../../..
 include $(TOP)/configs/current
 
-CFLAGS += $(RADEON_CFLAGS)
-
 LIBNAME = radeon_dri.so
 
+include ../Makefile.defines
+
 ifeq ($(RADEON_LDFLAGS),)
 CS_SOURCES = radeon_cs_space_drm.c radeon_bo.c radeon_cs.c
 endif
@@ -53,9 +53,10 @@ C_SOURCES = \
 
 DRIVER_DEFINES = -DRADEON_R100
 
+INCLUDES += $(RADEON_CFLAGS)
 DRI_LIB_DEPS += $(RADEON_LDFLAGS)
 
 X86_SOURCES = 
 
-include ../Makefile.template
+include ../Makefile.targets
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_chipset.h b/src/mesa/drivers/dri/radeon/radeon_chipset.h
index 9145023826e..bd236625122 100644
--- a/src/mesa/drivers/dri/radeon/radeon_chipset.h
+++ b/src/mesa/drivers/dri/radeon/radeon_chipset.h
@@ -450,6 +450,18 @@
 #define PCI_CHIP_PALM_9806              0x9806
 #define PCI_CHIP_PALM_9807              0x9807
 
+#define PCI_CHIP_SUMO_9640              0x9640
+#define PCI_CHIP_SUMO_9641              0x9641
+#define PCI_CHIP_SUMO2_9642             0x9642
+#define PCI_CHIP_SUMO2_9643             0x9643
+#define PCI_CHIP_SUMO2_9644             0x9644
+#define PCI_CHIP_SUMO2_9645             0x9645
+#define PCI_CHIP_SUMO_9647              0x9647
+#define PCI_CHIP_SUMO_9648              0x9648
+#define PCI_CHIP_SUMO_964A              0x964a
+#define PCI_CHIP_SUMO_964E              0x964e
+#define PCI_CHIP_SUMO_964F              0x964f
+
 #define PCI_CHIP_BARTS_6720             0x6720
 #define PCI_CHIP_BARTS_6721             0x6721
 #define PCI_CHIP_BARTS_6722             0x6722
@@ -534,6 +546,8 @@ enum {
    CHIP_FAMILY_CYPRESS,
    CHIP_FAMILY_HEMLOCK,
    CHIP_FAMILY_PALM,
+   CHIP_FAMILY_SUMO,
+   CHIP_FAMILY_SUMO2,
    CHIP_FAMILY_BARTS,
    CHIP_FAMILY_TURKS,
    CHIP_FAMILY_CAICOS,
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index 5d7b3973d57..bf8925f61d0 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -98,6 +98,8 @@ static const char* get_chip_family_name(int chip_family)
 	case CHIP_FAMILY_CYPRESS: return "CYPRESS";
 	case CHIP_FAMILY_HEMLOCK: return "HEMLOCK";
 	case CHIP_FAMILY_PALM: return "PALM";
+	case CHIP_FAMILY_SUMO: return "SUMO";
+	case CHIP_FAMILY_SUMO2: return "SUMO2";
 	case CHIP_FAMILY_BARTS: return "BARTS";
 	case CHIP_FAMILY_TURKS: return "TURKS";
 	case CHIP_FAMILY_CAICOS: return "CAICOS";
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index 6449229e088..6cf843406f9 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -1168,6 +1168,25 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
        screen->chip_flags = RADEON_CHIPSET_TCL;
        break;
 
+    case PCI_CHIP_SUMO_9640:
+    case PCI_CHIP_SUMO_9641:
+    case PCI_CHIP_SUMO_9647:
+    case PCI_CHIP_SUMO_9648:
+    case PCI_CHIP_SUMO_964A:
+    case PCI_CHIP_SUMO_964E:
+    case PCI_CHIP_SUMO_964F:
+       screen->chip_family = CHIP_FAMILY_SUMO;
+       screen->chip_flags = RADEON_CHIPSET_TCL;
+       break;
+
+    case PCI_CHIP_SUMO2_9642:
+    case PCI_CHIP_SUMO2_9643:
+    case PCI_CHIP_SUMO2_9644:
+    case PCI_CHIP_SUMO2_9645:
+       screen->chip_family = CHIP_FAMILY_SUMO2;
+       screen->chip_flags = RADEON_CHIPSET_TCL;
+       break;
+
    case PCI_CHIP_BARTS_6720:
    case PCI_CHIP_BARTS_6721:
    case PCI_CHIP_BARTS_6722:
diff --git a/src/mesa/drivers/dri/savage/Makefile b/src/mesa/drivers/dri/savage/Makefile
index 53511552c6d..03be3468da9 100644
--- a/src/mesa/drivers/dri/savage/Makefile
+++ b/src/mesa/drivers/dri/savage/Makefile
@@ -5,6 +5,8 @@ include $(TOP)/configs/current
 
 LIBNAME = savage_dri.so
 
+include ../Makefile.defines
+
 DRIVER_SOURCES = \
 	savage_xmesa.c \
 	savagedd.c \
@@ -22,5 +24,5 @@ C_SOURCES = \
 ASM_SOURCES = 
 
 
-include ../Makefile.template
+include ../Makefile.targets
 
diff --git a/src/mesa/drivers/dri/sis/Makefile b/src/mesa/drivers/dri/sis/Makefile
index 6b4f938bab3..0e0bf0d4eac 100644
--- a/src/mesa/drivers/dri/sis/Makefile
+++ b/src/mesa/drivers/dri/sis/Makefile
@@ -5,6 +5,8 @@ include $(TOP)/configs/current
 
 LIBNAME = sis_dri.so
 
+include ../Makefile.defines
+
 DRIVER_SOURCES = \
 	sis6326_state.c \
 	sis6326_clear.c \
@@ -28,5 +30,5 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-include ../Makefile.template
+include ../Makefile.targets
 
diff --git a/src/mesa/drivers/dri/swrast/Makefile b/src/mesa/drivers/dri/swrast/Makefile
index d2cf6dbc55b..4cb99fd0eb7 100644
--- a/src/mesa/drivers/dri/swrast/Makefile
+++ b/src/mesa/drivers/dri/swrast/Makefile
@@ -5,6 +5,8 @@ include $(TOP)/configs/current
 
 LIBNAME = swrast_dri.so
 
+include ../Makefile.defines
+
 DRIVER_DEFINES = -D__NOT_HAVE_DRM_H
 
 DRIVER_SOURCES = \
@@ -22,5 +24,5 @@ SWRAST_COMMON_SOURCES = \
 	../common/utils.c \
 	../common/drisw_util.c
 
-include ../Makefile.template
+include ../Makefile.targets
 
diff --git a/src/mesa/drivers/dri/tdfx/Makefile b/src/mesa/drivers/dri/tdfx/Makefile
index 96bd8f8202f..ed84df20925 100644
--- a/src/mesa/drivers/dri/tdfx/Makefile
+++ b/src/mesa/drivers/dri/tdfx/Makefile
@@ -5,6 +5,8 @@ include $(TOP)/configs/current
 
 LIBNAME = tdfx_dri.so
 
+include ../Makefile.defines
+
 DRIVER_SOURCES = \
 	tdfx_context.c \
 	tdfx_dd.c \
@@ -27,6 +29,6 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-include ../Makefile.template
+include ../Makefile.targets
 
 
diff --git a/src/mesa/drivers/dri/unichrome/Makefile b/src/mesa/drivers/dri/unichrome/Makefile
index 14cf9f30386..373da6016e4 100644
--- a/src/mesa/drivers/dri/unichrome/Makefile
+++ b/src/mesa/drivers/dri/unichrome/Makefile
@@ -5,6 +5,8 @@ include $(TOP)/configs/current
 
 LIBNAME = unichrome_dri.so
 
+include ../Makefile.defines
+
 DRIVER_SOURCES = \
 	via_context.c \
 	via_fb.c \
@@ -25,5 +27,5 @@ C_SOURCES = \
 ASM_SOURCES = 
 
 
-include ../Makefile.template
+include ../Makefile.targets