103 files changed, 5891 insertions, 1460 deletions
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 8ab129dd73d..a6174ee2f56 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -95,8 +95,6 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->TexSubImage2D = _mesa_store_texsubimage2d;
    driver->TexSubImage3D = _mesa_store_texsubimage3d;
    driver->GetTexImage = _mesa_get_teximage;
-   driver->CopyTexImage1D = _mesa_meta_CopyTexImage1D;
-   driver->CopyTexImage2D = _mesa_meta_CopyTexImage2D;
    driver->CopyTexSubImage1D = _mesa_meta_CopyTexSubImage1D;
    driver->CopyTexSubImage2D = _mesa_meta_CopyTexSubImage2D;
    driver->CopyTexSubImage3D = _mesa_meta_CopyTexSubImage3D;
@@ -250,10 +248,10 @@ _mesa_init_driver_state(struct gl_context *ctx)
       GLuint i;
       for (i = 0; i < ctx->Const.MaxDrawBuffers; i++) {
          ctx->Driver.ColorMaskIndexed(ctx, i,
-                                      ctx->Color.ColorMask[0][RCOMP],
-                                      ctx->Color.ColorMask[0][GCOMP],
-                                      ctx->Color.ColorMask[0][BCOMP],
-                                      ctx->Color.ColorMask[0][ACOMP]);
+                                      ctx->Color.ColorMask[i][RCOMP],
+                                      ctx->Color.ColorMask[i][GCOMP],
+                                      ctx->Color.ColorMask[i][BCOMP],
+                                      ctx->Color.ColorMask[i][ACOMP]);
       }
    }
    else {
@@ -288,7 +286,10 @@ _mesa_init_driver_state(struct gl_context *ctx)
    ctx->Driver.Enable(ctx, GL_TEXTURE_CUBE_MAP, GL_FALSE);
 
    ctx->Driver.Fogfv(ctx, GL_FOG_COLOR, ctx->Fog.Color);
-   ctx->Driver.Fogfv(ctx, GL_FOG_MODE, 0);
+   {
+      GLfloat mode = (GLfloat) ctx->Fog.Mode;
+      ctx->Driver.Fogfv(ctx, GL_FOG_MODE, &mode);
+   }
    ctx->Driver.Fogfv(ctx, GL_FOG_DENSITY, &ctx->Fog.Density);
    ctx->Driver.Fogfv(ctx, GL_FOG_START, &ctx->Fog.Start);
    ctx->Driver.Fogfv(ctx, GL_FOG_END, &ctx->Fog.End);
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 0e58aeca3f5..291d912121b 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -62,6 +62,7 @@
 #include "main/teximage.h"
 #include "main/texparam.h"
 #include "main/texstate.h"
+#include "main/uniforms.h"
 #include "main/varray.h"
 #include "main/viewport.h"
 #include "program/program.h"
@@ -72,63 +73,36 @@
 /** Return offset in bytes of the field within a vertex struct */
 #define OFFSET(FIELD) ((void *) offsetof(struct vertex, FIELD))
 
-
-/**
- * Flags passed to _mesa_meta_begin().
- */
-/*@{*/
-#define META_ALL              ~0x0
-#define META_ALPHA_TEST        0x1
-#define META_BLEND             0x2  /**< includes logicop */
-#define META_COLOR_MASK        0x4
-#define META_DEPTH_TEST        0x8
-#define META_FOG              0x10
-#define META_PIXEL_STORE      0x20
-#define META_PIXEL_TRANSFER   0x40
-#define META_RASTERIZATION    0x80
-#define META_SCISSOR         0x100
-#define META_SHADER          0x200
-#define META_STENCIL_TEST    0x400
-#define META_TRANSFORM       0x800 /**< modelview, projection, clip planes */
-#define META_TEXTURE        0x1000
-#define META_VERTEX         0x2000
-#define META_VIEWPORT       0x4000
-#define META_CLAMP_FRAGMENT_COLOR 0x8000
-#define META_CLAMP_VERTEX_COLOR 0x10000
-#define META_CONDITIONAL_RENDER 0x20000
-/*@}*/
-
-
 /**
  * State which we may save/restore across meta ops.
  * XXX this may be incomplete...
  */
 struct save_state
 {
-   GLbitfield SavedState;  /**< bitmask of META_* flags */
+   GLbitfield SavedState;  /**< bitmask of MESA_META_* flags */
 
-   /** META_ALPHA_TEST */
+   /** MESA_META_ALPHA_TEST */
    GLboolean AlphaEnabled;
    GLenum AlphaFunc;
    GLclampf AlphaRef;
 
-   /** META_BLEND */
+   /** MESA_META_BLEND */
    GLbitfield BlendEnabled;
    GLboolean ColorLogicOpEnabled;
 
-   /** META_COLOR_MASK */
+   /** MESA_META_COLOR_MASK */
    GLubyte ColorMask[MAX_DRAW_BUFFERS][4];
 
-   /** META_DEPTH_TEST */
+   /** MESA_META_DEPTH_TEST */
    struct gl_depthbuffer_attrib Depth;
 
-   /** META_FOG */
+   /** MESA_META_FOG */
    GLboolean Fog;
 
-   /** META_PIXEL_STORE */
+   /** MESA_META_PIXEL_STORE */
    struct gl_pixelstore_attrib Pack, Unpack;
 
-   /** META_PIXEL_TRANSFER */
+   /** MESA_META_PIXEL_TRANSFER */
    GLfloat RedBias, RedScale;
    GLfloat GreenBias, GreenScale;
    GLfloat BlueBias, BlueScale;
@@ -136,17 +110,17 @@ struct save_state
    GLfloat DepthBias, DepthScale;
    GLboolean MapColorFlag;
 
-   /** META_RASTERIZATION */
+   /** MESA_META_RASTERIZATION */
    GLenum FrontPolygonMode, BackPolygonMode;
    GLboolean PolygonOffset;
    GLboolean PolygonSmooth;
    GLboolean PolygonStipple;
    GLboolean PolygonCull;
 
-   /** META_SCISSOR */
+   /** MESA_META_SCISSOR */
    struct gl_scissor_attrib Scissor;
 
-   /** META_SHADER */
+   /** MESA_META_SHADER */
    GLboolean VertexProgramEnabled;
    struct gl_vertex_program *VertexProgram;
    GLboolean FragmentProgramEnabled;
@@ -156,17 +130,19 @@ struct save_state
    struct gl_shader_program *FragmentShader;
    struct gl_shader_program *ActiveShader;
 
-   /** META_STENCIL_TEST */
+   /** MESA_META_STENCIL_TEST */
    struct gl_stencil_attrib Stencil;
 
-   /** META_TRANSFORM */
+   /** MESA_META_TRANSFORM */
    GLenum MatrixMode;
    GLfloat ModelviewMatrix[16];
    GLfloat ProjectionMatrix[16];
    GLfloat TextureMatrix[16];
+
+   /** MESA_META_CLIP */
    GLbitfield ClipPlanesEnabled;
 
-   /** META_TEXTURE */
+   /** MESA_META_TEXTURE */
    GLuint ActiveUnit;
    GLuint ClientActiveUnit;
    /** for unit[0] only */
@@ -176,21 +152,21 @@ struct save_state
    GLbitfield TexGenEnabled[MAX_TEXTURE_UNITS];
    GLuint EnvMode;  /* unit[0] only */
 
-   /** META_VERTEX */
+   /** MESA_META_VERTEX */
    struct gl_array_object *ArrayObj;
    struct gl_buffer_object *ArrayBufferObj;
 
-   /** META_VIEWPORT */
+   /** MESA_META_VIEWPORT */
    GLint ViewportX, ViewportY, ViewportW, ViewportH;
    GLclampd DepthNear, DepthFar;
 
-   /** META_CLAMP_FRAGMENT_COLOR */
+   /** MESA_META_CLAMP_FRAGMENT_COLOR */
    GLenum ClampFragmentColor;
 
-   /** META_CLAMP_VERTEX_COLOR */
+   /** MESA_META_CLAMP_VERTEX_COLOR */
    GLenum ClampVertexColor;
 
-   /** META_CONDITIONAL_RENDER */
+   /** MESA_META_CONDITIONAL_RENDER */
    struct gl_query_object *CondRenderQuery;
    GLenum CondRenderMode;
 
@@ -235,6 +211,8 @@ struct clear_state
 {
    GLuint ArrayObj;
    GLuint VBO;
+   GLuint ShaderProg;
+   GLint ColorLocation;
 };
 
 
@@ -336,10 +314,10 @@ _mesa_meta_free(struct gl_context *ctx)
  * Enter meta state.  This is like a light-weight version of glPushAttrib
  * but it also resets most GL state back to default values.
  *
- * \param state  bitmask of META_* flags indicating which attribute groups
+ * \param state  bitmask of MESA_META_* flags indicating which attribute groups
  *               to save and reset to their defaults
  */
-static void
+void
 _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
 {
    struct save_state *save;
@@ -351,7 +329,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
    memset(save, 0, sizeof(*save));
    save->SavedState = state;
 
-   if (state & META_ALPHA_TEST) {
+   if (state & MESA_META_ALPHA_TEST) {
       save->AlphaEnabled = ctx->Color.AlphaEnabled;
       save->AlphaFunc = ctx->Color.AlphaFunc;
       save->AlphaRef = ctx->Color.AlphaRef;
@@ -359,7 +337,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
          _mesa_set_enable(ctx, GL_ALPHA_TEST, GL_FALSE);
    }
 
-   if (state & META_BLEND) {
+   if (state & MESA_META_BLEND) {
       save->BlendEnabled = ctx->Color.BlendEnabled;
       if (ctx->Color.BlendEnabled) {
          if (ctx->Extensions.EXT_draw_buffers2) {
@@ -377,7 +355,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
          _mesa_set_enable(ctx, GL_COLOR_LOGIC_OP, GL_FALSE);
    }
 
-   if (state & META_COLOR_MASK) {
+   if (state & MESA_META_COLOR_MASK) {
       memcpy(save->ColorMask, ctx->Color.ColorMask,
              sizeof(ctx->Color.ColorMask));
       if (!ctx->Color.ColorMask[0][0] ||
@@ -387,26 +365,26 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
          _mesa_ColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
    }
 
-   if (state & META_DEPTH_TEST) {
+   if (state & MESA_META_DEPTH_TEST) {
       save->Depth = ctx->Depth; /* struct copy */
       if (ctx->Depth.Test)
          _mesa_set_enable(ctx, GL_DEPTH_TEST, GL_FALSE);
    }
 
-   if (state & META_FOG) {
+   if (state & MESA_META_FOG) {
       save->Fog = ctx->Fog.Enabled;
       if (ctx->Fog.Enabled)
          _mesa_set_enable(ctx, GL_FOG, GL_FALSE);
    }
 
-   if (state & META_PIXEL_STORE) {
+   if (state & MESA_META_PIXEL_STORE) {
       save->Pack = ctx->Pack;
       save->Unpack = ctx->Unpack;
       ctx->Pack = ctx->DefaultPacking;
       ctx->Unpack = ctx->DefaultPacking;
    }
 
-   if (state & META_PIXEL_TRANSFER) {
+   if (state & MESA_META_PIXEL_TRANSFER) {
       save->RedScale = ctx->Pixel.RedScale;
       save->RedBias = ctx->Pixel.RedBias;
       save->GreenScale = ctx->Pixel.GreenScale;
@@ -429,7 +407,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
       ctx->NewState |=_NEW_PIXEL;
    }
 
-   if (state & META_RASTERIZATION) {
+   if (state & MESA_META_RASTERIZATION) {
       save->FrontPolygonMode = ctx->Polygon.FrontMode;
       save->BackPolygonMode = ctx->Polygon.BackMode;
       save->PolygonOffset = ctx->Polygon.OffsetFill;
@@ -443,12 +421,12 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
       _mesa_set_enable(ctx, GL_CULL_FACE, GL_FALSE);
    }
 
-   if (state & META_SCISSOR) {
+   if (state & MESA_META_SCISSOR) {
       save->Scissor = ctx->Scissor; /* struct copy */
       _mesa_set_enable(ctx, GL_SCISSOR_TEST, GL_FALSE);
    }
 
-   if (state & META_SHADER) {
+   if (state & MESA_META_SHADER) {
       if (ctx->Extensions.ARB_vertex_program) {
          save->VertexProgramEnabled = ctx->VertexProgram.Enabled;
          _mesa_reference_vertprog(ctx, &save->VertexProgram,
@@ -477,14 +455,14 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
       }
    }
 
-   if (state & META_STENCIL_TEST) {
+   if (state & MESA_META_STENCIL_TEST) {
       save->Stencil = ctx->Stencil; /* struct copy */
       if (ctx->Stencil.Enabled)
          _mesa_set_enable(ctx, GL_STENCIL_TEST, GL_FALSE);
       /* NOTE: other stencil state not reset */
    }
 
-   if (state & META_TEXTURE) {
+   if (state & MESA_META_TEXTURE) {
       GLuint u, tgt;
 
       save->ActiveUnit = ctx->Texture.CurrentUnit;
@@ -523,7 +501,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
       _mesa_TexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
    }
 
-   if (state & META_TRANSFORM) {
+   if (state & MESA_META_TRANSFORM) {
       GLuint activeTexture = ctx->Texture.CurrentUnit;
       memcpy(save->ModelviewMatrix, ctx->ModelviewMatrixStack.Top->m,
              16 * sizeof(GLfloat));
@@ -544,6 +522,9 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
       _mesa_Ortho(0.0, ctx->DrawBuffer->Width,
                   0.0, ctx->DrawBuffer->Height,
                   -1.0, 1.0);
+   }
+
+   if (state & MESA_META_CLIP) {
       save->ClipPlanesEnabled = ctx->Transform.ClipPlanesEnabled;
       if (ctx->Transform.ClipPlanesEnabled) {
          GLuint i;
@@ -553,7 +534,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
       }
    }
 
-   if (state & META_VERTEX) {
+   if (state & MESA_META_VERTEX) {
       /* save vertex array object state */
       _mesa_reference_array_object(ctx, &save->ArrayObj,
                                    ctx->Array.ArrayObj);
@@ -562,7 +543,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
       /* set some default state? */
    }
 
-   if (state & META_VIEWPORT) {
+   if (state & MESA_META_VIEWPORT) {
       /* save viewport state */
       save->ViewportX = ctx->Viewport.X;
       save->ViewportY = ctx->Viewport.Y;
@@ -583,7 +564,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
       _mesa_DepthRange(0.0, 1.0);
    }
 
-   if (state & META_CLAMP_FRAGMENT_COLOR) {
+   if (state & MESA_META_CLAMP_FRAGMENT_COLOR) {
       save->ClampFragmentColor = ctx->Color.ClampFragmentColor;
 
       /* Generally in here we want to do clamping according to whether
@@ -594,7 +575,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
 	 _mesa_ClampColorARB(GL_CLAMP_FRAGMENT_COLOR, GL_FALSE);
    }
 
-   if (state & META_CLAMP_VERTEX_COLOR) {
+   if (state & MESA_META_CLAMP_VERTEX_COLOR) {
       save->ClampVertexColor = ctx->Light.ClampVertexColor;
 
       /* Generally in here we never want vertex color clamping --
@@ -603,7 +584,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
       _mesa_ClampColorARB(GL_CLAMP_VERTEX_COLOR, GL_FALSE);
    }
 
-   if (state & META_CONDITIONAL_RENDER) {
+   if (state & MESA_META_CONDITIONAL_RENDER) {
       save->CondRenderQuery = ctx->Query.CondRenderQuery;
       save->CondRenderMode = ctx->Query.CondRenderMode;
 
@@ -623,19 +604,19 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
 /**
  * Leave meta state.  This is like a light-weight version of glPopAttrib().
  */
-static void
+void
 _mesa_meta_end(struct gl_context *ctx)
 {
    struct save_state *save = &ctx->Meta->Save[--ctx->Meta->SaveStackDepth];
    const GLbitfield state = save->SavedState;
 
-   if (state & META_ALPHA_TEST) {
+   if (state & MESA_META_ALPHA_TEST) {
       if (ctx->Color.AlphaEnabled != save->AlphaEnabled)
          _mesa_set_enable(ctx, GL_ALPHA_TEST, save->AlphaEnabled);
       _mesa_AlphaFunc(save->AlphaFunc, save->AlphaRef);
    }
 
-   if (state & META_BLEND) {
+   if (state & MESA_META_BLEND) {
       if (ctx->Color.BlendEnabled != save->BlendEnabled) {
          if (ctx->Extensions.EXT_draw_buffers2) {
             GLuint i;
@@ -651,7 +632,7 @@ _mesa_meta_end(struct gl_context *ctx)
          _mesa_set_enable(ctx, GL_COLOR_LOGIC_OP, save->ColorLogicOpEnabled);
    }
 
-   if (state & META_COLOR_MASK) {
+   if (state & MESA_META_COLOR_MASK) {
       GLuint i;
       for (i = 0; i < ctx->Const.MaxDrawBuffers; i++) {
          if (!TEST_EQ_4V(ctx->Color.ColorMask[i], save->ColorMask[i])) {
@@ -670,23 +651,23 @@ _mesa_meta_end(struct gl_context *ctx)
       }
    }
 
-   if (state & META_DEPTH_TEST) {
+   if (state & MESA_META_DEPTH_TEST) {
       if (ctx->Depth.Test != save->Depth.Test)
          _mesa_set_enable(ctx, GL_DEPTH_TEST, save->Depth.Test);
       _mesa_DepthFunc(save->Depth.Func);
       _mesa_DepthMask(save->Depth.Mask);
    }
 
-   if (state & META_FOG) {
+   if (state & MESA_META_FOG) {
       _mesa_set_enable(ctx, GL_FOG, save->Fog);
    }
 
-   if (state & META_PIXEL_STORE) {
+   if (state & MESA_META_PIXEL_STORE) {
       ctx->Pack = save->Pack;
       ctx->Unpack = save->Unpack;
    }
 
-   if (state & META_PIXEL_TRANSFER) {
+   if (state & MESA_META_PIXEL_TRANSFER) {
       ctx->Pixel.RedScale = save->RedScale;
       ctx->Pixel.RedBias = save->RedBias;
       ctx->Pixel.GreenScale = save->GreenScale;
@@ -700,7 +681,7 @@ _mesa_meta_end(struct gl_context *ctx)
       ctx->NewState |=_NEW_PIXEL;
    }
 
-   if (state & META_RASTERIZATION) {
+   if (state & MESA_META_RASTERIZATION) {
       _mesa_PolygonMode(GL_FRONT, save->FrontPolygonMode);
       _mesa_PolygonMode(GL_BACK, save->BackPolygonMode);
       _mesa_set_enable(ctx, GL_POLYGON_STIPPLE, save->PolygonStipple);
@@ -709,13 +690,13 @@ _mesa_meta_end(struct gl_context *ctx)
       _mesa_set_enable(ctx, GL_CULL_FACE, save->PolygonCull);
    }
 
-   if (state & META_SCISSOR) {
+   if (state & MESA_META_SCISSOR) {
       _mesa_set_enable(ctx, GL_SCISSOR_TEST, save->Scissor.Enabled);
       _mesa_Scissor(save->Scissor.X, save->Scissor.Y,
                     save->Scissor.Width, save->Scissor.Height);
    }
 
-   if (state & META_SHADER) {
+   if (state & MESA_META_SHADER) {
       if (ctx->Extensions.ARB_vertex_program) {
          _mesa_set_enable(ctx, GL_VERTEX_PROGRAM_ARB,
                           save->VertexProgramEnabled);
@@ -747,7 +728,7 @@ _mesa_meta_end(struct gl_context *ctx)
 				     save->ActiveShader);
    }
 
-   if (state & META_STENCIL_TEST) {
+   if (state & MESA_META_STENCIL_TEST) {
       const struct gl_stencil_attrib *stencil = &save->Stencil;
 
       _mesa_set_enable(ctx, GL_STENCIL_TEST, stencil->Enabled);
@@ -778,7 +759,7 @@ _mesa_meta_end(struct gl_context *ctx)
                               stencil->ZPassFunc[1]);
    }
 
-   if (state & META_TEXTURE) {
+   if (state & MESA_META_TEXTURE) {
       GLuint u, tgt;
 
       ASSERT(ctx->Texture.CurrentUnit == 0);
@@ -829,7 +810,7 @@ _mesa_meta_end(struct gl_context *ctx)
       _mesa_ClientActiveTextureARB(GL_TEXTURE0 + save->ClientActiveUnit);
    }
 
-   if (state & META_TRANSFORM) {
+   if (state & MESA_META_TRANSFORM) {
       GLuint activeTexture = ctx->Texture.CurrentUnit;
       _mesa_ActiveTextureARB(GL_TEXTURE0);
       _mesa_MatrixMode(GL_TEXTURE);
@@ -843,7 +824,9 @@ _mesa_meta_end(struct gl_context *ctx)
       _mesa_LoadMatrixf(save->ProjectionMatrix);
 
       _mesa_MatrixMode(save->MatrixMode);
+   }
 
+   if (state & MESA_META_CLIP) {
       if (save->ClipPlanesEnabled) {
          GLuint i;
          for (i = 0; i < ctx->Const.MaxClipPlanes; i++) {
@@ -854,7 +837,7 @@ _mesa_meta_end(struct gl_context *ctx)
       }
    }
 
-   if (state & META_VERTEX) {
+   if (state & MESA_META_VERTEX) {
       /* restore vertex buffer object */
       _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, save->ArrayBufferObj->Name);
       _mesa_reference_buffer_object(ctx, &save->ArrayBufferObj, NULL);
@@ -864,7 +847,7 @@ _mesa_meta_end(struct gl_context *ctx)
       _mesa_reference_array_object(ctx, &save->ArrayObj, NULL);
    }
 
-   if (state & META_VIEWPORT) {
+   if (state & MESA_META_VIEWPORT) {
       if (save->ViewportX != ctx->Viewport.X ||
           save->ViewportY != ctx->Viewport.Y ||
           save->ViewportW != ctx->Viewport.Width ||
@@ -875,15 +858,15 @@ _mesa_meta_end(struct gl_context *ctx)
       _mesa_DepthRange(save->DepthNear, save->DepthFar);
    }
 
-   if (state & META_CLAMP_FRAGMENT_COLOR) {
+   if (state & MESA_META_CLAMP_FRAGMENT_COLOR) {
       _mesa_ClampColorARB(GL_CLAMP_FRAGMENT_COLOR, save->ClampFragmentColor);
    }
 
-   if (state & META_CLAMP_VERTEX_COLOR) {
+   if (state & MESA_META_CLAMP_VERTEX_COLOR) {
       _mesa_ClampColorARB(GL_CLAMP_VERTEX_COLOR, save->ClampVertexColor);
    }
 
-   if (state & META_CONDITIONAL_RENDER) {
+   if (state & MESA_META_CONDITIONAL_RENDER) {
       if (save->CondRenderQuery)
 	 _mesa_BeginConditionalRender(save->CondRenderQuery->Id,
 				      save->CondRenderMode);
@@ -1349,7 +1332,7 @@ _mesa_meta_BlitFramebuffer(struct gl_context *ctx,
    }
 
    /* only scissor effects blit so save/clear all other relevant state */
-   _mesa_meta_begin(ctx, ~META_SCISSOR);
+   _mesa_meta_begin(ctx, ~MESA_META_SCISSOR);
 
    if (blit->ArrayObj == 0) {
       /* one-time setup */
@@ -1478,15 +1461,15 @@ _mesa_meta_Clear(struct gl_context *ctx, GLbitfield buffers)
    };
    struct vertex verts[4];
    /* save all state but scissor, pixel pack/unpack */
-   GLbitfield metaSave = (META_ALL -
-			  META_SCISSOR -
-			  META_PIXEL_STORE -
-			  META_CONDITIONAL_RENDER);
+   GLbitfield metaSave = (MESA_META_ALL -
+			  MESA_META_SCISSOR -
+			  MESA_META_PIXEL_STORE -
+			  MESA_META_CONDITIONAL_RENDER);
    const GLuint stencilMax = (1 << ctx->DrawBuffer->Visual.stencilBits) - 1;
 
    if (buffers & BUFFER_BITS_COLOR) {
       /* if clearing color buffers, don't save/restore colormask */
-      metaSave -= META_COLOR_MASK;
+      metaSave -= MESA_META_COLOR_MASK;
    }
 
    _mesa_meta_begin(ctx, metaSave);
@@ -1521,7 +1504,7 @@ _mesa_meta_Clear(struct gl_context *ctx, GLbitfield buffers)
       _mesa_ClampColorARB(GL_CLAMP_FRAGMENT_COLOR, GL_FALSE);
    }
    else {
-      ASSERT(metaSave & META_COLOR_MASK);
+      ASSERT(metaSave & MESA_META_COLOR_MASK);
       _mesa_ColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
    }
 
@@ -1589,10 +1572,166 @@ _mesa_meta_Clear(struct gl_context *ctx, GLbitfield buffers)
    _mesa_meta_end(ctx);
 }
 
+static void
+meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear)
+{
+   const char *vs_source =
+      "attribute vec4 position;\n"
+      "void main()\n"
+      "{\n"
+      "   gl_Position = position;\n"
+      "}\n";
+   const char *fs_source =
+      "uniform vec4 color;\n"
+      "void main()\n"
+      "{\n"
+      "   gl_FragColor = color;\n"
+      "}\n";
+   GLuint vs, fs;
+
+   if (clear->ArrayObj != 0)
+      return;
+
+   /* create vertex array object */
+   _mesa_GenVertexArrays(1, &clear->ArrayObj);
+   _mesa_BindVertexArray(clear->ArrayObj);
+
+   /* create vertex array buffer */
+   _mesa_GenBuffersARB(1, &clear->VBO);
+   _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, clear->VBO);
+
+   /* setup vertex arrays */
+   _mesa_VertexAttribPointerARB(0, 3, GL_FLOAT, GL_FALSE, 0, (void *)0);
+   _mesa_EnableVertexAttribArrayARB(0);
+
+   vs = _mesa_CreateShaderObjectARB(GL_VERTEX_SHADER);
+   _mesa_ShaderSourceARB(vs, 1, &vs_source, NULL);
+   _mesa_CompileShaderARB(vs);
+
+   fs = _mesa_CreateShaderObjectARB(GL_FRAGMENT_SHADER);
+   _mesa_ShaderSourceARB(fs, 1, &fs_source, NULL);
+   _mesa_CompileShaderARB(fs);
+
+   clear->ShaderProg = _mesa_CreateProgramObjectARB();
+   _mesa_AttachShader(clear->ShaderProg, fs);
+   _mesa_AttachShader(clear->ShaderProg, vs);
+   _mesa_BindAttribLocationARB(clear->ShaderProg, 0, "position");
+   _mesa_LinkProgramARB(clear->ShaderProg);
+
+   clear->ColorLocation = _mesa_GetUniformLocationARB(clear->ShaderProg,
+						      "color");
+}
+
+/**
+ * Meta implementation of ctx->Driver.Clear() in terms of polygon rendering.
+ */
+void
+_mesa_meta_glsl_Clear(struct gl_context *ctx, GLbitfield buffers)
+{
+   struct clear_state *clear = &ctx->Meta->Clear;
+   GLbitfield metaSave;
+   const GLuint stencilMax = (1 << ctx->DrawBuffer->Visual.stencilBits) - 1;
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   const float x0 = ((float)fb->_Xmin / fb->Width)  * 2.0f - 1.0f;
+   const float y0 = ((float)fb->_Ymin / fb->Height) * 2.0f - 1.0f;
+   const float x1 = ((float)fb->_Xmax / fb->Width)  * 2.0f - 1.0f;
+   const float y1 = ((float)fb->_Ymax / fb->Height) * 2.0f - 1.0f;
+   const float z = -invert_z(ctx->Depth.Clear);
+   struct vertex {
+      GLfloat x, y, z;
+   } verts[4];
+
+   metaSave = (MESA_META_ALPHA_TEST |
+	       MESA_META_BLEND |
+	       MESA_META_DEPTH_TEST |
+	       MESA_META_RASTERIZATION |
+	       MESA_META_SHADER |
+	       MESA_META_STENCIL_TEST |
+	       MESA_META_VERTEX |
+	       MESA_META_VIEWPORT |
+	       MESA_META_CLIP |
+	       MESA_META_CLAMP_FRAGMENT_COLOR);
+
+   if (!(buffers & BUFFER_BITS_COLOR)) {
+      /* We'll use colormask to disable color writes.  Otherwise,
+       * respect color mask
+       */
+      metaSave |= MESA_META_COLOR_MASK;
+   }
+
+   _mesa_meta_begin(ctx, metaSave);
+
+   meta_glsl_clear_init(ctx, clear);
+
+   _mesa_UseProgramObjectARB(clear->ShaderProg);
+   _mesa_Uniform4fvARB(clear->ColorLocation, 1,
+		       ctx->Color.ClearColorUnclamped);
+
+   _mesa_BindVertexArray(clear->ArrayObj);
+   _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, clear->VBO);
+
+   /* GL_COLOR_BUFFER_BIT */
+   if (buffers & BUFFER_BITS_COLOR) {
+      /* leave colormask, glDrawBuffer state as-is */
+
+      /* Clears never have the color clamped. */
+      _mesa_ClampColorARB(GL_CLAMP_FRAGMENT_COLOR, GL_FALSE);
+   }
+   else {
+      ASSERT(metaSave & MESA_META_COLOR_MASK);
+      _mesa_ColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
+   }
+
+   /* GL_DEPTH_BUFFER_BIT */
+   if (buffers & BUFFER_BIT_DEPTH) {
+      _mesa_set_enable(ctx, GL_DEPTH_TEST, GL_TRUE);
+      _mesa_DepthFunc(GL_ALWAYS);
+      _mesa_DepthMask(GL_TRUE);
+   }
+   else {
+      assert(!ctx->Depth.Test);
+   }
+
+   /* GL_STENCIL_BUFFER_BIT */
+   if (buffers & BUFFER_BIT_STENCIL) {
+      _mesa_set_enable(ctx, GL_STENCIL_TEST, GL_TRUE);
+      _mesa_StencilOpSeparate(GL_FRONT_AND_BACK,
+                              GL_REPLACE, GL_REPLACE, GL_REPLACE);
+      _mesa_StencilFuncSeparate(GL_FRONT_AND_BACK, GL_ALWAYS,
+                                ctx->Stencil.Clear & stencilMax,
+                                ctx->Stencil.WriteMask[0]);
+   }
+   else {
+      assert(!ctx->Stencil.Enabled);
+   }
+
+   /* vertex positions */
+   verts[0].x = x0;
+   verts[0].y = y0;
+   verts[0].z = z;
+   verts[1].x = x1;
+   verts[1].y = y0;
+   verts[1].z = z;
+   verts[2].x = x1;
+   verts[2].y = y1;
+   verts[2].z = z;
+   verts[3].x = x0;
+   verts[3].y = y1;
+   verts[3].z = z;
+
+   /* upload new vertex data */
+   _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(verts), verts,
+		       GL_DYNAMIC_DRAW_ARB);
+
+   /* draw quad */
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+   _mesa_meta_end(ctx);
+}
 
 /**
  * Meta implementation of ctx->Driver.CopyPixels() in terms
- * of texture mapping and polygon rendering.
+ * of texture mapping and polygon rendering and GLSL shaders.
  */
 void
 _mesa_meta_CopyPixels(struct gl_context *ctx, GLint srcX, GLint srcY,
@@ -1621,12 +1760,13 @@ _mesa_meta_CopyPixels(struct gl_context *ctx, GLint srcX, GLint srcY,
    /* Most GL state applies to glCopyPixels, but a there's a few things
     * we need to override:
     */
-   _mesa_meta_begin(ctx, (META_RASTERIZATION |
-                          META_SHADER |
-                          META_TEXTURE |
-                          META_TRANSFORM |
-                          META_VERTEX |
-                          META_VIEWPORT));
+   _mesa_meta_begin(ctx, (MESA_META_RASTERIZATION |
+                          MESA_META_SHADER |
+                          MESA_META_TEXTURE |
+                          MESA_META_TRANSFORM |
+                          MESA_META_CLIP |
+                          MESA_META_VERTEX |
+                          MESA_META_VIEWPORT));
 
    if (copypix->ArrayObj == 0) {
       /* one-time setup */
@@ -1901,10 +2041,10 @@ _mesa_meta_DrawPixels(struct gl_context *ctx,
           * in [0,1].
           */
          texIntFormat = GL_ALPHA;
-         metaExtraSave = (META_COLOR_MASK |
-                          META_DEPTH_TEST |
-                          META_SHADER |
-                          META_STENCIL_TEST);
+         metaExtraSave = (MESA_META_COLOR_MASK |
+                          MESA_META_DEPTH_TEST |
+                          MESA_META_SHADER |
+                          MESA_META_STENCIL_TEST);
       }
       else {
          fallback = GL_TRUE;
@@ -1914,7 +2054,7 @@ _mesa_meta_DrawPixels(struct gl_context *ctx,
       if (ctx->Extensions.ARB_depth_texture &&
           ctx->Extensions.ARB_fragment_program) {
          texIntFormat = GL_DEPTH_COMPONENT;
-         metaExtraSave = (META_SHADER);
+         metaExtraSave = (MESA_META_SHADER);
       }
       else {
          fallback = GL_TRUE;
@@ -1942,13 +2082,14 @@ _mesa_meta_DrawPixels(struct gl_context *ctx,
    /* Most GL state applies to glDrawPixels (like blending, stencil, etc),
     * but a there's a few things we need to override:
     */
-   _mesa_meta_begin(ctx, (META_RASTERIZATION |
-                          META_SHADER |
-                          META_TEXTURE |
-                          META_TRANSFORM |
-                          META_VERTEX |
-                          META_VIEWPORT |
-			  META_CLAMP_FRAGMENT_COLOR |
+   _mesa_meta_begin(ctx, (MESA_META_RASTERIZATION |
+                          MESA_META_SHADER |
+                          MESA_META_TEXTURE |
+                          MESA_META_TRANSFORM |
+                          MESA_META_CLIP |
+                          MESA_META_VERTEX |
+                          MESA_META_VIEWPORT |
+			  MESA_META_CLAMP_FRAGMENT_COLOR |
                           metaExtraSave));
 
    newTex = alloc_texture(tex, width, height, texIntFormat);
@@ -2149,14 +2290,15 @@ _mesa_meta_Bitmap(struct gl_context *ctx,
    /* Most GL state applies to glBitmap (like blending, stencil, etc),
     * but a there's a few things we need to override:
     */
-   _mesa_meta_begin(ctx, (META_ALPHA_TEST |
-                          META_PIXEL_STORE |
-                          META_RASTERIZATION |
-                          META_SHADER |
-                          META_TEXTURE |
-                          META_TRANSFORM |
-                          META_VERTEX |
-                          META_VIEWPORT));
+   _mesa_meta_begin(ctx, (MESA_META_ALPHA_TEST |
+                          MESA_META_PIXEL_STORE |
+                          MESA_META_RASTERIZATION |
+                          MESA_META_SHADER |
+                          MESA_META_TEXTURE |
+                          MESA_META_TRANSFORM |
+                          MESA_META_CLIP |
+                          MESA_META_VERTEX |
+                          MESA_META_VIEWPORT));
 
    if (bitmap->ArrayObj == 0) {
       /* one-time setup */
@@ -2282,7 +2424,9 @@ _mesa_meta_check_generate_mipmap_fallback(struct gl_context *ctx, GLenum target,
 
    /* check for fallbacks */
    if (!ctx->Extensions.EXT_framebuffer_object ||
-       target == GL_TEXTURE_3D) {
+       target == GL_TEXTURE_3D ||
+       target == GL_TEXTURE_1D_ARRAY ||
+       target == GL_TEXTURE_2D_ARRAY) {
       return GL_TRUE;
    }
 
@@ -2334,7 +2478,8 @@ _mesa_meta_check_generate_mipmap_fallback(struct gl_context *ctx, GLenum target,
 
 /**
  * Called via ctx->Driver.GenerateMipmap()
- * Note: texture borders and 3D texture support not yet complete.
+ * Note: We don't yet support 3D textures, 1D/2D array textures or texture
+ * borders.
  */
 void
 _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
@@ -2374,7 +2519,7 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
       faceTarget = target;
    }
 
-   _mesa_meta_begin(ctx, META_ALL);
+   _mesa_meta_begin(ctx, MESA_META_ALL);
 
    if (original_active_unit != 0)
       _mesa_BindTexture(target, texObj->Name);
@@ -2678,119 +2823,6 @@ get_temp_image_type(struct gl_context *ctx, GLenum baseFormat)
 
 
 /**
- * Helper for _mesa_meta_CopyTexImage1/2D() functions.
- * Have to be careful with locking and meta state for pixel transfer.
- */
-static void
-copy_tex_image(struct gl_context *ctx, GLuint dims, GLenum target, GLint level,
-               GLenum internalFormat, GLint x, GLint y,
-               GLsizei width, GLsizei height, GLint border)
-{
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-   GLenum format, type;
-   GLint bpp;
-   void *buf;
-   struct gl_renderbuffer *read_rb = ctx->ReadBuffer->_ColorReadBuffer;
-
-   texObj = _mesa_get_current_tex_object(ctx, target);
-   texImage = _mesa_get_tex_image(ctx, texObj, target, level);
-
-   /* Choose format/type for temporary image buffer */
-   format = _mesa_base_tex_format(ctx, internalFormat);
-
-   if (format == GL_LUMINANCE &&
-       _mesa_get_format_base_format(read_rb->Format) != GL_LUMINANCE) {
-      /* The glReadPixels() path will convert RGB to luminance by
-       * summing R+G+B.  glCopyTexImage() is supposed to behave as
-       * glCopyPixels, which doesn't do that change, and instead
-       * leaves it up to glTexImage which converts RGB to luminance by
-       * just taking the R channel.  To avoid glReadPixels() trashing
-       * our data, use RGBA for our temporary image.
-       */
-      format = GL_RGBA;
-   }
-
-   type = get_temp_image_type(ctx, format);
-   bpp = _mesa_bytes_per_pixel(format, type);
-   if (bpp <= 0) {
-      _mesa_problem(ctx, "Bad bpp in meta copy_tex_image()");
-      return;
-   }
-
-   /*
-    * Alloc image buffer (XXX could use a PBO)
-    */
-   buf = malloc(width * height * bpp);
-   if (!buf) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCopyTexImage%uD", dims);
-      return;
-   }
-
-   _mesa_unlock_texture(ctx, texObj); /* need to unlock first */
-
-   /*
-    * Read image from framebuffer (disable pixel transfer ops)
-    */
-   _mesa_meta_begin(ctx, META_PIXEL_STORE | META_PIXEL_TRANSFER);
-   ctx->Driver.ReadPixels(ctx, x, y, width, height,
-			  format, type, &ctx->Pack, buf);
-   _mesa_meta_end(ctx);
-
-   if (texImage->Data) {
-      ctx->Driver.FreeTexImageData(ctx, texImage);
-   }
-
-   /* The texture's format was already chosen in _mesa_CopyTexImage() */
-   ASSERT(texImage->TexFormat != MESA_FORMAT_NONE);
-
-   /*
-    * Store texture data (with pixel transfer ops)
-    */
-   _mesa_meta_begin(ctx, META_PIXEL_STORE);
-
-   _mesa_update_state(ctx); /* to update pixel transfer state */
-
-   if (target == GL_TEXTURE_1D) {
-      ctx->Driver.TexImage1D(ctx, target, level, internalFormat,
-                             width, border, format, type,
-                             buf, &ctx->Unpack, texObj, texImage);
-   }
-   else {
-      ctx->Driver.TexImage2D(ctx, target, level, internalFormat,
-                             width, height, border, format, type,
-                             buf, &ctx->Unpack, texObj, texImage);
-   }
-   _mesa_meta_end(ctx);
-
-   _mesa_lock_texture(ctx, texObj); /* re-lock */
-
-   free(buf);
-}
-
-
-void
-_mesa_meta_CopyTexImage1D(struct gl_context *ctx, GLenum target, GLint level,
-                          GLenum internalFormat, GLint x, GLint y,
-                          GLsizei width, GLint border)
-{
-   copy_tex_image(ctx, 1, target, level, internalFormat, x, y,
-                  width, 1, border);
-}
-
-
-void
-_mesa_meta_CopyTexImage2D(struct gl_context *ctx, GLenum target, GLint level,
-                          GLenum internalFormat, GLint x, GLint y,
-                          GLsizei width, GLsizei height, GLint border)
-{
-   copy_tex_image(ctx, 2, target, level, internalFormat, x, y,
-                  width, height, border);
-}
-
-
-
-/**
  * Helper for _mesa_meta_CopyTexSubImage1/2/3D() functions.
  * Have to be careful with locking and meta state for pixel transfer.
  */
@@ -2812,6 +2844,16 @@ copy_tex_sub_image(struct gl_context *ctx,
 
    /* Choose format/type for temporary image buffer */
    format = _mesa_get_format_base_format(texImage->TexFormat);
+   if (format == GL_LUMINANCE ||
+       format == GL_LUMINANCE_ALPHA ||
+       format == GL_INTENSITY) {
+      /* We don't want to use GL_LUMINANCE, GL_INTENSITY, etc. for the
+       * temp image buffer because glReadPixels will do L=R+G+B which is
+       * not what we want (should be L=R).
+       */
+      format = GL_RGBA;
+   }
+
    type = get_temp_image_type(ctx, format);
    bpp = _mesa_bytes_per_pixel(format, type);
    if (bpp <= 0) {
@@ -2833,7 +2875,7 @@ copy_tex_sub_image(struct gl_context *ctx,
    /*
     * Read image from framebuffer (disable pixel transfer ops)
     */
-   _mesa_meta_begin(ctx, META_PIXEL_STORE | META_PIXEL_TRANSFER);
+   _mesa_meta_begin(ctx, MESA_META_PIXEL_STORE | MESA_META_PIXEL_TRANSFER);
    ctx->Driver.ReadPixels(ctx, x, y, width, height,
 			  format, type, &ctx->Pack, buf);
    _mesa_meta_end(ctx);
@@ -2843,7 +2885,7 @@ copy_tex_sub_image(struct gl_context *ctx,
    /*
     * Store texture data (with pixel transfer ops)
     */
-   _mesa_meta_begin(ctx, META_PIXEL_STORE);
+   _mesa_meta_begin(ctx, MESA_META_PIXEL_STORE);
    if (target == GL_TEXTURE_1D) {
       ctx->Driver.TexSubImage1D(ctx, target, level, xoffset,
                                 width, format, type, buf,
@@ -2915,7 +2957,7 @@ _mesa_meta_CopyColorTable(struct gl_context *ctx,
    /*
     * Read image from framebuffer (disable pixel transfer ops)
     */
-   _mesa_meta_begin(ctx, META_PIXEL_STORE | META_PIXEL_TRANSFER);
+   _mesa_meta_begin(ctx, MESA_META_PIXEL_STORE | MESA_META_PIXEL_TRANSFER);
    ctx->Driver.ReadPixels(ctx, x, y, width, 1,
                           GL_RGBA, GL_FLOAT, &ctx->Pack, buf);
 
@@ -2942,7 +2984,7 @@ _mesa_meta_CopyColorSubTable(struct gl_context *ctx,GLenum target, GLsizei start
    /*
     * Read image from framebuffer (disable pixel transfer ops)
     */
-   _mesa_meta_begin(ctx, META_PIXEL_STORE | META_PIXEL_TRANSFER);
+   _mesa_meta_begin(ctx, MESA_META_PIXEL_STORE | MESA_META_PIXEL_TRANSFER);
    ctx->Driver.ReadPixels(ctx, x, y, width, 1,
                           GL_RGBA, GL_FLOAT, &ctx->Pack, buf);
 
diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h
index b0797d3d91a..ac20e370eb8 100644
--- a/src/mesa/drivers/common/meta.h
+++ b/src/mesa/drivers/common/meta.h
@@ -26,6 +26,33 @@
 #ifndef META_H
 #define META_H
 
+/**
+ * \name Flags for meta operations
+ * \{
+ *
+ * These flags are passed to _mesa_meta_begin().
+ */
+#define MESA_META_ALL                      ~0x0
+#define MESA_META_ALPHA_TEST                0x1
+#define MESA_META_BLEND                     0x2  /**< includes logicop */
+#define MESA_META_COLOR_MASK                0x4
+#define MESA_META_DEPTH_TEST                0x8
+#define MESA_META_FOG                      0x10
+#define MESA_META_PIXEL_STORE              0x20
+#define MESA_META_PIXEL_TRANSFER           0x40
+#define MESA_META_RASTERIZATION            0x80
+#define MESA_META_SCISSOR                 0x100
+#define MESA_META_SHADER                  0x200
+#define MESA_META_STENCIL_TEST            0x400
+#define MESA_META_TRANSFORM               0x800 /**< modelview/projection matrix state */
+#define MESA_META_TEXTURE                0x1000
+#define MESA_META_VERTEX                 0x2000
+#define MESA_META_VIEWPORT               0x4000
+#define MESA_META_CLAMP_FRAGMENT_COLOR   0x8000
+#define MESA_META_CLAMP_VERTEX_COLOR    0x10000
+#define MESA_META_CONDITIONAL_RENDER    0x20000
+#define MESA_META_CLIP                  0x40000
+/**\}*/
 
 extern void
 _mesa_meta_init(struct gl_context *ctx);
@@ -34,6 +61,12 @@ extern void
 _mesa_meta_free(struct gl_context *ctx);
 
 extern void
+_mesa_meta_begin(struct gl_context *ctx, GLbitfield state);
+
+extern void
+_mesa_meta_end(struct gl_context *ctx);
+
+extern void
 _mesa_meta_BlitFramebuffer(struct gl_context *ctx,
                            GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                            GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
@@ -43,6 +76,9 @@ extern void
 _mesa_meta_Clear(struct gl_context *ctx, GLbitfield buffers);
 
 extern void
+_mesa_meta_glsl_Clear(struct gl_context *ctx, GLbitfield buffers);
+
+extern void
 _mesa_meta_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
                       GLsizei width, GLsizei height,
                       GLint dstx, GLint dsty, GLenum type);
@@ -69,16 +105,6 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
                           struct gl_texture_object *texObj);
 
 extern void
-_mesa_meta_CopyTexImage1D(struct gl_context *ctx, GLenum target, GLint level,
-                          GLenum internalFormat, GLint x, GLint y,
-                          GLsizei width, GLint border);
-
-extern void
-_mesa_meta_CopyTexImage2D(struct gl_context *ctx, GLenum target, GLint level,
-                          GLenum internalFormat, GLint x, GLint y,
-                          GLsizei width, GLsizei height, GLint border);
-
-extern void
 _mesa_meta_CopyTexSubImage1D(struct gl_context *ctx, GLenum target, GLint level,
                              GLint xoffset,
                              GLint x, GLint y, GLsizei width);
diff --git a/src/mesa/drivers/dri/common/xmlconfig.c b/src/mesa/drivers/dri/common/xmlconfig.c
index 77967ac2a43..12dd31bb162 100644
--- a/src/mesa/drivers/dri/common/xmlconfig.c
+++ b/src/mesa/drivers/dri/common/xmlconfig.c
@@ -567,7 +567,7 @@ static void parseOptInfoAttr (struct OptInfoData *data, const XML_Char **attr) {
     } else
 	defaultVal = attrVal[OA_DEFAULT];
     if (!parseValue (&cache->values[opt], cache->info[opt].type, defaultVal))
-	XML_FATAL ("illegal default value: %s.", defaultVal);
+	XML_FATAL ("illegal default value for %s: %s.", cache->info[opt].name, defaultVal);
 
     if (attrVal[OA_VALID]) {
 	if (cache->info[opt].type == DRI_BOOL)
diff --git a/src/mesa/drivers/dri/common/xmlpool.h b/src/mesa/drivers/dri/common/xmlpool.h
index 587517ea10a..ffea430024d 100644
--- a/src/mesa/drivers/dri/common/xmlpool.h
+++ b/src/mesa/drivers/dri/common/xmlpool.h
@@ -60,7 +60,7 @@
 #define DRI_CONF_OPT_BEGIN(name,type,def) \
 "<option name=\""#name"\" type=\""#type"\" default=\""#def"\">\n"
 
-/** \brief Begin an option definition with qouted default value */
+/** \brief Begin an option definition with quoted default value */
 #define DRI_CONF_OPT_BEGIN_Q(name,type,def) \
 "<option name=\""#name"\" type=\""#type"\" default="#def">\n"
 
diff --git a/src/mesa/drivers/dri/common/xmlpool/options.h b/src/mesa/drivers/dri/common/xmlpool/options.h
index d76595578c7..1e584ba086a 100644
--- a/src/mesa/drivers/dri/common/xmlpool/options.h
+++ b/src/mesa/drivers/dri/common/xmlpool/options.h
@@ -425,6 +425,66 @@ DRI_CONF_OPT_BEGIN(hyperz,bool,def) \
         DRI_CONF_DESC(sv,"Använd HyperZ för att maximera prestandan") \
 DRI_CONF_OPT_END
 
+#define DRI_CONF_PP_CELSHADE(def) \
+DRI_CONF_OPT_BEGIN_V(pp_celshade,enum,def,"0:1") \
+        DRI_CONF_DESC(en,"A post-processing filter to cel-shade the output") \
+        DRI_CONF_DESC(de,"A post-processing filter to cel-shade the output") \
+        DRI_CONF_DESC(es,"A post-processing filter to cel-shade the output") \
+        DRI_CONF_DESC(nl,"A post-processing filter to cel-shade the output") \
+        DRI_CONF_DESC(fr,"A post-processing filter to cel-shade the output") \
+        DRI_CONF_DESC(sv,"A post-processing filter to cel-shade the output") \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_PP_NORED(def) \
+DRI_CONF_OPT_BEGIN_V(pp_nored,enum,def,"0:1") \
+        DRI_CONF_DESC(en,"A post-processing filter to remove the red channel") \
+        DRI_CONF_DESC(de,"A post-processing filter to remove the red channel") \
+        DRI_CONF_DESC(es,"A post-processing filter to remove the red channel") \
+        DRI_CONF_DESC(nl,"A post-processing filter to remove the red channel") \
+        DRI_CONF_DESC(fr,"A post-processing filter to remove the red channel") \
+        DRI_CONF_DESC(sv,"A post-processing filter to remove the red channel") \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_PP_NOGREEN(def) \
+DRI_CONF_OPT_BEGIN_V(pp_nogreen,enum,def,"0:1") \
+        DRI_CONF_DESC(en,"A post-processing filter to remove the green channel") \
+        DRI_CONF_DESC(de,"A post-processing filter to remove the green channel") \
+        DRI_CONF_DESC(es,"A post-processing filter to remove the green channel") \
+        DRI_CONF_DESC(nl,"A post-processing filter to remove the green channel") \
+        DRI_CONF_DESC(fr,"A post-processing filter to remove the green channel") \
+        DRI_CONF_DESC(sv,"A post-processing filter to remove the green channel") \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_PP_NOBLUE(def) \
+DRI_CONF_OPT_BEGIN_V(pp_noblue,enum,def,"0:1") \
+        DRI_CONF_DESC(en,"A post-processing filter to remove the blue channel") \
+        DRI_CONF_DESC(de,"A post-processing filter to remove the blue channel") \
+        DRI_CONF_DESC(es,"A post-processing filter to remove the blue channel") \
+        DRI_CONF_DESC(nl,"A post-processing filter to remove the blue channel") \
+        DRI_CONF_DESC(fr,"A post-processing filter to remove the blue channel") \
+        DRI_CONF_DESC(sv,"A post-processing filter to remove the blue channel") \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_PP_JIMENEZMLAA(def,min,max) \
+DRI_CONF_OPT_BEGIN_V(pp_jimenezmlaa,int,def, # min ":" # max ) \
+        DRI_CONF_DESC(en,"Morphological anti-aliasing based on Jimenez\' MLAA. 0 to disable, 8 for default quality") \
+        DRI_CONF_DESC(de,"Morphological anti-aliasing based on Jimenez\' MLAA. 0 to disable, 8 for default quality") \
+        DRI_CONF_DESC(es,"Morphological anti-aliasing based on Jimenez\' MLAA. 0 to disable, 8 for default quality") \
+        DRI_CONF_DESC(nl,"Morphological anti-aliasing based on Jimenez\' MLAA. 0 to disable, 8 for default quality") \
+        DRI_CONF_DESC(fr,"Morphological anti-aliasing based on Jimenez\' MLAA. 0 to disable, 8 for default quality") \
+        DRI_CONF_DESC(sv,"Morphological anti-aliasing based on Jimenez\' MLAA. 0 to disable, 8 for default quality") \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_PP_JIMENEZMLAA_COLOR(def,min,max) \
+DRI_CONF_OPT_BEGIN_V(pp_jimenezmlaa_color,int,def, # min ":" # max ) \
+        DRI_CONF_DESC(en,"Morphological anti-aliasing based on Jimenez\' MLAA. 0 to disable, 8 for default quality. Color version, usable with 2d GL apps") \
+        DRI_CONF_DESC(de,"Morphological anti-aliasing based on Jimenez\' MLAA. 0 to disable, 8 for default quality. Color version, usable with 2d GL apps") \
+        DRI_CONF_DESC(es,"Morphological anti-aliasing based on Jimenez\' MLAA. 0 to disable, 8 for default quality. Color version, usable with 2d GL apps") \
+        DRI_CONF_DESC(nl,"Morphological anti-aliasing based on Jimenez\' MLAA. 0 to disable, 8 for default quality. Color version, usable with 2d GL apps") \
+        DRI_CONF_DESC(fr,"Morphological anti-aliasing based on Jimenez\' MLAA. 0 to disable, 8 for default quality. Color version, usable with 2d GL apps") \
+        DRI_CONF_DESC(sv,"Morphological anti-aliasing based on Jimenez\' MLAA. 0 to disable, 8 for default quality. Color version, usable with 2d GL apps") \
+DRI_CONF_OPT_END
+
 #define DRI_CONF_MAX_TEXTURE_UNITS(def,min,max) \
 DRI_CONF_OPT_BEGIN_V(texture_units,int,def, # min ":" # max ) \
         DRI_CONF_DESC(en,"Number of texture units used") \
diff --git a/src/mesa/drivers/dri/common/xmlpool/t_options.h b/src/mesa/drivers/dri/common/xmlpool/t_options.h
index 5fd6ec65bf8..2427aa77f5b 100644
--- a/src/mesa/drivers/dri/common/xmlpool/t_options.h
+++ b/src/mesa/drivers/dri/common/xmlpool/t_options.h
@@ -191,6 +191,36 @@ DRI_CONF_OPT_BEGIN(hyperz,bool,def) \
         DRI_CONF_DESC(en,gettext("Use HyperZ to boost performance")) \
 DRI_CONF_OPT_END
 
+#define DRI_CONF_PP_CELSHADE(def) \
+DRI_CONF_OPT_BEGIN_V(pp_celshade,enum,def,"0:1") \
+        DRI_CONF_DESC(en,gettext("A post-processing filter to cel-shade the output")) \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_PP_NORED(def) \
+DRI_CONF_OPT_BEGIN_V(pp_nored,enum,def,"0:1") \
+        DRI_CONF_DESC(en,gettext("A post-processing filter to remove the red channel")) \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_PP_NOGREEN(def) \
+DRI_CONF_OPT_BEGIN_V(pp_nogreen,enum,def,"0:1") \
+        DRI_CONF_DESC(en,gettext("A post-processing filter to remove the green channel")) \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_PP_NOBLUE(def) \
+DRI_CONF_OPT_BEGIN_V(pp_noblue,enum,def,"0:1") \
+        DRI_CONF_DESC(en,gettext("A post-processing filter to remove the blue channel")) \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_PP_JIMENEZMLAA(def,min,max) \
+DRI_CONF_OPT_BEGIN_V(pp_jimenezmlaa,int,def, # min ":" # max ) \
+        DRI_CONF_DESC(en,gettext("Morphological anti-aliasing based on Jimenez\\\' MLAA. 0 to disable, 8 for default quality")) \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_PP_JIMENEZMLAA_COLOR(def,min,max) \
+DRI_CONF_OPT_BEGIN_V(pp_jimenezmlaa_color,int,def, # min ":" # max ) \
+        DRI_CONF_DESC(en,gettext("Morphological anti-aliasing based on Jimenez\\\' MLAA. 0 to disable, 8 for default quality. Color version, usable with 2d GL apps")) \
+DRI_CONF_OPT_END
+
 #define DRI_CONF_MAX_TEXTURE_UNITS(def,min,max) \
 DRI_CONF_OPT_BEGIN_V(texture_units,int,def, # min ":" # max ) \
         DRI_CONF_DESC(en,gettext("Number of texture units used")) \
diff --git a/src/mesa/drivers/dri/i915/i830_vtbl.c b/src/mesa/drivers/dri/i915/i830_vtbl.c
index 6d43726beb1..ed5286fd7d9 100644
--- a/src/mesa/drivers/dri/i915/i830_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i830_vtbl.c
@@ -881,6 +881,12 @@ i830_invalidate_state(struct intel_context *intel, GLuint new_state)
       i830_update_provoking_vertex(&intel->ctx);
 }
 
+static bool
+i830_is_hiz_depth_format(struct intel_context *intel, gl_format format)
+{
+   return false;
+}
+
 void
 i830InitVtbl(struct i830_context *i830)
 {
@@ -898,4 +904,5 @@ i830InitVtbl(struct i830_context *i830)
    i830->intel.vtbl.finish_batch = intel_finish_vb;
    i830->intel.vtbl.invalidate_state = i830_invalidate_state;
    i830->intel.vtbl.render_target_supported = i830_render_target_supported;
+   i830->intel.vtbl.is_hiz_depth_format = i830_is_hiz_depth_format;
 }
diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c
index 6e1d7092237..d155b85ffca 100644
--- a/src/mesa/drivers/dri/i915/i915_fragprog.c
+++ b/src/mesa/drivers/dri/i915/i915_fragprog.c
@@ -175,10 +175,8 @@ src_vector(struct i915_fragment_program *p,
    case PROGRAM_STATE_VAR:
    case PROGRAM_NAMED_PARAM:
    case PROGRAM_UNIFORM:
-      src =
-         i915_emit_param4fv(p,
-                            program->Base.Parameters->ParameterValues[source->
-                                                                      Index]);
+      src = i915_emit_param4fv(p,
+	 &program->Base.Parameters->ParameterValues[source->Index][0].f);
       break;
 
    default:
@@ -303,7 +301,7 @@ do {									\
 /* 
  * TODO: consider moving this into core 
  */
-static void calc_live_regs( struct i915_fragment_program *p )
+static bool calc_live_regs( struct i915_fragment_program *p )
 {
     const struct gl_fragment_program *program = &p->FragProg;
     GLuint regsUsed = 0xffff0000;
@@ -317,6 +315,9 @@ static void calc_live_regs( struct i915_fragment_program *p )
 
         /* Register is written to: unmark as live for this and preceeding ops */ 
         if (inst->DstReg.File == PROGRAM_TEMPORARY) {
+	    if (inst->DstReg.Index > 16)
+	       return false;
+
             live_components[inst->DstReg.Index] &= ~inst->DstReg.WriteMask;
             if (live_components[inst->DstReg.Index] == 0)
                 regsUsed &= ~(1 << inst->DstReg.Index);
@@ -327,6 +328,9 @@ static void calc_live_regs( struct i915_fragment_program *p )
             if (inst->SrcReg[a].File == PROGRAM_TEMPORARY) {
                 unsigned c;
 
+		if (inst->SrcReg[a].Index > 16)
+		   return false;
+
                 regsUsed |= 1 << inst->SrcReg[a].Index;
 
                 for (c = 0; c < 4; c++) {
@@ -340,6 +344,8 @@ static void calc_live_regs( struct i915_fragment_program *p )
 
         p->usedRegs[i] = regsUsed;
     }
+
+    return true;
 }
 
 static GLuint get_live_regs( struct i915_fragment_program *p, 
@@ -394,7 +400,10 @@ upload_program(struct i915_fragment_program *p)
 
    /* Not always needed:
     */
-   calc_live_regs(p);
+   if (!calc_live_regs(p)) {
+      i915_program_error(p, "Could not allocate registers");
+      return;
+   }
 
    while (1) {
       GLuint src0, src1, src2, flags;
diff --git a/src/mesa/drivers/dri/i915/i915_program.c b/src/mesa/drivers/dri/i915/i915_program.c
index ca1949b223e..0a600d30bef 100644
--- a/src/mesa/drivers/dri/i915/i915_program.c
+++ b/src/mesa/drivers/dri/i915/i915_program.c
@@ -442,14 +442,16 @@ i915_emit_param4fv(struct i915_fragment_program * p, const GLfloat * values)
 void
 i915_program_error(struct i915_fragment_program *p, const char *fmt, ...)
 {
-   va_list args;
+   if (unlikely((INTEL_DEBUG & (DEBUG_WM | DEBUG_FALLBACKS)) != 0)) {
+      va_list args;
 
-   fprintf(stderr, "i915_program_error: ");
-   va_start(args, fmt);
-   vfprintf(stderr, fmt, args);
-   va_end(args);
+      fprintf(stderr, "i915_program_error: ");
+      va_start(args, fmt);
+      vfprintf(stderr, fmt, args);
+      va_end(args);
 
-   fprintf(stderr, "\n");
+      fprintf(stderr, "\n");
+   }
    p->error = 1;
 }
 
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index 44f28cd9d15..d9c885da65b 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -124,7 +124,11 @@ CXX_SOURCES = \
 	brw_fs_reg_allocate.cpp \
 	brw_fs_schedule_instructions.cpp \
 	brw_fs_vector_splitting.cpp \
-	brw_shader.cpp
+	brw_shader.cpp \
+	brw_vec4.cpp \
+	brw_vec4_emit.cpp \
+	brw_vec4_reg_allocate.cpp \
+	brw_vec4_visitor.cpp
 
 ASM_SOURCES = 
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 471015cf9d0..df63fe1d52c 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -212,6 +212,7 @@ enum state_struct_type {
    AUB_TRACE_BINDING_TABLE =		0x101,
    AUB_TRACE_SURFACE_STATE =		0x102,
    AUB_TRACE_VS_CONSTANTS =		0x103,
+   AUB_TRACE_WM_CONSTANTS =		0x104,
 };
 
 /** Subclass of Mesa vertex program */
@@ -247,6 +248,7 @@ enum param_conversion {
    PARAM_CONVERT_F2I,
    PARAM_CONVERT_F2U,
    PARAM_CONVERT_F2B,
+   PARAM_CONVERT_ZERO,
 };
 
 /* Data about a particular attempt to compile a program.  Note that
@@ -310,12 +312,20 @@ struct brw_vs_prog_data {
    GLuint total_grf;
    GLbitfield64 outputs_written;
    GLuint nr_params;       /**< number of float params/constants */
+   GLuint total_scratch;
 
    GLuint inputs_read;
 
    /* Used for calculating urb partitions:
     */
    GLuint urb_entry_size;
+
+   const float *param[MAX_UNIFORMS * 4]; /* should be: BRW_MAX_CURBE */
+   enum param_conversion param_convert[MAX_UNIFORMS * 4];
+   const float *pull_param[MAX_UNIFORMS * 4];
+   enum param_conversion pull_param_convert[MAX_UNIFORMS * 4];
+
+   bool uses_new_param_layout;
 };
 
 
@@ -528,7 +538,7 @@ struct brw_context
        * the CURBE, the depth buffer, and a query BO.
        */
       drm_intel_bo *validated_bos[VERT_ATTRIB_MAX + BRW_WM_MAX_SURF + 16];
-      int validated_bo_count;
+      unsigned int validated_bo_count;
    } state;
 
    struct brw_cache cache;
@@ -662,6 +672,7 @@ struct brw_context
       struct brw_vs_prog_data *prog_data;
       int8_t *constant_map; /* variable array following prog_data */
 
+      drm_intel_bo *scratch_bo;
       drm_intel_bo *const_bo;
       /** Offset in the program cache to the VS program */
       uint32_t prog_offset;
@@ -674,6 +685,23 @@ struct brw_context
 
       uint32_t push_const_offset; /* Offset in the batchbuffer */
       int push_const_size; /* in 256-bit register increments */
+
+      /** @{ register allocator */
+
+      struct ra_regs *regs;
+
+      /**
+       * Array of the ra classes for the unaligned contiguous register
+       * block sizes used.
+       */
+      int *classes;
+
+      /**
+       * Mapping for register-allocated objects in *regs to the first
+       * GRF for that object.
+      */
+      uint8_t *ra_reg_to_grf;
+      /** @} */
    } vs;
 
    struct {
@@ -726,7 +754,6 @@ struct brw_context
       GLuint render_surf;
       GLuint nr_surfaces;      
 
-      GLuint max_threads;
       drm_intel_bo *scratch_bo;
 
       GLuint sampler_count;
@@ -747,6 +774,29 @@ struct brw_context
        * Pre-gen6, push constants live in the CURBE.
        */
       uint32_t push_const_offset;
+
+      /** @{ register allocator */
+
+      struct ra_regs *regs;
+
+      /** Array of the ra classes for the unaligned contiguous
+       * register block sizes used.
+       */
+      int *classes;
+
+      /**
+       * Mapping for register-allocated objects in *regs to the first
+       * GRF for that object.
+      */
+      uint8_t *ra_reg_to_grf;
+
+      /**
+       * ra class for the aligned pairs we use for PLN, which doesn't
+       * appear in *classes.
+       */
+      int aligned_pairs_class;
+
+      /** @} */
    } wm;
 
 
@@ -827,6 +877,10 @@ void brw_validate_textures( struct brw_context *brw );
  */
 void brwInitFragProgFuncs( struct dd_function_table *functions );
 
+int brw_get_scratch_size(int size);
+void brw_get_scratch_bo(struct intel_context *intel,
+			drm_intel_bo **scratch_bo, int size);
+
 
 /* brw_urb.c
  */
@@ -874,7 +928,7 @@ brw_fragment_program_const(const struct gl_fragment_program *p)
 }
 
 static inline
-float convert_param(enum param_conversion conversion, float param)
+float convert_param(enum param_conversion conversion, const float *param)
 {
    union {
       float f;
@@ -884,21 +938,23 @@ float convert_param(enum param_conversion conversion, float param)
 
    switch (conversion) {
    case PARAM_NO_CONVERT:
-      return param;
+      return *param;
    case PARAM_CONVERT_F2I:
-      fi.i = param;
+      fi.i = *param;
       return fi.f;
    case PARAM_CONVERT_F2U:
-      fi.u = param;
+      fi.u = *param;
       return fi.f;
    case PARAM_CONVERT_F2B:
-      if (param != 0.0)
+      if (*param != 0.0)
 	 fi.i = 1;
       else
 	 fi.i = 0;
       return fi.f;
+   case PARAM_CONVERT_ZERO:
+      return 0.0;
    default:
-      return param;
+      return *param;
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index ae11c487a2c..960be10006e 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -203,7 +203,7 @@ static void prepare_constant_buffer(struct brw_context *brw)
       /* copy float constants */
       for (i = 0; i < brw->wm.prog_data->nr_params; i++) {
 	 buf[offset + i] = convert_param(brw->wm.prog_data->param_convert[i],
-					 *brw->wm.prog_data->param[i]);
+					 brw->wm.prog_data->param[i]);
       }
    }
 
@@ -244,15 +244,22 @@ static void prepare_constant_buffer(struct brw_context *brw)
       GLuint offset = brw->curbe.vs_start * 16;
       GLuint nr = brw->vs.prog_data->nr_params / 4;
 
-      /* Load the subset of push constants that will get used when
-       * we also have a pull constant buffer.
-       */
-      for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
-	 if (brw->vs.constant_map[i] != -1) {
-	    assert(brw->vs.constant_map[i] <= nr);
-	    memcpy(buf + offset + brw->vs.constant_map[i] * 4,
-		   vp->program.Base.Parameters->ParameterValues[i],
-		   4 * sizeof(float));
+      if (brw->vs.prog_data->uses_new_param_layout) {
+	 for (i = 0; i < brw->vs.prog_data->nr_params; i++) {
+	    buf[offset + i] = convert_param(brw->vs.prog_data->param_convert[i],
+					    brw->vs.prog_data->param[i]);
+	 }
+      } else {
+	 /* Load the subset of push constants that will get used when
+	  * we also have a pull constant buffer.
+	  */
+	 for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
+	    if (brw->vs.constant_map[i] != -1) {
+	       assert(brw->vs.constant_map[i] <= nr);
+	       memcpy(buf + offset + brw->vs.constant_map[i] * 4,
+		      vp->program.Base.Parameters->ParameterValues[i],
+		      4 * sizeof(float));
+	    }
 	 }
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 0a3027d04ad..d1799c0ab4f 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -557,58 +557,93 @@
 #define BRW_WE_ALL		1
 /** @} */
 
-#define BRW_OPCODE_MOV        1
-#define BRW_OPCODE_SEL        2
-#define BRW_OPCODE_NOT        4
-#define BRW_OPCODE_AND        5
-#define BRW_OPCODE_OR         6
-#define BRW_OPCODE_XOR        7
-#define BRW_OPCODE_SHR        8
-#define BRW_OPCODE_SHL        9
-#define BRW_OPCODE_RSR        10
-#define BRW_OPCODE_RSL        11
-#define BRW_OPCODE_ASR        12
-#define BRW_OPCODE_CMP        16
-#define BRW_OPCODE_CMPN       17
-#define BRW_OPCODE_JMPI       32
-#define BRW_OPCODE_IF         34
-#define BRW_OPCODE_IFF        35
-#define BRW_OPCODE_ELSE       36
-#define BRW_OPCODE_ENDIF      37
-#define BRW_OPCODE_DO         38
-#define BRW_OPCODE_WHILE      39
-#define BRW_OPCODE_BREAK      40
-#define BRW_OPCODE_CONTINUE   41
-#define BRW_OPCODE_HALT       42
-#define BRW_OPCODE_MSAVE      44
-#define BRW_OPCODE_MRESTORE   45
-#define BRW_OPCODE_PUSH       46
-#define BRW_OPCODE_POP        47
-#define BRW_OPCODE_WAIT       48
-#define BRW_OPCODE_SEND       49
-#define BRW_OPCODE_SENDC      50
-#define BRW_OPCODE_MATH       56
-#define BRW_OPCODE_ADD        64
-#define BRW_OPCODE_MUL        65
-#define BRW_OPCODE_AVG        66
-#define BRW_OPCODE_FRC        67
-#define BRW_OPCODE_RNDU       68
-#define BRW_OPCODE_RNDD       69
-#define BRW_OPCODE_RNDE       70
-#define BRW_OPCODE_RNDZ       71
-#define BRW_OPCODE_MAC        72
-#define BRW_OPCODE_MACH       73
-#define BRW_OPCODE_LZD        74
-#define BRW_OPCODE_SAD2       80
-#define BRW_OPCODE_SADA2      81
-#define BRW_OPCODE_DP4        84
-#define BRW_OPCODE_DPH        85
-#define BRW_OPCODE_DP3        86
-#define BRW_OPCODE_DP2        87
-#define BRW_OPCODE_DPA2       88
-#define BRW_OPCODE_LINE       89
-#define BRW_OPCODE_PLN        90
-#define BRW_OPCODE_NOP        126
+enum opcode {
+   /* These are the actual hardware opcodes. */
+   BRW_OPCODE_MOV =	1,
+   BRW_OPCODE_SEL =	2,
+   BRW_OPCODE_NOT =	4,
+   BRW_OPCODE_AND =	5,
+   BRW_OPCODE_OR =	6,
+   BRW_OPCODE_XOR =	7,
+   BRW_OPCODE_SHR =	8,
+   BRW_OPCODE_SHL =	9,
+   BRW_OPCODE_RSR =	10,
+   BRW_OPCODE_RSL =	11,
+   BRW_OPCODE_ASR =	12,
+   BRW_OPCODE_CMP =	16,
+   BRW_OPCODE_CMPN =	17,
+   BRW_OPCODE_JMPI =	32,
+   BRW_OPCODE_IF =	34,
+   BRW_OPCODE_IFF =	35,
+   BRW_OPCODE_ELSE =	36,
+   BRW_OPCODE_ENDIF =	37,
+   BRW_OPCODE_DO =	38,
+   BRW_OPCODE_WHILE =	39,
+   BRW_OPCODE_BREAK =	40,
+   BRW_OPCODE_CONTINUE = 41,
+   BRW_OPCODE_HALT =	42,
+   BRW_OPCODE_MSAVE =	44,
+   BRW_OPCODE_MRESTORE = 45,
+   BRW_OPCODE_PUSH =	46,
+   BRW_OPCODE_POP =	47,
+   BRW_OPCODE_WAIT =	48,
+   BRW_OPCODE_SEND =	49,
+   BRW_OPCODE_SENDC =	50,
+   BRW_OPCODE_MATH =	56,
+   BRW_OPCODE_ADD =	64,
+   BRW_OPCODE_MUL =	65,
+   BRW_OPCODE_AVG =	66,
+   BRW_OPCODE_FRC =	67,
+   BRW_OPCODE_RNDU =	68,
+   BRW_OPCODE_RNDD =	69,
+   BRW_OPCODE_RNDE =	70,
+   BRW_OPCODE_RNDZ =	71,
+   BRW_OPCODE_MAC =	72,
+   BRW_OPCODE_MACH =	73,
+   BRW_OPCODE_LZD =	74,
+   BRW_OPCODE_SAD2 =	80,
+   BRW_OPCODE_SADA2 =	81,
+   BRW_OPCODE_DP4 =	84,
+   BRW_OPCODE_DPH =	85,
+   BRW_OPCODE_DP3 =	86,
+   BRW_OPCODE_DP2 =	87,
+   BRW_OPCODE_DPA2 =	88,
+   BRW_OPCODE_LINE =	89,
+   BRW_OPCODE_PLN =	90,
+   BRW_OPCODE_NOP =	126,
+
+   /* These are compiler backend opcodes that get translated into other
+    * instructions.
+    */
+   FS_OPCODE_FB_WRITE = 128,
+   SHADER_OPCODE_RCP,
+   SHADER_OPCODE_RSQ,
+   SHADER_OPCODE_SQRT,
+   SHADER_OPCODE_EXP2,
+   SHADER_OPCODE_LOG2,
+   SHADER_OPCODE_POW,
+   SHADER_OPCODE_SIN,
+   SHADER_OPCODE_COS,
+   FS_OPCODE_DDX,
+   FS_OPCODE_DDY,
+   FS_OPCODE_PIXEL_X,
+   FS_OPCODE_PIXEL_Y,
+   FS_OPCODE_CINTERP,
+   FS_OPCODE_LINTERP,
+   FS_OPCODE_TEX,
+   FS_OPCODE_TXB,
+   FS_OPCODE_TXD,
+   FS_OPCODE_TXL,
+   FS_OPCODE_TXS,
+   FS_OPCODE_DISCARD,
+   FS_OPCODE_SPILL,
+   FS_OPCODE_UNSPILL,
+   FS_OPCODE_PULL_CONSTANT_LOAD,
+
+   VS_OPCODE_URB_WRITE,
+   VS_OPCODE_SCRATCH_READ,
+   VS_OPCODE_SCRATCH_WRITE,
+};
 
 #define BRW_PREDICATE_NONE             0
 #define BRW_PREDICATE_NORMAL           1
@@ -734,7 +769,6 @@
 #define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE 0
 #define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE  1
 #define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
-#define BRW_SAMPLER_MESSAGE_SIMD8_RESINFO             2
 #define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO            2
 #define BRW_SAMPLER_MESSAGE_SIMD4X2_LD                3
 #define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
@@ -747,6 +781,7 @@
 #define GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS       4
 #define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE 5
 #define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE  6
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO      10
 
 /* for GEN5 only */
 #define BRW_SAMPLER_SIMD_MODE_SIMD4X2                   0
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index af41c848308..927b0b4acc9 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -309,6 +309,35 @@ char *target_function[16] = {
     [BRW_MESSAGE_TARGET_THREAD_SPAWNER] = "thread_spawner"
 };
 
+char *target_function_gen6[16] = {
+    [BRW_MESSAGE_TARGET_NULL] = "null",
+    [BRW_MESSAGE_TARGET_MATH] = "math",
+    [BRW_MESSAGE_TARGET_SAMPLER] = "sampler",
+    [BRW_MESSAGE_TARGET_GATEWAY] = "gateway",
+    [GEN6_MESSAGE_TARGET_DP_SAMPLER_CACHE] = "sampler",
+    [GEN6_MESSAGE_TARGET_DP_RENDER_CACHE] = "render",
+    [GEN6_MESSAGE_TARGET_DP_CONST_CACHE] = "const",
+    [BRW_MESSAGE_TARGET_URB] = "urb",
+    [BRW_MESSAGE_TARGET_THREAD_SPAWNER] = "thread_spawner"
+};
+
+char *dp_rc_msg_type_gen6[16] = {
+    [BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ] = "OWORD block read",
+    [GEN6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ] = "RT UNORM read",
+    [GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ] = "OWORD dual block read",
+    [GEN6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ] = "media block read",
+    [GEN6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ] = "OWORD unaligned block read",
+    [GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ] = "DWORD scattered read",
+    [GEN6_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE] = "DWORD atomic write",
+    [GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE] = "OWORD block write",
+    [GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE] = "OWORD dual block write",
+    [GEN6_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE] = "media block write",
+    [GEN6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE] = "DWORD scattered write",
+    [GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE] = "RT write",
+    [GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE] = "streamed VB write",
+    [GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE] = "RT UNORMc write",
+};
+
 char *math_function[16] = {
     [BRW_MATH_FUNCTION_INV] = "inv",
     [BRW_MATH_FUNCTION_LOG] = "log",
@@ -927,8 +956,14 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
 	newline (file);
 	pad (file, 16);
 	space = 0;
-	err |= control (file, "target function", target_function,
-			target, &space);
+
+	if (gen >= 6) {
+	   err |= control (file, "target function", target_function_gen6,
+			   target, &space);
+	} else {
+	   err |= control (file, "target function", target_function,
+			   target, &space);
+	}
 
 	switch (target) {
 	case BRW_MESSAGE_TARGET_MATH:
@@ -985,9 +1020,16 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
 			inst->bits3.dp_read.msg_type);
 	    }
 	    break;
+
 	case BRW_MESSAGE_TARGET_DATAPORT_WRITE:
 	    if (gen >= 6) {
-		format (file, " (%d, %d, %d, %d, %d, %d)",
+		format (file, " (");
+
+		err |= control (file, "DP rc message type",
+				dp_rc_msg_type_gen6,
+				inst->bits3.gen6_dp.msg_type, &space);
+
+		format (file, ", %d, %d, %d, %d, %d, %d)",
 			inst->bits3.gen6_dp.binding_table_index,
 			inst->bits3.gen6_dp.msg_control,
 			inst->bits3.gen6_dp.msg_type,
@@ -1003,6 +1045,7 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
 			inst->bits3.dp_write.send_commit_msg);
 	    }
 	    break;
+
 	case BRW_MESSAGE_TARGET_URB:
 	    if (gen >= 5) {
 		format (file, " %d", inst->bits3.urb_gen5.offset);
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 56a46ced6e3..7bc69c612e3 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -689,17 +689,17 @@ static void brw_prepare_indices(struct brw_context *brw)
        * rebase it into a temporary.
        */
        if ((get_size(index_buffer->type) - 1) & offset) {
-           GLubyte *map = ctx->Driver.MapBuffer(ctx,
-                                                GL_ELEMENT_ARRAY_BUFFER_ARB,
-                                                GL_DYNAMIC_DRAW_ARB,
-                                                bufferobj);
-           map += offset;
+           GLubyte *map = ctx->Driver.MapBufferRange(ctx,
+						     offset,
+						     ib_size,
+						     GL_MAP_WRITE_BIT,
+						     bufferobj);
 
 	   intel_upload_data(&brw->intel, map, ib_size, ib_type_size,
 			     &bo, &offset);
 	   brw->ib.start_vertex_offset = offset / ib_type_size;
 
-           ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB, bufferobj);
+           ctx->Driver.UnmapBuffer(ctx, bufferobj);
        } else {
 	  /* Use CMD_3D_PRIM's start_vertex_offset to avoid re-uploading
 	   * the index buffer state when we're just moving the start index
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 72d50eadbce..af50305fc2b 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -44,6 +44,9 @@
 #define BRW_SWIZZLE_NOOP      BRW_SWIZZLE4(0,1,2,3)
 #define BRW_SWIZZLE_XYZW      BRW_SWIZZLE4(0,1,2,3)
 #define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
+#define BRW_SWIZZLE_YYYY      BRW_SWIZZLE4(1,1,1,1)
+#define BRW_SWIZZLE_ZZZZ      BRW_SWIZZLE4(2,2,2,2)
+#define BRW_SWIZZLE_WWWW      BRW_SWIZZLE4(3,3,3,3)
 #define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
 
 
@@ -798,6 +801,12 @@ void brw_init_compile(struct brw_context *, struct brw_compile *p,
 		      void *mem_ctx);
 const GLuint *brw_get_program( struct brw_compile *p, GLuint *sz );
 
+struct brw_instruction *brw_next_insn(struct brw_compile *p, GLuint opcode);
+void brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
+		  struct brw_reg dest);
+void brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
+		  struct brw_reg reg);
+
 
 /* Helpers for regular instructions:
  */
@@ -852,6 +861,27 @@ ROUND(RNDE)
 
 /* Helpers for SEND instruction:
  */
+void brw_set_dp_read_message(struct brw_compile *p,
+			     struct brw_instruction *insn,
+			     GLuint binding_table_index,
+			     GLuint msg_control,
+			     GLuint msg_type,
+			     GLuint target_cache,
+			     GLuint msg_length,
+			     GLuint response_length);
+
+void brw_set_dp_write_message(struct brw_compile *p,
+			      struct brw_instruction *insn,
+			      GLuint binding_table_index,
+			      GLuint msg_control,
+			      GLuint msg_type,
+			      GLuint msg_length,
+			      GLboolean header_present,
+			      GLuint pixel_scoreboard_clear,
+			      GLuint response_length,
+			      GLuint end_of_thread,
+			      GLuint send_commit_msg);
+
 void brw_urb_WRITE(struct brw_compile *p,
 		   struct brw_reg dest,
 		   GLuint msg_reg_nr,
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index e7370f36064..c5013de7ec1 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -89,9 +89,9 @@ gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
 }
 
 
-static void brw_set_dest(struct brw_compile *p,
-			 struct brw_instruction *insn,
-			 struct brw_reg dest)
+void
+brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
+	     struct brw_reg dest)
 {
    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
        dest.file != BRW_MESSAGE_REGISTER_FILE)
@@ -221,9 +221,9 @@ validate_reg(struct brw_instruction *insn, struct brw_reg reg)
    /* 10. Check destination issues. */
 }
 
-static void brw_set_src0(struct brw_compile *p,
-			 struct brw_instruction *insn,
-			 struct brw_reg reg)
+void
+brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
+	     struct brw_reg reg)
 {
    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
       assert(reg.nr < 128);
@@ -504,17 +504,18 @@ static void brw_set_urb_message( struct brw_compile *p,
     }
 }
 
-static void brw_set_dp_write_message( struct brw_compile *p,
-				      struct brw_instruction *insn,
-				      GLuint binding_table_index,
-				      GLuint msg_control,
-				      GLuint msg_type,
-				      GLuint msg_length,
-				      GLboolean header_present,
-				      GLuint pixel_scoreboard_clear,
-				      GLuint response_length,
-				      GLuint end_of_thread,
-				      GLuint send_commit_msg)
+void
+brw_set_dp_write_message(struct brw_compile *p,
+			 struct brw_instruction *insn,
+			 GLuint binding_table_index,
+			 GLuint msg_control,
+			 GLuint msg_type,
+			 GLuint msg_length,
+			 GLboolean header_present,
+			 GLuint pixel_scoreboard_clear,
+			 GLuint response_length,
+			 GLuint end_of_thread,
+			 GLuint send_commit_msg)
 {
    struct brw_context *brw = p->brw;
    struct intel_context *intel = &brw->intel;
@@ -570,7 +571,7 @@ static void brw_set_dp_write_message( struct brw_compile *p,
    }
 }
 
-static void
+void
 brw_set_dp_read_message(struct brw_compile *p,
 			struct brw_instruction *insn,
 			GLuint binding_table_index,
@@ -709,9 +710,9 @@ static void brw_set_sampler_message(struct brw_compile *p,
 }
 
 
-
-static struct brw_instruction *next_insn( struct brw_compile *p, 
-					  GLuint opcode )
+#define next_insn brw_next_insn
+struct brw_instruction *
+brw_next_insn(struct brw_compile *p, GLuint opcode)
 {
    struct brw_instruction *insn;
 
@@ -732,7 +733,6 @@ static struct brw_instruction *next_insn( struct brw_compile *p,
    return insn;
 }
 
-
 static struct brw_instruction *brw_alu1( struct brw_compile *p,
 					 GLuint opcode,
 					 struct brw_reg dest,
@@ -1341,8 +1341,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
       brw_set_src1(p, insn, brw_imm_ud(0));
       insn->bits3.break_cont.jip = br * (do_insn - insn);
 
-      insn->header.execution_size = do_insn->header.execution_size;
-      assert(insn->header.execution_size == BRW_EXECUTE_8);
+      insn->header.execution_size = BRW_EXECUTE_8;
    } else if (intel->gen == 6) {
       insn = next_insn(p, BRW_OPCODE_WHILE);
 
@@ -1351,8 +1350,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 
-      insn->header.execution_size = do_insn->header.execution_size;
-      assert(insn->header.execution_size == BRW_EXECUTE_8);
+      insn->header.execution_size = BRW_EXECUTE_8;
    } else {
       if (p->single_program_flow) {
 	 insn = next_insn(p, BRW_OPCODE_ADD);
@@ -2246,10 +2244,13 @@ void brw_urb_WRITE(struct brw_compile *p,
 
    if (intel->gen == 7) {
       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
+      brw_push_insn_state(p);
+      brw_set_access_mode(p, BRW_ALIGN_1);
       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
 		       BRW_REGISTER_TYPE_UD),
 	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
 		brw_imm_ud(0xff00));
+      brw_pop_insn_state(p);
    }
 
    insn = next_insn(p, BRW_OPCODE_SEND);
@@ -2311,7 +2312,7 @@ brw_find_loop_end(struct brw_compile *p, int start)
       if (insn->header.opcode == BRW_OPCODE_WHILE) {
 	 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
 				   : insn->bits3.break_cont.jip;
-	 if (ip + jip / br < start)
+	 if (ip + jip / br <= start)
 	    return ip;
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index b5ea943387d..0b0445ea142 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -143,20 +143,21 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
       return 0;
 
    switch (inst->opcode) {
-   case FS_OPCODE_RCP:
-   case FS_OPCODE_RSQ:
-   case FS_OPCODE_SQRT:
-   case FS_OPCODE_EXP2:
-   case FS_OPCODE_LOG2:
-   case FS_OPCODE_SIN:
-   case FS_OPCODE_COS:
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
       return 1 * c->dispatch_width / 8;
-   case FS_OPCODE_POW:
+   case SHADER_OPCODE_POW:
       return 2 * c->dispatch_width / 8;
    case FS_OPCODE_TEX:
    case FS_OPCODE_TXB:
    case FS_OPCODE_TXD:
    case FS_OPCODE_TXL:
+   case FS_OPCODE_TXS:
       return 1;
    case FS_OPCODE_FB_WRITE:
       return 2;
@@ -181,29 +182,26 @@ fs_visitor::virtual_grf_alloc(int size)
 	 virtual_grf_array_size *= 2;
       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 				   virtual_grf_array_size);
-
-      /* This slot is always unused. */
-      virtual_grf_sizes[0] = 0;
    }
    virtual_grf_sizes[virtual_grf_next] = size;
    return virtual_grf_next++;
 }
 
 /** Fixed HW reg constructor. */
-fs_reg::fs_reg(enum register_file file, int hw_reg)
+fs_reg::fs_reg(enum register_file file, int reg)
 {
    init();
    this->file = file;
-   this->hw_reg = hw_reg;
+   this->reg = reg;
    this->type = BRW_REGISTER_TYPE_F;
 }
 
 /** Fixed HW reg constructor. */
-fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
+fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 {
    init();
    this->file = file;
-   this->hw_reg = hw_reg;
+   this->reg = reg;
    this->type = type;
 }
 
@@ -242,11 +240,12 @@ import_uniforms_callback(const void *key,
  * This brings in those uniform definitions
  */
 void
-fs_visitor::import_uniforms(struct hash_table *src_variable_ht)
+fs_visitor::import_uniforms(fs_visitor *v)
 {
-   hash_table_call_foreach(src_variable_ht,
+   hash_table_call_foreach(v->variable_ht,
 			   import_uniforms_callback,
 			   variable_ht);
+   this->params_remap = v->params_remap;
 }
 
 /* Our support for uniforms is piggy-backed on the struct
@@ -281,23 +280,27 @@ fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 
 	 assert(param < ARRAY_SIZE(c->prog_data.param));
 
-	 switch (type->base_type) {
-	 case GLSL_TYPE_FLOAT:
+	 if (ctx->Const.NativeIntegers) {
 	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
-	    break;
-	 case GLSL_TYPE_UINT:
-	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
-	    break;
-	 case GLSL_TYPE_INT:
-	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
-	    break;
-	 case GLSL_TYPE_BOOL:
-	    c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
-	    break;
-	 default:
-	    assert(!"not reached");
-	    c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
-	    break;
+	 } else {
+	    switch (type->base_type) {
+	    case GLSL_TYPE_FLOAT:
+	       c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
+	       break;
+	    case GLSL_TYPE_UINT:
+	       c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
+	       break;
+	    case GLSL_TYPE_INT:
+	       c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
+	       break;
+	    case GLSL_TYPE_BOOL:
+	       c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
+	       break;
+	    default:
+	       assert(!"not reached");
+	       c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
+	       break;
+	    }
 	 }
 	 this->param_index[param] = loc;
 	 this->param_offset[param] = i;
@@ -463,9 +466,21 @@ fs_visitor::emit_general_interpolation(ir_variable *ir)
 	 } else {
 	    /* Perspective interpolation case. */
 	    for (unsigned int k = 0; k < type->vector_elements; k++) {
-	       struct brw_reg interp = interp_reg(location, k);
-	       emit(FS_OPCODE_LINTERP, attr,
-		    this->delta_x, this->delta_y, fs_reg(interp));
+	       /* FINISHME: At some point we probably want to push
+		* this farther by giving similar treatment to the
+		* other potentially constant components of the
+		* attribute, as well as making brw_vs_constval.c
+		* handle varyings other than gl_TexCoord.
+		*/
+	       if (location >= FRAG_ATTRIB_TEX0 &&
+		   location <= FRAG_ATTRIB_TEX7 &&
+		   k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
+		  emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
+	       } else {
+		  struct brw_reg interp = interp_reg(location, k);
+		  emit(FS_OPCODE_LINTERP, attr,
+		       this->delta_x, this->delta_y, fs_reg(interp));
+	       }
 	       attr.reg_offset++;
 	    }
 
@@ -512,16 +527,16 @@ fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
 }
 
 fs_inst *
-fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
+fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
 {
    switch (opcode) {
-   case FS_OPCODE_RCP:
-   case FS_OPCODE_RSQ:
-   case FS_OPCODE_SQRT:
-   case FS_OPCODE_EXP2:
-   case FS_OPCODE_LOG2:
-   case FS_OPCODE_SIN:
-   case FS_OPCODE_COS:
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
       break;
    default:
       assert(!"not reached: bad math opcode");
@@ -555,12 +570,12 @@ fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
 }
 
 fs_inst *
-fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
+fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 {
    int base_mrf = 2;
    fs_inst *inst;
 
-   assert(opcode == FS_OPCODE_POW);
+   assert(opcode == SHADER_OPCODE_POW);
 
    if (intel->gen >= 6) {
       /* Can't do hstride == 0 args to gen6 math, so expand it out.
@@ -605,7 +620,7 @@ fs_visitor::setup_paramvalues_refs()
    /* Set up the pointers to ParamValues now that that array is finalized. */
    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
       c->prog_data.param[i] =
-	 fp->Base.Parameters->ParameterValues[this->param_index[i]] +
+	 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
 	 this->param_offset[i];
    }
 }
@@ -621,12 +636,12 @@ fs_visitor::assign_curb_setup()
    }
 
    /* Map the offsets in the UNIFORM file to fixed HW regs. */
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       for (unsigned int i = 0; i < 3; i++) {
 	 if (inst->src[i].file == UNIFORM) {
-	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
+	    int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
 	    struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
 						  constant_nr / 8,
 						  constant_nr % 8);
@@ -684,8 +699,8 @@ fs_visitor::assign_urb_setup()
    /* Offset all the urb_setup[] index by the actual position of the
     * setup regs, now that the location of the constants has been chosen.
     */
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       if (inst->opcode == FS_OPCODE_LINTERP) {
 	 assert(inst->src[2].file == FIXED_HW_REG);
@@ -739,8 +754,8 @@ fs_visitor::split_virtual_grfs()
       split_grf[this->delta_x.reg] = false;
    }
 
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       /* Texturing produces 4 contiguous registers, so no splitting. */
       if (inst->is_tex()) {
@@ -763,8 +778,8 @@ fs_visitor::split_virtual_grfs()
       }
    }
 
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       if (inst->dst.file == GRF &&
 	  split_grf[inst->dst.reg] &&
@@ -786,6 +801,86 @@ fs_visitor::split_virtual_grfs()
    this->live_intervals_valid = false;
 }
 
+bool
+fs_visitor::remove_dead_constants()
+{
+   if (c->dispatch_width == 8) {
+      this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
+
+      for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
+	 this->params_remap[i] = -1;
+
+      /* Find which params are still in use. */
+      foreach_list(node, &this->instructions) {
+	 fs_inst *inst = (fs_inst *)node;
+
+	 for (int i = 0; i < 3; i++) {
+	    int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
+
+	    if (inst->src[i].file != UNIFORM)
+	       continue;
+
+	    assert(constant_nr < (int)c->prog_data.nr_params);
+
+	    /* For now, set this to non-negative.  We'll give it the
+	     * actual new number in a moment, in order to keep the
+	     * register numbers nicely ordered.
+	     */
+	    this->params_remap[constant_nr] = 0;
+	 }
+      }
+
+      /* Figure out what the new numbers for the params will be.  At some
+       * point when we're doing uniform array access, we're going to want
+       * to keep the distinction between .reg and .reg_offset, but for
+       * now we don't care.
+       */
+      unsigned int new_nr_params = 0;
+      for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
+	 if (this->params_remap[i] != -1) {
+	    this->params_remap[i] = new_nr_params++;
+	 }
+      }
+
+      /* Update the list of params to be uploaded to match our new numbering. */
+      for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
+	 int remapped = this->params_remap[i];
+
+	 if (remapped == -1)
+	    continue;
+
+	 /* We've already done setup_paramvalues_refs() so no need to worry
+	  * about param_index and param_offset.
+	  */
+	 c->prog_data.param[remapped] = c->prog_data.param[i];
+	 c->prog_data.param_convert[remapped] = c->prog_data.param_convert[i];
+      }
+
+      c->prog_data.nr_params = new_nr_params;
+   } else {
+      /* This should have been generated in the 8-wide pass already. */
+      assert(this->params_remap);
+   }
+
+   /* Now do the renumbering of the shader to remove unused params. */
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
+
+      for (int i = 0; i < 3; i++) {
+	 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
+
+	 if (inst->src[i].file != UNIFORM)
+	    continue;
+
+	 assert(this->params_remap[constant_nr] != -1);
+	 inst->src[i].reg = this->params_remap[constant_nr];
+	 inst->src[i].reg_offset = 0;
+      }
+   }
+
+   return true;
+}
+
 /**
  * Choose accesses from the UNIFORM file to demote to using the pull
  * constant buffer.
@@ -815,14 +910,14 @@ fs_visitor::setup_pull_constants()
    int pull_uniform_base = max_uniform_components;
    int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
 
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       for (int i = 0; i < 3; i++) {
 	 if (inst->src[i].file != UNIFORM)
 	    continue;
 
-	 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
+	 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
 	 if (uniform_nr < pull_uniform_base)
 	    continue;
 
@@ -871,8 +966,8 @@ fs_visitor::calculate_live_intervals()
    }
 
    int ip = 0;
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       if (inst->opcode == BRW_OPCODE_DO) {
 	 if (loop_depth++ == 0)
@@ -892,7 +987,7 @@ fs_visitor::calculate_live_intervals()
 	 }
       } else {
 	 for (unsigned int i = 0; i < 3; i++) {
-	    if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
+	    if (inst->src[i].file == GRF) {
 	       int reg = inst->src[i].reg;
 
 	       if (!loop_depth) {
@@ -908,7 +1003,7 @@ fs_visitor::calculate_live_intervals()
 	       }
 	    }
 	 }
-	 if (inst->dst.file == GRF && inst->dst.reg != 0) {
+	 if (inst->dst.file == GRF) {
 	    int reg = inst->dst.reg;
 
 	    if (!loop_depth) {
@@ -945,8 +1040,8 @@ fs_visitor::propagate_constants()
 
    calculate_live_intervals();
 
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       if (inst->opcode != BRW_OPCODE_MOV ||
 	  inst->predicated ||
@@ -965,11 +1060,9 @@ fs_visitor::propagate_constants()
       /* Found a move of a constant to a GRF.  Find anything else using the GRF
        * before it's written, and replace it with the constant if we can.
        */
-      exec_list_iterator scan_iter = iter;
-      scan_iter.next();
-      for (; scan_iter.has_next(); scan_iter.next()) {
-	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
-
+      for (fs_inst *scan_inst = (fs_inst *)inst->next;
+	   !scan_inst->is_tail_sentinel();
+	   scan_inst = (fs_inst *)scan_inst->next) {
 	 if (scan_inst->opcode == BRW_OPCODE_DO ||
 	     scan_inst->opcode == BRW_OPCODE_WHILE ||
 	     scan_inst->opcode == BRW_OPCODE_ELSE ||
@@ -1046,6 +1139,24 @@ fs_visitor::propagate_constants()
 		  progress = true;
 	       }
 	       break;
+
+	    case SHADER_OPCODE_RCP:
+	       /* The hardware doesn't do math on immediate values
+		* (because why are you doing that, seriously?), but
+		* the correct answer is to just constant fold it
+		* anyway.
+		*/
+	       assert(i == 0);
+	       if (inst->src[0].imm.f != 0.0f) {
+		  scan_inst->opcode = BRW_OPCODE_MOV;
+		  scan_inst->src[0] = inst->src[0];
+		  scan_inst->src[0].imm.f = 1.0f / scan_inst->src[0].imm.f;
+		  progress = true;
+	       }
+	       break;
+
+	    default:
+	       break;
 	    }
 	 }
 
@@ -1063,6 +1174,49 @@ fs_visitor::propagate_constants()
 
    return progress;
 }
+
+
+/**
+ * Attempts to move immediate constants into the immediate
+ * constant slot of following instructions.
+ *
+ * Immediate constants are a bit tricky -- they have to be in the last
+ * operand slot, you can't do abs/negate on them,
+ */
+
+bool
+fs_visitor::opt_algebraic()
+{
+   bool progress = false;
+
+   calculate_live_intervals();
+
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
+
+      switch (inst->opcode) {
+      case BRW_OPCODE_MUL:
+	 if (inst->src[1].file != IMM)
+	    continue;
+
+	 /* a * 1.0 = a */
+	 if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
+	     inst->src[1].imm.f == 1.0) {
+	    inst->opcode = BRW_OPCODE_MOV;
+	    inst->src[1] = reg_undef;
+	    progress = true;
+	    break;
+	 }
+
+	 break;
+      default:
+	 break;
+      }
+   }
+
+   return progress;
+}
+
 /**
  * Must be called after calculate_live_intervales() to remove unused
  * writes to registers -- register allocation will fail otherwise
@@ -1077,8 +1231,8 @@ fs_visitor::dead_code_eliminate()
 
    calculate_live_intervals();
 
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list_safe(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
 	 inst->remove();
@@ -1101,8 +1255,8 @@ fs_visitor::register_coalesce()
    int if_depth = 0;
    int loop_depth = 0;
 
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list_safe(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       /* Make sure that we dominate the instructions we're going to
        * scan for interfering with our coalescing, or we won't have
@@ -1123,6 +1277,8 @@ fs_visitor::register_coalesce()
       case BRW_OPCODE_ENDIF:
 	 if_depth--;
 	 break;
+      default:
+	 break;
       }
       if (loop_depth || if_depth)
 	 continue;
@@ -1130,7 +1286,8 @@ fs_visitor::register_coalesce()
       if (inst->opcode != BRW_OPCODE_MOV ||
 	  inst->predicated ||
 	  inst->saturate ||
-	  inst->dst.file != GRF || inst->src[0].file != GRF ||
+	  inst->dst.file != GRF || (inst->src[0].file != GRF &&
+				    inst->src[0].file != UNIFORM)||
 	  inst->dst.type != inst->src[0].type)
 	 continue;
 
@@ -1141,11 +1298,10 @@ fs_visitor::register_coalesce()
        * program.
        */
       bool interfered = false;
-      exec_list_iterator scan_iter = iter;
-      scan_iter.next();
-      for (; scan_iter.has_next(); scan_iter.next()) {
-	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
 
+      for (fs_inst *scan_inst = (fs_inst *)inst->next;
+	   !scan_inst->is_tail_sentinel();
+	   scan_inst = (fs_inst *)scan_inst->next) {
 	 if (scan_inst->dst.file == GRF) {
 	    if (scan_inst->dst.reg == inst->dst.reg &&
 		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
@@ -1153,7 +1309,8 @@ fs_visitor::register_coalesce()
 	       interfered = true;
 	       break;
 	    }
-	    if (scan_inst->dst.reg == inst->src[0].reg &&
+	    if (inst->src[0].file == GRF &&
+		scan_inst->dst.reg == inst->src[0].reg &&
 		(scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
 		 scan_inst->is_tex())) {
 	       interfered = true;
@@ -1161,10 +1318,13 @@ fs_visitor::register_coalesce()
 	    }
 	 }
 
-	 /* The gen6 MATH instruction can't handle source modifiers, so avoid
-	  * coalescing those for now.  We should do something more specific.
+	 /* The gen6 MATH instruction can't handle source modifiers or
+	  * unusual register regions, so avoid coalescing those for
+	  * now.  We should do something more specific.
 	  */
-	 if (intel->gen >= 6 && scan_inst->is_math() && has_source_modifiers) {
+	 if (intel->gen >= 6 &&
+	     scan_inst->is_math() &&
+	     (has_source_modifiers || inst->src[0].file == UNIFORM)) {
 	    interfered = true;
 	    break;
 	 }
@@ -1176,19 +1336,17 @@ fs_visitor::register_coalesce()
       /* Rewrite the later usage to point at the source of the move to
        * be removed.
        */
-      for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
-	   scan_iter.next()) {
-	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
-
+      for (fs_inst *scan_inst = inst;
+	   !scan_inst->is_tail_sentinel();
+	   scan_inst = (fs_inst *)scan_inst->next) {
 	 for (int i = 0; i < 3; i++) {
 	    if (scan_inst->src[i].file == GRF &&
 		scan_inst->src[i].reg == inst->dst.reg &&
 		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
-	       scan_inst->src[i].reg = inst->src[0].reg;
-	       scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
-	       scan_inst->src[i].abs |= inst->src[0].abs;
-	       scan_inst->src[i].negate ^= inst->src[0].negate;
-	       scan_inst->src[i].smear = inst->src[0].smear;
+	       fs_reg new_src = inst->src[0];
+	       new_src.negate ^= scan_inst->src[i].negate;
+	       new_src.abs |= scan_inst->src[i].abs;
+	       scan_inst->src[i] = new_src;
 	    }
 	 }
       }
@@ -1212,8 +1370,8 @@ fs_visitor::compute_to_mrf()
 
    calculate_live_intervals();
 
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list_safe(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       int ip = next_ip;
       next_ip++;
@@ -1228,9 +1386,9 @@ fs_visitor::compute_to_mrf()
       /* Work out which hardware MRF registers are written by this
        * instruction.
        */
-      int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
+      int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
       int mrf_high;
-      if (inst->dst.hw_reg & BRW_MRF_COMPR4) {
+      if (inst->dst.reg & BRW_MRF_COMPR4) {
 	 mrf_high = mrf_low + 4;
       } else if (c->dispatch_width == 16 &&
 		 (!inst->force_uncompressed && !inst->force_sechalf)) {
@@ -1297,7 +1455,7 @@ fs_visitor::compute_to_mrf()
 	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
 	       /* Found the creator of our MRF's source value. */
 	       scan_inst->dst.file = MRF;
-	       scan_inst->dst.hw_reg = inst->dst.hw_reg;
+	       scan_inst->dst.reg = inst->dst.reg;
 	       scan_inst->saturate |= inst->saturate;
 	       inst->remove();
 	       progress = true;
@@ -1334,10 +1492,10 @@ fs_visitor::compute_to_mrf()
 	    /* If somebody else writes our MRF here, we can't
 	     * compute-to-MRF before that.
 	     */
-	    int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4;
+	    int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
 	    int scan_mrf_high;
 
-	    if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) {
+	    if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
 	       scan_mrf_high = scan_mrf_low + 4;
 	    } else if (c->dispatch_width == 16 &&
 		       (!scan_inst->force_uncompressed &&
@@ -1392,8 +1550,8 @@ fs_visitor::remove_duplicate_mrf_writes()
 
    memset(last_mrf_move, 0, sizeof(last_mrf_move));
 
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list_safe(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       switch (inst->opcode) {
       case BRW_OPCODE_DO:
@@ -1409,7 +1567,7 @@ fs_visitor::remove_duplicate_mrf_writes()
 
       if (inst->opcode == BRW_OPCODE_MOV &&
 	  inst->dst.file == MRF) {
-	 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
+	 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
 	 if (prev_inst && inst->equals(prev_inst)) {
 	    inst->remove();
 	    progress = true;
@@ -1419,7 +1577,7 @@ fs_visitor::remove_duplicate_mrf_writes()
 
       /* Clear out the last-write records for MRFs that were overwritten. */
       if (inst->dst.file == MRF) {
-	 last_mrf_move[inst->dst.hw_reg] = NULL;
+	 last_mrf_move[inst->dst.reg] = NULL;
       }
 
       if (inst->mlen > 0) {
@@ -1445,7 +1603,7 @@ fs_visitor::remove_duplicate_mrf_writes()
 	  inst->dst.file == MRF &&
 	  inst->src[0].file == GRF &&
 	  !inst->predicated) {
-	 last_mrf_move[inst->dst.hw_reg] = inst;
+	 last_mrf_move[inst->dst.reg] = inst;
       }
    }
 
@@ -1527,8 +1685,8 @@ fs_visitor::run()
       /* Generate FS IR for main().  (the visitor only descends into
        * functions called "main").
        */
-      foreach_iter(exec_list_iterator, iter, *shader->ir) {
-	 ir_instruction *ir = (ir_instruction *)iter.get();
+      foreach_list(node, &*shader->ir) {
+	 ir_instruction *ir = (ir_instruction *)node;
 	 base_ir = ir;
 	 this->result = reg_undef;
 	 ir->accept(this);
@@ -1550,11 +1708,14 @@ fs_visitor::run()
 	 progress = remove_duplicate_mrf_writes() || progress;
 
 	 progress = propagate_constants() || progress;
+	 progress = opt_algebraic() || progress;
 	 progress = register_coalesce() || progress;
 	 progress = compute_to_mrf() || progress;
 	 progress = dead_code_eliminate() || progress;
       } while (progress);
 
+      remove_dead_constants();
+
       schedule_instructions();
 
       assign_curb_setup();
@@ -1563,7 +1724,7 @@ fs_visitor::run()
       if (0) {
 	 /* Debug of register spilling: Go spill everything. */
 	 int virtual_grf_count = virtual_grf_next;
-	 for (int i = 1; i < virtual_grf_count; i++) {
+	 for (int i = 0; i < virtual_grf_count; i++) {
 	    spill_reg(i);
 	 }
       }
@@ -1625,7 +1786,7 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
    fs_visitor v(c, prog, shader);
    if (!v.run()) {
       prog->LinkStatus = GL_FALSE;
-      prog->InfoLog = ralloc_strdup(prog, v.fail_msg);
+      ralloc_strcat(&prog->InfoLog, v.fail_msg);
 
       return false;
    }
@@ -1633,7 +1794,7 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
       c->dispatch_width = 16;
       fs_visitor v2(c, prog, shader);
-      v2.import_uniforms(v.variable_ht);
+      v2.import_uniforms(&v);
       v2.run();
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 2bf850e5dea..10f45f30fe9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -25,6 +25,8 @@
  *
  */
 
+#include "brw_shader.h"
+
 extern "C" {
 
 #include <sys/types.h>
@@ -51,37 +53,10 @@ enum register_file {
    MRF = BRW_MESSAGE_REGISTER_FILE,
    IMM = BRW_IMMEDIATE_VALUE,
    FIXED_HW_REG, /* a struct brw_reg */
-   UNIFORM, /* prog_data->params[hw_reg] */
+   UNIFORM, /* prog_data->params[reg] */
    BAD_FILE
 };
 
-enum fs_opcodes {
-   FS_OPCODE_FB_WRITE = 256,
-   FS_OPCODE_RCP,
-   FS_OPCODE_RSQ,
-   FS_OPCODE_SQRT,
-   FS_OPCODE_EXP2,
-   FS_OPCODE_LOG2,
-   FS_OPCODE_POW,
-   FS_OPCODE_SIN,
-   FS_OPCODE_COS,
-   FS_OPCODE_DDX,
-   FS_OPCODE_DDY,
-   FS_OPCODE_PIXEL_X,
-   FS_OPCODE_PIXEL_Y,
-   FS_OPCODE_CINTERP,
-   FS_OPCODE_LINTERP,
-   FS_OPCODE_TEX,
-   FS_OPCODE_TXB,
-   FS_OPCODE_TXD,
-   FS_OPCODE_TXL,
-   FS_OPCODE_DISCARD,
-   FS_OPCODE_SPILL,
-   FS_OPCODE_UNSPILL,
-   FS_OPCODE_PULL_CONSTANT_LOAD,
-};
-
-
 class fs_reg {
 public:
    /* Callers of this ralloc-based new need not call delete. It's
@@ -99,7 +74,6 @@ public:
    void init()
    {
       memset(this, 0, sizeof(*this));
-      this->hw_reg = -1;
       this->smear = -1;
    }
 
@@ -146,8 +120,8 @@ public:
       this->type = fixed_hw_reg.type;
    }
 
-   fs_reg(enum register_file file, int hw_reg);
-   fs_reg(enum register_file file, int hw_reg, uint32_t type);
+   fs_reg(enum register_file file, int reg);
+   fs_reg(enum register_file file, int reg, uint32_t type);
    fs_reg(class fs_visitor *v, const struct glsl_type *type);
 
    bool equals(fs_reg *r)
@@ -155,7 +129,6 @@ public:
       return (file == r->file &&
 	      reg == r->reg &&
 	      reg_offset == r->reg_offset &&
-	      hw_reg == r->hw_reg &&
 	      type == r->type &&
 	      negate == r->negate &&
 	      abs == r->abs &&
@@ -167,12 +140,17 @@ public:
 
    /** Register file: ARF, GRF, MRF, IMM. */
    enum register_file file;
-   /** virtual register number.  0 = fixed hw reg */
+   /**
+    * Register number.  For ARF/MRF, it's the hardware register.  For
+    * GRF, it's a virtual register number until register allocation
+    */
    int reg;
-   /** Offset within the virtual register. */
+   /**
+    * For virtual registers, this is a hardware register offset from
+    * the start of the register block (for example, a constant index
+    * in an array access).
+    */
    int reg_offset;
-   /** HW register number.  Generally unset until register allocation. */
-   int hw_reg;
    /** Register type.  BRW_REGISTER_TYPE_* */
    int type;
    bool negate;
@@ -224,13 +202,13 @@ public:
       init();
    }
 
-   fs_inst(int opcode)
+   fs_inst(enum opcode opcode)
    {
       init();
       this->opcode = opcode;
    }
 
-   fs_inst(int opcode, fs_reg dst)
+   fs_inst(enum opcode opcode, fs_reg dst)
    {
       init();
       this->opcode = opcode;
@@ -240,7 +218,7 @@ public:
 	 assert(dst.reg_offset >= 0);
    }
 
-   fs_inst(int opcode, fs_reg dst, fs_reg src0)
+   fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
    {
       init();
       this->opcode = opcode;
@@ -253,7 +231,7 @@ public:
 	 assert(src[0].reg_offset >= 0);
    }
 
-   fs_inst(int opcode, fs_reg dst, fs_reg src0, fs_reg src1)
+   fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
    {
       init();
       this->opcode = opcode;
@@ -269,7 +247,7 @@ public:
 	 assert(src[1].reg_offset >= 0);
    }
 
-   fs_inst(int opcode, fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)
+   fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)
    {
       init();
       this->opcode = opcode;
@@ -313,22 +291,23 @@ public:
       return (opcode == FS_OPCODE_TEX ||
 	      opcode == FS_OPCODE_TXB ||
 	      opcode == FS_OPCODE_TXD ||
-	      opcode == FS_OPCODE_TXL);
+	      opcode == FS_OPCODE_TXL ||
+	      opcode == FS_OPCODE_TXS);
    }
 
    bool is_math()
    {
-      return (opcode == FS_OPCODE_RCP ||
-	      opcode == FS_OPCODE_RSQ ||
-	      opcode == FS_OPCODE_SQRT ||
-	      opcode == FS_OPCODE_EXP2 ||
-	      opcode == FS_OPCODE_LOG2 ||
-	      opcode == FS_OPCODE_SIN ||
-	      opcode == FS_OPCODE_COS ||
-	      opcode == FS_OPCODE_POW);
+      return (opcode == SHADER_OPCODE_RCP ||
+	      opcode == SHADER_OPCODE_RSQ ||
+	      opcode == SHADER_OPCODE_SQRT ||
+	      opcode == SHADER_OPCODE_EXP2 ||
+	      opcode == SHADER_OPCODE_LOG2 ||
+	      opcode == SHADER_OPCODE_SIN ||
+	      opcode == SHADER_OPCODE_COS ||
+	      opcode == SHADER_OPCODE_POW);
    }
 
-   int opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
+   enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
    fs_reg dst;
    fs_reg src[3];
    bool saturate;
@@ -402,7 +381,7 @@ public:
       this->base_ir = NULL;
 
       this->virtual_grf_sizes = NULL;
-      this->virtual_grf_next = 1;
+      this->virtual_grf_next = 0;
       this->virtual_grf_array_size = 0;
       this->virtual_grf_def = NULL;
       this->virtual_grf_use = NULL;
@@ -421,7 +400,7 @@ public:
 
    fs_reg *variable_storage(ir_variable *var);
    int virtual_grf_alloc(int size);
-   void import_uniforms(struct hash_table *src_variable_ht);
+   void import_uniforms(fs_visitor *v);
 
    void visit(ir_variable *ir);
    void visit(ir_assignment *ir);
@@ -445,27 +424,28 @@ public:
 
    fs_inst *emit(fs_inst inst);
 
-   fs_inst *emit(int opcode)
+   fs_inst *emit(enum opcode opcode)
    {
       return emit(fs_inst(opcode));
    }
 
-   fs_inst *emit(int opcode, fs_reg dst)
+   fs_inst *emit(enum opcode opcode, fs_reg dst)
    {
       return emit(fs_inst(opcode, dst));
    }
 
-   fs_inst *emit(int opcode, fs_reg dst, fs_reg src0)
+   fs_inst *emit(enum opcode opcode, fs_reg dst, fs_reg src0)
    {
       return emit(fs_inst(opcode, dst, src0));
    }
 
-   fs_inst *emit(int opcode, fs_reg dst, fs_reg src0, fs_reg src1)
+   fs_inst *emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
    {
       return emit(fs_inst(opcode, dst, src0, src1));
    }
 
-   fs_inst *emit(int opcode, fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)
+   fs_inst *emit(enum opcode opcode, fs_reg dst,
+		 fs_reg src0, fs_reg src1, fs_reg src2)
    {
       return emit(fs_inst(opcode, dst, src0, src1, src2));
    }
@@ -485,9 +465,11 @@ public:
    void setup_pull_constants();
    void calculate_live_intervals();
    bool propagate_constants();
+   bool opt_algebraic();
    bool register_coalesce();
    bool compute_to_mrf();
    bool dead_code_eliminate();
+   bool remove_dead_constants();
    bool remove_duplicate_mrf_writes();
    bool virtual_grf_interferes(int a, int b);
    void schedule_instructions();
@@ -524,8 +506,8 @@ public:
 			      int sampler);
    fs_inst *emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 			      int sampler);
-   fs_inst *emit_math(fs_opcodes op, fs_reg dst, fs_reg src0);
-   fs_inst *emit_math(fs_opcodes op, fs_reg dst, fs_reg src0, fs_reg src1);
+   fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0);
+   fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0, fs_reg src1);
    bool try_emit_saturate(ir_expression *ir);
    void emit_bool_to_cond_code(ir_rvalue *condition);
    void emit_if_gen6(ir_if *ir);
@@ -565,6 +547,13 @@ public:
    int *virtual_grf_use;
    bool live_intervals_valid;
 
+   /* This is the map from UNIFORM hw_reg + reg_offset as generated by
+    * the visitor to the packed uniform number after
+    * remove_dead_constants() that represents the actual uploaded
+    * uniform index.
+    */
+   int *params_remap;
+
    struct hash_table *variable_ht;
    ir_variable *frag_color, *frag_data, *frag_depth;
    int first_non_payload_grf;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index 1d89b8f1d11..28efbd3605f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -59,7 +59,8 @@ fs_visitor::generate_fb_write(fs_inst *inst)
 
 	 if (inst->target > 0) {
 	    /* Set the render target index for choosing BLEND_STATE. */
-	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
+	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+					   inst->base_mrf, 2),
 			      BRW_REGISTER_TYPE_UD),
 		    brw_imm_ud(inst->target));
 	 }
@@ -145,43 +146,12 @@ void
 fs_visitor::generate_math(fs_inst *inst,
 			  struct brw_reg dst, struct brw_reg *src)
 {
-   int op;
-
-   switch (inst->opcode) {
-   case FS_OPCODE_RCP:
-      op = BRW_MATH_FUNCTION_INV;
-      break;
-   case FS_OPCODE_RSQ:
-      op = BRW_MATH_FUNCTION_RSQ;
-      break;
-   case FS_OPCODE_SQRT:
-      op = BRW_MATH_FUNCTION_SQRT;
-      break;
-   case FS_OPCODE_EXP2:
-      op = BRW_MATH_FUNCTION_EXP;
-      break;
-   case FS_OPCODE_LOG2:
-      op = BRW_MATH_FUNCTION_LOG;
-      break;
-   case FS_OPCODE_POW:
-      op = BRW_MATH_FUNCTION_POW;
-      break;
-   case FS_OPCODE_SIN:
-      op = BRW_MATH_FUNCTION_SIN;
-      break;
-   case FS_OPCODE_COS:
-      op = BRW_MATH_FUNCTION_COS;
-      break;
-   default:
-      assert(!"not reached: unknown math function");
-      op = 0;
-      break;
-   }
+   int op = brw_math_function(inst->opcode);
 
    if (intel->gen >= 6) {
       assert(inst->mlen == 0);
 
-      if (inst->opcode == FS_OPCODE_POW) {
+      if (inst->opcode == SHADER_OPCODE_POW) {
 	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 	 brw_math2(p, dst, op, src[0], src[1]);
 
@@ -272,10 +242,16 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
 	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
 	 }
 	 break;
+      case FS_OPCODE_TXS:
+	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
+	 break;
       case FS_OPCODE_TXD:
 	 /* There is no sample_d_c message; comparisons are done manually */
 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
 	 break;
+      default:
+	 assert(!"not reached");
+	 break;
       }
    } else {
       switch (inst->opcode) {
@@ -316,6 +292,14 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
 	 assert(inst->mlen == 7 || inst->mlen == 10);
 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
 	 break;
+      case FS_OPCODE_TXS:
+	 assert(inst->mlen == 3);
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
+	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+	 break;
+      default:
+	 assert(!"not reached");
+	 break;
       }
    }
    assert(msg_type != -1);
@@ -537,11 +521,9 @@ brw_reg_from_fs_reg(fs_reg *reg)
    case ARF:
    case MRF:
       if (reg->smear == -1) {
-	 brw_reg = brw_vec8_reg(reg->file,
-				reg->hw_reg, 0);
+	 brw_reg = brw_vec8_reg(reg->file, reg->reg, 0);
       } else {
-	 brw_reg = brw_vec1_reg(reg->file,
-				reg->hw_reg, reg->smear);
+	 brw_reg = brw_vec1_reg(reg->file, reg->reg, reg->smear);
       }
       brw_reg = retype(brw_reg, reg->type);
       if (reg->sechalf)
@@ -608,8 +590,8 @@ fs_visitor::generate_code()
 	     prog->Name, c->dispatch_width);
    }
 
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
       struct brw_reg src[3], dst;
 
       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
@@ -656,6 +638,11 @@ fs_visitor::generate_code()
       case BRW_OPCODE_MUL:
 	 brw_MUL(p, dst, src[0], src[1]);
 	 break;
+      case BRW_OPCODE_MACH:
+	 brw_set_acc_write_control(p, 1);
+	 brw_MACH(p, dst, src[0], src[1]);
+	 brw_set_acc_write_control(p, 0);
+	 break;
 
       case BRW_OPCODE_FRC:
 	 brw_FRC(p, dst, src[0]);
@@ -770,14 +757,14 @@ fs_visitor::generate_code()
       }
 	 break;
 
-      case FS_OPCODE_RCP:
-      case FS_OPCODE_RSQ:
-      case FS_OPCODE_SQRT:
-      case FS_OPCODE_EXP2:
-      case FS_OPCODE_LOG2:
-      case FS_OPCODE_POW:
-      case FS_OPCODE_SIN:
-      case FS_OPCODE_COS:
+      case SHADER_OPCODE_RCP:
+      case SHADER_OPCODE_RSQ:
+      case SHADER_OPCODE_SQRT:
+      case SHADER_OPCODE_EXP2:
+      case SHADER_OPCODE_LOG2:
+      case SHADER_OPCODE_POW:
+      case SHADER_OPCODE_SIN:
+      case SHADER_OPCODE_COS:
 	 generate_math(inst, dst, src);
 	 break;
       case FS_OPCODE_PIXEL_X:
@@ -796,6 +783,7 @@ fs_visitor::generate_code()
       case FS_OPCODE_TXB:
       case FS_OPCODE_TXD:
       case FS_OPCODE_TXL:
+      case FS_OPCODE_TXS:
 	 generate_tex(inst, dst, src[0]);
 	 break;
       case FS_OPCODE_DISCARD:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index b4689d2c293..7c5414ac26c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -25,23 +25,6 @@
  *
  */
 
-extern "C" {
-
-#include <sys/types.h>
-
-#include "main/macros.h"
-#include "main/shaderobj.h"
-#include "main/uniforms.h"
-#include "program/prog_parameter.h"
-#include "program/prog_print.h"
-#include "program/prog_optimize.h"
-#include "program/register_allocate.h"
-#include "program/sampler.h"
-#include "program/hash_table.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_wm.h"
-}
 #include "brw_fs.h"
 #include "../glsl/glsl_types.h"
 #include "../glsl/ir_optimization.h"
@@ -50,45 +33,115 @@ extern "C" {
 static void
 assign_reg(int *reg_hw_locations, fs_reg *reg, int reg_width)
 {
-   if (reg->file == GRF && reg->reg != 0) {
+   if (reg->file == GRF) {
       assert(reg->reg_offset >= 0);
-      reg->hw_reg = reg_hw_locations[reg->reg] + reg->reg_offset * reg_width;
-      reg->reg = 0;
+      reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset * reg_width;
+      reg->reg_offset = 0;
    }
 }
 
 void
 fs_visitor::assign_regs_trivial()
 {
-   int last_grf = 0;
-   int hw_reg_mapping[this->virtual_grf_next];
+   int hw_reg_mapping[this->virtual_grf_next + 1];
    int i;
    int reg_width = c->dispatch_width / 8;
 
-   hw_reg_mapping[0] = 0;
    /* Note that compressed instructions require alignment to 2 registers. */
-   hw_reg_mapping[1] = ALIGN(this->first_non_payload_grf, reg_width);
-   for (i = 2; i < this->virtual_grf_next; i++) {
+   hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
+   for (i = 1; i <= this->virtual_grf_next; i++) {
       hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
 			   this->virtual_grf_sizes[i - 1] * reg_width);
    }
-   last_grf = hw_reg_mapping[i - 1] + (this->virtual_grf_sizes[i - 1] *
-				       reg_width);
+   this->grf_used = hw_reg_mapping[this->virtual_grf_next];
 
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       assign_reg(hw_reg_mapping, &inst->dst, reg_width);
       assign_reg(hw_reg_mapping, &inst->src[0], reg_width);
       assign_reg(hw_reg_mapping, &inst->src[1], reg_width);
    }
 
-   if (last_grf >= BRW_MAX_GRF) {
+   if (this->grf_used >= BRW_MAX_GRF) {
       fail("Ran out of regs on trivial allocator (%d/%d)\n",
-	   last_grf, BRW_MAX_GRF);
+	   this->grf_used, BRW_MAX_GRF);
+   }
+
+}
+
+static void
+brw_alloc_reg_set_for_classes(struct brw_context *brw,
+			      int *class_sizes,
+			      int class_count,
+			      int reg_width,
+			      int base_reg_count)
+{
+   struct intel_context *intel = &brw->intel;
+
+   /* Compute the total number of registers across all classes. */
+   int ra_reg_count = 0;
+   for (int i = 0; i < class_count; i++) {
+      ra_reg_count += base_reg_count - (class_sizes[i] - 1);
+   }
+
+   ralloc_free(brw->wm.ra_reg_to_grf);
+   brw->wm.ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count);
+   ralloc_free(brw->wm.regs);
+   brw->wm.regs = ra_alloc_reg_set(ra_reg_count);
+   ralloc_free(brw->wm.classes);
+   brw->wm.classes = ralloc_array(brw, int, class_count + 1);
+
+   brw->wm.aligned_pairs_class = -1;
+
+   /* Now, add the registers to their classes, and add the conflicts
+    * between them and the base GRF registers (and also each other).
+    */
+   int reg = 0;
+   int pairs_base_reg = 0;
+   int pairs_reg_count = 0;
+   for (int i = 0; i < class_count; i++) {
+      int class_reg_count = base_reg_count - (class_sizes[i] - 1);
+      brw->wm.classes[i] = ra_alloc_reg_class(brw->wm.regs);
+
+      /* Save this off for the aligned pair class at the end. */
+      if (class_sizes[i] == 2) {
+	 pairs_base_reg = reg;
+	 pairs_reg_count = class_reg_count;
+      }
+
+      for (int j = 0; j < class_reg_count; j++) {
+	 ra_class_add_reg(brw->wm.regs, brw->wm.classes[i], reg);
+
+	 brw->wm.ra_reg_to_grf[reg] = j;
+
+	 for (int base_reg = j;
+	      base_reg < j + class_sizes[i];
+	      base_reg++) {
+	    ra_add_transitive_reg_conflict(brw->wm.regs, base_reg, reg);
+	 }
+
+	 reg++;
+      }
+   }
+   assert(reg == ra_reg_count);
+
+   /* Add a special class for aligned pairs, which we'll put delta_x/y
+    * in on gen5 so that we can do PLN.
+    */
+   if (brw->has_pln && reg_width == 1 && intel->gen < 6) {
+      brw->wm.aligned_pairs_class = ra_alloc_reg_class(brw->wm.regs);
+
+      for (int i = 0; i < pairs_reg_count; i++) {
+	 if ((brw->wm.ra_reg_to_grf[pairs_base_reg + i] & 1) == 0) {
+	    ra_class_add_reg(brw->wm.regs, brw->wm.aligned_pairs_class,
+			     pairs_base_reg + i);
+	 }
+      }
+      class_count++;
    }
 
-   this->grf_used = last_grf + reg_width;
+   ra_set_finalize(brw->wm.regs);
 }
 
 bool
@@ -101,12 +154,11 @@ fs_visitor::assign_regs()
     * for reg_width == 2.
     */
    int reg_width = c->dispatch_width / 8;
-   int hw_reg_mapping[this->virtual_grf_next + 1];
+   int hw_reg_mapping[this->virtual_grf_next];
    int first_assigned_grf = ALIGN(this->first_non_payload_grf, reg_width);
    int base_reg_count = (BRW_MAX_GRF - first_assigned_grf) / reg_width;
    int class_sizes[base_reg_count];
    int class_count = 0;
-   int aligned_pair_class = -1;
 
    calculate_live_intervals();
 
@@ -125,7 +177,7 @@ fs_visitor::assign_regs()
        */
       class_sizes[class_count++] = 2;
    }
-   for (int r = 1; r < this->virtual_grf_next; r++) {
+   for (int r = 0; r < this->virtual_grf_next; r++) {
       int i;
 
       for (i = 0; i < class_count; i++) {
@@ -141,94 +193,26 @@ fs_visitor::assign_regs()
       }
    }
 
-   int ra_reg_count = 0;
-   int class_base_reg[class_count];
-   int class_reg_count[class_count];
-   int classes[class_count + 1];
-
-   for (int i = 0; i < class_count; i++) {
-      class_base_reg[i] = ra_reg_count;
-      class_reg_count[i] = base_reg_count - (class_sizes[i] - 1);
-      ra_reg_count += class_reg_count[i];
-   }
-
-   struct ra_regs *regs = ra_alloc_reg_set(ra_reg_count);
-   for (int i = 0; i < class_count; i++) {
-      classes[i] = ra_alloc_reg_class(regs);
-
-      for (int i_r = 0; i_r < class_reg_count[i]; i_r++) {
-	 ra_class_add_reg(regs, classes[i], class_base_reg[i] + i_r);
-      }
+   brw_alloc_reg_set_for_classes(brw, class_sizes, class_count,
+				 reg_width, base_reg_count);
 
-      /* Add conflicts between our contiguous registers aliasing
-       * base regs and other register classes' contiguous registers
-       * that alias base regs, or the base regs themselves for classes[0].
-       */
-      for (int c = 0; c <= i; c++) {
-	 for (int i_r = 0; i_r < class_reg_count[i]; i_r++) {
-	    for (int c_r = MAX2(0, i_r - (class_sizes[c] - 1));
-		 c_r < MIN2(class_reg_count[c], i_r + class_sizes[i]);
-		 c_r++) {
-
-	       if (0) {
-		  printf("%d/%d conflicts %d/%d\n",
-			 class_sizes[i], first_assigned_grf + i_r,
-			 class_sizes[c], first_assigned_grf + c_r);
-	       }
-
-	       ra_add_reg_conflict(regs,
-				   class_base_reg[i] + i_r,
-				   class_base_reg[c] + c_r);
-	    }
-	 }
-      }
-   }
-
-   /* Add a special class for aligned pairs, which we'll put delta_x/y
-    * in on gen5 so that we can do PLN.
-    */
-   if (brw->has_pln && reg_width == 1 && intel->gen < 6) {
-      int reg_count = (base_reg_count - 1) / 2;
-      int unaligned_pair_class = 1;
-      assert(class_sizes[unaligned_pair_class] == 2);
-
-      aligned_pair_class = class_count;
-      classes[aligned_pair_class] = ra_alloc_reg_class(regs);
-      class_sizes[aligned_pair_class] = 2;
-      class_base_reg[aligned_pair_class] = 0;
-      class_reg_count[aligned_pair_class] = 0;
-      int start = (first_assigned_grf & 1) ? 1 : 0;
-
-      for (int i = 0; i < reg_count; i++) {
-	 ra_class_add_reg(regs, classes[aligned_pair_class],
-			  class_base_reg[unaligned_pair_class] + i * 2 + start);
-      }
-      class_count++;
-   }
-
-   ra_set_finalize(regs);
-
-   struct ra_graph *g = ra_alloc_interference_graph(regs,
+   struct ra_graph *g = ra_alloc_interference_graph(brw->wm.regs,
 						    this->virtual_grf_next);
-   /* Node 0 is just a placeholder to keep virtual_grf[] mapping 1:1
-    * with nodes.
-    */
-   ra_set_node_class(g, 0, classes[0]);
 
-   for (int i = 1; i < this->virtual_grf_next; i++) {
+   for (int i = 0; i < this->virtual_grf_next; i++) {
       for (int c = 0; c < class_count; c++) {
 	 if (class_sizes[c] == this->virtual_grf_sizes[i]) {
-	    if (aligned_pair_class >= 0 &&
+	    if (brw->wm.aligned_pairs_class >= 0 &&
 		this->delta_x.reg == i) {
-	       ra_set_node_class(g, i, classes[aligned_pair_class]);
+	       ra_set_node_class(g, i, brw->wm.aligned_pairs_class);
 	    } else {
-	       ra_set_node_class(g, i, classes[c]);
+	       ra_set_node_class(g, i, brw->wm.classes[c]);
 	    }
 	    break;
 	 }
       }
 
-      for (int j = 1; j < i; j++) {
+      for (int j = 0; j < i; j++) {
 	 if (virtual_grf_interferes(i, j)) {
 	    ra_add_node_interference(g, i, j);
 	 }
@@ -253,7 +237,6 @@ fs_visitor::assign_regs()
 
 
       ralloc_free(g);
-      ralloc_free(regs);
 
       return false;
    }
@@ -263,28 +246,18 @@ fs_visitor::assign_regs()
     * numbers.
     */
    this->grf_used = first_assigned_grf;
-   hw_reg_mapping[0] = 0; /* unused */
-   for (int i = 1; i < this->virtual_grf_next; i++) {
+   for (int i = 0; i < this->virtual_grf_next; i++) {
       int reg = ra_get_node_reg(g, i);
-      int hw_reg = -1;
-
-      for (int c = 0; c < class_count; c++) {
-	 if (reg >= class_base_reg[c] &&
-	     reg < class_base_reg[c] + class_reg_count[c]) {
-	    hw_reg = reg - class_base_reg[c];
-	    break;
-	 }
-      }
 
-      assert(hw_reg >= 0);
-      hw_reg_mapping[i] = first_assigned_grf + hw_reg * reg_width;
+      hw_reg_mapping[i] = (first_assigned_grf +
+			   brw->wm.ra_reg_to_grf[reg] * reg_width);
       this->grf_used = MAX2(this->grf_used,
 			    hw_reg_mapping[i] + this->virtual_grf_sizes[i] *
 			    reg_width);
    }
 
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       assign_reg(hw_reg_mapping, &inst->dst, reg_width);
       assign_reg(hw_reg_mapping, &inst->src[0], reg_width);
@@ -292,7 +265,6 @@ fs_visitor::assign_regs()
    }
 
    ralloc_free(g);
-   ralloc_free(regs);
 
    return true;
 }
@@ -336,8 +308,8 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
     * spill/unspill we'll have to do, and guess that the insides of
     * loops run 10 times.
     */
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       for (unsigned int i = 0; i < 3; i++) {
 	 if (inst->src[i].file == GRF) {
@@ -370,6 +342,9 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
 	 if (inst->dst.file == GRF)
 	    no_spill[inst->dst.reg] = true;
 	 break;
+
+      default:
+	 break;
       }
    }
 
@@ -394,8 +369,8 @@ fs_visitor::spill_reg(int spill_reg)
     * virtual grf of the same size.  For most instructions, though, we
     * could just spill/unspill the GRF being accessed.
     */
-   foreach_iter(exec_list_iterator, iter, this->instructions) {
-      fs_inst *inst = (fs_inst *)iter.get();
+   foreach_list(node, &this->instructions) {
+      fs_inst *inst = (fs_inst *)node;
 
       for (unsigned int i = 0; i < 3; i++) {
 	 if (inst->src[i].file == GRF &&
diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
index d8218c26edb..0ea4e5c36f0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
@@ -25,21 +25,6 @@
  *
  */
 
-extern "C" {
-
-#include <sys/types.h>
-
-#include "main/macros.h"
-#include "main/shaderobj.h"
-#include "main/uniforms.h"
-#include "program/prog_optimize.h"
-#include "program/register_allocate.h"
-#include "program/sampler.h"
-#include "program/hash_table.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_wm.h"
-}
 #include "brw_fs.h"
 #include "../glsl/glsl_types.h"
 #include "../glsl/ir_optimization.h"
@@ -84,26 +69,26 @@ public:
       int math_latency = 22;
 
       switch (inst->opcode) {
-      case FS_OPCODE_RCP:
+      case SHADER_OPCODE_RCP:
 	 this->latency = 1 * chans * math_latency;
 	 break;
-      case FS_OPCODE_RSQ:
+      case SHADER_OPCODE_RSQ:
 	 this->latency = 2 * chans * math_latency;
 	 break;
-      case FS_OPCODE_SQRT:
-      case FS_OPCODE_LOG2:
+      case SHADER_OPCODE_SQRT:
+      case SHADER_OPCODE_LOG2:
 	 /* full precision log.  partial is 2. */
 	 this->latency = 3 * chans * math_latency;
 	 break;
-      case FS_OPCODE_EXP2:
+      case SHADER_OPCODE_EXP2:
 	 /* full precision.  partial is 3, same throughput. */
 	 this->latency = 4 * chans * math_latency;
 	 break;
-      case FS_OPCODE_POW:
+      case SHADER_OPCODE_POW:
 	 this->latency = 8 * chans * math_latency;
 	 break;
-      case FS_OPCODE_SIN:
-      case FS_OPCODE_COS:
+      case SHADER_OPCODE_SIN:
+      case SHADER_OPCODE_COS:
 	 /* minimum latency, max is 12 rounds. */
 	 this->latency = 5 * chans * math_latency;
 	 break;
@@ -283,8 +268,8 @@ instruction_scheduler::calculate_deps()
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 
    /* top-to-bottom dependencies: RAW and WAW. */
-   foreach_iter(exec_list_iterator, iter, instructions) {
-      schedule_node *n = (schedule_node *)iter.get();
+   foreach_list(node, &instructions) {
+      schedule_node *n = (schedule_node *)node;
       fs_inst *inst = n->inst;
 
       /* read-after-write deps. */
@@ -321,12 +306,12 @@ instruction_scheduler::calculate_deps()
 	 add_dep(last_grf_write[inst->dst.reg], n);
 	 last_grf_write[inst->dst.reg] = n;
       } else if (inst->dst.file == MRF) {
-	 int reg = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
+	 int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 
 	 add_dep(last_mrf_write[reg], n);
 	 last_mrf_write[reg] = n;
 	 if (is_compressed(inst)) {
-	    if (inst->dst.hw_reg & BRW_MRF_COMPR4)
+	    if (inst->dst.reg & BRW_MRF_COMPR4)
 	       reg += 4;
 	    else
 	       reg++;
@@ -401,12 +386,12 @@ instruction_scheduler::calculate_deps()
       if (inst->dst.file == GRF) {
 	 last_grf_write[inst->dst.reg] = n;
       } else if (inst->dst.file == MRF) {
-	 int reg = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
+	 int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 
 	 last_mrf_write[reg] = n;
 
 	 if (is_compressed(inst)) {
-	    if (inst->dst.hw_reg & BRW_MRF_COMPR4)
+	    if (inst->dst.reg & BRW_MRF_COMPR4)
 	       reg += 4;
 	    else
 	       reg++;
@@ -437,8 +422,8 @@ instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
    int time = 0;
 
    /* Remove non-DAG heads from the list. */
-   foreach_iter(exec_list_iterator, iter, instructions) {
-      schedule_node *n = (schedule_node *)iter.get();
+   foreach_list_safe(node, &instructions) {
+      schedule_node *n = (schedule_node *)node;
       if (n->parent_count != 0)
 	 n->remove();
    }
@@ -447,8 +432,8 @@ instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
       schedule_node *chosen = NULL;
       int chosen_time = 0;
 
-      foreach_iter(exec_list_iterator, iter, instructions) {
-	 schedule_node *n = (schedule_node *)iter.get();
+      foreach_list(node, &instructions) {
+	 schedule_node *n = (schedule_node *)node;
 
 	 if (!chosen || n->unblocked_time < chosen_time) {
 	    chosen = n;
@@ -490,8 +475,8 @@ instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
        * progress until the first is done.
        */
       if (chosen->inst->is_math()) {
-	 foreach_iter(exec_list_iterator, iter, instructions) {
-	    schedule_node *n = (schedule_node *)iter.get();
+	 foreach_list(node, &instructions) {
+	    schedule_node *n = (schedule_node *)node;
 
 	    if (n->inst->is_math())
 	       n->unblocked_time = MAX2(n->unblocked_time,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
index 530ffa26580..a9a60c2fd8a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
@@ -122,8 +122,8 @@ ir_vector_reference_visitor::get_variable_entry(ir_variable *var)
       break;
    }
 
-   foreach_iter(exec_list_iterator, iter, this->variable_list) {
-      variable_entry *entry = (variable_entry *)iter.get();
+   foreach_list(node, &this->variable_list) {
+      variable_entry *entry = (variable_entry *)node;
       if (entry->var == var)
 	 return entry;
    }
@@ -222,8 +222,8 @@ ir_vector_splitting_visitor::get_splitting_entry(ir_variable *var)
    if (!var->type->is_vector())
       return NULL;
 
-   foreach_iter(exec_list_iterator, iter, *this->variable_list) {
-      variable_entry *entry = (variable_entry *)iter.get();
+   foreach_list(node, &*this->variable_list) {
+      variable_entry *entry = (variable_entry *)node;
       if (entry->var == var) {
 	 return entry;
       }
@@ -341,8 +341,8 @@ brw_do_vector_splitting(exec_list *instructions)
    visit_list_elements(&refs, instructions);
 
    /* Trim out variables we can't split. */
-   foreach_iter(exec_list_iterator, iter, refs.variable_list) {
-      variable_entry *entry = (variable_entry *)iter.get();
+   foreach_list_safe(node, &refs.variable_list) {
+      variable_entry *entry = (variable_entry *)node;
 
       if (debug) {
 	 printf("vector %s@%p: decl %d, whole_access %d\n",
@@ -363,8 +363,8 @@ brw_do_vector_splitting(exec_list *instructions)
    /* Replace the decls of the vectors to be split with their split
     * components.
     */
-   foreach_iter(exec_list_iterator, iter, refs.variable_list) {
-      variable_entry *entry = (variable_entry *)iter.get();
+   foreach_list(node, &refs.variable_list) {
+      variable_entry *entry = (variable_entry *)node;
       const struct glsl_type *type;
       type = glsl_type::get_instance(entry->var->type->base_type, 1, 1);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index cbe5cf428c5..cdaf543c88b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -142,9 +142,7 @@ fs_visitor::visit(ir_dereference_array *ir)
    this->result.type = brw_type_for_base_type(ir->type);
 
    if (index) {
-      assert(this->result.file == UNIFORM ||
-	     (this->result.file == GRF &&
-	      this->result.reg != 0));
+      assert(this->result.file == UNIFORM || this->result.file == GRF);
       this->result.reg_offset += index->value.i[0] * element_size;
    } else {
       assert(!"FINISHME: non-constant array element");
@@ -252,14 +250,14 @@ fs_visitor::visit(ir_expression *ir)
 
       break;
    case ir_unop_rcp:
-      emit_math(FS_OPCODE_RCP, this->result, op[0]);
+      emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
       break;
 
    case ir_unop_exp2:
-      emit_math(FS_OPCODE_EXP2, this->result, op[0]);
+      emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
       break;
    case ir_unop_log2:
-      emit_math(FS_OPCODE_LOG2, this->result, op[0]);
+      emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
       break;
    case ir_unop_exp:
    case ir_unop_log:
@@ -267,11 +265,11 @@ fs_visitor::visit(ir_expression *ir)
       break;
    case ir_unop_sin:
    case ir_unop_sin_reduced:
-      emit_math(FS_OPCODE_SIN, this->result, op[0]);
+      emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
       break;
    case ir_unop_cos:
    case ir_unop_cos_reduced:
-      emit_math(FS_OPCODE_COS, this->result, op[0]);
+      emit_math(SHADER_OPCODE_COS, this->result, op[0]);
       break;
 
    case ir_unop_dFdx:
@@ -289,7 +287,23 @@ fs_visitor::visit(ir_expression *ir)
       break;
 
    case ir_binop_mul:
-      emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
+      if (ir->type->is_integer()) {
+	 /* For integer multiplication, the MUL uses the low 16 bits
+	  * of one of the operands (src0 on gen6, src1 on gen7).  The
+	  * MACH accumulates in the contribution of the upper 16 bits
+	  * of that operand.
+	  *
+	  * FINISHME: Emit just the MUL if we know an operand is small
+	  * enough.
+	  */
+	 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
+
+	 emit(BRW_OPCODE_MUL, acc, op[0], op[1]);
+	 emit(BRW_OPCODE_MACH, reg_null_d, op[0], op[1]);
+	 emit(BRW_OPCODE_MOV, this->result, fs_reg(acc));
+      } else {
+	 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
+      }
       break;
    case ir_binop_div:
       assert(!"not reached: should be handled by ir_div_to_mul_rcp");
@@ -342,11 +356,11 @@ fs_visitor::visit(ir_expression *ir)
       break;
 
    case ir_unop_sqrt:
-      emit_math(FS_OPCODE_SQRT, this->result, op[0]);
+      emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
       break;
 
    case ir_unop_rsq:
-      emit_math(FS_OPCODE_RSQ, this->result, op[0]);
+      emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
       break;
 
    case ir_unop_i2u:
@@ -425,7 +439,7 @@ fs_visitor::visit(ir_expression *ir)
       break;
 
    case ir_binop_pow:
-      emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
+      emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
       break;
 
    case ir_unop_bit_not:
@@ -496,7 +510,7 @@ fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
 void
 fs_visitor::visit(ir_assignment *ir)
 {
-   struct fs_reg l, r;
+   fs_reg l, r;
    fs_inst *inst;
 
    /* FINISHME: arrays on the lhs */
@@ -603,9 +617,11 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
       mlen += 3;
    } else if (ir->op == ir_txd) {
+      this->result = reg_undef;
       ir->lod_info.grad.dPdx->accept(this);
       fs_reg dPdx = this->result;
 
+      this->result = reg_undef;
       ir->lod_info.grad.dPdy->accept(this);
       fs_reg dPdy = this->result;
 
@@ -620,6 +636,8 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
        * dPdx = dudx, dvdx, drdx
        * dPdy = dudy, dvdy, drdy
        *
+       * 1-arg: Does not exist.
+       *
        * 2-arg: dudx   dvdx   dudy   dvdy
        *        dPdx.x dPdx.y dPdy.x dPdy.y
        *        m4     m5     m6     m7
@@ -631,18 +649,26 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), dPdx);
 	 dPdx.reg_offset++;
-	 mlen++;
       }
+      mlen += MAX2(ir->lod_info.grad.dPdx->type->vector_elements, 2);
 
       for (int i = 0; i < ir->lod_info.grad.dPdy->type->vector_elements; i++) {
 	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), dPdy);
 	 dPdy.reg_offset++;
-	 mlen++;
       }
+      mlen += MAX2(ir->lod_info.grad.dPdy->type->vector_elements, 2);
+   } else if (ir->op == ir_txs) {
+      /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
+      simd16 = true;
+      this->result = reg_undef;
+      ir->lod_info.lod->accept(this);
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), this->result);
+      mlen += 2;
    } else {
       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
        * instructions.  We'll need to do SIMD16 here.
        */
+      simd16 = true;
       assert(ir->op == ir_txb || ir->op == ir_txl);
 
       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
@@ -671,16 +697,19 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 
       /* The unused upper half. */
       mlen++;
+   }
 
+   if (simd16) {
       /* Now, since we're doing simd16, the return is 2 interleaved
        * vec4s where the odd-indexed ones are junk. We'll need to move
        * this weirdness around to the expected layout.
        */
-      simd16 = true;
       orig_dst = dst;
-      dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
-						       2));
-      dst.type = BRW_REGISTER_TYPE_F;
+      const glsl_type *vec_type =
+	 glsl_type::get_instance(ir->type->base_type, 4, 1);
+      dst = fs_reg(this, glsl_type::get_array_instance(vec_type, 2));
+      dst.type = intel->is_g4x ? brw_type_for_base_type(ir->type)
+			       : BRW_REGISTER_TYPE_F;
    }
 
    fs_inst *inst = NULL;
@@ -697,6 +726,9 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    case ir_txd:
       inst = emit(FS_OPCODE_TXD, dst);
       break;
+   case ir_txs:
+      inst = emit(FS_OPCODE_TXS, dst);
+      break;
    case ir_txf:
       assert(!"GLSL 1.30 features unsupported");
       break;
@@ -732,6 +764,8 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    int base_mrf = 2;
    int reg_width = c->dispatch_width / 8;
    bool header_present = false;
+   const int vector_elements =
+      ir->coordinate ? ir->coordinate->type->vector_elements : 0;
 
    if (ir->offset) {
       /* The offsets set up by the ir_texture visitor are in the
@@ -742,7 +776,7 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       base_mrf--;
    }
 
-   for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
+   for (int i = 0; i < vector_elements; i++) {
       fs_inst *inst = emit(BRW_OPCODE_MOV,
 			   fs_reg(MRF, base_mrf + mlen + i * reg_width),
 			   coordinate);
@@ -750,7 +784,7 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 	 inst->saturate = true;
       coordinate.reg_offset++;
    }
-   mlen += ir->coordinate->type->vector_elements * reg_width;
+   mlen += vector_elements * reg_width;
 
    if (ir->shadow_comparitor && ir->op != ir_txd) {
       mlen = MAX2(mlen, header_present + 4 * reg_width);
@@ -786,9 +820,11 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       inst = emit(FS_OPCODE_TXL, dst);
       break;
    case ir_txd: {
+      this->result = reg_undef;
       ir->lod_info.grad.dPdx->accept(this);
       fs_reg dPdx = this->result;
 
+      this->result = reg_undef;
       ir->lod_info.grad.dPdy->accept(this);
       fs_reg dPdy = this->result;
 
@@ -816,6 +852,13 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       inst = emit(FS_OPCODE_TXD, dst);
       break;
    }
+   case ir_txs:
+      this->result = reg_undef;
+      ir->lod_info.lod->accept(this);
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), this->result);
+      mlen += reg_width;
+      inst = emit(FS_OPCODE_TXS, dst);
+      break;
    case ir_txf:
       assert(!"GLSL 1.30 features unsupported");
       break;
@@ -850,6 +893,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    }
 
    if (ir->shadow_comparitor && ir->op != ir_txd) {
+      this->result = reg_undef;
       ir->shadow_comparitor->accept(this);
       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
       mlen += reg_width;
@@ -860,11 +904,13 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    case ir_tex:
       break;
    case ir_txb:
+      this->result = reg_undef;
       ir->lod_info.bias->accept(this);
       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
       mlen += reg_width;
       break;
    case ir_txl:
+      this->result = reg_undef;
       ir->lod_info.lod->accept(this);
       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
       mlen += reg_width;
@@ -873,9 +919,11 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       if (c->dispatch_width == 16)
 	 fail("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
 
+      this->result = reg_undef;
       ir->lod_info.grad.dPdx->accept(this);
       fs_reg dPdx = this->result;
 
+      this->result = reg_undef;
       ir->lod_info.grad.dPdy->accept(this);
       fs_reg dPdy = this->result;
 
@@ -900,13 +948,19 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       }
       break;
    }
+   case ir_txs:
+      this->result = reg_undef;
+      ir->lod_info.lod->accept(this);
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), this->result);
+      mlen += reg_width;
+      break;
    case ir_txf:
       assert(!"GLSL 1.30 features unsupported");
       break;
    }
 
    /* Set up the coordinate (except for TXD where it was done earlier) */
-   if (ir->op != ir_txd) {
+   if (ir->op != ir_txd && ir->op != ir_txs) {
       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 	 fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
 			      coordinate);
@@ -924,7 +978,8 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
    case ir_txl: inst = emit(FS_OPCODE_TXL, dst); break;
    case ir_txd: inst = emit(FS_OPCODE_TXD, dst); break;
-   case ir_txf: assert(!"TXF unsupported.");
+   case ir_txf: assert(!"TXF unsupported."); break;
+   case ir_txs: inst = emit(FS_OPCODE_TXS, dst); break;
    }
    inst->base_mrf = base_mrf;
    inst->mlen = mlen;
@@ -959,7 +1014,8 @@ fs_visitor::visit(ir_texture *ir)
    }
 
    this->result = reg_undef;
-   ir->coordinate->accept(this);
+   if (ir->coordinate)
+      ir->coordinate->accept(this);
    fs_reg coordinate = this->result;
 
    if (ir->offset != NULL) {
@@ -1000,7 +1056,8 @@ fs_visitor::visit(ir_texture *ir)
     * texture coordinates.  We use the program parameter state
     * tracking to get the scaling factor.
     */
-   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
+   if (intel->gen < 6 &&
+       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
       struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
       int tokens[STATE_LENGTH] = {
 	 STATE_INTERNAL,
@@ -1046,7 +1103,7 @@ fs_visitor::visit(ir_texture *ir)
    /* Writemasking doesn't eliminate channels on SIMD8 texture
     * samples, so don't worry about them.
     */
-   fs_reg dst = fs_reg(this, glsl_type::vec4_type);
+   fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
 
    if (intel->gen >= 7) {
       inst = emit_texture_gen7(ir, dst, coordinate, sampler);
@@ -1070,6 +1127,7 @@ fs_visitor::visit(ir_texture *ir)
       if (hw_compare_supported) {
 	 inst->shadow_compare = true;
       } else {
+	 this->result = reg_undef;
 	 ir->shadow_comparitor->accept(this);
 	 fs_reg ref = this->result;
 
@@ -1465,8 +1523,8 @@ fs_visitor::visit(ir_if *ir)
       inst->predicated = true;
    }
 
-   foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
-      ir_instruction *ir = (ir_instruction *)iter.get();
+   foreach_list(node, &ir->then_instructions) {
+      ir_instruction *ir = (ir_instruction *)node;
       this->base_ir = ir;
       this->result = reg_undef;
       ir->accept(this);
@@ -1475,8 +1533,8 @@ fs_visitor::visit(ir_if *ir)
    if (!ir->else_instructions.is_empty()) {
       emit(BRW_OPCODE_ELSE);
 
-      foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
-	 ir_instruction *ir = (ir_instruction *)iter.get();
+      foreach_list(node, &ir->else_instructions) {
+	 ir_instruction *ir = (ir_instruction *)node;
 	 this->base_ir = ir;
 	 this->result = reg_undef;
 	 ir->accept(this);
@@ -1526,8 +1584,8 @@ fs_visitor::visit(ir_loop *ir)
       inst->predicated = true;
    }
 
-   foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
-      ir_instruction *ir = (ir_instruction *)iter.get();
+   foreach_list(node, &ir->body_instructions) {
+      ir_instruction *ir = (ir_instruction *)node;
 
       this->base_ir = ir;
       this->result = reg_undef;
@@ -1583,8 +1641,8 @@ fs_visitor::visit(ir_function *ir)
 
       assert(sig);
 
-      foreach_iter(exec_list_iterator, iter, sig->body) {
-	 ir_instruction *ir = (ir_instruction *)iter.get();
+      foreach_list(node, &sig->body) {
+	 ir_instruction *ir = (ir_instruction *)node;
 	 this->base_ir = ir;
 	 this->result = reg_undef;
 	 ir->accept(this);
@@ -1684,7 +1742,7 @@ fs_visitor::emit_interpolation_setup_gen4()
 	interp_reg(FRAG_ATTRIB_WPOS, 3));
    /* Compute the pixel 1/W value from wpos.w. */
    this->pixel_w = fs_reg(this, glsl_type::float_type);
-   emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
+   emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
    this->current_annotation = NULL;
 }
 
@@ -1721,7 +1779,7 @@ fs_visitor::emit_interpolation_setup_gen6()
    this->current_annotation = "compute pos.w";
    this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
    this->wpos_w = fs_reg(this, glsl_type::float_type);
-   emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w);
+   emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
 
    this->delta_x = fs_reg(brw_vec8_grf(2, 0));
    this->delta_y = fs_reg(brw_vec8_grf(3, 0));
@@ -1733,6 +1791,7 @@ void
 fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color)
 {
    int reg_width = c->dispatch_width / 8;
+   fs_inst *inst;
 
    if (c->dispatch_width == 8 || intel->gen == 6) {
       /* SIMD8 write looks like:
@@ -1751,8 +1810,10 @@ fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color)
        * m + 6: a0
        * m + 7: a1
        */
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width),
-	   color);
+      inst = emit(BRW_OPCODE_MOV,
+		  fs_reg(MRF, first_color_mrf + index * reg_width),
+		  color);
+      inst->saturate = c->key.clamp_fragment_color;
    } else {
       /* pre-gen6 SIMD16 single source DP write looks like:
        * m + 0: r0
@@ -1770,16 +1831,22 @@ fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color)
 	  * usual destination + 1 for the second half we get
 	  * destination + 4.
 	  */
-	 emit(BRW_OPCODE_MOV,
-	      fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index), color);
+	 inst = emit(BRW_OPCODE_MOV,
+		     fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index),
+		     color);
+	 inst->saturate = c->key.clamp_fragment_color;
       } else {
 	 push_force_uncompressed();
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color);
+	 inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index),
+		     color);
+	 inst->saturate = c->key.clamp_fragment_color;
 	 pop_force_uncompressed();
 
 	 push_force_sechalf();
 	 color.sechalf = true;
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color);
+	 inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4),
+		     color);
+	 inst->saturate = c->key.clamp_fragment_color;
 	 pop_force_sechalf();
 	 color.sechalf = false;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 03cebbb824b..f7e6e7c81d1 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -46,7 +46,7 @@ static void upload_drawing_rect(struct brw_context *brw)
    struct gl_context *ctx = &intel->ctx;
 
    BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE);
+   OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE << 16 | (4 - 2));
    OUT_BATCH(0); /* xmin, ymin */
    OUT_BATCH(((ctx->DrawBuffer->Width - 1) & 0xffff) |
 	    ((ctx->DrawBuffer->Height - 1) << 16));
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 6674f1640c8..09b5be4c96e 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -226,6 +226,34 @@ static GLboolean brwProgramStringNotify( struct gl_context *ctx,
    return GL_TRUE;
 }
 
+/* Per-thread scratch space is a power-of-two multiple of 1KB. */
+int
+brw_get_scratch_size(int size)
+{
+   int i;
+
+   for (i = 1024; i < size; i *= 2)
+      ;
+
+   return i;
+}
+
+void
+brw_get_scratch_bo(struct intel_context *intel,
+		   drm_intel_bo **scratch_bo, int size)
+{
+   drm_intel_bo *old_bo = *scratch_bo;
+
+   if (old_bo && old_bo->size < size) {
+      drm_intel_bo_unreference(old_bo);
+      old_bo = NULL;
+   }
+
+   if (!old_bo) {
+      *scratch_bo = drm_intel_bo_alloc(intel->bufmgr, "scratch bo", size, 4096);
+   }
+}
+
 void brwInitFragProgFuncs( struct dd_function_table *functions )
 {
    assert(functions->ProgramStringNotify == _tnl_program_string); 
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 9471883fb2b..3ff6bbaed47 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -24,6 +24,7 @@
 extern "C" {
 #include "main/macros.h"
 #include "brw_context.h"
+#include "brw_vs.h"
 }
 #include "brw_fs.h"
 #include "../glsl/ir_optimization.h"
@@ -67,6 +68,9 @@ brw_shader_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
    if (!brw_fs_precompile(ctx, prog))
       return false;
 
+   if (!brw_vs_precompile(ctx, prog))
+      return false;
+
    return true;
 }
 
@@ -75,10 +79,15 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 {
    struct brw_context *brw = brw_context(ctx);
    struct intel_context *intel = &brw->intel;
+   unsigned int stage;
+
+   for (stage = 0; stage < ARRAY_SIZE(prog->_LinkedShaders); stage++) {
+      struct brw_shader *shader =
+	 (struct brw_shader *)prog->_LinkedShaders[stage];
+
+      if (!shader)
+	 continue;
 
-   struct brw_shader *shader =
-      (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
-   if (shader != NULL) {
       void *mem_ctx = ralloc_context(NULL);
       bool progress;
 
@@ -106,18 +115,22 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
       brw_do_cubemap_normalize(shader->ir);
       lower_noise(shader->ir);
       lower_quadop_vector(shader->ir, false);
+
+      bool input = true;
+      bool output = stage == MESA_SHADER_FRAGMENT;
+      bool temp = stage == MESA_SHADER_FRAGMENT;
+      bool uniform = true;
+
       lower_variable_index_to_cond_assign(shader->ir,
-					  GL_TRUE, /* input */
-					  GL_TRUE, /* output */
-					  GL_TRUE, /* temp */
-					  GL_TRUE /* uniform */
-					  );
+					  input, output, temp, uniform);
 
       do {
 	 progress = false;
 
-	 brw_do_channel_expressions(shader->ir);
-	 brw_do_vector_splitting(shader->ir);
+	 if (stage == MESA_SHADER_FRAGMENT) {
+	    brw_do_channel_expressions(shader->ir);
+	    brw_do_vector_splitting(shader->ir);
+	 }
 
 	 progress = do_lower_jumps(shader->ir, true, true,
 				   true, /* main return */
@@ -192,3 +205,29 @@ brw_conditional_for_comparison(unsigned int op)
       return BRW_CONDITIONAL_NZ;
    }
 }
+
+uint32_t
+brw_math_function(enum opcode op)
+{
+   switch (op) {
+   case SHADER_OPCODE_RCP:
+      return BRW_MATH_FUNCTION_INV;
+   case SHADER_OPCODE_RSQ:
+      return BRW_MATH_FUNCTION_RSQ;
+   case SHADER_OPCODE_SQRT:
+      return BRW_MATH_FUNCTION_SQRT;
+   case SHADER_OPCODE_EXP2:
+      return BRW_MATH_FUNCTION_EXP;
+   case SHADER_OPCODE_LOG2:
+      return BRW_MATH_FUNCTION_LOG;
+   case SHADER_OPCODE_POW:
+      return BRW_MATH_FUNCTION_POW;
+   case SHADER_OPCODE_SIN:
+      return BRW_MATH_FUNCTION_SIN;
+   case SHADER_OPCODE_COS:
+      return BRW_MATH_FUNCTION_COS;
+   default:
+      assert(!"not reached: unknown math function");
+      return 0;
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 4c568a26caa..1054d7a589e 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -21,5 +21,11 @@
  * IN THE SOFTWARE.
  */
 
+#include <stdint.h>
+#include "brw_defines.h"
+
+#pragma once
+
 int brw_type_for_base_type(const struct glsl_type *type);
 uint32_t brw_conditional_for_comparison(unsigned int op);
+uint32_t brw_math_function(enum opcode op);
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index b9e5cc1a534..cb7a3ef73d3 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -455,6 +455,23 @@ dump_vs_constants(struct brw_context *brw, uint32_t offset, uint32_t size)
    }
 }
 
+static void
+dump_wm_constants(struct brw_context *brw, uint32_t offset, uint32_t size)
+{
+   const char *name = "WM_CONST";
+   struct intel_context *intel = &brw->intel;
+   uint32_t *as_uint = intel->batch.bo->virtual + offset;
+   float *as_float = intel->batch.bo->virtual + offset;
+   int i;
+
+   for (i = 0; i < size / 4; i += 4) {
+      batch_out(brw, name, offset, i, "%3d: (% f % f % f % f) (0x%08x 0x%08x 0x%08x 0x%08x)\n",
+		i / 4,
+		as_float[i], as_float[i + 1], as_float[i + 2], as_float[i + 3],
+		as_uint[i], as_uint[i + 1], as_uint[i + 2], as_uint[i + 3]);
+   }
+}
+
 static void dump_binding_table(struct brw_context *brw, uint32_t offset,
 			       uint32_t size)
 {
@@ -602,6 +619,9 @@ dump_state_batch(struct brw_context *brw)
       case AUB_TRACE_VS_CONSTANTS:
 	 dump_vs_constants(brw, offset, size);
 	 break;
+      case AUB_TRACE_WM_CONSTANTS:
+	 dump_wm_constants(brw, offset, size);
+	 break;
       default:
 	 break;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index f462f32b19a..46a417a08ed 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -60,7 +60,7 @@ GLboolean brw_miptree_layout(struct intel_context *intel,
 	   * given in Volume 1 of the BSpec.
 	   */
 	  h0 = ALIGN(mt->height0, align_h);
-	  h1 = ALIGN(minify(h0), align_h);
+	  h1 = ALIGN(minify(mt->height0), align_h);
 	  qpitch = (h0 + h1 + (intel->gen >= 7 ? 12 : 11) * align_h);
           if (mt->compressed)
 	     qpitch /= 4;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
new file mode 100644
index 00000000000..760bc1f7acd
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+extern "C" {
+#include "main/macros.h"
+#include "program/prog_parameter.h"
+}
+
+#define MAX_INSTRUCTION (1 << 30)
+
+namespace brw {
+
+void
+vec4_visitor::calculate_live_intervals()
+{
+   int *def = ralloc_array(mem_ctx, int, virtual_grf_count);
+   int *use = ralloc_array(mem_ctx, int, virtual_grf_count);
+   int loop_depth = 0;
+   int loop_start = 0;
+
+   if (this->live_intervals_valid)
+      return;
+
+   for (int i = 0; i < virtual_grf_count; i++) {
+      def[i] = MAX_INSTRUCTION;
+      use[i] = -1;
+   }
+
+   int ip = 0;
+   foreach_list(node, &this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+
+      if (inst->opcode == BRW_OPCODE_DO) {
+	 if (loop_depth++ == 0)
+	    loop_start = ip;
+      } else if (inst->opcode == BRW_OPCODE_WHILE) {
+	 loop_depth--;
+
+	 if (loop_depth == 0) {
+	    /* Patches up the use of vars marked for being live across
+	     * the whole loop.
+	     */
+	    for (int i = 0; i < virtual_grf_count; i++) {
+	       if (use[i] == loop_start) {
+		  use[i] = ip;
+	       }
+	    }
+	 }
+      } else {
+	 for (unsigned int i = 0; i < 3; i++) {
+	    if (inst->src[i].file == GRF) {
+	       int reg = inst->src[i].reg;
+
+	       if (!loop_depth) {
+		  use[reg] = ip;
+	       } else {
+		  def[reg] = MIN2(loop_start, def[reg]);
+		  use[reg] = loop_start;
+
+		  /* Nobody else is going to go smash our start to
+		   * later in the loop now, because def[reg] now
+		   * points before the bb header.
+		   */
+	       }
+	    }
+	 }
+	 if (inst->dst.file == GRF) {
+	    int reg = inst->dst.reg;
+
+	    if (!loop_depth) {
+	       def[reg] = MIN2(def[reg], ip);
+	    } else {
+	       def[reg] = MIN2(def[reg], loop_start);
+	    }
+	 }
+      }
+
+      ip++;
+   }
+
+   ralloc_free(this->virtual_grf_def);
+   ralloc_free(this->virtual_grf_use);
+   this->virtual_grf_def = def;
+   this->virtual_grf_use = use;
+
+   this->live_intervals_valid = true;
+}
+
+bool
+vec4_visitor::virtual_grf_interferes(int a, int b)
+{
+   int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
+   int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
+
+   /* We can't handle dead register writes here, without iterating
+    * over the whole instruction stream to find every single dead
+    * write to that register to compare to the live interval of the
+    * other register.  Just assert that dead_code_eliminate() has been
+    * called.
+    */
+   assert((this->virtual_grf_use[a] != -1 ||
+	   this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
+	  (this->virtual_grf_use[b] != -1 ||
+	   this->virtual_grf_def[b] == MAX_INSTRUCTION));
+
+   return start < end;
+}
+
+/**
+ * Must be called after calculate_live_intervales() to remove unused
+ * writes to registers -- register allocation will fail otherwise
+ * because something deffed but not used won't be considered to
+ * interfere with other regs.
+ */
+bool
+vec4_visitor::dead_code_eliminate()
+{
+   bool progress = false;
+   int pc = 0;
+
+   calculate_live_intervals();
+
+   foreach_list_safe(node, &this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+
+      if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
+	 inst->remove();
+	 progress = true;
+      }
+
+      pc++;
+   }
+
+   if (progress)
+      live_intervals_valid = false;
+
+   return progress;
+}
+
+} /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
new file mode 100644
index 00000000000..1db910e2b99
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -0,0 +1,489 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_H
+#define BRW_VEC4_H
+
+#include <stdint.h>
+#include "brw_shader.h"
+#include "main/compiler.h"
+#include "program/hash_table.h"
+
+extern "C" {
+#include "brw_vs.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+};
+
+#include "../glsl/ir.h"
+
+namespace brw {
+
+class dst_reg;
+
+/**
+ * Common helper for constructing swizzles.  When only a subset of
+ * channels of a vec4 are used, we don't want to reference the other
+ * channels, as that will tell optimization passes that those other
+ * channels are used.
+ */
+static int
+swizzle_for_size(int size)
+{
+   int size_swizzles[4] = {
+      BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+      BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
+      BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z),
+      BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W),
+   };
+
+   assert((size >= 1) && (size <= 4));
+   return size_swizzles[size - 1];
+}
+
+enum register_file {
+   ARF = BRW_ARCHITECTURE_REGISTER_FILE,
+   GRF = BRW_GENERAL_REGISTER_FILE,
+   MRF = BRW_MESSAGE_REGISTER_FILE,
+   IMM = BRW_IMMEDIATE_VALUE,
+   HW_REG, /* a struct brw_reg */
+   ATTR,
+   UNIFORM, /* prog_data->params[hw_reg] */
+   BAD_FILE
+};
+
+class reg
+{
+public:
+   /** Register file: ARF, GRF, MRF, IMM. */
+   enum register_file file;
+   /** virtual register number.  0 = fixed hw reg */
+   int reg;
+   /** Offset within the virtual register. */
+   int reg_offset;
+   /** Register type.  BRW_REGISTER_TYPE_* */
+   int type;
+   bool sechalf;
+   struct brw_reg fixed_hw_reg;
+   int smear; /* -1, or a channel of the reg to smear to all channels. */
+
+   /** Value for file == BRW_IMMMEDIATE_FILE */
+   union {
+      int32_t i;
+      uint32_t u;
+      float f;
+   } imm;
+};
+
+class src_reg : public reg
+{
+public:
+   /* Callers of this ralloc-based new need not call delete. It's
+    * easier to just ralloc_free 'ctx' (or any of its ancestors). */
+   static void* operator new(size_t size, void *ctx)
+   {
+      void *node;
+
+      node = ralloc_size(ctx, size);
+      assert(node != NULL);
+
+      return node;
+   }
+
+   void init()
+   {
+      memset(this, 0, sizeof(*this));
+
+      this->file = BAD_FILE;
+   }
+
+   src_reg(register_file file, int reg, const glsl_type *type)
+   {
+      init();
+
+      this->file = file;
+      this->reg = reg;
+      if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
+	 this->swizzle = swizzle_for_size(type->vector_elements);
+      else
+	 this->swizzle = SWIZZLE_XYZW;
+   }
+
+   /** Generic unset register constructor. */
+   src_reg()
+   {
+      init();
+   }
+
+   src_reg(float f)
+   {
+      init();
+
+      this->file = IMM;
+      this->type = BRW_REGISTER_TYPE_F;
+      this->imm.f = f;
+   }
+
+   src_reg(uint32_t u)
+   {
+      init();
+
+      this->file = IMM;
+      this->type = BRW_REGISTER_TYPE_UD;
+      this->imm.f = u;
+   }
+
+   src_reg(int32_t i)
+   {
+      init();
+
+      this->file = IMM;
+      this->type = BRW_REGISTER_TYPE_D;
+      this->imm.i = i;
+   }
+
+   src_reg(class vec4_visitor *v, const struct glsl_type *type);
+
+   explicit src_reg(dst_reg reg);
+
+   GLuint swizzle; /**< SWIZZLE_XYZW swizzles from Mesa. */
+   bool negate;
+   bool abs;
+
+   src_reg *reladdr;
+};
+
+class dst_reg : public reg
+{
+public:
+   /* Callers of this ralloc-based new need not call delete. It's
+    * easier to just ralloc_free 'ctx' (or any of its ancestors). */
+   static void* operator new(size_t size, void *ctx)
+   {
+      void *node;
+
+      node = ralloc_size(ctx, size);
+      assert(node != NULL);
+
+      return node;
+   }
+
+   void init()
+   {
+      memset(this, 0, sizeof(*this));
+      this->file = BAD_FILE;
+      this->writemask = WRITEMASK_XYZW;
+   }
+
+   dst_reg()
+   {
+      init();
+   }
+
+   dst_reg(register_file file, int reg)
+   {
+      init();
+
+      this->file = file;
+      this->reg = reg;
+   }
+
+   dst_reg(struct brw_reg reg)
+   {
+      init();
+
+      this->file = HW_REG;
+      this->fixed_hw_reg = reg;
+   }
+
+   dst_reg(class vec4_visitor *v, const struct glsl_type *type);
+
+   explicit dst_reg(src_reg reg);
+
+   int writemask; /**< Bitfield of WRITEMASK_[XYZW] */
+
+   src_reg *reladdr;
+};
+
+class vec4_instruction : public exec_node {
+public:
+   /* Callers of this ralloc-based new need not call delete. It's
+    * easier to just ralloc_free 'ctx' (or any of its ancestors). */
+   static void* operator new(size_t size, void *ctx)
+   {
+      void *node;
+
+      node = rzalloc_size(ctx, size);
+      assert(node != NULL);
+
+      return node;
+   }
+
+   struct brw_reg get_dst(void);
+   struct brw_reg get_src(int i);
+
+   enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
+   dst_reg dst;
+   src_reg src[3];
+
+   bool saturate;
+   bool predicate_inverse;
+   uint32_t predicate;
+
+   int conditional_mod; /**< BRW_CONDITIONAL_* */
+
+   int sampler;
+   int target; /**< MRT target. */
+   bool shadow_compare;
+
+   bool eot;
+   bool header_present;
+   int mlen; /**< SEND message length */
+   int base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */
+
+   uint32_t offset; /* spill/unspill offset */
+   /** @{
+    * Annotation for the generated IR.  One of the two can be set.
+    */
+   ir_instruction *ir;
+   const char *annotation;
+};
+
+class vec4_visitor : public ir_visitor
+{
+public:
+   vec4_visitor(struct brw_vs_compile *c,
+		struct gl_shader_program *prog, struct brw_shader *shader);
+   ~vec4_visitor();
+
+   dst_reg dst_null_f()
+   {
+      return dst_reg(brw_null_reg());
+   }
+
+   dst_reg dst_null_d()
+   {
+      return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   }
+
+   dst_reg dst_null_cmp()
+   {
+      if (intel->gen > 4)
+	 return dst_null_d();
+      else
+	 return dst_null_f();
+   }
+
+   struct brw_context *brw;
+   const struct gl_vertex_program *vp;
+   struct intel_context *intel;
+   struct gl_context *ctx;
+   struct brw_vs_compile *c;
+   struct brw_vs_prog_data *prog_data;
+   struct brw_compile *p;
+   struct brw_shader *shader;
+   struct gl_shader_program *prog;
+   void *mem_ctx;
+   exec_list instructions;
+
+   char *fail_msg;
+   bool failed;
+
+   /**
+    * GLSL IR currently being processed, which is associated with our
+    * driver IR instructions for debugging purposes.
+    */
+   ir_instruction *base_ir;
+   const char *current_annotation;
+
+   int *virtual_grf_sizes;
+   int virtual_grf_count;
+   int virtual_grf_array_size;
+   int first_non_payload_grf;
+   int *virtual_grf_def;
+   int *virtual_grf_use;
+   bool live_intervals_valid;
+
+   dst_reg *variable_storage(ir_variable *var);
+
+   void reladdr_to_temp(ir_instruction *ir, src_reg *reg, int *num_reladdr);
+
+   src_reg src_reg_for_float(float val);
+
+   /**
+    * \name Visit methods
+    *
+    * As typical for the visitor pattern, there must be one \c visit method for
+    * each concrete subclass of \c ir_instruction.  Virtual base classes within
+    * the hierarchy should not have \c visit methods.
+    */
+   /*@{*/
+   virtual void visit(ir_variable *);
+   virtual void visit(ir_loop *);
+   virtual void visit(ir_loop_jump *);
+   virtual void visit(ir_function_signature *);
+   virtual void visit(ir_function *);
+   virtual void visit(ir_expression *);
+   virtual void visit(ir_swizzle *);
+   virtual void visit(ir_dereference_variable  *);
+   virtual void visit(ir_dereference_array *);
+   virtual void visit(ir_dereference_record *);
+   virtual void visit(ir_assignment *);
+   virtual void visit(ir_constant *);
+   virtual void visit(ir_call *);
+   virtual void visit(ir_return *);
+   virtual void visit(ir_discard *);
+   virtual void visit(ir_texture *);
+   virtual void visit(ir_if *);
+   /*@}*/
+
+   src_reg result;
+
+   /* Regs for vertex results.  Generated at ir_variable visiting time
+    * for the ir->location's used.
+    */
+   dst_reg output_reg[VERT_RESULT_MAX];
+   int uniform_size[MAX_UNIFORMS];
+   int uniforms;
+
+   struct hash_table *variable_ht;
+
+   bool run(void);
+   void fail(const char *msg, ...);
+
+   int virtual_grf_alloc(int size);
+   int setup_uniform_values(int loc, const glsl_type *type);
+   void setup_builtin_uniform_values(ir_variable *ir);
+   int setup_attributes(int payload_reg);
+   int setup_uniforms(int payload_reg);
+   void setup_payload();
+   void reg_allocate_trivial();
+   void reg_allocate();
+   void move_grf_array_access_to_scratch();
+   void calculate_live_intervals();
+   bool dead_code_eliminate();
+   bool virtual_grf_interferes(int a, int b);
+
+   vec4_instruction *emit(enum opcode opcode);
+
+   vec4_instruction *emit(enum opcode opcode, dst_reg dst, src_reg src0);
+
+   vec4_instruction *emit(enum opcode opcode, dst_reg dst,
+			  src_reg src0, src_reg src1);
+
+   vec4_instruction *emit(enum opcode opcode, dst_reg dst,
+			  src_reg src0, src_reg src1, src_reg src2);
+
+   bool try_rewrite_rhs_to_dst(ir_assignment *ir,
+			       dst_reg dst,
+			       src_reg src,
+			       vec4_instruction *pre_rhs_inst,
+			       vec4_instruction *last_rhs_inst);
+
+   /** Walks an exec_list of ir_instruction and sends it through this visitor. */
+   void visit_instructions(const exec_list *list);
+
+   void emit_bool_to_cond_code(ir_rvalue *ir);
+   void emit_bool_comparison(unsigned int op, dst_reg dst, src_reg src0, src_reg src1);
+   void emit_if_gen6(ir_if *ir);
+
+   void emit_block_move(dst_reg *dst, src_reg *src,
+			const struct glsl_type *type, bool predicated);
+
+   void emit_constant_values(dst_reg *dst, ir_constant *value);
+
+   /**
+    * Emit the correct dot-product instruction for the type of arguments
+    */
+   void emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements);
+
+   void emit_scalar(ir_instruction *ir, enum prog_opcode op,
+		    dst_reg dst, src_reg src0);
+
+   void emit_scalar(ir_instruction *ir, enum prog_opcode op,
+		    dst_reg dst, src_reg src0, src_reg src1);
+
+   void emit_scs(ir_instruction *ir, enum prog_opcode op,
+		 dst_reg dst, const src_reg &src);
+
+   void emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src);
+   void emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src);
+   void emit_math(enum opcode opcode, dst_reg dst, src_reg src);
+   void emit_math2_gen6(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1);
+   void emit_math2_gen4(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1);
+   void emit_math(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1);
+
+   int emit_vue_header_gen6(int header_mrf);
+   int emit_vue_header_gen4(int header_mrf);
+   void emit_urb_writes(void);
+
+   src_reg get_scratch_offset(vec4_instruction *inst,
+			      src_reg *reladdr, int reg_offset);
+   void emit_scratch_read(vec4_instruction *inst,
+			  dst_reg dst,
+			  src_reg orig_src,
+			  int base_offset);
+   void emit_scratch_write(vec4_instruction *inst,
+			   src_reg temp,
+			   dst_reg orig_dst,
+			   int base_offset);
+
+   GLboolean try_emit_sat(ir_expression *ir);
+
+   bool process_move_condition(ir_rvalue *ir);
+
+   void generate_code();
+   void generate_vs_instruction(vec4_instruction *inst,
+				struct brw_reg dst,
+				struct brw_reg *src);
+
+   void generate_math1_gen4(vec4_instruction *inst,
+			    struct brw_reg dst,
+			    struct brw_reg src);
+   void generate_math1_gen6(vec4_instruction *inst,
+			    struct brw_reg dst,
+			    struct brw_reg src);
+   void generate_math2_gen4(vec4_instruction *inst,
+			    struct brw_reg dst,
+			    struct brw_reg src0,
+			    struct brw_reg src1);
+   void generate_math2_gen6(vec4_instruction *inst,
+			    struct brw_reg dst,
+			    struct brw_reg src0,
+			    struct brw_reg src1);
+
+   void generate_urb_write(vec4_instruction *inst);
+   void generate_oword_dual_block_offsets(struct brw_reg m1,
+					  struct brw_reg index);
+   void generate_scratch_write(vec4_instruction *inst,
+			       struct brw_reg dst,
+			       struct brw_reg src,
+			       struct brw_reg index);
+   void generate_scratch_read(vec4_instruction *inst,
+			      struct brw_reg dst,
+			      struct brw_reg index);
+};
+
+} /* namespace brw */
+
+#endif /* BRW_VEC4_H */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
new file mode 100644
index 00000000000..65ac7d9dc09
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
@@ -0,0 +1,854 @@
+/* Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "../glsl/ir_print_visitor.h"
+
+extern "C" {
+#include "brw_eu.h"
+};
+
+using namespace brw;
+
+namespace brw {
+
+int
+vec4_visitor::setup_attributes(int payload_reg)
+{
+   int nr_attributes;
+   int attribute_map[VERT_ATTRIB_MAX];
+
+   nr_attributes = 0;
+   for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
+      if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
+	 attribute_map[i] = payload_reg + nr_attributes;
+	 nr_attributes++;
+
+	 /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED
+	  * attributes come in as floating point conversions of the
+	  * integer values.
+	  */
+	 if (c->key.gl_fixed_input_size[i] != 0) {
+	    struct brw_reg reg = brw_vec8_grf(attribute_map[i], 0);
+
+	    brw_MUL(p,
+		    brw_writemask(reg, (1 << c->key.gl_fixed_input_size[i]) - 1),
+		    reg, brw_imm_f(1.0 / 65536.0));
+	 }
+      }
+   }
+
+   foreach_list(node, &this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+
+      for (int i = 0; i < 3; i++) {
+	 if (inst->src[i].file != ATTR)
+	    continue;
+
+	 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
+
+	 struct brw_reg reg = brw_vec8_grf(grf, 0);
+	 reg.dw1.bits.swizzle = inst->src[i].swizzle;
+	 if (inst->src[i].abs)
+	    reg = brw_abs(reg);
+	 if (inst->src[i].negate)
+	    reg = negate(reg);
+
+	 inst->src[i].file = HW_REG;
+	 inst->src[i].fixed_hw_reg = reg;
+      }
+   }
+
+   /* The BSpec says we always have to read at least one thing from
+    * the VF, and it appears that the hardware wedges otherwise.
+    */
+   if (nr_attributes == 0)
+      nr_attributes = 1;
+
+   prog_data->urb_read_length = (nr_attributes + 1) / 2;
+
+   return payload_reg + nr_attributes;
+}
+
+int
+vec4_visitor::setup_uniforms(int reg)
+{
+   /* User clip planes from curbe:
+    */
+   if (c->key.nr_userclip) {
+      if (intel->gen >= 6) {
+	 for (int i = 0; i < c->key.nr_userclip; i++) {
+	    c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
+						  (i % 2) * 4), 0, 4, 1);
+	 }
+	 reg += ALIGN(c->key.nr_userclip, 2) / 2;
+      } else {
+	 for (int i = 0; i < c->key.nr_userclip; i++) {
+	    c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
+						  (i % 2) * 4), 0, 4, 1);
+	 }
+	 reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
+      }
+   }
+
+   /* The pre-gen6 VS requires that some push constants get loaded no
+    * matter what, or the GPU would hang.
+    */
+   if (intel->gen < 6 && this->uniforms == 0) {
+      this->uniform_size[this->uniforms] = 1;
+
+      for (unsigned int i = 0; i < 4; i++) {
+	 unsigned int slot = this->uniforms * 4 + i;
+
+	 c->prog_data.param[slot] = NULL;
+	 c->prog_data.param_convert[slot] = PARAM_CONVERT_ZERO;
+      }
+
+      this->uniforms++;
+      reg++;
+   } else {
+      reg += ALIGN(uniforms, 2) / 2;
+   }
+
+   /* for now, we are not doing any elimination of unused slots, nor
+    * are we packing our uniforms.
+    */
+   c->prog_data.nr_params = this->uniforms * 4;
+
+   c->prog_data.curb_read_length = reg - 1;
+   c->prog_data.uses_new_param_layout = true;
+
+   return reg;
+}
+
+void
+vec4_visitor::setup_payload(void)
+{
+   int reg = 0;
+
+   /* The payload always contains important data in g0, which contains
+    * the URB handles that are passed on to the URB write at the end
+    * of the thread.  So, we always start push constants at g1.
+    */
+   reg++;
+
+   reg = setup_uniforms(reg);
+
+   reg = setup_attributes(reg);
+
+   this->first_non_payload_grf = reg;
+}
+
+struct brw_reg
+vec4_instruction::get_dst(void)
+{
+   struct brw_reg brw_reg;
+
+   switch (dst.file) {
+   case GRF:
+      brw_reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0);
+      brw_reg = retype(brw_reg, dst.type);
+      brw_reg.dw1.bits.writemask = dst.writemask;
+      break;
+
+   case HW_REG:
+      brw_reg = dst.fixed_hw_reg;
+      break;
+
+   case BAD_FILE:
+      brw_reg = brw_null_reg();
+      break;
+
+   default:
+      assert(!"not reached");
+      brw_reg = brw_null_reg();
+      break;
+   }
+   return brw_reg;
+}
+
+struct brw_reg
+vec4_instruction::get_src(int i)
+{
+   struct brw_reg brw_reg;
+
+   switch (src[i].file) {
+   case GRF:
+      brw_reg = brw_vec8_grf(src[i].reg + src[i].reg_offset, 0);
+      brw_reg = retype(brw_reg, src[i].type);
+      brw_reg.dw1.bits.swizzle = src[i].swizzle;
+      if (src[i].abs)
+	 brw_reg = brw_abs(brw_reg);
+      if (src[i].negate)
+	 brw_reg = negate(brw_reg);
+      break;
+
+   case IMM:
+      switch (src[i].type) {
+      case BRW_REGISTER_TYPE_F:
+	 brw_reg = brw_imm_f(src[i].imm.f);
+	 break;
+      case BRW_REGISTER_TYPE_D:
+	 brw_reg = brw_imm_d(src[i].imm.i);
+	 break;
+      case BRW_REGISTER_TYPE_UD:
+	 brw_reg = brw_imm_ud(src[i].imm.u);
+	 break;
+      default:
+	 assert(!"not reached");
+	 brw_reg = brw_null_reg();
+	 break;
+      }
+      break;
+
+   case UNIFORM:
+      brw_reg = stride(brw_vec4_grf(1 + (src[i].reg + src[i].reg_offset) / 2,
+				    ((src[i].reg + src[i].reg_offset) % 2) * 4),
+		       0, 4, 1);
+      brw_reg = retype(brw_reg, src[i].type);
+      brw_reg.dw1.bits.swizzle = src[i].swizzle;
+      if (src[i].abs)
+	 brw_reg = brw_abs(brw_reg);
+      if (src[i].negate)
+	 brw_reg = negate(brw_reg);
+      break;
+
+   case HW_REG:
+      brw_reg = src[i].fixed_hw_reg;
+      break;
+
+   case BAD_FILE:
+      /* Probably unused. */
+      brw_reg = brw_null_reg();
+      break;
+   case ATTR:
+   default:
+      assert(!"not reached");
+      brw_reg = brw_null_reg();
+      break;
+   }
+
+   return brw_reg;
+}
+
+void
+vec4_visitor::generate_math1_gen4(vec4_instruction *inst,
+				  struct brw_reg dst,
+				  struct brw_reg src)
+{
+   brw_math(p,
+	    dst,
+	    brw_math_function(inst->opcode),
+	    BRW_MATH_SATURATE_NONE,
+	    inst->base_mrf,
+	    src,
+	    BRW_MATH_DATA_SCALAR,
+	    BRW_MATH_PRECISION_FULL);
+}
+
+static void
+check_gen6_math_src_arg(struct brw_reg src)
+{
+   /* Source swizzles are ignored. */
+   assert(!src.abs);
+   assert(!src.negate);
+   assert(src.dw1.bits.swizzle = BRW_SWIZZLE_XYZW);
+}
+
+void
+vec4_visitor::generate_math1_gen6(vec4_instruction *inst,
+				  struct brw_reg dst,
+				  struct brw_reg src)
+{
+   /* Can't do writemask because math can't be align16. */
+   assert(dst.dw1.bits.writemask == WRITEMASK_XYZW);
+   check_gen6_math_src_arg(src);
+
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_math(p,
+	    dst,
+	    brw_math_function(inst->opcode),
+	    BRW_MATH_SATURATE_NONE,
+	    inst->base_mrf,
+	    src,
+	    BRW_MATH_DATA_SCALAR,
+	    BRW_MATH_PRECISION_FULL);
+   brw_set_access_mode(p, BRW_ALIGN_16);
+}
+
+void
+vec4_visitor::generate_math2_gen6(vec4_instruction *inst,
+				  struct brw_reg dst,
+				  struct brw_reg src0,
+				  struct brw_reg src1)
+{
+   /* Can't do writemask because math can't be align16. */
+   assert(dst.dw1.bits.writemask == WRITEMASK_XYZW);
+   /* Source swizzles are ignored. */
+   check_gen6_math_src_arg(src0);
+   check_gen6_math_src_arg(src1);
+
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_math2(p,
+	     dst,
+	     brw_math_function(inst->opcode),
+	     src0, src1);
+   brw_set_access_mode(p, BRW_ALIGN_16);
+}
+
+void
+vec4_visitor::generate_math2_gen4(vec4_instruction *inst,
+				  struct brw_reg dst,
+				  struct brw_reg src0,
+				  struct brw_reg src1)
+{
+   /* Can't do writemask because math can't be align16. */
+   assert(dst.dw1.bits.writemask == WRITEMASK_XYZW);
+
+   brw_MOV(p, brw_message_reg(inst->base_mrf + 1), src1);
+
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_math(p,
+	    dst,
+	    brw_math_function(inst->opcode),
+	    BRW_MATH_SATURATE_NONE,
+	    inst->base_mrf,
+	    src0,
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+   brw_set_access_mode(p, BRW_ALIGN_16);
+}
+
+void
+vec4_visitor::generate_urb_write(vec4_instruction *inst)
+{
+   brw_urb_WRITE(p,
+		 brw_null_reg(), /* dest */
+		 inst->base_mrf, /* starting mrf reg nr */
+		 brw_vec8_grf(0, 0), /* src */
+		 false,		/* allocate */
+		 true,		/* used */
+		 inst->mlen,
+		 0,		/* response len */
+		 inst->eot,	/* eot */
+		 inst->eot,	/* writes complete */
+		 inst->offset,	/* urb destination offset */
+		 BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+void
+vec4_visitor::generate_oword_dual_block_offsets(struct brw_reg m1,
+						struct brw_reg index)
+{
+   int second_vertex_offset;
+
+   if (intel->gen >= 6)
+      second_vertex_offset = 1;
+   else
+      second_vertex_offset = 16;
+
+   m1 = retype(m1, BRW_REGISTER_TYPE_D);
+
+   /* Set up M1 (message payload).  Only the block offsets in M1.0 and
+    * M1.4 are used, and the rest are ignored.
+    */
+   struct brw_reg m1_0 = suboffset(vec1(m1), 0);
+   struct brw_reg m1_4 = suboffset(vec1(m1), 4);
+   struct brw_reg index_0 = suboffset(vec1(index), 0);
+   struct brw_reg index_4 = suboffset(vec1(index), 4);
+
+   brw_push_insn_state(p);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_access_mode(p, BRW_ALIGN_1);
+
+   brw_MOV(p, m1_0, index_0);
+
+   brw_set_predicate_inverse(p, true);
+   if (index.file == BRW_IMMEDIATE_VALUE) {
+      index_4.dw1.ud++;
+      brw_MOV(p, m1_4, index_4);
+   } else {
+      brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
+   }
+
+   brw_pop_insn_state(p);
+}
+
+void
+vec4_visitor::generate_scratch_read(vec4_instruction *inst,
+				    struct brw_reg dst,
+				    struct brw_reg index)
+{
+   if (intel->gen >= 6) {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_MOV(p,
+	      retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_D),
+	      retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_D));
+      brw_pop_insn_state(p);
+   }
+
+   generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1),
+				     index);
+
+   uint32_t msg_type;
+
+   if (intel->gen >= 6)
+      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else if (intel->gen == 5 || intel->is_g4x)
+      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else
+      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+
+   /* Each of the 8 channel enables is considered for whether each
+    * dword is written.
+    */
+   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, brw_message_reg(inst->base_mrf));
+   brw_set_dp_read_message(p, send,
+			   255, /* binding table index: stateless access */
+			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+			   msg_type,
+			   BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
+			   2, /* mlen */
+			   1 /* rlen */);
+}
+
+void
+vec4_visitor::generate_scratch_write(vec4_instruction *inst,
+				     struct brw_reg dst,
+				     struct brw_reg src,
+				     struct brw_reg index)
+{
+   /* If the instruction is predicated, we'll predicate the send, not
+    * the header setup.
+    */
+   brw_set_predicate_control(p, false);
+
+   if (intel->gen >= 6) {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_MOV(p,
+	      retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_D),
+	      retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_D));
+      brw_pop_insn_state(p);
+   }
+
+   generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1),
+				     index);
+
+   brw_MOV(p,
+	   retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D),
+	   retype(src, BRW_REGISTER_TYPE_D));
+
+   uint32_t msg_type;
+
+   if (intel->gen >= 6)
+      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
+   else
+      msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
+
+   brw_set_predicate_control(p, inst->predicate);
+
+   /* Each of the 8 channel enables is considered for whether each
+    * dword is written.
+    */
+   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, brw_message_reg(inst->base_mrf));
+   brw_set_dp_write_message(p, send,
+			    255, /* binding table index: stateless access */
+			    BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+			    msg_type,
+			    3, /* mlen */
+			    true, /* header present */
+			    false, /* pixel scoreboard */
+			    0, /* rlen */
+			    false, /* eot */
+			    false /* commit */);
+}
+
+void
+vec4_visitor::generate_vs_instruction(vec4_instruction *instruction,
+				      struct brw_reg dst,
+				      struct brw_reg *src)
+{
+   vec4_instruction *inst = (vec4_instruction *)instruction;
+
+   switch (inst->opcode) {
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      if (intel->gen >= 6) {
+	 generate_math1_gen6(inst, dst, src[0]);
+      } else {
+	 generate_math1_gen4(inst, dst, src[0]);
+      }
+      break;
+
+   case SHADER_OPCODE_POW:
+      if (intel->gen >= 6) {
+	 generate_math2_gen6(inst, dst, src[0], src[1]);
+      } else {
+	 generate_math2_gen4(inst, dst, src[0], src[1]);
+      }
+      break;
+
+   case VS_OPCODE_URB_WRITE:
+      generate_urb_write(inst);
+      break;
+
+   case VS_OPCODE_SCRATCH_READ:
+      generate_scratch_read(inst, dst, src[0]);
+      break;
+
+   case VS_OPCODE_SCRATCH_WRITE:
+      generate_scratch_write(inst, dst, src[0], src[1]);
+      break;
+
+   default:
+      if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
+	 fail("unsupported opcode in `%s' in VS\n",
+	      brw_opcodes[inst->opcode].name);
+      } else {
+	 fail("Unsupported opcode %d in VS", inst->opcode);
+      }
+   }
+}
+
+bool
+vec4_visitor::run()
+{
+   /* Generate VS IR for main().  (the visitor only descends into
+    * functions called "main").
+    */
+   visit_instructions(shader->ir);
+
+   emit_urb_writes();
+
+   /* Before any optimization, push array accesses out to scratch
+    * space where we need them to be.  This pass may allocate new
+    * virtual GRFs, so we want to do it early.  It also makes sure
+    * that we have reladdr computations available for CSE, since we'll
+    * often do repeated subexpressions for those.
+    */
+   move_grf_array_access_to_scratch();
+
+   bool progress;
+   do {
+      progress = false;
+      progress = dead_code_eliminate() || progress;
+   } while (progress);
+
+   if (failed)
+      return false;
+
+   setup_payload();
+   reg_allocate();
+
+   if (failed)
+      return false;
+
+   brw_set_access_mode(p, BRW_ALIGN_16);
+
+   generate_code();
+
+   return !failed;
+}
+
+void
+vec4_visitor::generate_code()
+{
+   int last_native_inst = p->nr_insn;
+   const char *last_annotation_string = NULL;
+   ir_instruction *last_annotation_ir = NULL;
+
+   int loop_stack_array_size = 16;
+   int loop_stack_depth = 0;
+   brw_instruction **loop_stack =
+      rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
+   int *if_depth_in_loop =
+      rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
+
+
+   if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
+      printf("Native code for vertex shader %d:\n", prog->Name);
+   }
+
+   foreach_list(node, &this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+      struct brw_reg src[3], dst;
+
+      if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
+	 if (last_annotation_ir != inst->ir) {
+	    last_annotation_ir = inst->ir;
+	    if (last_annotation_ir) {
+	       printf("   ");
+	       last_annotation_ir->print();
+	       printf("\n");
+	    }
+	 }
+	 if (last_annotation_string != inst->annotation) {
+	    last_annotation_string = inst->annotation;
+	    if (last_annotation_string)
+	       printf("   %s\n", last_annotation_string);
+	 }
+      }
+
+      for (unsigned int i = 0; i < 3; i++) {
+	 src[i] = inst->get_src(i);
+      }
+      dst = inst->get_dst();
+
+      brw_set_conditionalmod(p, inst->conditional_mod);
+      brw_set_predicate_control(p, inst->predicate);
+      brw_set_predicate_inverse(p, inst->predicate_inverse);
+      brw_set_saturate(p, inst->saturate);
+
+      switch (inst->opcode) {
+      case BRW_OPCODE_MOV:
+	 brw_MOV(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_ADD:
+	 brw_ADD(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_MUL:
+	 brw_MUL(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_MACH:
+	 brw_set_acc_write_control(p, 1);
+	 brw_MACH(p, dst, src[0], src[1]);
+	 brw_set_acc_write_control(p, 0);
+	 break;
+
+      case BRW_OPCODE_FRC:
+	 brw_FRC(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDD:
+	 brw_RNDD(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDE:
+	 brw_RNDE(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDZ:
+	 brw_RNDZ(p, dst, src[0]);
+	 break;
+
+      case BRW_OPCODE_AND:
+	 brw_AND(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_OR:
+	 brw_OR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_XOR:
+	 brw_XOR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_NOT:
+	 brw_NOT(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_ASR:
+	 brw_ASR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SHR:
+	 brw_SHR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SHL:
+	 brw_SHL(p, dst, src[0], src[1]);
+	 break;
+
+      case BRW_OPCODE_CMP:
+	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SEL:
+	 brw_SEL(p, dst, src[0], src[1]);
+	 break;
+
+      case BRW_OPCODE_DP4:
+	 brw_DP4(p, dst, src[0], src[1]);
+	 break;
+
+      case BRW_OPCODE_DP3:
+	 brw_DP3(p, dst, src[0], src[1]);
+	 break;
+
+      case BRW_OPCODE_DP2:
+	 brw_DP2(p, dst, src[0], src[1]);
+	 break;
+
+      case BRW_OPCODE_IF:
+	 if (inst->src[0].file != BAD_FILE) {
+	    /* The instruction has an embedded compare (only allowed on gen6) */
+	    assert(intel->gen == 6);
+	    gen6_IF(p, inst->conditional_mod, src[0], src[1]);
+	 } else {
+	    struct brw_instruction *brw_inst = brw_IF(p, BRW_EXECUTE_8);
+	    brw_inst->header.predicate_control = inst->predicate;
+	 }
+	 if_depth_in_loop[loop_stack_depth]++;
+	 break;
+
+      case BRW_OPCODE_ELSE:
+	 brw_ELSE(p);
+	 break;
+      case BRW_OPCODE_ENDIF:
+	 brw_ENDIF(p);
+	 if_depth_in_loop[loop_stack_depth]--;
+	 break;
+
+      case BRW_OPCODE_DO:
+	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
+	 if (loop_stack_array_size <= loop_stack_depth) {
+	    loop_stack_array_size *= 2;
+	    loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
+				  loop_stack_array_size);
+	    if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
+				        loop_stack_array_size);
+	 }
+	 if_depth_in_loop[loop_stack_depth] = 0;
+	 break;
+
+      case BRW_OPCODE_BREAK:
+	 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 break;
+      case BRW_OPCODE_CONTINUE:
+	 /* FINISHME: We need to write the loop instruction support still. */
+	 if (intel->gen >= 6)
+	    gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
+	 else
+	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 break;
+
+      case BRW_OPCODE_WHILE: {
+	 struct brw_instruction *inst0, *inst1;
+	 GLuint br = 1;
+
+	 if (intel->gen >= 5)
+	    br = 2;
+
+	 assert(loop_stack_depth > 0);
+	 loop_stack_depth--;
+	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
+	 if (intel->gen < 6) {
+	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
+	    while (inst0 > loop_stack[loop_stack_depth]) {
+	       inst0--;
+	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
+		   inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+	    }
+	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
+			inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	       }
+	    }
+	 }
+      }
+	 break;
+
+      default:
+	 generate_vs_instruction(inst, dst, src);
+	 break;
+      }
+
+      if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
+	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
+	    if (0) {
+	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
+		      ((uint32_t *)&p->store[i])[3],
+		      ((uint32_t *)&p->store[i])[2],
+		      ((uint32_t *)&p->store[i])[1],
+		      ((uint32_t *)&p->store[i])[0]);
+	    }
+	    brw_disasm(stdout, &p->store[i], intel->gen);
+	 }
+      }
+
+      last_native_inst = p->nr_insn;
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
+      printf("\n");
+   }
+
+   ralloc_free(loop_stack);
+   ralloc_free(if_depth_in_loop);
+
+   brw_set_uip_jip(p);
+
+   /* OK, while the INTEL_DEBUG=vs above is very nice for debugging VS
+    * emit issues, it doesn't get the jump distances into the output,
+    * which is often something we want to debug.  So this is here in
+    * case you're doing that.
+    */
+   if (0) {
+      if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
+	 for (unsigned int i = 0; i < p->nr_insn; i++) {
+	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
+		   ((uint32_t *)&p->store[i])[3],
+		   ((uint32_t *)&p->store[i])[2],
+		   ((uint32_t *)&p->store[i])[1],
+		   ((uint32_t *)&p->store[i])[0]);
+	    brw_disasm(stdout, &p->store[i], intel->gen);
+	 }
+      }
+   }
+}
+
+extern "C" {
+
+bool
+brw_vs_emit(struct gl_shader_program *prog, struct brw_vs_compile *c)
+{
+   if (!prog)
+      return false;
+
+   struct brw_shader *shader =
+     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
+   if (!shader)
+      return false;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
+      printf("GLSL IR for native vertex shader %d:\n", prog->Name);
+      _mesa_print_ir(shader->ir, NULL);
+      printf("\n\n");
+   }
+
+   vec4_visitor v(c, prog, shader);
+   if (!v.run()) {
+      prog->LinkStatus = GL_FALSE;
+      ralloc_strcat(&prog->InfoLog, v.fail_msg);
+      return false;
+   }
+
+   return true;
+}
+
+} /* extern "C" */
+
+} /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
new file mode 100644
index 00000000000..3f052ff64cf
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+extern "C" {
+#include "main/macros.h"
+#include "program/register_allocate.h"
+} /* extern "C" */
+
+#include "brw_vec4.h"
+#include "../glsl/ir_print_visitor.h"
+
+using namespace brw;
+
+namespace brw {
+
+static void
+assign(int *reg_hw_locations, reg *reg)
+{
+   if (reg->file == GRF) {
+      reg->reg = reg_hw_locations[reg->reg];
+   }
+}
+
+void
+vec4_visitor::reg_allocate_trivial()
+{
+   int hw_reg_mapping[this->virtual_grf_count];
+   bool virtual_grf_used[this->virtual_grf_count];
+   int i;
+   int next;
+
+   /* Calculate which virtual GRFs are actually in use after whatever
+    * optimization passes have occurred.
+    */
+   for (int i = 0; i < this->virtual_grf_count; i++) {
+      virtual_grf_used[i] = false;
+   }
+
+   foreach_iter(exec_list_iterator, iter, this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)iter.get();
+
+      if (inst->dst.file == GRF)
+	 virtual_grf_used[inst->dst.reg] = true;
+
+      for (int i = 0; i < 3; i++) {
+	 if (inst->src[i].file == GRF)
+	    virtual_grf_used[inst->src[i].reg] = true;
+      }
+   }
+
+   hw_reg_mapping[0] = this->first_non_payload_grf;
+   next = hw_reg_mapping[0] + this->virtual_grf_sizes[0];
+   for (i = 1; i < this->virtual_grf_count; i++) {
+      if (virtual_grf_used[i]) {
+	 hw_reg_mapping[i] = next;
+	 next += this->virtual_grf_sizes[i];
+      }
+   }
+   prog_data->total_grf = next;
+
+   foreach_iter(exec_list_iterator, iter, this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)iter.get();
+
+      assign(hw_reg_mapping, &inst->dst);
+      assign(hw_reg_mapping, &inst->src[0]);
+      assign(hw_reg_mapping, &inst->src[1]);
+      assign(hw_reg_mapping, &inst->src[2]);
+   }
+
+   if (prog_data->total_grf > BRW_MAX_GRF) {
+      fail("Ran out of regs on trivial allocator (%d/%d)\n",
+	   prog_data->total_grf, BRW_MAX_GRF);
+   }
+}
+
+static void
+brw_alloc_reg_set_for_classes(struct brw_context *brw,
+			      int *class_sizes,
+			      int class_count,
+			      int base_reg_count)
+{
+   /* Compute the total number of registers across all classes. */
+   int ra_reg_count = 0;
+   for (int i = 0; i < class_count; i++) {
+      ra_reg_count += base_reg_count - (class_sizes[i] - 1);
+   }
+
+   ralloc_free(brw->vs.ra_reg_to_grf);
+   brw->vs.ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count);
+   ralloc_free(brw->vs.regs);
+   brw->vs.regs = ra_alloc_reg_set(ra_reg_count);
+   ralloc_free(brw->vs.classes);
+   brw->vs.classes = ralloc_array(brw, int, class_count + 1);
+
+   /* Now, add the registers to their classes, and add the conflicts
+    * between them and the base GRF registers (and also each other).
+    */
+   int reg = 0;
+   for (int i = 0; i < class_count; i++) {
+      int class_reg_count = base_reg_count - (class_sizes[i] - 1);
+      brw->vs.classes[i] = ra_alloc_reg_class(brw->vs.regs);
+
+      for (int j = 0; j < class_reg_count; j++) {
+	 ra_class_add_reg(brw->vs.regs, brw->vs.classes[i], reg);
+
+	 brw->vs.ra_reg_to_grf[reg] = j;
+
+	 for (int base_reg = j;
+	      base_reg < j + class_sizes[i];
+	      base_reg++) {
+	    ra_add_transitive_reg_conflict(brw->vs.regs, base_reg, reg);
+	 }
+
+	 reg++;
+      }
+   }
+   assert(reg == ra_reg_count);
+
+   ra_set_finalize(brw->vs.regs);
+}
+
+void
+vec4_visitor::reg_allocate()
+{
+   int hw_reg_mapping[virtual_grf_count];
+   int first_assigned_grf = this->first_non_payload_grf;
+   int base_reg_count = BRW_MAX_GRF - first_assigned_grf;
+   int class_sizes[base_reg_count];
+   int class_count = 0;
+
+   /* Using the trivial allocator can be useful in debugging undefined
+    * register access as a result of broken optimization passes.
+    */
+   if (0) {
+      reg_allocate_trivial();
+      return;
+   }
+
+   calculate_live_intervals();
+
+   /* Set up the register classes.
+    *
+    * The base registers store a vec4.  However, we'll need larger
+    * storage for arrays, structures, and matrices, which will be sets
+    * of contiguous registers.
+    */
+   class_sizes[class_count++] = 1;
+
+   for (int r = 0; r < virtual_grf_count; r++) {
+      int i;
+
+      for (i = 0; i < class_count; i++) {
+	 if (class_sizes[i] == this->virtual_grf_sizes[r])
+	    break;
+      }
+      if (i == class_count) {
+	 if (this->virtual_grf_sizes[r] >= base_reg_count) {
+	    fail("Object too large to register allocate.\n");
+	 }
+
+	 class_sizes[class_count++] = this->virtual_grf_sizes[r];
+      }
+   }
+
+   brw_alloc_reg_set_for_classes(brw, class_sizes, class_count, base_reg_count);
+
+   struct ra_graph *g = ra_alloc_interference_graph(brw->vs.regs,
+						    virtual_grf_count);
+
+   for (int i = 0; i < virtual_grf_count; i++) {
+      for (int c = 0; c < class_count; c++) {
+	 if (class_sizes[c] == this->virtual_grf_sizes[i]) {
+	    ra_set_node_class(g, i, brw->vs.classes[c]);
+	    break;
+	 }
+      }
+
+      for (int j = 0; j < i; j++) {
+	 if (virtual_grf_interferes(i, j)) {
+	    ra_add_node_interference(g, i, j);
+	 }
+      }
+   }
+
+   if (!ra_allocate_no_spills(g)) {
+      ralloc_free(g);
+      fail("No register spilling support yet\n");
+   }
+
+   /* Get the chosen virtual registers for each node, and map virtual
+    * regs in the register classes back down to real hardware reg
+    * numbers.
+    */
+   prog_data->total_grf = first_assigned_grf;
+   for (int i = 0; i < virtual_grf_count; i++) {
+      int reg = ra_get_node_reg(g, i);
+
+      hw_reg_mapping[i] = first_assigned_grf + brw->vs.ra_reg_to_grf[reg];
+      prog_data->total_grf = MAX2(prog_data->total_grf, hw_reg_mapping[i] + 1);
+   }
+
+   foreach_list(node, &this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+
+      assign(hw_reg_mapping, &inst->dst);
+      assign(hw_reg_mapping, &inst->src[0]);
+      assign(hw_reg_mapping, &inst->src[1]);
+      assign(hw_reg_mapping, &inst->src[2]);
+   }
+
+   ralloc_free(g);
+}
+
+} /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
new file mode 100644
index 00000000000..b3a07bd0539
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -0,0 +1,2156 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+extern "C" {
+#include "main/macros.h"
+#include "program/prog_parameter.h"
+}
+
+namespace brw {
+
+src_reg::src_reg(dst_reg reg)
+{
+   init();
+
+   this->file = reg.file;
+   this->reg = reg.reg;
+   this->reg_offset = reg.reg_offset;
+   this->type = reg.type;
+   this->reladdr = reg.reladdr;
+   this->fixed_hw_reg = reg.fixed_hw_reg;
+
+   int swizzles[4];
+   int next_chan = 0;
+   int last = 0;
+
+   for (int i = 0; i < 4; i++) {
+      if (!(reg.writemask & (1 << i)))
+	 continue;
+
+      swizzles[next_chan++] = last = i;
+   }
+
+   for (; next_chan < 4; next_chan++) {
+      swizzles[next_chan] = last;
+   }
+
+   this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
+				swizzles[2], swizzles[3]);
+}
+
+dst_reg::dst_reg(src_reg reg)
+{
+   init();
+
+   this->file = reg.file;
+   this->reg = reg.reg;
+   this->reg_offset = reg.reg_offset;
+   this->type = reg.type;
+   this->writemask = WRITEMASK_XYZW;
+   this->reladdr = reg.reladdr;
+   this->fixed_hw_reg = reg.fixed_hw_reg;
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, dst_reg dst,
+		   src_reg src0, src_reg src1, src_reg src2)
+{
+   vec4_instruction *inst = new(mem_ctx) vec4_instruction();
+
+   inst->opcode = opcode;
+   inst->dst = dst;
+   inst->src[0] = src0;
+   inst->src[1] = src1;
+   inst->src[2] = src2;
+   inst->ir = this->base_ir;
+   inst->annotation = this->current_annotation;
+
+   this->instructions.push_tail(inst);
+
+   return inst;
+}
+
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
+{
+   return emit(opcode, dst, src0, src1, src_reg());
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
+{
+   assert(dst.writemask != 0);
+   return emit(opcode, dst, src0, src_reg(), src_reg());
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode)
+{
+   return emit(opcode, dst_reg(), src_reg(), src_reg(), src_reg());
+}
+
+void
+vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
+{
+   static enum opcode dot_opcodes[] = {
+      BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
+   };
+
+   emit(dot_opcodes[elements - 2], dst, src0, src1);
+}
+
+void
+vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
+{
+   /* The gen6 math instruction ignores the source modifiers --
+    * swizzle, abs, negate, and at least some parts of the register
+    * region description.
+    */
+   src_reg temp_src = src_reg(this, glsl_type::vec4_type);
+   emit(BRW_OPCODE_MOV, dst_reg(temp_src), src);
+
+   if (dst.writemask != WRITEMASK_XYZW) {
+      /* The gen6 math instruction must be align1, so we can't do
+       * writemasks.
+       */
+      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
+
+      emit(opcode, temp_dst, temp_src);
+
+      emit(BRW_OPCODE_MOV, dst, src_reg(temp_dst));
+   } else {
+      emit(opcode, dst, temp_src);
+   }
+}
+
+void
+vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
+{
+   vec4_instruction *inst = emit(opcode, dst, src);
+   inst->base_mrf = 1;
+   inst->mlen = 1;
+}
+
+void
+vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
+{
+   switch (opcode) {
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      break;
+   default:
+      assert(!"not reached: bad math opcode");
+      return;
+   }
+
+   if (intel->gen >= 6) {
+      return emit_math1_gen6(opcode, dst, src);
+   } else {
+      return emit_math1_gen4(opcode, dst, src);
+   }
+}
+
+void
+vec4_visitor::emit_math2_gen6(enum opcode opcode,
+			      dst_reg dst, src_reg src0, src_reg src1)
+{
+   src_reg expanded;
+
+   /* The gen6 math instruction ignores the source modifiers --
+    * swizzle, abs, negate, and at least some parts of the register
+    * region description.  Move the sources to temporaries to make it
+    * generally work.
+    */
+
+   expanded = src_reg(this, glsl_type::vec4_type);
+   emit(BRW_OPCODE_MOV, dst_reg(expanded), src0);
+   src0 = expanded;
+
+   expanded = src_reg(this, glsl_type::vec4_type);
+   emit(BRW_OPCODE_MOV, dst_reg(expanded), src1);
+   src1 = expanded;
+
+   if (dst.writemask != WRITEMASK_XYZW) {
+      /* The gen6 math instruction must be align1, so we can't do
+       * writemasks.
+       */
+      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
+
+      emit(opcode, temp_dst, src0, src1);
+
+      emit(BRW_OPCODE_MOV, dst, src_reg(temp_dst));
+   } else {
+      emit(opcode, dst, src0, src1);
+   }
+}
+
+void
+vec4_visitor::emit_math2_gen4(enum opcode opcode,
+			      dst_reg dst, src_reg src0, src_reg src1)
+{
+   vec4_instruction *inst = emit(opcode, dst, src0, src1);
+   inst->base_mrf = 1;
+   inst->mlen = 2;
+}
+
+void
+vec4_visitor::emit_math(enum opcode opcode,
+			dst_reg dst, src_reg src0, src_reg src1)
+{
+   assert(opcode == SHADER_OPCODE_POW);
+
+   if (intel->gen >= 6) {
+      return emit_math2_gen6(opcode, dst, src0, src1);
+   } else {
+      return emit_math2_gen4(opcode, dst, src0, src1);
+   }
+}
+
+void
+vec4_visitor::visit_instructions(const exec_list *list)
+{
+   foreach_list(node, list) {
+      ir_instruction *ir = (ir_instruction *)node;
+
+      base_ir = ir;
+      ir->accept(this);
+   }
+}
+
+
+static int
+type_size(const struct glsl_type *type)
+{
+   unsigned int i;
+   int size;
+
+   switch (type->base_type) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_BOOL:
+      if (type->is_matrix()) {
+	 return type->matrix_columns;
+      } else {
+	 /* Regardless of size of vector, it gets a vec4. This is bad
+	  * packing for things like floats, but otherwise arrays become a
+	  * mess.  Hopefully a later pass over the code can pack scalars
+	  * down if appropriate.
+	  */
+	 return 1;
+      }
+   case GLSL_TYPE_ARRAY:
+      assert(type->length > 0);
+      return type_size(type->fields.array) * type->length;
+   case GLSL_TYPE_STRUCT:
+      size = 0;
+      for (i = 0; i < type->length; i++) {
+	 size += type_size(type->fields.structure[i].type);
+      }
+      return size;
+   case GLSL_TYPE_SAMPLER:
+      /* Samplers take up one slot in UNIFORMS[], but they're baked in
+       * at link time.
+       */
+      return 1;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+int
+vec4_visitor::virtual_grf_alloc(int size)
+{
+   if (virtual_grf_array_size <= virtual_grf_count) {
+      if (virtual_grf_array_size == 0)
+	 virtual_grf_array_size = 16;
+      else
+	 virtual_grf_array_size *= 2;
+      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
+				   virtual_grf_array_size);
+   }
+   virtual_grf_sizes[virtual_grf_count] = size;
+   return virtual_grf_count++;
+}
+
+src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
+{
+   init();
+
+   this->file = GRF;
+   this->reg = v->virtual_grf_alloc(type_size(type));
+
+   if (type->is_array() || type->is_record()) {
+      this->swizzle = BRW_SWIZZLE_NOOP;
+   } else {
+      this->swizzle = swizzle_for_size(type->vector_elements);
+   }
+
+   this->type = brw_type_for_base_type(type);
+}
+
+dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
+{
+   init();
+
+   this->file = GRF;
+   this->reg = v->virtual_grf_alloc(type_size(type));
+
+   if (type->is_array() || type->is_record()) {
+      this->writemask = WRITEMASK_XYZW;
+   } else {
+      this->writemask = (1 << type->vector_elements) - 1;
+   }
+
+   this->type = brw_type_for_base_type(type);
+}
+
+/* Our support for uniforms is piggy-backed on the struct
+ * gl_fragment_program, because that's where the values actually
+ * get stored, rather than in some global gl_shader_program uniform
+ * store.
+ */
+int
+vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
+{
+   unsigned int offset = 0;
+   float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
+
+   if (type->is_matrix()) {
+      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
+							type->vector_elements,
+							1);
+
+      for (unsigned int i = 0; i < type->matrix_columns; i++) {
+	 offset += setup_uniform_values(loc + offset, column);
+      }
+
+      return offset;
+   }
+
+   switch (type->base_type) {
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_BOOL:
+      for (unsigned int i = 0; i < type->vector_elements; i++) {
+	 int slot = this->uniforms * 4 + i;
+	 switch (type->base_type) {
+	 case GLSL_TYPE_FLOAT:
+	    c->prog_data.param_convert[slot] = PARAM_NO_CONVERT;
+	    break;
+	 case GLSL_TYPE_UINT:
+	    c->prog_data.param_convert[slot] = PARAM_CONVERT_F2U;
+	    break;
+	 case GLSL_TYPE_INT:
+	    c->prog_data.param_convert[slot] = PARAM_CONVERT_F2I;
+	    break;
+	 case GLSL_TYPE_BOOL:
+	    c->prog_data.param_convert[slot] = PARAM_CONVERT_F2B;
+	    break;
+	 default:
+	    assert(!"not reached");
+	    c->prog_data.param_convert[slot] = PARAM_NO_CONVERT;
+	    break;
+	 }
+	 c->prog_data.param[slot] = &values[i];
+      }
+
+      for (unsigned int i = type->vector_elements; i < 4; i++) {
+	 c->prog_data.param_convert[this->uniforms * 4 + i] =
+	    PARAM_CONVERT_ZERO;
+	 c->prog_data.param[this->uniforms * 4 + i] = NULL;
+      }
+
+      this->uniform_size[this->uniforms] = type->vector_elements;
+      this->uniforms++;
+
+      return 1;
+
+   case GLSL_TYPE_STRUCT:
+      for (unsigned int i = 0; i < type->length; i++) {
+	 offset += setup_uniform_values(loc + offset,
+					type->fields.structure[i].type);
+      }
+      return offset;
+
+   case GLSL_TYPE_ARRAY:
+      for (unsigned int i = 0; i < type->length; i++) {
+	 offset += setup_uniform_values(loc + offset, type->fields.array);
+      }
+      return offset;
+
+   case GLSL_TYPE_SAMPLER:
+      /* The sampler takes up a slot, but we don't use any values from it. */
+      return 1;
+
+   default:
+      assert(!"not reached");
+      return 0;
+   }
+}
+
+/* Our support for builtin uniforms is even scarier than non-builtin.
+ * It sits on top of the PROG_STATE_VAR parameters that are
+ * automatically updated from GL context state.
+ */
+void
+vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
+{
+   const ir_state_slot *const slots = ir->state_slots;
+   assert(ir->state_slots != NULL);
+
+   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
+      /* This state reference has already been setup by ir_to_mesa,
+       * but we'll get the same index back here.  We can reference
+       * ParameterValues directly, since unlike brw_fs.cpp, we never
+       * add new state references during compile.
+       */
+      int index = _mesa_add_state_reference(this->vp->Base.Parameters,
+					    (gl_state_index *)slots[i].tokens);
+      float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
+
+      this->uniform_size[this->uniforms] = 0;
+      /* Add each of the unique swizzled channels of the element.
+       * This will end up matching the size of the glsl_type of this field.
+       */
+      int last_swiz = -1;
+      for (unsigned int j = 0; j < 4; j++) {
+	 int swiz = GET_SWZ(slots[i].swizzle, j);
+	 last_swiz = swiz;
+
+	 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
+	 c->prog_data.param_convert[this->uniforms * 4 + j] = PARAM_NO_CONVERT;
+	 if (swiz <= last_swiz)
+	    this->uniform_size[this->uniforms]++;
+      }
+      this->uniforms++;
+   }
+}
+
+dst_reg *
+vec4_visitor::variable_storage(ir_variable *var)
+{
+   return (dst_reg *)hash_table_find(this->variable_ht, var);
+}
+
+void
+vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
+{
+   ir_expression *expr = ir->as_expression();
+
+   if (expr) {
+      src_reg op[2];
+      vec4_instruction *inst;
+
+      assert(expr->get_num_operands() <= 2);
+      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
+	 assert(expr->operands[i]->type->is_scalar());
+
+	 expr->operands[i]->accept(this);
+	 op[i] = this->result;
+      }
+
+      switch (expr->operation) {
+      case ir_unop_logic_not:
+	 inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], src_reg(1));
+	 inst->conditional_mod = BRW_CONDITIONAL_Z;
+	 break;
+
+      case ir_binop_logic_xor:
+	 inst = emit(BRW_OPCODE_XOR, dst_null_d(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_binop_logic_or:
+	 inst = emit(BRW_OPCODE_OR, dst_null_d(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_binop_logic_and:
+	 inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_unop_f2b:
+	 if (intel->gen >= 6) {
+	    inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0.0f));
+	 } else {
+	    inst = emit(BRW_OPCODE_MOV, dst_null_f(), op[0]);
+	 }
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_unop_i2b:
+	 if (intel->gen >= 6) {
+	    inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
+	 } else {
+	    inst = emit(BRW_OPCODE_MOV, dst_null_d(), op[0]);
+	 }
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 break;
+
+      case ir_binop_greater:
+      case ir_binop_gequal:
+      case ir_binop_less:
+      case ir_binop_lequal:
+      case ir_binop_equal:
+      case ir_binop_all_equal:
+      case ir_binop_nequal:
+      case ir_binop_any_nequal:
+	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
+	 inst->conditional_mod =
+	    brw_conditional_for_comparison(expr->operation);
+	 break;
+
+      default:
+	 assert(!"not reached");
+	 break;
+      }
+      return;
+   }
+
+   ir->accept(this);
+
+   if (intel->gen >= 6) {
+      vec4_instruction *inst = emit(BRW_OPCODE_AND, dst_null_d(),
+			       this->result, src_reg(1));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+   } else {
+      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst_null_d(), this->result);
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+   }
+}
+
+/**
+ * Emit a gen6 IF statement with the comparison folded into the IF
+ * instruction.
+ */
+void
+vec4_visitor::emit_if_gen6(ir_if *ir)
+{
+   ir_expression *expr = ir->condition->as_expression();
+
+   if (expr) {
+      src_reg op[2];
+      vec4_instruction *inst;
+      dst_reg temp;
+
+      assert(expr->get_num_operands() <= 2);
+      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
+	 expr->operands[i]->accept(this);
+	 op[i] = this->result;
+      }
+
+      switch (expr->operation) {
+      case ir_unop_logic_not:
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_Z;
+	 return;
+
+      case ir_binop_logic_xor:
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_binop_logic_or:
+	 temp = dst_reg(this, glsl_type::bool_type);
+	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_binop_logic_and:
+	 temp = dst_reg(this, glsl_type::bool_type);
+	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_unop_f2b:
+	 inst = emit(BRW_OPCODE_IF, dst_null_f(), op[0], src_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_unop_i2b:
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+
+      case ir_binop_greater:
+      case ir_binop_gequal:
+      case ir_binop_less:
+      case ir_binop_lequal:
+      case ir_binop_equal:
+      case ir_binop_nequal:
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
+	 inst->conditional_mod =
+	    brw_conditional_for_comparison(expr->operation);
+	 return;
+
+      case ir_binop_all_equal:
+	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_Z;
+
+	 inst = emit(BRW_OPCODE_IF);
+	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+	 return;
+
+      case ir_binop_any_nequal:
+	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+	 inst = emit(BRW_OPCODE_IF);
+	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+	 return;
+
+      case ir_unop_any:
+	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+	 inst = emit(BRW_OPCODE_IF);
+	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+	 return;
+
+      default:
+	 assert(!"not reached");
+	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 return;
+      }
+      return;
+   }
+
+   ir->condition->accept(this);
+
+   vec4_instruction *inst = emit(BRW_OPCODE_IF, dst_null_d(),
+			    this->result, src_reg(0));
+   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+}
+
+void
+vec4_visitor::visit(ir_variable *ir)
+{
+   dst_reg *reg = NULL;
+
+   if (variable_storage(ir))
+      return;
+
+   switch (ir->mode) {
+   case ir_var_in:
+      reg = new(mem_ctx) dst_reg(ATTR, ir->location);
+      break;
+
+   case ir_var_out:
+      reg = new(mem_ctx) dst_reg(this, ir->type);
+
+      for (int i = 0; i < type_size(ir->type); i++) {
+	 output_reg[ir->location + i] = *reg;
+	 output_reg[ir->location + i].reg_offset = i;
+	 output_reg[ir->location + i].type = BRW_REGISTER_TYPE_F;
+      }
+      break;
+
+   case ir_var_auto:
+   case ir_var_temporary:
+      reg = new(mem_ctx) dst_reg(this, ir->type);
+      break;
+
+   case ir_var_uniform:
+      reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
+
+      if (!strncmp(ir->name, "gl_", 3)) {
+	 setup_builtin_uniform_values(ir);
+      } else {
+	 setup_uniform_values(ir->location, ir->type);
+      }
+      break;
+
+   default:
+      assert(!"not reached");
+   }
+
+   reg->type = brw_type_for_base_type(ir->type);
+   hash_table_insert(this->variable_ht, reg, ir);
+}
+
+void
+vec4_visitor::visit(ir_loop *ir)
+{
+   dst_reg counter;
+
+   /* We don't want debugging output to print the whole body of the
+    * loop as the annotation.
+    */
+   this->base_ir = NULL;
+
+   if (ir->counter != NULL) {
+      this->base_ir = ir->counter;
+      ir->counter->accept(this);
+      counter = *(variable_storage(ir->counter));
+
+      if (ir->from != NULL) {
+	 this->base_ir = ir->from;
+	 ir->from->accept(this);
+
+	 emit(BRW_OPCODE_MOV, counter, this->result);
+      }
+   }
+
+   emit(BRW_OPCODE_DO);
+
+   if (ir->to) {
+      this->base_ir = ir->to;
+      ir->to->accept(this);
+
+      vec4_instruction *inst = emit(BRW_OPCODE_CMP, dst_null_d(),
+				    src_reg(counter), this->result);
+      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
+
+      inst = emit(BRW_OPCODE_BREAK);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+   }
+
+   visit_instructions(&ir->body_instructions);
+
+
+   if (ir->increment) {
+      this->base_ir = ir->increment;
+      ir->increment->accept(this);
+      emit(BRW_OPCODE_ADD, counter, src_reg(counter), this->result);
+   }
+
+   emit(BRW_OPCODE_WHILE);
+}
+
+void
+vec4_visitor::visit(ir_loop_jump *ir)
+{
+   switch (ir->mode) {
+   case ir_loop_jump::jump_break:
+      emit(BRW_OPCODE_BREAK);
+      break;
+   case ir_loop_jump::jump_continue:
+      emit(BRW_OPCODE_CONTINUE);
+      break;
+   }
+}
+
+
+void
+vec4_visitor::visit(ir_function_signature *ir)
+{
+   assert(0);
+   (void)ir;
+}
+
+void
+vec4_visitor::visit(ir_function *ir)
+{
+   /* Ignore function bodies other than main() -- we shouldn't see calls to
+    * them since they should all be inlined.
+    */
+   if (strcmp(ir->name, "main") == 0) {
+      const ir_function_signature *sig;
+      exec_list empty;
+
+      sig = ir->matching_signature(&empty);
+
+      assert(sig);
+
+      visit_instructions(&sig->body);
+   }
+}
+
+GLboolean
+vec4_visitor::try_emit_sat(ir_expression *ir)
+{
+   ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
+   if (!sat_src)
+      return false;
+
+   sat_src->accept(this);
+   src_reg src = this->result;
+
+   this->result = src_reg(this, ir->type);
+   vec4_instruction *inst;
+   inst = emit(BRW_OPCODE_MOV, dst_reg(this->result), src);
+   inst->saturate = true;
+
+   return true;
+}
+
+void
+vec4_visitor::emit_bool_comparison(unsigned int op,
+				 dst_reg dst, src_reg src0, src_reg src1)
+{
+   /* original gen4 does destination conversion before comparison. */
+   if (intel->gen < 5)
+      dst.type = src0.type;
+
+   vec4_instruction *inst = emit(BRW_OPCODE_CMP, dst, src0, src1);
+   inst->conditional_mod = brw_conditional_for_comparison(op);
+
+   dst.type = BRW_REGISTER_TYPE_D;
+   emit(BRW_OPCODE_AND, dst, src_reg(dst), src_reg(0x1));
+}
+
+void
+vec4_visitor::visit(ir_expression *ir)
+{
+   unsigned int operand;
+   src_reg op[Elements(ir->operands)];
+   src_reg result_src;
+   dst_reg result_dst;
+   vec4_instruction *inst;
+
+   if (try_emit_sat(ir))
+      return;
+
+   for (operand = 0; operand < ir->get_num_operands(); operand++) {
+      this->result.file = BAD_FILE;
+      ir->operands[operand]->accept(this);
+      if (this->result.file == BAD_FILE) {
+	 printf("Failed to get tree for expression operand:\n");
+	 ir->operands[operand]->print();
+	 exit(1);
+      }
+      op[operand] = this->result;
+
+      /* Matrix expression operands should have been broken down to vector
+       * operations already.
+       */
+      assert(!ir->operands[operand]->type->is_matrix());
+   }
+
+   int vector_elements = ir->operands[0]->type->vector_elements;
+   if (ir->operands[1]) {
+      vector_elements = MAX2(vector_elements,
+			     ir->operands[1]->type->vector_elements);
+   }
+
+   this->result.file = BAD_FILE;
+
+   /* Storage for our result.  Ideally for an assignment we'd be using
+    * the actual storage for the result here, instead.
+    */
+   result_src = src_reg(this, ir->type);
+   /* convenience for the emit functions below. */
+   result_dst = dst_reg(result_src);
+   /* If nothing special happens, this is the result. */
+   this->result = result_src;
+   /* Limit writes to the channels that will be used by result_src later.
+    * This does limit this temp's use as a temporary for multi-instruction
+    * sequences.
+    */
+   result_dst.writemask = (1 << ir->type->vector_elements) - 1;
+
+   switch (ir->operation) {
+   case ir_unop_logic_not:
+      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
+       * ones complement of the whole register, not just bit 0.
+       */
+      emit(BRW_OPCODE_XOR, result_dst, op[0], src_reg(1));
+      break;
+   case ir_unop_neg:
+      op[0].negate = !op[0].negate;
+      this->result = op[0];
+      break;
+   case ir_unop_abs:
+      op[0].abs = true;
+      op[0].negate = false;
+      this->result = op[0];
+      break;
+
+   case ir_unop_sign:
+      emit(BRW_OPCODE_MOV, result_dst, src_reg(0.0f));
+
+      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
+      inst->conditional_mod = BRW_CONDITIONAL_G;
+      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1.0f));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+
+      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
+      inst->conditional_mod = BRW_CONDITIONAL_L;
+      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(-1.0f));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+
+      break;
+
+   case ir_unop_rcp:
+      emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
+      break;
+
+   case ir_unop_exp2:
+      emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
+      break;
+   case ir_unop_log2:
+      emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
+      break;
+   case ir_unop_exp:
+   case ir_unop_log:
+      assert(!"not reached: should be handled by ir_explog_to_explog2");
+      break;
+   case ir_unop_sin:
+   case ir_unop_sin_reduced:
+      emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
+      break;
+   case ir_unop_cos:
+   case ir_unop_cos_reduced:
+      emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
+      break;
+
+   case ir_unop_dFdx:
+   case ir_unop_dFdy:
+      assert(!"derivatives not valid in vertex shader");
+      break;
+
+   case ir_unop_noise:
+      assert(!"not reached: should be handled by lower_noise");
+      break;
+
+   case ir_binop_add:
+      emit(BRW_OPCODE_ADD, result_dst, op[0], op[1]);
+      break;
+   case ir_binop_sub:
+      assert(!"not reached: should be handled by ir_sub_to_add_neg");
+      break;
+
+   case ir_binop_mul:
+      if (ir->type->is_integer()) {
+	 /* For integer multiplication, the MUL uses the low 16 bits
+	  * of one of the operands (src0 on gen6, src1 on gen7).  The
+	  * MACH accumulates in the contribution of the upper 16 bits
+	  * of that operand.
+	  *
+	  * FINISHME: Emit just the MUL if we know an operand is small
+	  * enough.
+	  */
+	 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
+
+	 emit(BRW_OPCODE_MUL, acc, op[0], op[1]);
+	 emit(BRW_OPCODE_MACH, dst_null_d(), op[0], op[1]);
+	 emit(BRW_OPCODE_MOV, result_dst, src_reg(acc));
+      } else {
+	 emit(BRW_OPCODE_MUL, result_dst, op[0], op[1]);
+      }
+      break;
+   case ir_binop_div:
+      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
+   case ir_binop_mod:
+      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
+      break;
+
+   case ir_binop_less:
+   case ir_binop_greater:
+   case ir_binop_lequal:
+   case ir_binop_gequal:
+   case ir_binop_equal:
+   case ir_binop_nequal: {
+      dst_reg temp = result_dst;
+      /* original gen4 does implicit conversion before comparison. */
+      if (intel->gen < 5)
+	 temp.type = op[0].type;
+
+      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
+      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
+      emit(BRW_OPCODE_AND, result_dst, this->result, src_reg(0x1));
+      break;
+   }
+
+   case ir_binop_all_equal:
+      /* "==" operator producing a scalar boolean. */
+      if (ir->operands[0]->type->is_vector() ||
+	  ir->operands[1]->type->is_vector()) {
+	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_Z;
+
+	 emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
+	 inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
+	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+      } else {
+	 dst_reg temp = result_dst;
+	 /* original gen4 does implicit conversion before comparison. */
+	 if (intel->gen < 5)
+	    temp.type = op[0].type;
+
+	 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_Z;
+	 emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
+      }
+      break;
+   case ir_binop_any_nequal:
+      /* "!=" operator producing a scalar boolean. */
+      if (ir->operands[0]->type->is_vector() ||
+	  ir->operands[1]->type->is_vector()) {
+	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+	 emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
+	 inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
+	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      } else {
+	 dst_reg temp = result_dst;
+	 /* original gen4 does implicit conversion before comparison. */
+	 if (intel->gen < 5)
+	    temp.type = op[0].type;
+
+	 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
+	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
+      }
+      break;
+
+   case ir_unop_any:
+      inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+      emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
+
+      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+
+   case ir_binop_logic_xor:
+      emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
+      break;
+
+   case ir_binop_logic_or:
+      emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
+      break;
+
+   case ir_binop_logic_and:
+      emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
+      break;
+
+   case ir_binop_dot:
+      assert(ir->operands[0]->type->is_vector());
+      assert(ir->operands[0]->type == ir->operands[1]->type);
+      emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
+      break;
+
+   case ir_unop_sqrt:
+      emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
+      break;
+   case ir_unop_rsq:
+      emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
+      break;
+   case ir_unop_i2f:
+   case ir_unop_i2u:
+   case ir_unop_u2i:
+   case ir_unop_u2f:
+   case ir_unop_b2f:
+   case ir_unop_b2i:
+   case ir_unop_f2i:
+      emit(BRW_OPCODE_MOV, result_dst, op[0]);
+      break;
+   case ir_unop_f2b:
+   case ir_unop_i2b: {
+      dst_reg temp = result_dst;
+      /* original gen4 does implicit conversion before comparison. */
+      if (intel->gen < 5)
+	 temp.type = op[0].type;
+
+      inst = emit(BRW_OPCODE_CMP, temp, op[0], src_reg(0.0f));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+      inst = emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(1));
+      break;
+   }
+
+   case ir_unop_trunc:
+      emit(BRW_OPCODE_RNDZ, result_dst, op[0]);
+      break;
+   case ir_unop_ceil:
+      op[0].negate = !op[0].negate;
+      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
+      this->result.negate = true;
+      break;
+   case ir_unop_floor:
+      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
+      break;
+   case ir_unop_fract:
+      inst = emit(BRW_OPCODE_FRC, result_dst, op[0]);
+      break;
+   case ir_unop_round_even:
+      emit(BRW_OPCODE_RNDE, result_dst, op[0]);
+      break;
+
+   case ir_binop_min:
+      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
+      inst->conditional_mod = BRW_CONDITIONAL_L;
+
+      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   case ir_binop_max:
+      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
+      inst->conditional_mod = BRW_CONDITIONAL_G;
+
+      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+
+   case ir_binop_pow:
+      emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
+      break;
+
+   case ir_unop_bit_not:
+      inst = emit(BRW_OPCODE_NOT, result_dst, op[0]);
+      break;
+   case ir_binop_bit_and:
+      inst = emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
+      break;
+   case ir_binop_bit_xor:
+      inst = emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
+      break;
+   case ir_binop_bit_or:
+      inst = emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
+      break;
+
+   case ir_binop_lshift:
+   case ir_binop_rshift:
+      assert(!"GLSL 1.30 features unsupported");
+      break;
+
+   case ir_quadop_vector:
+      assert(!"not reached: should be handled by lower_quadop_vector");
+      break;
+   }
+}
+
+
+void
+vec4_visitor::visit(ir_swizzle *ir)
+{
+   src_reg src;
+   int i = 0;
+   int swizzle[4];
+
+   /* Note that this is only swizzles in expressions, not those on the left
+    * hand side of an assignment, which do write masking.  See ir_assignment
+    * for that.
+    */
+
+   ir->val->accept(this);
+   src = this->result;
+   assert(src.file != BAD_FILE);
+
+   for (i = 0; i < ir->type->vector_elements; i++) {
+      switch (i) {
+      case 0:
+	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
+	 break;
+      case 1:
+	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
+	 break;
+      case 2:
+	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
+	 break;
+      case 3:
+	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
+	    break;
+      }
+   }
+   for (; i < 4; i++) {
+      /* Replicate the last channel out. */
+      swizzle[i] = swizzle[ir->type->vector_elements - 1];
+   }
+
+   src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+
+   this->result = src;
+}
+
+void
+vec4_visitor::visit(ir_dereference_variable *ir)
+{
+   const struct glsl_type *type = ir->type;
+   dst_reg *reg = variable_storage(ir->var);
+
+   if (!reg) {
+      fail("Failed to find variable storage for %s\n", ir->var->name);
+      this->result = src_reg(brw_null_reg());
+      return;
+   }
+
+   this->result = src_reg(*reg);
+
+   if (type->is_scalar() || type->is_vector() || type->is_matrix())
+      this->result.swizzle = swizzle_for_size(type->vector_elements);
+}
+
+void
+vec4_visitor::visit(ir_dereference_array *ir)
+{
+   ir_constant *constant_index;
+   src_reg src;
+   int element_size = type_size(ir->type);
+
+   constant_index = ir->array_index->constant_expression_value();
+
+   ir->array->accept(this);
+   src = this->result;
+
+   if (constant_index) {
+      src.reg_offset += constant_index->value.i[0] * element_size;
+   } else {
+      /* Variable index array dereference.  It eats the "vec4" of the
+       * base of the array and an index that offsets the Mesa register
+       * index.
+       */
+      ir->array_index->accept(this);
+
+      src_reg index_reg;
+
+      if (element_size == 1) {
+	 index_reg = this->result;
+      } else {
+	 index_reg = src_reg(this, glsl_type::int_type);
+
+	 emit(BRW_OPCODE_MUL, dst_reg(index_reg),
+	      this->result, src_reg(element_size));
+      }
+
+      if (src.reladdr) {
+	 src_reg temp = src_reg(this, glsl_type::int_type);
+
+	 emit(BRW_OPCODE_ADD, dst_reg(temp), *src.reladdr, index_reg);
+
+	 index_reg = temp;
+      }
+
+      src.reladdr = ralloc(mem_ctx, src_reg);
+      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
+   }
+
+   /* If the type is smaller than a vec4, replicate the last channel out. */
+   if (ir->type->is_scalar() || ir->type->is_vector())
+      src.swizzle = swizzle_for_size(ir->type->vector_elements);
+   else
+      src.swizzle = BRW_SWIZZLE_NOOP;
+   src.type = brw_type_for_base_type(ir->type);
+
+   this->result = src;
+}
+
+void
+vec4_visitor::visit(ir_dereference_record *ir)
+{
+   unsigned int i;
+   const glsl_type *struct_type = ir->record->type;
+   int offset = 0;
+
+   ir->record->accept(this);
+
+   for (i = 0; i < struct_type->length; i++) {
+      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
+	 break;
+      offset += type_size(struct_type->fields.structure[i].type);
+   }
+
+   /* If the type is smaller than a vec4, replicate the last channel out. */
+   if (ir->type->is_scalar() || ir->type->is_vector())
+      this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
+   else
+      this->result.swizzle = BRW_SWIZZLE_NOOP;
+   this->result.type = brw_type_for_base_type(ir->type);
+
+   this->result.reg_offset += offset;
+}
+
+/**
+ * We want to be careful in assignment setup to hit the actual storage
+ * instead of potentially using a temporary like we might with the
+ * ir_dereference handler.
+ */
+static dst_reg
+get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
+{
+   /* The LHS must be a dereference.  If the LHS is a variable indexed array
+    * access of a vector, it must be separated into a series conditional moves
+    * before reaching this point (see ir_vec_index_to_cond_assign).
+    */
+   assert(ir->as_dereference());
+   ir_dereference_array *deref_array = ir->as_dereference_array();
+   if (deref_array) {
+      assert(!deref_array->array->type->is_vector());
+   }
+
+   /* Use the rvalue deref handler for the most part.  We'll ignore
+    * swizzles in it and write swizzles using writemask, though.
+    */
+   ir->accept(v);
+   return dst_reg(v->result);
+}
+
+void
+vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
+			      const struct glsl_type *type, bool predicated)
+{
+   if (type->base_type == GLSL_TYPE_STRUCT) {
+      for (unsigned int i = 0; i < type->length; i++) {
+	 emit_block_move(dst, src, type->fields.structure[i].type, predicated);
+      }
+      return;
+   }
+
+   if (type->is_array()) {
+      for (unsigned int i = 0; i < type->length; i++) {
+	 emit_block_move(dst, src, type->fields.array, predicated);
+      }
+      return;
+   }
+
+   if (type->is_matrix()) {
+      const struct glsl_type *vec_type;
+
+      vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
+					 type->vector_elements, 1);
+
+      for (int i = 0; i < type->matrix_columns; i++) {
+	 emit_block_move(dst, src, vec_type, predicated);
+      }
+      return;
+   }
+
+   assert(type->is_scalar() || type->is_vector());
+
+   dst->type = brw_type_for_base_type(type);
+   src->type = dst->type;
+
+   dst->writemask = (1 << type->vector_elements) - 1;
+
+   /* Do we need to worry about swizzling a swizzle? */
+   assert(src->swizzle = BRW_SWIZZLE_NOOP);
+   src->swizzle = swizzle_for_size(type->vector_elements);
+
+   vec4_instruction *inst = emit(BRW_OPCODE_MOV, *dst, *src);
+   if (predicated)
+      inst->predicate = BRW_PREDICATE_NORMAL;
+
+   dst->reg_offset++;
+   src->reg_offset++;
+}
+
+
+/* If the RHS processing resulted in an instruction generating a
+ * temporary value, and it would be easy to rewrite the instruction to
+ * generate its result right into the LHS instead, do so.  This ends
+ * up reliably removing instructions where it can be tricky to do so
+ * later without real UD chain information.
+ */
+bool
+vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
+				     dst_reg dst,
+				     src_reg src,
+				     vec4_instruction *pre_rhs_inst,
+				     vec4_instruction *last_rhs_inst)
+{
+   /* This could be supported, but it would take more smarts. */
+   if (ir->condition)
+      return false;
+
+   if (pre_rhs_inst == last_rhs_inst)
+      return false; /* No instructions generated to work with. */
+
+   /* Make sure the last instruction generated our source reg. */
+   if (src.file != GRF ||
+       src.file != last_rhs_inst->dst.file ||
+       src.reg != last_rhs_inst->dst.reg ||
+       src.reg_offset != last_rhs_inst->dst.reg_offset ||
+       src.reladdr ||
+       src.abs ||
+       src.negate ||
+       last_rhs_inst->predicate != BRW_PREDICATE_NONE)
+      return false;
+
+   /* Check that that last instruction fully initialized the channels
+    * we want to use, in the order we want to use them.  We could
+    * potentially reswizzle the operands of many instructions so that
+    * we could handle out of order channels, but don't yet.
+    */
+   for (int i = 0; i < 4; i++) {
+      if (dst.writemask & (1 << i)) {
+	 if (!(last_rhs_inst->dst.writemask & (1 << i)))
+	    return false;
+
+	 if (BRW_GET_SWZ(src.swizzle, i) != i)
+	    return false;
+      }
+   }
+
+   /* Success!  Rewrite the instruction. */
+   last_rhs_inst->dst.file = dst.file;
+   last_rhs_inst->dst.reg = dst.reg;
+   last_rhs_inst->dst.reg_offset = dst.reg_offset;
+   last_rhs_inst->dst.reladdr = dst.reladdr;
+   last_rhs_inst->dst.writemask &= dst.writemask;
+
+   return true;
+}
+
+void
+vec4_visitor::visit(ir_assignment *ir)
+{
+   dst_reg dst = get_assignment_lhs(ir->lhs, this);
+
+   if (!ir->lhs->type->is_scalar() &&
+       !ir->lhs->type->is_vector()) {
+      ir->rhs->accept(this);
+      src_reg src = this->result;
+
+      if (ir->condition) {
+	 emit_bool_to_cond_code(ir->condition);
+      }
+
+      emit_block_move(&dst, &src, ir->rhs->type, ir->condition != NULL);
+      return;
+   }
+
+   /* Now we're down to just a scalar/vector with writemasks. */
+   int i;
+
+   vec4_instruction *pre_rhs_inst, *last_rhs_inst;
+   pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
+
+   ir->rhs->accept(this);
+
+   last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
+
+   src_reg src = this->result;
+
+   int swizzles[4];
+   int first_enabled_chan = 0;
+   int src_chan = 0;
+
+   assert(ir->lhs->type->is_vector() ||
+	  ir->lhs->type->is_scalar());
+   dst.writemask = ir->write_mask;
+
+   for (int i = 0; i < 4; i++) {
+      if (dst.writemask & (1 << i)) {
+	 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
+	 break;
+      }
+   }
+
+   /* Swizzle a small RHS vector into the channels being written.
+    *
+    * glsl ir treats write_mask as dictating how many channels are
+    * present on the RHS while in our instructions we need to make
+    * those channels appear in the slots of the vec4 they're written to.
+    */
+   for (int i = 0; i < 4; i++) {
+      if (dst.writemask & (1 << i))
+	 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
+      else
+	 swizzles[i] = first_enabled_chan;
+   }
+   src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
+			      swizzles[2], swizzles[3]);
+
+   if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
+      return;
+   }
+
+   if (ir->condition) {
+      emit_bool_to_cond_code(ir->condition);
+   }
+
+   for (i = 0; i < type_size(ir->lhs->type); i++) {
+      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst, src);
+
+      if (ir->condition)
+	 inst->predicate = BRW_PREDICATE_NORMAL;
+
+      dst.reg_offset++;
+      src.reg_offset++;
+   }
+}
+
+void
+vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
+{
+   if (ir->type->base_type == GLSL_TYPE_STRUCT) {
+      foreach_list(node, &ir->components) {
+	 ir_constant *field_value = (ir_constant *)node;
+
+	 emit_constant_values(dst, field_value);
+      }
+      return;
+   }
+
+   if (ir->type->is_array()) {
+      for (unsigned int i = 0; i < ir->type->length; i++) {
+	 emit_constant_values(dst, ir->array_elements[i]);
+      }
+      return;
+   }
+
+   if (ir->type->is_matrix()) {
+      for (int i = 0; i < ir->type->matrix_columns; i++) {
+	 for (int j = 0; j < ir->type->vector_elements; j++) {
+	    dst->writemask = 1 << j;
+	    dst->type = BRW_REGISTER_TYPE_F;
+
+	    emit(BRW_OPCODE_MOV, *dst,
+		 src_reg(ir->value.f[i * ir->type->vector_elements + j]));
+	 }
+	 dst->reg_offset++;
+      }
+      return;
+   }
+
+   for (int i = 0; i < ir->type->vector_elements; i++) {
+      dst->writemask = 1 << i;
+      dst->type = brw_type_for_base_type(ir->type);
+
+      switch (ir->type->base_type) {
+      case GLSL_TYPE_FLOAT:
+	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.f[i]));
+	 break;
+      case GLSL_TYPE_INT:
+	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.i[i]));
+	 break;
+      case GLSL_TYPE_UINT:
+	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.u[i]));
+	 break;
+      case GLSL_TYPE_BOOL:
+	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.b[i]));
+	 break;
+      default:
+	 assert(!"Non-float/uint/int/bool constant");
+	 break;
+      }
+   }
+   dst->reg_offset++;
+}
+
+void
+vec4_visitor::visit(ir_constant *ir)
+{
+   dst_reg dst = dst_reg(this, ir->type);
+   this->result = src_reg(dst);
+
+   emit_constant_values(&dst, ir);
+}
+
+void
+vec4_visitor::visit(ir_call *ir)
+{
+   assert(!"not reached");
+}
+
+void
+vec4_visitor::visit(ir_texture *ir)
+{
+   /* FINISHME: Implement vertex texturing.
+    *
+    * With 0 vertex samplers available, the linker will reject
+    * programs that do vertex texturing, but after our visitor has
+    * run.
+    */
+}
+
+void
+vec4_visitor::visit(ir_return *ir)
+{
+   assert(!"not reached");
+}
+
+void
+vec4_visitor::visit(ir_discard *ir)
+{
+   assert(!"not reached");
+}
+
+void
+vec4_visitor::visit(ir_if *ir)
+{
+   /* Don't point the annotation at the if statement, because then it plus
+    * the then and else blocks get printed.
+    */
+   this->base_ir = ir->condition;
+
+   if (intel->gen == 6) {
+      emit_if_gen6(ir);
+   } else {
+      emit_bool_to_cond_code(ir->condition);
+      vec4_instruction *inst = emit(BRW_OPCODE_IF);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+   }
+
+   visit_instructions(&ir->then_instructions);
+
+   if (!ir->else_instructions.is_empty()) {
+      this->base_ir = ir->condition;
+      emit(BRW_OPCODE_ELSE);
+
+      visit_instructions(&ir->else_instructions);
+   }
+
+   this->base_ir = ir->condition;
+   emit(BRW_OPCODE_ENDIF);
+}
+
+int
+vec4_visitor::emit_vue_header_gen4(int header_mrf)
+{
+   /* Get the position */
+   src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
+
+   /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
+   dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
+
+   current_annotation = "NDC";
+   dst_reg ndc_w = ndc;
+   ndc_w.writemask = WRITEMASK_W;
+   src_reg pos_w = pos;
+   pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
+   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
+
+   dst_reg ndc_xyz = ndc;
+   ndc_xyz.writemask = WRITEMASK_XYZ;
+
+   emit(BRW_OPCODE_MUL, ndc_xyz, pos, src_reg(ndc_w));
+
+   if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
+       c->key.nr_userclip || brw->has_negative_rhw_bug) {
+      dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
+      GLuint i;
+
+      emit(BRW_OPCODE_MOV, header1, 0u);
+
+      if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
+	 assert(!"finishme: psiz");
+	 src_reg psiz;
+
+	 header1.writemask = WRITEMASK_W;
+	 emit(BRW_OPCODE_MUL, header1, psiz, 1u << 11);
+	 emit(BRW_OPCODE_AND, header1, src_reg(header1), 0x7ff << 8);
+      }
+
+      for (i = 0; i < c->key.nr_userclip; i++) {
+	 vec4_instruction *inst;
+
+	 inst = emit(BRW_OPCODE_DP4, dst_reg(brw_null_reg()),
+		     pos, src_reg(c->userplane[i]));
+	 inst->conditional_mod = BRW_CONDITIONAL_L;
+
+	 emit(BRW_OPCODE_OR, header1, src_reg(header1), 1u << i);
+	 inst->predicate = BRW_PREDICATE_NORMAL;
+      }
+
+      /* i965 clipping workaround:
+       * 1) Test for -ve rhw
+       * 2) If set,
+       *      set ndc = (0,0,0,0)
+       *      set ucp[6] = 1
+       *
+       * Later, clipping will detect ucp[6] and ensure the primitive is
+       * clipped against all fixed planes.
+       */
+      if (brw->has_negative_rhw_bug) {
+#if 0
+	 /* FINISHME */
+	 brw_CMP(p,
+		 vec8(brw_null_reg()),
+		 BRW_CONDITIONAL_L,
+		 brw_swizzle1(ndc, 3),
+		 brw_imm_f(0));
+
+	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
+	 brw_MOV(p, ndc, brw_imm_f(0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+#endif
+      }
+
+      header1.writemask = WRITEMASK_XYZW;
+      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(header1));
+   } else {
+      emit(BRW_OPCODE_MOV, retype(brw_message_reg(header_mrf++),
+				  BRW_REGISTER_TYPE_UD), 0u);
+   }
+
+   if (intel->gen == 5) {
+      /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
+       * dword 0-3 (m1) of the header is indices, point width, clip flags.
+       * dword 4-7 (m2) is the ndc position (set above)
+       * dword 8-11 (m3) of the vertex header is the 4D space position
+       * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
+       * m6 is a pad so that the vertex element data is aligned
+       * m7 is the first vertex data we fill.
+       */
+      current_annotation = "NDC";
+      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
+
+      current_annotation = "gl_Position";
+      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
+
+      /* user clip distance. */
+      header_mrf += 2;
+
+      /* Pad so that vertex element data is aligned. */
+      header_mrf++;
+   } else {
+      /* There are 8 dwords in VUE header pre-Ironlake:
+       * dword 0-3 (m1) is indices, point width, clip flags.
+       * dword 4-7 (m2) is ndc position (set above)
+       *
+       * dword 8-11 (m3) is the first vertex data.
+       */
+      current_annotation = "NDC";
+      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
+
+      current_annotation = "gl_Position";
+      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
+   }
+
+   return header_mrf;
+}
+
+int
+vec4_visitor::emit_vue_header_gen6(int header_mrf)
+{
+   struct brw_reg reg;
+
+   /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
+    * dword 0-3 (m2) of the header is indices, point width, clip flags.
+    * dword 4-7 (m3) is the 4D space position
+    * dword 8-15 (m4,m5) of the vertex header is the user clip distance if
+    * enabled.
+    *
+    * m4 or 6 is the first vertex element data we fill.
+    */
+
+   current_annotation = "indices, point width, clip flags";
+   reg = brw_message_reg(header_mrf++);
+   emit(BRW_OPCODE_MOV, retype(reg, BRW_REGISTER_TYPE_D), src_reg(0));
+   if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
+      emit(BRW_OPCODE_MOV, brw_writemask(reg, WRITEMASK_W),
+	   src_reg(output_reg[VERT_RESULT_PSIZ]));
+   }
+
+   current_annotation = "gl_Position";
+   emit(BRW_OPCODE_MOV,
+	brw_message_reg(header_mrf++), src_reg(output_reg[VERT_RESULT_HPOS]));
+
+   current_annotation = "user clip distances";
+   if (c->key.nr_userclip) {
+      for (int i = 0; i < c->key.nr_userclip; i++) {
+	 struct brw_reg m;
+	 if (i < 4)
+	    m = brw_message_reg(header_mrf);
+	 else
+	    m = brw_message_reg(header_mrf + 1);
+
+	 emit(BRW_OPCODE_DP4,
+	      dst_reg(brw_writemask(m, 1 << (i & 3))),
+	      src_reg(c->userplane[i]));
+      }
+      header_mrf += 2;
+   }
+
+   current_annotation = NULL;
+
+   return header_mrf;
+}
+
+static int
+align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
+{
+   struct intel_context *intel = &brw->intel;
+
+   if (intel->gen >= 6) {
+      /* URB data written (does not include the message header reg) must
+       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
+       * section 5.4.3.2.2: URB_INTERLEAVED.
+       *
+       * URB entries are allocated on a multiple of 1024 bits, so an
+       * extra 128 bits written here to make the end align to 256 is
+       * no problem.
+       */
+      if ((mlen % 2) != 1)
+	 mlen++;
+   }
+
+   return mlen;
+}
+
+/**
+ * Generates the VUE payload plus the 1 or 2 URB write instructions to
+ * complete the VS thread.
+ *
+ * The VUE layout is documented in Volume 2a.
+ */
+void
+vec4_visitor::emit_urb_writes()
+{
+   /* MRF 0 is reserved for the debugger, so start with message header
+    * in MRF 1.
+    */
+   int base_mrf = 1;
+   int mrf = base_mrf;
+   int urb_entry_size;
+   uint64_t outputs_remaining = c->prog_data.outputs_written;
+   /* In the process of generating our URB write message contents, we
+    * may need to unspill a register or load from an array.  Those
+    * reads would use MRFs 14-15.
+    */
+   int max_usable_mrf = 13;
+
+   /* FINISHME: edgeflag */
+
+   /* First mrf is the g0-based message header containing URB handles and such,
+    * which is implied in VS_OPCODE_URB_WRITE.
+    */
+   mrf++;
+
+   if (intel->gen >= 6) {
+      mrf = emit_vue_header_gen6(mrf);
+   } else {
+      mrf = emit_vue_header_gen4(mrf);
+   }
+
+   /* Set up the VUE data for the first URB write */
+   int attr;
+   for (attr = 0; attr < VERT_RESULT_MAX; attr++) {
+      if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
+	 continue;
+
+      outputs_remaining &= ~BITFIELD64_BIT(attr);
+
+      /* This is set up in the VUE header. */
+      if (attr == VERT_RESULT_HPOS)
+	 continue;
+
+      /* This is loaded into the VUE header, and thus doesn't occupy
+       * an attribute slot.
+       */
+      if (attr == VERT_RESULT_PSIZ)
+	 continue;
+
+      vec4_instruction *inst = emit(BRW_OPCODE_MOV, brw_message_reg(mrf++),
+				    src_reg(output_reg[attr]));
+
+      if ((attr == VERT_RESULT_COL0 ||
+	   attr == VERT_RESULT_COL1 ||
+	   attr == VERT_RESULT_BFC0 ||
+	   attr == VERT_RESULT_BFC1) &&
+	  c->key.clamp_vertex_color) {
+	 inst->saturate = true;
+      }
+
+      /* If this was MRF 15, we can't fit anything more into this URB
+       * WRITE.  Note that base_mrf of 1 means that MRF 15 is an
+       * even-numbered amount of URB write data, which will meet
+       * gen6's requirements for length alignment.
+       */
+      if (mrf > max_usable_mrf) {
+	 attr++;
+	 break;
+      }
+   }
+
+   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
+   inst->base_mrf = base_mrf;
+   inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
+   inst->eot = !outputs_remaining;
+
+   urb_entry_size = mrf - base_mrf;
+
+   /* Optional second URB write */
+   if (outputs_remaining) {
+      mrf = base_mrf + 1;
+
+      for (; attr < VERT_RESULT_MAX; attr++) {
+	 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
+	    continue;
+
+	 assert(mrf < max_usable_mrf);
+
+	 emit(BRW_OPCODE_MOV, brw_message_reg(mrf++), src_reg(output_reg[attr]));
+      }
+
+      inst = emit(VS_OPCODE_URB_WRITE);
+      inst->base_mrf = base_mrf;
+      inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
+      inst->eot = true;
+      /* URB destination offset.  In the previous write, we got MRFs
+       * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
+       * URB row increments, and each of our MRFs is half of one of
+       * those, since we're doing interleaved writes.
+       */
+      inst->offset = (max_usable_mrf - base_mrf) / 2;
+
+      urb_entry_size += mrf - base_mrf;
+   }
+
+   if (intel->gen == 6)
+      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 8) / 8;
+   else
+      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 4) / 4;
+}
+
+src_reg
+vec4_visitor::get_scratch_offset(vec4_instruction *inst,
+				 src_reg *reladdr, int reg_offset)
+{
+   /* Because we store the values to scratch interleaved like our
+    * vertex data, we need to scale the vec4 index by 2.
+    */
+   int message_header_scale = 2;
+
+   /* Pre-gen6, the message header uses byte offsets instead of vec4
+    * (16-byte) offset units.
+    */
+   if (intel->gen < 6)
+      message_header_scale *= 16;
+
+   if (reladdr) {
+      src_reg index = src_reg(this, glsl_type::int_type);
+
+      vec4_instruction *add = emit(BRW_OPCODE_ADD,
+				   dst_reg(index),
+				   *reladdr,
+				   src_reg(reg_offset));
+      /* Move our new instruction from the tail to its correct place. */
+      add->remove();
+      inst->insert_before(add);
+
+      vec4_instruction *mul = emit(BRW_OPCODE_MUL, dst_reg(index),
+				   index, src_reg(message_header_scale));
+      mul->remove();
+      inst->insert_before(mul);
+
+      return index;
+   } else {
+      return src_reg(reg_offset * message_header_scale);
+   }
+}
+
+/**
+ * Emits an instruction before @inst to load the value named by @orig_src
+ * from scratch space at @base_offset to @temp.
+ */
+void
+vec4_visitor::emit_scratch_read(vec4_instruction *inst,
+				dst_reg temp, src_reg orig_src,
+				int base_offset)
+{
+   int reg_offset = base_offset + orig_src.reg_offset;
+   src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
+
+   vec4_instruction *scratch_read_inst = emit(VS_OPCODE_SCRATCH_READ,
+					      temp, index);
+
+   scratch_read_inst->base_mrf = 14;
+   scratch_read_inst->mlen = 1;
+   /* Move our instruction from the tail to its correct place. */
+   scratch_read_inst->remove();
+   inst->insert_before(scratch_read_inst);
+}
+
+/**
+ * Emits an instruction after @inst to store the value to be written
+ * to @orig_dst to scratch space at @base_offset, from @temp.
+ */
+void
+vec4_visitor::emit_scratch_write(vec4_instruction *inst,
+				 src_reg temp, dst_reg orig_dst,
+				 int base_offset)
+{
+   int reg_offset = base_offset + orig_dst.reg_offset;
+   src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
+
+   dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
+				       orig_dst.writemask));
+   vec4_instruction *scratch_write_inst = emit(VS_OPCODE_SCRATCH_WRITE,
+					       dst, temp, index);
+   scratch_write_inst->base_mrf = 13;
+   scratch_write_inst->mlen = 2;
+   scratch_write_inst->predicate = inst->predicate;
+   /* Move our instruction from the tail to its correct place. */
+   scratch_write_inst->remove();
+   inst->insert_after(scratch_write_inst);
+}
+
+/**
+ * We can't generally support array access in GRF space, because a
+ * single instruction's destination can only span 2 contiguous
+ * registers.  So, we send all GRF arrays that get variable index
+ * access to scratch space.
+ */
+void
+vec4_visitor::move_grf_array_access_to_scratch()
+{
+   int scratch_loc[this->virtual_grf_count];
+
+   for (int i = 0; i < this->virtual_grf_count; i++) {
+      scratch_loc[i] = -1;
+   }
+
+   /* First, calculate the set of virtual GRFs that need to be punted
+    * to scratch due to having any array access on them, and where in
+    * scratch.
+    */
+   foreach_list(node, &this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+
+      if (inst->dst.file == GRF && inst->dst.reladdr &&
+	  scratch_loc[inst->dst.reg] == -1) {
+	 scratch_loc[inst->dst.reg] = c->last_scratch;
+	 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
+      }
+
+      for (int i = 0 ; i < 3; i++) {
+	 src_reg *src = &inst->src[i];
+
+	 if (src->file == GRF && src->reladdr &&
+	     scratch_loc[src->reg] == -1) {
+	    scratch_loc[src->reg] = c->last_scratch;
+	    c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
+	 }
+      }
+   }
+
+   /* Now, for anything that will be accessed through scratch, rewrite
+    * it to load/store.  Note that this is a _safe list walk, because
+    * we may generate a new scratch_write instruction after the one
+    * we're processing.
+    */
+   foreach_list_safe(node, &this->instructions) {
+      vec4_instruction *inst = (vec4_instruction *)node;
+
+      /* Set up the annotation tracking for new generated instructions. */
+      base_ir = inst->ir;
+      current_annotation = inst->annotation;
+
+      if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
+	 src_reg temp = src_reg(this, glsl_type::vec4_type);
+
+	 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
+
+	 inst->dst.file = temp.file;
+	 inst->dst.reg = temp.reg;
+	 inst->dst.reg_offset = temp.reg_offset;
+	 inst->dst.reladdr = NULL;
+      }
+
+      for (int i = 0 ; i < 3; i++) {
+	 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
+	    continue;
+
+	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
+
+	 emit_scratch_read(inst, temp, inst->src[i],
+			   scratch_loc[inst->src[i].reg]);
+
+	 inst->src[i].file = temp.file;
+	 inst->src[i].reg = temp.reg;
+	 inst->src[i].reg_offset = temp.reg_offset;
+	 inst->src[i].reladdr = NULL;
+      }
+   }
+}
+
+
+vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
+			   struct gl_shader_program *prog,
+			   struct brw_shader *shader)
+{
+   this->c = c;
+   this->p = &c->func;
+   this->brw = p->brw;
+   this->intel = &brw->intel;
+   this->ctx = &intel->ctx;
+   this->prog = prog;
+   this->shader = shader;
+
+   this->mem_ctx = ralloc_context(NULL);
+   this->failed = false;
+
+   this->base_ir = NULL;
+   this->current_annotation = NULL;
+
+   this->c = c;
+   this->vp = prog->VertexProgram;
+   this->prog_data = &c->prog_data;
+
+   this->variable_ht = hash_table_ctor(0,
+				       hash_table_pointer_hash,
+				       hash_table_pointer_compare);
+
+   this->virtual_grf_def = NULL;
+   this->virtual_grf_use = NULL;
+   this->virtual_grf_sizes = NULL;
+   this->virtual_grf_count = 0;
+   this->virtual_grf_array_size = 0;
+   this->live_intervals_valid = false;
+
+   this->uniforms = 0;
+
+   this->variable_ht = hash_table_ctor(0,
+				       hash_table_pointer_hash,
+				       hash_table_pointer_compare);
+}
+
+vec4_visitor::~vec4_visitor()
+{
+   ralloc_free(this->mem_ctx);
+   hash_table_dtor(this->variable_ht);
+}
+
+
+void
+vec4_visitor::fail(const char *format, ...)
+{
+   va_list va;
+   char *msg;
+
+   if (failed)
+      return;
+
+   failed = true;
+
+   va_start(va, format);
+   msg = ralloc_vasprintf(mem_ctx, format, va);
+   va_end(va);
+   msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
+
+   this->fail_msg = msg;
+
+   if (INTEL_DEBUG & DEBUG_VS) {
+      fprintf(stderr, "%s",  msg);
+   }
+}
+
+} /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index a9ad5311fe3..3373e707d98 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -30,6 +30,7 @@
   */
            
 
+#include "main/compiler.h"
 #include "brw_context.h"
 #include "brw_vs.h"
 #include "brw_util.h"
@@ -39,17 +40,21 @@
 
 #include "../glsl/ralloc.h"
 
-static void do_vs_prog( struct brw_context *brw, 
-			struct brw_vertex_program *vp,
-			struct brw_vs_prog_key *key )
+static bool
+do_vs_prog(struct brw_context *brw,
+	   struct gl_shader_program *prog,
+	   struct brw_vertex_program *vp,
+	   struct brw_vs_prog_key *key)
 {
    struct gl_context *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;
    GLuint program_size;
    const GLuint *program;
    struct brw_vs_compile c;
    void *mem_ctx;
    int aux_size;
    int i;
+   static int new_vs = -1;
 
    memset(&c, 0, sizeof(c));
    memcpy(&c.key, key, sizeof(*key));
@@ -85,7 +90,25 @@ static void do_vs_prog( struct brw_context *brw,
 
    /* Emit GEN4 code.
     */
-   brw_vs_emit(&c);
+   if (new_vs == -1)
+      new_vs = getenv("INTEL_NEW_VS") != NULL;
+
+   if (new_vs && prog) {
+      if (!brw_vs_emit(prog, &c)) {
+	 ralloc_free(mem_ctx);
+	 return false;
+      }
+   } else {
+      brw_old_vs_emit(&c);
+   }
+
+   /* Scratch space is used for register spilling */
+   if (c.last_scratch) {
+      c.prog_data.total_scratch = brw_get_scratch_size(c.last_scratch);
+
+      brw_get_scratch_bo(intel, &brw->vs.scratch_bo,
+			 c.prog_data.total_scratch * brw->vs_max_threads);
+   }
 
    /* get the program
     */
@@ -111,6 +134,8 @@ static void do_vs_prog( struct brw_context *brw,
 		    &c.prog_data, aux_size,
 		    &brw->vs.prog_offset, &brw->vs.prog_data);
    ralloc_free(mem_ctx);
+
+   return true;
 }
 
 
@@ -155,13 +180,15 @@ static void brw_upload_vs_prog(struct brw_context *brw)
    if (!brw_search_cache(&brw->cache, BRW_VS_PROG,
 			 &key, sizeof(key),
 			 &brw->vs.prog_offset, &brw->vs.prog_data)) {
-      do_vs_prog(brw, vp, &key);
+      bool success = do_vs_prog(brw, ctx->Shader.CurrentVertexProgram,
+				vp, &key);
+
+      assert(success);
    }
    brw->vs.constant_map = ((int8_t *)brw->vs.prog_data +
 			   sizeof(*brw->vs.prog_data));
 }
 
-
 /* See brw_vs.c:
  */
 const struct brw_tracked_state brw_vs_prog = {
@@ -174,3 +201,30 @@ const struct brw_tracked_state brw_vs_prog = {
    },
    .prepare = brw_upload_vs_prog
 };
+
+bool
+brw_vs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_vs_prog_key key;
+   struct gl_vertex_program *vp = prog->VertexProgram;
+   struct brw_vertex_program *bvp = brw_vertex_program(vp);
+   uint32_t old_prog_offset = brw->vs.prog_offset;
+   struct brw_vs_prog_data *old_prog_data = brw->vs.prog_data;
+   bool success;
+
+   if (!vp)
+      return true;
+
+   memset(&key, 0, sizeof(key));
+
+   key.program_string_id = bvp->id;
+   key.clamp_vertex_color = true;
+
+   success = do_vs_prog(brw, prog, bvp, &key);
+
+   brw->vs.prog_offset = old_prog_offset;
+   brw->vs.prog_data = old_prog_data;
+
+   return success;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 432994a8534..beccb381ee2 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -66,6 +66,7 @@ struct brw_vs_compile {
    GLuint first_output;
    GLuint nr_outputs;
    GLuint first_overflow_output; /**< VERT_ATTRIB_x */
+   GLuint last_scratch;
 
    GLuint first_tmp;
    GLuint last_tmp;
@@ -92,6 +93,8 @@ struct brw_vs_compile {
    GLboolean needs_stack;
 };
 
-void brw_vs_emit( struct brw_vs_compile *c );
+bool brw_vs_emit(struct gl_shader_program *prog, struct brw_vs_compile *c);
+void brw_old_vs_emit(struct brw_vs_compile *c);
+bool brw_vs_precompile(struct gl_context *ctx, struct gl_shader_program *prog);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_vs_constval.c b/src/mesa/drivers/dri/i965/brw_vs_constval.c
index 9fdfebe9f76..47cc0a7da7a 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_constval.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_constval.c
@@ -194,19 +194,11 @@ static void calc_wm_input_sizes( struct brw_context *brw )
    /* BRW_NEW_VERTEX_PROGRAM */
    const struct brw_vertex_program *vp =
       brw_vertex_program_const(brw->vertex_program);
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
    /* BRW_NEW_INPUT_DIMENSIONS */
    struct tracker t;
    GLuint insn;
    GLuint i;
 
-   /* If we're going to go through brw_fs.cpp, we don't end up using
-    * brw->wm.input_size_masks.
-    */
-   if (prog && prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
-      return;
-
    memset(&t, 0, sizeof(t));
 
    /* _NEW_LIGHT */
@@ -246,9 +238,7 @@ static void calc_wm_input_sizes( struct brw_context *brw )
 const struct brw_tracked_state brw_wm_input_sizes = {
    .dirty = {
       .mesa  = _NEW_LIGHT,
-      .brw   = (BRW_NEW_FRAGMENT_PROGRAM |
-		BRW_NEW_VERTEX_PROGRAM |
-		BRW_NEW_INPUT_DIMENSIONS),
+      .brw   = BRW_NEW_VERTEX_PROGRAM | BRW_NEW_INPUT_DIMENSIONS,
       .cache = 0
    },
    .prepare = calc_wm_input_sizes
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 9d733344a26..bfee811e13d 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -1096,31 +1096,6 @@ static void emit_lrp_noalias(struct brw_vs_compile *c,
    brw_MAC(p, dst, arg0, arg1);
 }
 
-/** 3 or 4-component vector normalization */
-static void emit_nrm( struct brw_vs_compile *c, 
-                      struct brw_reg dst,
-                      struct brw_reg arg0,
-                      int num_comps)
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg tmp = get_tmp(c);
-
-   /* tmp = dot(arg0, arg0) */
-   if (num_comps == 3)
-      brw_DP3(p, tmp, arg0, arg0);
-   else
-      brw_DP4(p, tmp, arg0, arg0);
-
-   /* tmp = 1 / sqrt(tmp) */
-   emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
-
-   /* dst = arg0 * tmp */
-   brw_MUL(p, dst, arg0, tmp);
-
-   release_tmp(c, tmp);
-}
-
-
 static struct brw_reg
 get_constant(struct brw_vs_compile *c,
              const struct prog_instruction *inst,
@@ -1359,7 +1334,7 @@ get_src_reg( struct brw_vs_compile *c,
 
 	 if (component >= 0) {
 	    params = c->vp->program.Base.Parameters;
-	    f = params->ParameterValues[src->Index][component];
+	    f = params->ParameterValues[src->Index][component].f;
 
 	    if (src->Abs)
 	       f = fabs(f);
@@ -1821,6 +1796,9 @@ accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
    if (val.address_mode != BRW_ADDRESS_DIRECT)
       return GL_FALSE;
 
+   if (val.negate || val.abs)
+      return GL_FALSE;
+
    switch (prev_insn->header.opcode) {
    case BRW_OPCODE_MOV:
    case BRW_OPCODE_MAC:
@@ -1900,7 +1878,7 @@ brw_vs_rescale_gl_fixed(struct brw_vs_compile *c)
 
 /* Emit the vertex program instructions here.
  */
-void brw_vs_emit(struct brw_vs_compile *c )
+void brw_old_vs_emit(struct brw_vs_compile *c )
 {
 #define MAX_IF_DEPTH 32
 #define MAX_LOOP_DEPTH 32
@@ -1980,9 +1958,22 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	      const struct prog_src_register *src = &inst->SrcReg[i];
 	      index = src->Index;
 	      file = src->File;	
-	      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
-		  args[i] = c->output_regs[index].reg;
-	      else
+	      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src) {
+		 /* Can't just make get_arg "do the right thing" here because
+		  * other callers of get_arg and get_src_reg don't expect any
+		  * special behavior for the c->output_regs[index].used_in_src
+		  * case.
+		  */
+		 args[i] = c->output_regs[index].reg;
+		 args[i].dw1.bits.swizzle =
+		    BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
+				 GET_SWZ(src->Swizzle, 1),
+				 GET_SWZ(src->Swizzle, 2),
+				 GET_SWZ(src->Swizzle, 3));
+
+		 /* Note this is ok for non-swizzle ARB_vp instructions */
+		 args[i].negate = src->Negate ? 1 : 0;
+	      } else
                   args[i] = get_arg(c, inst, i);
 	  }
 
@@ -1993,7 +1984,11 @@ void brw_vs_emit(struct brw_vs_compile *c )
       index = inst->DstReg.Index;
       file = inst->DstReg.File;
       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
-	  dst = c->output_regs[index].reg;
+	 /* Can't just make get_dst "do the right thing" here because other
+	  * callers of get_dst don't expect any special behavior for the
+	  * c->output_regs[index].used_in_src case.
+	  */
+	 dst = brw_writemask(c->output_regs[index].reg, inst->DstReg.WriteMask);
       else
 	  dst = get_dst(c, inst->DstReg);
 
@@ -2025,12 +2020,6 @@ void brw_vs_emit(struct brw_vs_compile *c )
       case OPCODE_DPH:
 	 brw_DPH(p, dst, args[0], args[1]);
 	 break;
-      case OPCODE_NRM3:
-	 emit_nrm(c, dst, args[0], 3);
-	 break;
-      case OPCODE_NRM4:
-	 emit_nrm(c, dst, args[0], 4);
-	 break;
       case OPCODE_DST:
 	 unalias2(c, dst, args[0], args[1], emit_dst_noalias); 
 	 break;
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index fc4373ab311..29b3e47ab0c 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -77,6 +77,16 @@ brw_prepare_vs_unit(struct brw_context *brw)
    else
       vs->thread1.binding_table_entry_count = brw->vs.nr_surfaces;
 
+   if (brw->vs.prog_data->total_scratch != 0) {
+      vs->thread2.scratch_space_base_pointer =
+	 brw->vs.scratch_bo->offset >> 10; /* reloc */
+      vs->thread2.per_thread_scratch_space =
+	 ffs(brw->vs.prog_data->total_scratch) - 11;
+   } else {
+      vs->thread2.scratch_space_base_pointer = 0;
+      vs->thread2.per_thread_scratch_space = 0;
+   }
+
    vs->thread3.urb_entry_read_length = brw->vs.prog_data->urb_read_length;
    vs->thread3.const_urb_entry_read_length = brw->vs.prog_data->curb_read_length;
    vs->thread3.dispatch_grf_start_reg = 1;
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 55dbd4fa8b0..40360b23fff 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -213,6 +213,7 @@ static void brw_new_batch( struct intel_context *intel )
    brw->state_batch_count = 0;
 
    brw->vb.nr_current_buffers = 0;
+   brw->ib.type = -1;
 
    /* Mark that the current program cache BO has been used by the GPU.
     * It will be reallocated if we need to put new programs in for the
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index b0dfdd536aa..e76832515fe 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -206,10 +206,6 @@ bool do_wm_prog(struct brw_context *brw,
           */
          return false;
       }
-      c->instruction = rzalloc_array(c, struct brw_wm_instruction, BRW_WM_MAX_INSN);
-      c->prog_instructions = rzalloc_array(c, struct prog_instruction, BRW_WM_MAX_INSN);
-      c->vreg = rzalloc_array(c, struct brw_wm_value, BRW_WM_MAX_VREG);
-      c->refs = rzalloc_array(c, struct brw_wm_ref, BRW_WM_MAX_REF);
    } else {
       void *instruction = c->instruction;
       void *prog_instructions = c->prog_instructions;
@@ -232,6 +228,13 @@ bool do_wm_prog(struct brw_context *brw,
       if (!brw_wm_fs_emit(brw, c, prog))
 	 return false;
    } else {
+      if (!c->instruction) {
+	 c->instruction = rzalloc_array(c, struct brw_wm_instruction, BRW_WM_MAX_INSN);
+	 c->prog_instructions = rzalloc_array(c, struct prog_instruction, BRW_WM_MAX_INSN);
+	 c->vreg = rzalloc_array(c, struct brw_wm_value, BRW_WM_MAX_VREG);
+	 c->refs = rzalloc_array(c, struct brw_wm_ref, BRW_WM_MAX_REF);
+      }
+
       /* Fallback for fixed function and ARB_fp shaders. */
       c->dispatch_width = 16;
       brw_wm_payload_setup(brw, c);
@@ -241,29 +244,10 @@ bool do_wm_prog(struct brw_context *brw,
 
    /* Scratch space is used for register spilling */
    if (c->last_scratch) {
-      uint32_t total_scratch;
-
-      /* Per-thread scratch space is power-of-two sized. */
-      for (c->prog_data.total_scratch = 1024;
-	   c->prog_data.total_scratch <= c->last_scratch;
-	   c->prog_data.total_scratch *= 2) {
-	 /* empty */
-      }
-      total_scratch = c->prog_data.total_scratch * brw->wm_max_threads;
+      c->prog_data.total_scratch = brw_get_scratch_size(c->last_scratch);
 
-      if (brw->wm.scratch_bo && total_scratch > brw->wm.scratch_bo->size) {
-	 drm_intel_bo_unreference(brw->wm.scratch_bo);
-	 brw->wm.scratch_bo = NULL;
-      }
-      if (brw->wm.scratch_bo == NULL) {
-	 brw->wm.scratch_bo = drm_intel_bo_alloc(intel->bufmgr,
-						 "wm scratch",
-						 total_scratch,
-						 4096);
-      }
-   }
-   else {
-      c->prog_data.total_scratch = 0;
+      brw_get_scratch_bo(intel, &brw->wm.scratch_bo,
+			 c->prog_data.total_scratch * brw->wm_max_threads);
    }
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM))
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index f61757a8cac..6ea4a7d6e50 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -1094,9 +1094,16 @@ void emit_tex(struct brw_wm_compile *c,
    if (intel->gen < 5 && c->dispatch_width == 8)
       nr_texcoords = 3;
 
-   /* For shadow comparisons, we have to supply u,v,r. */
-   if (shadow)
-      nr_texcoords = 3;
+   if (shadow) {
+      if (intel->gen < 7) {
+	 /* For shadow comparisons, we have to supply u,v,r. */
+	 nr_texcoords = 3;
+      } else {
+	 /* On Ivybridge, the shadow comparitor comes first. Just load it. */
+	 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
+	 cur_mrf += mrf_per_channel;
+      }
+   }
 
    /* Emit the texcoords. */
    for (i = 0; i < nr_texcoords; i++) {
@@ -1113,7 +1120,7 @@ void emit_tex(struct brw_wm_compile *c,
    }
 
    /* Fill in the shadow comparison reference value. */
-   if (shadow) {
+   if (shadow && intel->gen < 7) {
       if (intel->gen >= 5) {
 	 /* Fill in the cube map array index value. */
 	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index 7cd3edad235..bd46bd8de43 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -535,15 +535,15 @@ static struct prog_src_register search_or_add_const4f( struct brw_wm_compile *c,
 						     GLfloat s3)
 {
    struct gl_program_parameter_list *paramList = c->fp->program.Base.Parameters;
-   GLfloat values[4];
+   gl_constant_value values[4];
    GLuint idx;
    GLuint swizzle;
    struct prog_src_register reg;
 
-   values[0] = s0;
-   values[1] = s1;
-   values[2] = s2;
-   values[3] = s3;
+   values[0].f = s0;
+   values[1].f = s1;
+   values[2].f = s2;
+   values[3].f = s3;
 
    idx = _mesa_add_unnamed_constant( paramList, values, 4, &swizzle );
    reg = src_reg(PROGRAM_STATE_VAR, idx);
@@ -664,6 +664,8 @@ static void precalc_lit( struct brw_wm_compile *c,
 static void precalc_tex( struct brw_wm_compile *c,
 			 const struct prog_instruction *inst )
 {
+   struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct prog_src_register coord;
    struct prog_dst_register tmpcoord = { 0 };
    const GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
@@ -727,7 +729,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        release_temp(c, tmp0);
        release_temp(c, tmp1);
    }
-   else if (inst->TexSrcTarget == TEXTURE_RECT_INDEX) {
+   else if (intel->gen < 6 && inst->TexSrcTarget == TEXTURE_RECT_INDEX) {
       struct prog_src_register scale = 
 	 search_or_add_param5( c, 
 			       STATE_INTERNAL, 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
index f78bdc31866..ccf9dc2bc18 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
@@ -205,14 +205,14 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 	 case PROGRAM_CONSTANT:
 	    /* These are invarient:
 	     */
-	    ref = get_const_ref(c, &plist->ParameterValues[idx][component]);
+	    ref = get_const_ref(c, &plist->ParameterValues[idx][component].f);
 	    break;
 
 	 case PROGRAM_STATE_VAR:
 	 case PROGRAM_UNIFORM:
 	    /* These may change from run to run:
 	     */
-	    ref = get_param_ref(c, &plist->ParameterValues[idx][component] );
+	    ref = get_param_ref(c, &plist->ParameterValues[idx][component].f );
 	    break;
 
 	 default:
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index 98146136703..6834ebad780 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -289,6 +289,13 @@ static void brw_update_sampler_state(struct brw_context *brw,
    sampler->ss1.max_lod = U_FIXED(CLAMP(gl_sampler->MaxLod, 0, 13), 6);
    sampler->ss1.min_lod = U_FIXED(CLAMP(gl_sampler->MinLod, 0, 13), 6);
 
+   /* On Gen6+, the sampler can handle non-normalized texture
+    * rectangle coordinates natively
+    */
+   if (intel->gen >= 6 && texObj->Target == GL_TEXTURE_RECTANGLE) {
+      sampler->ss3.non_normalized_coord = 1;
+   }
+
    upload_default_color(brw, gl_sampler, unit);
 
    if (intel->gen >= 6) {
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index fb4fb146f8d..ad909789d82 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -342,7 +342,7 @@ prepare_wm_pull_constants(struct brw_context *brw)
    constants = brw->wm.const_bo->virtual;
    for (i = 0; i < brw->wm.prog_data->nr_pull_params; i++) {
       constants[i] = convert_param(brw->wm.prog_data->pull_param_convert[i],
-				   *brw->wm.prog_data->pull_param[i]);
+				   brw->wm.prog_data->pull_param[i]);
    }
    drm_intel_gem_bo_unmap_gtt(brw->wm.const_bo);
 
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index fb4cdbaadf9..b94121e8437 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -81,12 +81,21 @@ gen6_prepare_vs_push_constants(struct brw_context *brw)
 	 params_uploaded++;
       }
 
-      for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
-	 if (brw->vs.constant_map[i] != -1) {
-	    memcpy(param + brw->vs.constant_map[i] * 4,
-		   vp->program.Base.Parameters->ParameterValues[i],
-		   4 * sizeof(float));
-	    params_uploaded++;
+      if (brw->vs.prog_data->uses_new_param_layout) {
+	 for (i = 0; i < brw->vs.prog_data->nr_params; i++) {
+	    *param = convert_param(brw->vs.prog_data->param_convert[i],
+				   brw->vs.prog_data->param[i]);
+	    param++;
+	 }
+	 params_uploaded += brw->vs.prog_data->nr_params / 4;
+      } else {
+	 for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
+	    if (brw->vs.constant_map[i] != -1) {
+	       memcpy(param + brw->vs.constant_map[i] * 4,
+		      vp->program.Base.Parameters->ParameterValues[i],
+		      4 * sizeof(float));
+	       params_uploaded++;
+	    }
 	 }
       }
 
@@ -151,7 +160,15 @@ upload_vs_state(struct brw_context *brw)
    OUT_BATCH((0 << GEN6_VS_SAMPLER_COUNT_SHIFT) |
 	     GEN6_VS_FLOATING_POINT_MODE_ALT |
 	     (brw->vs.nr_surfaces << GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
-   OUT_BATCH(0); /* scratch space base offset */
+
+   if (brw->vs.prog_data->total_scratch) {
+      OUT_RELOC(brw->vs.scratch_bo,
+		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+		ffs(brw->vs.prog_data->total_scratch) - 11);
+   } else {
+      OUT_BATCH(0);
+   }
+
    OUT_BATCH((1 << GEN6_VS_DISPATCH_START_GRF_SHIFT) |
 	     (brw->vs.prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
 	     (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));
@@ -160,6 +177,32 @@ upload_vs_state(struct brw_context *brw)
 	     GEN6_VS_STATISTICS_ENABLE |
 	     GEN6_VS_ENABLE);
    ADVANCE_BATCH();
+
+   /* Based on my reading of the simulator, the VS constants don't get
+    * pulled into the VS FF unit until an appropriate pipeline flush
+    * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
+    * references to them into a little FIFO.  The flushes are common,
+    * but don't reliably happen between this and a 3DPRIMITIVE, causing
+    * the primitive to use the wrong constants.  Then the FIFO
+    * containing the constant setup gets added to again on the next
+    * constants change, and eventually when a flush does happen the
+    * unit is overwhelmed by constant changes and dies.
+    *
+    * To avoid this, send a PIPE_CONTROL down the line that will
+    * update the unit immediately loading the constants.  The flush
+    * type bits here were those set by the STATE_BASE_ADDRESS whose
+    * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
+    * bug reports that led to this workaround, and may be more than
+    * what is strictly required to avoid the issue.
+    */
+   BEGIN_BATCH(4);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL);
+   OUT_BATCH(PIPE_CONTROL_DEPTH_STALL |
+	     PIPE_CONTROL_INSTRUCTION_FLUSH |
+	     PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+   OUT_BATCH(0); /* address */
+   OUT_BATCH(0); /* write data */
+   ADVANCE_BATCH();
 }
 
 const struct brw_tracked_state gen6_vs_state = {
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 185da9c355f..07e9995f53b 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -54,14 +54,14 @@ gen6_prepare_wm_push_constants(struct brw_context *brw)
       float *constants;
       unsigned int i;
 
-      constants = brw_state_batch(brw, AUB_TRACE_NO_TYPE,
+      constants = brw_state_batch(brw, AUB_TRACE_WM_CONSTANTS,
 				  brw->wm.prog_data->nr_params *
 				  sizeof(float),
 				  32, &brw->wm.push_const_offset);
 
       for (i = 0; i < brw->wm.prog_data->nr_params; i++) {
 	 constants[i] = convert_param(brw->wm.prog_data->param_convert[i],
-				      *brw->wm.prog_data->param[i]);
+				      brw->wm.prog_data->param[i]);
       }
 
       if (0) {
diff --git a/src/mesa/drivers/dri/i965/gen7_sampler_state.c b/src/mesa/drivers/dri/i965/gen7_sampler_state.c
index e787c21f4d1..aee67c87472 100644
--- a/src/mesa/drivers/dri/i965/gen7_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sampler_state.c
@@ -157,6 +157,13 @@ gen7_update_sampler_state(struct brw_context *brw, int unit,
    sampler->ss1.max_lod = U_FIXED(CLAMP(gl_sampler->MaxLod, 0, 13), 8);
    sampler->ss1.min_lod = U_FIXED(CLAMP(gl_sampler->MinLod, 0, 13), 8);
 
+   /* The sampler can handle non-normalized texture rectangle coordinates
+    * natively
+    */
+   if (texObj->Target == GL_TEXTURE_RECTANGLE) {
+      sampler->ss3.non_normalized_coord = 1;
+   }
+
    upload_default_color(brw, gl_sampler, unit);
 
    sampler->ss2.default_color_pointer = brw->wm.sdc_offset[unit] >> 5;
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index 0fad3d2fb68..f3cd5d15bf0 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -71,7 +71,15 @@ upload_vs_state(struct brw_context *brw)
    OUT_BATCH((0 << GEN6_VS_SAMPLER_COUNT_SHIFT) |
 	     GEN6_VS_FLOATING_POINT_MODE_ALT |
 	     (brw->vs.nr_surfaces << GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
-   OUT_BATCH(0); /* scratch space base offset */
+
+   if (brw->vs.prog_data->total_scratch) {
+      OUT_RELOC(brw->vs.scratch_bo,
+		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+		ffs(brw->vs.prog_data->total_scratch) - 11);
+   } else {
+      OUT_BATCH(0);
+   }
+
    OUT_BATCH((1 << GEN6_VS_DISPATCH_START_GRF_SHIFT) |
 	     (brw->vs.prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
 	     (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index a102ca772b3..55a603e887a 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -58,7 +58,7 @@ gen7_prepare_wm_constants(struct brw_context *brw)
 
       for (i = 0; i < brw->wm.prog_data->nr_params; i++) {
 	 constants[i] = convert_param(brw->wm.prog_data->param_convert[i],
-				      *brw->wm.prog_data->param[i]);
+				      brw->wm.prog_data->param[i]);
       }
 
       if (0) {
@@ -228,7 +228,13 @@ upload_ps_state(struct brw_context *brw)
    OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
    OUT_BATCH(brw->wm.prog_offset);
    OUT_BATCH(dw2);
-   OUT_BATCH(0); /* scratch space base offset */
+   if (brw->wm.prog_data->total_scratch) {
+      OUT_RELOC(brw->wm.scratch_bo,
+		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+		ffs(brw->wm.prog_data->total_scratch) - 11);
+   } else {
+      OUT_BATCH(0);
+   }
    OUT_BATCH(dw4);
    OUT_BATCH(dw5);
    OUT_BATCH(0); /* kernel 1 pointer */
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index b61a2ffef19..db4343be10c 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -308,12 +308,29 @@ emit:
  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
  *
- * XXX: There is also a workaround that would appear to apply to this
- * workaround, but it doesn't appear to be necessary so far:
+ * And the workaround for these two requires this workaround first:
  *
- * Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
+ * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
  * BEFORE the pipe-control with a post-sync op and no write-cache
  * flushes.
+ *
+ * And this last workaround is tricky because of the requirements on
+ * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
+ * volume 2 part 1:
+ *
+ *     "1 of the following must also be set:
+ *      - Render Target Cache Flush Enable ([12] of DW1)
+ *      - Depth Cache Flush Enable ([0] of DW1)
+ *      - Stall at Pixel Scoreboard ([1] of DW1)
+ *      - Depth Stall ([13] of DW1)
+ *      - Post-Sync Operation ([13] of DW1)
+ *      - Notify Enable ([8] of DW1)"
+ *
+ * The cache flushes require the workaround flush that triggered this
+ * one, so we can't use it.  Depth stall would trigger the same.
+ * Post-sync nonzero is what triggered this second workaround, so we
+ * can't use that one either.  Notify enable is IRQs, which aren't
+ * really our business.  That leaves only stall at scoreboard.
  */
 void
 intel_emit_post_sync_nonzero_flush(struct intel_context *intel)
@@ -323,9 +340,17 @@ intel_emit_post_sync_nonzero_flush(struct intel_context *intel)
 
    BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_PIPE_CONTROL);
+   OUT_BATCH(PIPE_CONTROL_CS_STALL |
+	     PIPE_CONTROL_STALL_AT_SCOREBOARD);
+   OUT_BATCH(0); /* address */
+   OUT_BATCH(0); /* write data */
+   ADVANCE_BATCH();
+
+   BEGIN_BATCH(4);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL);
    OUT_BATCH(PIPE_CONTROL_WRITE_IMMEDIATE);
    OUT_RELOC(intel->batch.workaround_bo,
-	     I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT, 0);
+	     I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, 0);
    OUT_BATCH(0); /* write data */
    ADVANCE_BATCH();
 
@@ -365,6 +390,7 @@ intel_batchbuffer_emit_mi_flush(struct intel_context *intel)
 	 OUT_BATCH(PIPE_CONTROL_INSTRUCTION_FLUSH |
 		   PIPE_CONTROL_WRITE_FLUSH |
 		   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+		   PIPE_CONTROL_TC_FLUSH |
 		   PIPE_CONTROL_NO_WRITE);
 	 OUT_BATCH(0); /* write address */
 	 OUT_BATCH(0); /* write data */
diff --git a/src/mesa/drivers/dri/intel/intel_blit.c b/src/mesa/drivers/dri/intel/intel_blit.c
index 30be1b9382f..b18dd2922d9 100644
--- a/src/mesa/drivers/dri/intel/intel_blit.c
+++ b/src/mesa/drivers/dri/intel/intel_blit.c
@@ -541,8 +541,8 @@ intel_set_teximage_alpha_to_one(struct gl_context *ctx,
 
    /* get dest x/y in destination texture */
    intel_miptree_get_image_offset(intel_image->mt,
-				  intel_image->level,
-				  intel_image->face,
+				  intel_image->base.Level,
+				  intel_image->base.Face,
 				  0,
 				  &image_x, &image_y);
 
diff --git a/src/mesa/drivers/dri/intel/intel_buffer_objects.c b/src/mesa/drivers/dri/intel/intel_buffer_objects.c
index 439d6fc8247..d908975fc87 100644
--- a/src/mesa/drivers/dri/intel/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/intel/intel_buffer_objects.c
@@ -41,8 +41,7 @@
 #include "intel_regions.h"
 
 static GLboolean
-intel_bufferobj_unmap(struct gl_context * ctx,
-                      GLenum target, struct gl_buffer_object *obj);
+intel_bufferobj_unmap(struct gl_context * ctx, struct gl_buffer_object *obj);
 
 /** Allocates a new drm_intel_bo to store the data for the buffer object. */
 static void
@@ -122,7 +121,7 @@ intel_bufferobj_free(struct gl_context * ctx, struct gl_buffer_object *obj)
     * (though it does if you call glDeleteBuffers)
     */
    if (obj->Pointer)
-      intel_bufferobj_unmap(ctx, 0, obj);
+      intel_bufferobj_unmap(ctx, obj);
 
    free(intel_obj->sys_buffer);
    if (intel_obj->region) {
@@ -203,7 +202,6 @@ intel_bufferobj_data(struct gl_context * ctx,
  */
 static void
 intel_bufferobj_subdata(struct gl_context * ctx,
-                        GLenum target,
                         GLintptrARB offset,
                         GLsizeiptrARB size,
                         const GLvoid * data, struct gl_buffer_object *obj)
@@ -276,82 +274,28 @@ intel_bufferobj_subdata(struct gl_context * ctx,
  */
 static void
 intel_bufferobj_get_subdata(struct gl_context * ctx,
-                            GLenum target,
                             GLintptrARB offset,
                             GLsizeiptrARB size,
                             GLvoid * data, struct gl_buffer_object *obj)
 {
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
+   struct intel_context *intel = intel_context(ctx);
 
    assert(intel_obj);
    if (intel_obj->sys_buffer)
       memcpy(data, (char *)intel_obj->sys_buffer + offset, size);
-   else
-      drm_intel_bo_get_subdata(intel_obj->buffer, offset, size, data);
-}
-
-
-
-/**
- * Called via glMapBufferARB().
- */
-static void *
-intel_bufferobj_map(struct gl_context * ctx,
-                    GLenum target,
-                    GLenum access, struct gl_buffer_object *obj)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
-   GLboolean read_only = (access == GL_READ_ONLY_ARB);
-   GLboolean write_only = (access == GL_WRITE_ONLY_ARB);
-
-   assert(intel_obj);
-
-   if (intel_obj->sys_buffer) {
-      if (!read_only && intel_obj->source) {
-	 release_buffer(intel_obj);
-      }
-
-      if (!intel_obj->buffer || intel_obj->source) {
-	 obj->Pointer = intel_obj->sys_buffer;
-	 obj->Length = obj->Size;
-	 obj->Offset = 0;
-	 return obj->Pointer;
+   else {
+      if (drm_intel_bo_references(intel->batch.bo, intel_obj->buffer)) {
+	 intel_batchbuffer_flush(intel);
       }
-
-      free(intel_obj->sys_buffer);
-      intel_obj->sys_buffer = NULL;
-   }
-
-   /* Flush any existing batchbuffer that might reference this data. */
-   if (drm_intel_bo_references(intel->batch.bo, intel_obj->buffer))
-      intel_flush(ctx);
-
-   if (intel_obj->region)
-      intel_bufferobj_cow(intel, intel_obj);
-
-   if (intel_obj->buffer == NULL) {
-      obj->Pointer = NULL;
-      return NULL;
-   }
-
-   if (write_only) {
-      drm_intel_gem_bo_map_gtt(intel_obj->buffer);
-      intel_obj->mapped_gtt = GL_TRUE;
-   } else {
-      drm_intel_bo_map(intel_obj->buffer, !read_only);
-      intel_obj->mapped_gtt = GL_FALSE;
+      drm_intel_bo_get_subdata(intel_obj->buffer, offset, size, data);
    }
+}
 
-   obj->Pointer = intel_obj->buffer->virtual;
-   obj->Length = obj->Size;
-   obj->Offset = 0;
 
-   return obj->Pointer;
-}
 
 /**
- * Called via glMapBufferRange().
+ * Called via glMapBufferRange and glMapBuffer
  *
  * The goal of this extension is to allow apps to accumulate their rendering
  * at the same time as they accumulate their buffer object.  Without it,
@@ -368,12 +312,11 @@ intel_bufferobj_map(struct gl_context * ctx,
  */
 static void *
 intel_bufferobj_map_range(struct gl_context * ctx,
-			  GLenum target, GLintptr offset, GLsizeiptr length,
+			  GLintptr offset, GLsizeiptr length,
 			  GLbitfield access, struct gl_buffer_object *obj)
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
-   GLboolean read_only = (access == GL_READ_ONLY_ARB);
 
    assert(intel_obj);
 
@@ -385,6 +328,9 @@ intel_bufferobj_map_range(struct gl_context * ctx,
    obj->AccessFlags = access;
 
    if (intel_obj->sys_buffer) {
+      const bool read_only =
+	 (access & (GL_MAP_READ_BIT | GL_MAP_WRITE_BIT)) == GL_MAP_READ_BIT;
+
       if (!read_only && intel_obj->source)
 	 release_buffer(intel_obj);
 
@@ -468,7 +414,7 @@ intel_bufferobj_map_range(struct gl_context * ctx,
  * would defeat the point.
  */
 static void
-intel_bufferobj_flush_mapped_range(struct gl_context *ctx, GLenum target,
+intel_bufferobj_flush_mapped_range(struct gl_context *ctx,
 				   GLintptr offset, GLsizeiptr length,
 				   struct gl_buffer_object *obj)
 {
@@ -502,8 +448,7 @@ intel_bufferobj_flush_mapped_range(struct gl_context *ctx, GLenum target,
  * Called via glUnmapBuffer().
  */
 static GLboolean
-intel_bufferobj_unmap(struct gl_context * ctx,
-                      GLenum target, struct gl_buffer_object *obj)
+intel_bufferobj_unmap(struct gl_context * ctx, struct gl_buffer_object *obj)
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
@@ -758,23 +703,23 @@ intel_bufferobj_copy_subdata(struct gl_context *ctx,
        * not overlap.
        */
       if (src == dst) {
-	 char *ptr = intel_bufferobj_map(ctx, GL_COPY_WRITE_BUFFER,
-					 GL_READ_WRITE, dst);
+	 char *ptr = intel_bufferobj_map_range(ctx, 0, dst->Size,
+					       GL_MAP_READ_BIT, dst);
 	 memmove(ptr + write_offset, ptr + read_offset, size);
-	 intel_bufferobj_unmap(ctx, GL_COPY_WRITE_BUFFER, dst);
+	 intel_bufferobj_unmap(ctx, dst);
       } else {
 	 const char *src_ptr;
 	 char *dst_ptr;
 
-	 src_ptr =  intel_bufferobj_map(ctx, GL_COPY_READ_BUFFER,
-					GL_READ_ONLY, src);
-	 dst_ptr =  intel_bufferobj_map(ctx, GL_COPY_WRITE_BUFFER,
-					GL_WRITE_ONLY, dst);
+	 src_ptr =  intel_bufferobj_map_range(ctx, 0, src->Size,
+					      GL_MAP_READ_BIT, src);
+	 dst_ptr =  intel_bufferobj_map_range(ctx, 0, dst->Size,
+					      GL_MAP_WRITE_BIT, dst);
 
 	 memcpy(dst_ptr + write_offset, src_ptr + read_offset, size);
 
-	 intel_bufferobj_unmap(ctx, GL_COPY_READ_BUFFER, src);
-	 intel_bufferobj_unmap(ctx, GL_COPY_WRITE_BUFFER, dst);
+	 intel_bufferobj_unmap(ctx, src);
+	 intel_bufferobj_unmap(ctx, dst);
       }
       return;
    }
@@ -924,7 +869,6 @@ intelInitBufferObjectFuncs(struct dd_function_table *functions)
    functions->BufferData = intel_bufferobj_data;
    functions->BufferSubData = intel_bufferobj_subdata;
    functions->GetBufferSubData = intel_bufferobj_get_subdata;
-   functions->MapBuffer = intel_bufferobj_map;
    functions->MapBufferRange = intel_bufferobj_map_range;
    functions->FlushMappedBufferRange = intel_bufferobj_flush_mapped_range;
    functions->UnmapBuffer = intel_bufferobj_unmap;
diff --git a/src/mesa/drivers/dri/intel/intel_clear.c b/src/mesa/drivers/dri/intel/intel_clear.c
index dfca03c14bf..76d33f9b37e 100644
--- a/src/mesa/drivers/dri/intel/intel_clear.c
+++ b/src/mesa/drivers/dri/intel/intel_clear.c
@@ -116,13 +116,13 @@ intelClear(struct gl_context *ctx, GLbitfield mask)
    }
 
    /* HW color buffers (front, back, aux, generic FBO, etc) */
-   if (colorMask == ~0) {
+   if (intel->gen < 6 && colorMask == ~0) {
       /* clear all R,G,B,A */
       blit_mask |= (mask & BUFFER_BITS_COLOR);
    }
    else {
       /* glColorMask in effect */
-      tri_mask |= (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_BACK_LEFT));
+      tri_mask |= (mask & BUFFER_BITS_COLOR);
    }
 
    /* Make sure we have up to date buffers before we start looking at
@@ -143,6 +143,12 @@ intelClear(struct gl_context *ctx, GLbitfield mask)
 	     */
             tri_mask |= BUFFER_BIT_STENCIL;
          }
+	 else if (intel->has_separate_stencil &&
+	       stencilRegion->tiling == I915_TILING_NONE) {
+	    /* The stencil buffer is actually W tiled, which the hardware
+	     * cannot blit to. */
+	    tri_mask |= BUFFER_BIT_STENCIL;
+	 }
          else {
             /* clearing all stencil bits, use blitting */
             blit_mask |= BUFFER_BIT_STENCIL;
@@ -182,7 +188,10 @@ intelClear(struct gl_context *ctx, GLbitfield mask)
 
    if (tri_mask) {
       debug_mask("tri", tri_mask);
-      _mesa_meta_Clear(&intel->ctx, tri_mask);
+      if (ctx->Extensions.ARB_fragment_shader)
+	 _mesa_meta_glsl_Clear(&intel->ctx, tri_mask);
+      else
+	 _mesa_meta_Clear(&intel->ctx, tri_mask);
    }
 }
 
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 2ba13632569..14342ef6246 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -1439,7 +1439,12 @@ intel_verify_dri2_has_hiz(struct intel_context *intel,
       assert(stencil_rb->Base.Format == MESA_FORMAT_S8);
       assert(depth_rb && depth_rb->Base.Format == MESA_FORMAT_X8_Z24);
 
-      if (stencil_rb->region->tiling == I915_TILING_Y) {
+      if (stencil_rb->region->tiling == I915_TILING_NONE) {
+	 /*
+	  * The stencil buffer is actually W tiled. The region's tiling is
+	  * I915_TILING_NONE, however, because the GTT is incapable of W
+	  * fencing.
+	  */
 	 intel->intelScreen->dri2_has_hiz = INTEL_DRI2_HAS_HIZ_TRUE;
 	 return;
       } else {
@@ -1449,6 +1454,13 @@ intel_verify_dri2_has_hiz(struct intel_context *intel,
 	  * a combined depth/stencil buffer. Discard the hiz buffer too.
 	  */
 	 intel->intelScreen->dri2_has_hiz = INTEL_DRI2_HAS_HIZ_FALSE;
+	 if (intel->must_use_separate_stencil) {
+	    _mesa_problem(&intel->ctx,
+			  "intel_context requires separate stencil, but the "
+			  "DRIscreen does not support it. You may need to "
+			  "upgrade the Intel X driver to 2.16.0");
+	    abort();
+	 }
 
 	 /* 1. Discard depth and stencil renderbuffers. */
 	 _mesa_remove_renderbuffer(fb, BUFFER_DEPTH);
@@ -1527,7 +1539,7 @@ intel_verify_dri2_has_hiz(struct intel_context *intel,
        * Presently, however, no verification or clean up is necessary, and
        * execution should not reach here. If the framebuffer still has a hiz
        * region, then we have already set dri2_has_hiz to true after
-       * confirming above that the stencil buffer is Y tiled.
+       * confirming above that the stencil buffer is W tiled.
        */
       assert(0);
    }
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index 55bcc757873..754f9f202d1 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -173,6 +173,9 @@ intel_alloc_renderbuffer_storage(struct gl_context * ctx, struct gl_renderbuffer
 
    if (irb->Base.Format == MESA_FORMAT_S8) {
       /*
+       * The stencil buffer is W tiled. However, we request from the kernel a
+       * non-tiled buffer because the GTT is incapable of W fencing.
+       *
        * The stencil buffer has quirky pitch requirements.  From Vol 2a,
        * 11.5.6.2.1 3DSTATE_STENCIL_BUFFER, field "Surface Pitch":
        *    The pitch must be set to 2x the value computed based on width, as
@@ -180,14 +183,13 @@ intel_alloc_renderbuffer_storage(struct gl_context * ctx, struct gl_renderbuffer
        * To accomplish this, we resort to the nasty hack of doubling the drm
        * region's cpp and halving its height.
        *
-       * If we neglect to double the pitch, then drm_intel_gem_bo_map_gtt()
-       * maps the memory incorrectly.
+       * If we neglect to double the pitch, then render corruption occurs.
        */
       irb->region = intel_region_alloc(intel->intelScreen,
-				       I915_TILING_Y,
+				       I915_TILING_NONE,
 				       cpp * 2,
-				       width,
-				       height / 2,
+				       ALIGN(width, 64),
+				       ALIGN((height + 1) / 2, 64),
 				       GL_TRUE);
       if (!irb->region)
 	return false;
@@ -594,17 +596,15 @@ intel_renderbuffer_set_draw_offset(struct intel_renderbuffer *irb,
 				   struct intel_texture_image *intel_image,
 				   int zoffset)
 {
-   struct intel_mipmap_tree *mt = intel_image->mt;
    unsigned int dst_x, dst_y;
 
    /* compute offset of the particular 2D image within the texture region */
    intel_miptree_get_image_offset(intel_image->mt,
-				  intel_image->level,
-				  intel_image->face,
+				  intel_image->base.Level,
+				  intel_image->base.Face,
 				  zoffset,
 				  &dst_x, &dst_y);
 
-   irb->draw_offset = (dst_y * mt->region->pitch + dst_x) * mt->cpp;
    irb->draw_x = dst_x;
    irb->draw_y = dst_y;
 }
@@ -645,6 +645,22 @@ intel_renderbuffer_tile_offsets(struct intel_renderbuffer *irb,
    }
 }
 
+#ifndef I915
+static bool
+need_tile_offset_workaround(struct brw_context *brw,
+			    struct intel_renderbuffer *irb)
+{
+   uint32_t tile_x, tile_y;
+
+   if (brw->has_surface_tile_offset)
+      return false;
+
+   intel_renderbuffer_tile_offsets(irb, &tile_x, &tile_y);
+
+   return tile_x != 0 || tile_y != 0;
+}
+#endif
+
 /**
  * Called by glFramebufferTexture[123]DEXT() (and other places) to
  * prepare for rendering into texture memory.  This might be called
@@ -698,8 +714,7 @@ intel_render_texture(struct gl_context * ctx,
    intel_image->used_as_render_target = GL_TRUE;
 
 #ifndef I915
-   if (!brw_context(ctx)->has_surface_tile_offset &&
-       (irb->draw_offset & 4095) != 0) {
+   if (need_tile_offset_workaround(brw_context(ctx), irb)) {
       /* Original gen4 hardware couldn't draw to a non-tile-aligned
        * destination in a miptree unless you actually setup your
        * renderbuffer as a miptree and used the fragile
@@ -713,8 +728,8 @@ intel_render_texture(struct gl_context * ctx,
 
       new_mt = intel_miptree_create(intel, image->TexObject->Target,
 				    intel_image->base.TexFormat,
-				    intel_image->level,
-				    intel_image->level,
+				    intel_image->base.Level,
+				    intel_image->base.Level,
 				    intel_image->base.Width,
 				    intel_image->base.Height,
 				    intel_image->base.Depth,
@@ -722,8 +737,8 @@ intel_render_texture(struct gl_context * ctx,
 
       intel_miptree_image_copy(intel,
                                new_mt,
-                               intel_image->face,
-			       intel_image->level,
+			       intel_image->base.Face,
+			       intel_image->base.Level,
 			       old_mt);
 
       intel_miptree_release(intel, &intel_image->mt);
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.h b/src/mesa/drivers/dri/intel/intel_fbo.h
index f7f99a4f00c..2487994fde5 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.h
+++ b/src/mesa/drivers/dri/intel/intel_fbo.h
@@ -58,7 +58,6 @@ struct intel_renderbuffer
 
    /** \} */
 
-   GLuint draw_offset; /**< Offset of drawing address within the region */
    GLuint draw_x, draw_y; /**< Offset of drawing within the region */
 };
 
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
index 4e711de1ce1..f36240d7f1d 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
@@ -227,7 +227,7 @@ intel_miptree_match_image(struct intel_mipmap_tree *mt,
                           struct gl_texture_image *image)
 {
    struct intel_texture_image *intelImage = intel_texture_image(image);
-   GLuint level = intelImage->level;
+   GLuint level = intelImage->base.Level;
 
    /* Images with borders are never pulled into mipmap trees. */
    if (image->Border)
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
index 86d0ef2d748..d9873a303ee 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
@@ -74,9 +74,9 @@ static const GLubyte *map_pbo( struct gl_context *ctx,
       return NULL;
    }
 
-   buf = (GLubyte *) ctx->Driver.MapBuffer(ctx, GL_PIXEL_UNPACK_BUFFER_EXT,
-					   GL_READ_ONLY_ARB,
-					   unpack->BufferObj);
+   buf = (GLubyte *) ctx->Driver.MapBufferRange(ctx, 0, unpack->BufferObj->Size,
+						GL_MAP_READ_BIT,
+						unpack->BufferObj);
    if (!buf) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "glBitmap(PBO is mapped)");
       return NULL;
@@ -292,8 +292,7 @@ out:
 
    if (_mesa_is_bufferobj(unpack->BufferObj)) {
       /* done with PBO so unmap it now */
-      ctx->Driver.UnmapBuffer(ctx, GL_PIXEL_UNPACK_BUFFER_EXT,
-                              unpack->BufferObj);
+      ctx->Driver.UnmapBuffer(ctx, unpack->BufferObj);
    }
 
    intel_check_front_buffer_rendering(intel);
diff --git a/src/mesa/drivers/dri/intel/intel_reg.h b/src/mesa/drivers/dri/intel/intel_reg.h
index 5aa629150cf..a98a669af21 100644
--- a/src/mesa/drivers/dri/intel/intel_reg.h
+++ b/src/mesa/drivers/dri/intel/intel_reg.h
@@ -75,6 +75,7 @@
 #define PIPE_CONTROL_VF_CACHE_INVALIDATE	(1 << 4)
 #define PIPE_CONTROL_CONST_CACHE_INVALIDATE	(1 << 3)
 #define PIPE_CONTROL_STATE_CACHE_INVALIDATE	(1 << 2)
+#define PIPE_CONTROL_STALL_AT_SCOREBOARD	(1 << 1)
 #define PIPE_CONTROL_DEPTH_CACHE_FLUSH		(1 << 0)
 #define PIPE_CONTROL_PPGTT_WRITE	(0 << 2)
 #define PIPE_CONTROL_GLOBAL_GTT_WRITE	(1 << 2)
diff --git a/src/mesa/drivers/dri/intel/intel_screen.h b/src/mesa/drivers/dri/intel/intel_screen.h
index b2013af1a29..9dd6a525566 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.h
+++ b/src/mesa/drivers/dri/intel/intel_screen.h
@@ -63,9 +63,12 @@
  * x8_z24 and s8).
  *
  * Eventually, intel_update_renderbuffers() makes a DRI2 request for
- * DRI2BufferStencil and DRI2BufferHiz. If the returned buffers are Y tiled,
- * then we joyfully set intel_screen.dri2_has_hiz to true and continue as if
- * nothing happend.
+ * DRI2BufferStencil and DRI2BufferHiz. If the stencil buffer's tiling is
+ * I915_TILING_NONE [1], then we joyfully set intel_screen.dri2_has_hiz to
+ * true and continue as if nothing happend.
+ *
+ * [1] The stencil buffer is actually W tiled. However, we request from the
+ *     kernel a non-tiled buffer because the GTT is incapable of W fencing.
  *
  * If the buffers are X tiled, however, the handshake has failed and we must
  * clean up.
diff --git a/src/mesa/drivers/dri/intel/intel_span.c b/src/mesa/drivers/dri/intel/intel_span.c
index 153803fba09..2e1c80c4766 100644
--- a/src/mesa/drivers/dri/intel/intel_span.c
+++ b/src/mesa/drivers/dri/intel/intel_span.c
@@ -131,38 +131,84 @@ intel_set_span_functions(struct intel_context *intel,
    int miny = 0;							\
    int maxx = rb->Width;						\
    int maxy = rb->Height;						\
-   int stride = rb->RowStride;						\
-   uint8_t *buf = rb->Data;						\
+									\
+   /*									\
+    * Here we ignore rb->Data and rb->RowStride as set by		\
+    * intelSpanRenderStart. Since intel_offset_S8 decodes the W tile	\
+    * manually, the region's *real* base address and stride is		\
+    * required.								\
+    */									\
+   struct intel_renderbuffer *irb = intel_renderbuffer(rb);		\
+   uint8_t *buf = irb->region->buffer->virtual;				\
+   unsigned stride = irb->region->pitch;				\
+   unsigned height = 2 * irb->region->height;				\
+   bool flip = rb->Name == 0;						\
+   int y_scale = flip ? -1 : 1;						\
+   int y_bias = flip ? (height - 1) : 0;				\
 
-/* Don't flip y. */
 #undef Y_FLIP
-#define Y_FLIP(y) y
+#define Y_FLIP(y) (y_scale * (y) + y_bias)
 
 /**
  * \brief Get pointer offset into stencil buffer.
  *
- * The stencil buffer interleaves two rows into one. Yay for crazy hardware.
- * The table below demonstrates how the pointer arithmetic behaves for a buffer
- * with positive stride (s=stride).
- *
- *     x    | y     | byte offset
- *     --------------------------
- *     0    | 0     | 0
- *     0    | 1     | 1
- *     1    | 0     | 2
- *     1    | 1     | 3
- *     ...  | ...   | ...
- *     0    | 2     | s
- *     0    | 3     | s + 1
- *     1    | 2     | s + 2
- *     1    | 3     | s + 3
+ * The stencil buffer is W tiled. Since the GTT is incapable of W fencing, we
+ * must decode the tile's layout in software.
  *
+ * See
+ *   - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.2.1 W-Major Tile
+ *     Format.
+ *   - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.3 Tiling Algorithm
  *
+ * Even though the returned offset is always positive, the return type is
+ * signed due to
+ *    commit e8b1c6d6f55f5be3bef25084fdd8b6127517e137
+ *    mesa: Fix return type of  _mesa_get_format_bytes() (#37351)
  */
 static inline intptr_t
-intel_offset_S8(int stride, GLint x, GLint y)
+intel_offset_S8(uint32_t stride, uint32_t x, uint32_t y)
 {
-   return 2 * ((y / 2) * stride + x) + y % 2;
+   uint32_t tile_size = 4096;
+   uint32_t tile_width = 64;
+   uint32_t tile_height = 64;
+   uint32_t row_size = 64 * stride;
+
+   uint32_t tile_x = x / tile_width;
+   uint32_t tile_y = y / tile_height;
+
+   /* The byte's address relative to the tile's base addres. */
+   uint32_t byte_x = x % tile_width;
+   uint32_t byte_y = y % tile_height;
+
+   uintptr_t u = tile_y * row_size
+               + tile_x * tile_size
+               + 512 * (byte_x / 8)
+               +  64 * (byte_y / 8)
+               +  32 * ((byte_y / 4) % 2)
+               +  16 * ((byte_x / 4) % 2)
+               +   8 * ((byte_y / 2) % 2)
+               +   4 * ((byte_x / 2) % 2)
+               +   2 * (byte_y % 2)
+               +   1 * (byte_x % 2);
+
+   /*
+    * Errata for Gen5:
+    *
+    * An additional offset is needed which is not documented in the PRM.
+    *
+    * if ((byte_x / 8) % 2 == 1) {
+    *    if ((byte_y / 8) % 2) == 0) {
+    *       u += 64;
+    *    } else {
+    *       u -= 64;
+    *    }
+    * }
+    *
+    * The offset is expressed more tersely as
+    * u += ((int) x & 0x8) * (8 - (((int) y & 0x8) << 1));
+    */
+
+   return u;
 }
 
 #define WRITE_STENCIL(x, y, src)  buf[intel_offset_S8(stride, x, y)] = src;
diff --git a/src/mesa/drivers/dri/intel/intel_tex.c b/src/mesa/drivers/dri/intel/intel_tex.c
index 21c4a1dddba..ee0cd252375 100644
--- a/src/mesa/drivers/dri/intel/intel_tex.c
+++ b/src/mesa/drivers/dri/intel/intel_tex.c
@@ -95,17 +95,12 @@ intelGenerateMipmap(struct gl_context *ctx, GLenum target,
       if (!_mesa_is_format_compressed(first_image->TexFormat)) {
          GLuint nr_faces = (texObj->Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
          GLuint face, i;
-         /* Update the level information in our private data in the new images,
-          * since it didn't get set as part of a normal TexImage path.
-          */
          for (face = 0; face < nr_faces; face++) {
             for (i = texObj->BaseLevel + 1; i < texObj->MaxLevel; i++) {
                struct intel_texture_image *intelImage =
                   intel_texture_image(texObj->Image[face][i]);
                if (!intelImage)
                   break;
-               intelImage->level = i;
-               intelImage->face = face;
                /* Unreference the miptree to signal that the new Data is a
                 * bare pointer from mesa.
                 */
diff --git a/src/mesa/drivers/dri/intel/intel_tex_copy.c b/src/mesa/drivers/dri/intel/intel_tex_copy.c
index 1a3643da593..600bd1251e0 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_copy.c
@@ -118,8 +118,8 @@ intel_copy_texsubimage(struct intel_context *intel,
 
       /* get dest x/y in destination texture */
       intel_miptree_get_image_offset(intelImage->mt,
-				     intelImage->level,
-				     intelImage->face,
+				     intelImage->base.Level,
+				     intelImage->base.Face,
 				     0,
 				     &image_x, &image_y);
 
@@ -164,101 +164,6 @@ intel_copy_texsubimage(struct intel_context *intel,
 
 
 static void
-intelCopyTexImage1D(struct gl_context * ctx, GLenum target, GLint level,
-                    GLenum internalFormat,
-                    GLint x, GLint y, GLsizei width, GLint border)
-{
-   struct gl_texture_unit *texUnit = _mesa_get_current_tex_unit(ctx);
-   struct gl_texture_object *texObj =
-      _mesa_select_tex_object(ctx, texUnit, target);
-   struct gl_texture_image *texImage =
-      _mesa_select_tex_image(ctx, texObj, target, level);
-   int srcx, srcy, dstx, dsty, height;
-
-   if (border)
-      goto fail;
-
-   /* Setup or redefine the texture object, mipmap tree and texture
-    * image.  Don't populate yet.  
-    */
-   ctx->Driver.TexImage1D(ctx, target, level, internalFormat,
-                          width, border,
-                          GL_RGBA, CHAN_TYPE, NULL,
-                          &ctx->DefaultPacking, texObj, texImage);
-   srcx = x;
-   srcy = y;
-   dstx = 0;
-   dsty = 0;
-   height = 1;
-   if (!_mesa_clip_copytexsubimage(ctx,
-				   &dstx, &dsty,
-				   &srcx, &srcy,
-				   &width, &height))
-      return;
-
-   if (!intel_copy_texsubimage(intel_context(ctx), target,
-                               intel_texture_image(texImage),
-                               internalFormat, 0, 0, x, y, width, height))
-      goto fail;
-
-   return;
-
- fail:
-   fallback_debug("%s - fallback to swrast\n", __FUNCTION__);
-   _mesa_meta_CopyTexImage1D(ctx, target, level, internalFormat, x, y,
-                             width, border);
-}
-
-
-static void
-intelCopyTexImage2D(struct gl_context * ctx, GLenum target, GLint level,
-                    GLenum internalFormat,
-                    GLint x, GLint y, GLsizei width, GLsizei height,
-                    GLint border)
-{
-   struct gl_texture_unit *texUnit = _mesa_get_current_tex_unit(ctx);
-   struct gl_texture_object *texObj =
-      _mesa_select_tex_object(ctx, texUnit, target);
-   struct gl_texture_image *texImage =
-      _mesa_select_tex_image(ctx, texObj, target, level);
-   int srcx, srcy, dstx, dsty;
-
-   if (border)
-      goto fail;
-
-   /* Setup or redefine the texture object, mipmap tree and texture
-    * image.  Don't populate yet.
-    */
-   ctx->Driver.TexImage2D(ctx, target, level, internalFormat,
-                          width, height, border,
-                          GL_RGBA, GL_UNSIGNED_BYTE, NULL,
-                          &ctx->DefaultPacking, texObj, texImage);
-
-   srcx = x;
-   srcy = y;
-   dstx = 0;
-   dsty = 0;
-   if (!_mesa_clip_copytexsubimage(ctx,
-				   &dstx, &dsty,
-				   &srcx, &srcy,
-				   &width, &height))
-      return;
-
-   if (!intel_copy_texsubimage(intel_context(ctx), target,
-                               intel_texture_image(texImage),
-                               internalFormat, 0, 0, x, y, width, height))
-      goto fail;
-
-   return;
-
- fail:
-   fallback_debug("%s - fallback to swrast\n", __FUNCTION__);
-   _mesa_meta_CopyTexImage2D(ctx, target, level, internalFormat, x, y,
-                             width, height, border);
-}
-
-
-static void
 intelCopyTexSubImage1D(struct gl_context * ctx, GLenum target, GLint level,
                        GLint xoffset, GLint x, GLint y, GLsizei width)
 {
@@ -312,8 +217,6 @@ intelCopyTexSubImage2D(struct gl_context * ctx, GLenum target, GLint level,
 void
 intelInitTextureCopyImageFuncs(struct dd_function_table *functions)
 {
-   functions->CopyTexImage1D = intelCopyTexImage1D;
-   functions->CopyTexImage2D = intelCopyTexImage2D;
    functions->CopyTexSubImage1D = intelCopyTexSubImage1D;
    functions->CopyTexSubImage2D = intelCopyTexSubImage2D;
 }
diff --git a/src/mesa/drivers/dri/intel/intel_tex_image.c b/src/mesa/drivers/dri/intel/intel_tex_image.c
index 1f8b885bbec..4ee66847255 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_image.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_image.c
@@ -63,7 +63,7 @@ intel_miptree_create_for_teximage(struct intel_context *intel,
    if (intelImage->base.Border)
       return NULL;
 
-   if (intelImage->level > intelObj->base.BaseLevel &&
+   if (intelImage->base.Level > intelObj->base.BaseLevel &&
        (intelImage->base.Width == 1 ||
         (intelObj->base.Target != GL_TEXTURE_1D &&
          intelImage->base.Height == 1) ||
@@ -74,19 +74,19 @@ intel_miptree_create_for_teximage(struct intel_context *intel,
        * likely base level width/height/depth for a full mipmap stack
        * from this info, so just allocate this one level.
        */
-      firstLevel = intelImage->level;
-      lastLevel = intelImage->level;
+      firstLevel = intelImage->base.Level;
+      lastLevel = intelImage->base.Level;
    } else {
       /* If this image disrespects BaseLevel, allocate from level zero.
        * Usually BaseLevel == 0, so it's unlikely to happen.
        */
-      if (intelImage->level < intelObj->base.BaseLevel)
+      if (intelImage->base.Level < intelObj->base.BaseLevel)
 	 firstLevel = 0;
       else
 	 firstLevel = intelObj->base.BaseLevel;
 
       /* Figure out image dimensions at start level. */
-      for (i = intelImage->level; i > firstLevel; i--) {
+      for (i = intelImage->base.Level; i > firstLevel; i--) {
 	 width <<= 1;
 	 if (height != 1)
 	    height <<= 1;
@@ -101,7 +101,7 @@ intel_miptree_create_for_teximage(struct intel_context *intel,
        */
       if ((intelObj->base.Sampler.MinFilter == GL_NEAREST ||
 	   intelObj->base.Sampler.MinFilter == GL_LINEAR) &&
-	  intelImage->level == firstLevel &&
+	  intelImage->base.Level == firstLevel &&
 	  (intel->gen < 4 || firstLevel == 0)) {
 	 lastLevel = firstLevel;
       } else {
@@ -186,8 +186,8 @@ try_pbo_upload(struct intel_context *intel,
    else
       src_stride = width;
 
-   intel_miptree_get_image_offset(intelImage->mt, intelImage->level,
-				  intelImage->face, 0,
+   intel_miptree_get_image_offset(intelImage->mt, intelImage->base.Level,
+				  intelImage->base.Face, 0,
 				  &dst_x, &dst_y);
 
    dst_stride = intelImage->mt->region->pitch;
@@ -243,8 +243,8 @@ try_pbo_zcopy(struct intel_context *intel,
    else
       src_stride = width;
 
-   intel_miptree_get_image_offset(intelImage->mt, intelImage->level,
-				  intelImage->face, 0,
+   intel_miptree_get_image_offset(intelImage->mt, intelImage->base.Level,
+				  intelImage->base.Face, 0,
 				  &dst_x, &dst_y);
 
    dst_stride = intelImage->mt->region->pitch;
@@ -407,9 +407,6 @@ intelTexImage(struct gl_context * ctx,
    DBG("%s target %s level %d %dx%dx%d border %d\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(target), level, width, height, depth, border);
 
-   intelImage->face = _mesa_tex_target_to_face(target);
-   intelImage->level = level;
-
    if (_mesa_is_format_compressed(texImage->TexFormat)) {
       texelBytes = 0;
    }
@@ -514,8 +511,8 @@ intelTexImage(struct gl_context * ctx,
 	 }
          texImage->Data = intel_miptree_image_map(intel,
                                                   intelImage->mt,
-                                                  intelImage->face,
-                                                  intelImage->level,
+                                                  intelImage->base.Face,
+                                                  intelImage->base.Level,
                                                   &dstRowStride,
                                                   intelImage->base.ImageOffsets);
       }
@@ -684,8 +681,8 @@ intel_get_tex_image(struct gl_context * ctx, GLenum target, GLint level,
       intelImage->base.Data =
          intel_miptree_image_map(intel,
                                  intelImage->mt,
-                                 intelImage->face,
-                                 intelImage->level,
+                                 intelImage->base.Face,
+                                 intelImage->base.Level,
                                  &intelImage->base.RowStride,
                                  intelImage->base.ImageOffsets);
       intelImage->base.RowStride /= intelImage->mt->cpp;
@@ -816,8 +813,6 @@ intelSetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
 			      rb->region->width, rb->region->height, 1,
 			      0, internalFormat, texFormat);
 
-   intelImage->face = _mesa_tex_target_to_face(target);
-   intelImage->level = level;
    texImage->RowStride = rb->region->pitch;
    intel_miptree_reference(&intelImage->mt, intelObj->mt);
 
@@ -874,8 +869,6 @@ intel_image_target_texture_2d(struct gl_context *ctx, GLenum target,
 			      image->region->width, image->region->height, 1,
 			      0, image->internal_format, image->format);
 
-   intelImage->face = _mesa_tex_target_to_face(target);
-   intelImage->level = 0;
    texImage->RowStride = image->region->pitch;
    intel_miptree_reference(&intelImage->mt, intelObj->mt);
 
diff --git a/src/mesa/drivers/dri/intel/intel_tex_obj.h b/src/mesa/drivers/dri/intel/intel_tex_obj.h
index a9ae2ec5429..e7a4318b8d8 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_obj.h
+++ b/src/mesa/drivers/dri/intel/intel_tex_obj.h
@@ -52,11 +52,6 @@ struct intel_texture_image
 {
    struct gl_texture_image base;
 
-   /* These aren't stored in gl_texture_image 
-    */
-   GLuint level;
-   GLuint face;
-
    /* If intelImage->mt != NULL, image data is stored here.
     * Else if intelImage->base.Data != NULL, image is stored there.
     * Else there is no image data.
diff --git a/src/mesa/drivers/dri/intel/intel_tex_subimage.c b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
index 8b43c406cf9..5fd2cc36234 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
@@ -113,7 +113,7 @@ intelTexSubimage(struct gl_context * ctx,
 	 dstRowStride = pitch;
 
 	 intel_miptree_get_image_offset(intelImage->mt, level,
-					intelImage->face, 0,
+					intelImage->base.Face, 0,
 					&blit_x, &blit_y);
 	 blit_x += xoffset;
 	 blit_y += yoffset;
@@ -122,8 +122,8 @@ intelTexSubimage(struct gl_context * ctx,
       } else {
 	 texImage->Data = intel_miptree_image_map(intel,
 						  intelImage->mt,
-						  intelImage->face,
-						  intelImage->level,
+						  intelImage->base.Face,
+						  intelImage->base.Level,
 						  &dstRowStride,
 						  texImage->ImageOffsets);
       }
diff --git a/src/mesa/drivers/dri/intel/intel_tex_validate.c b/src/mesa/drivers/dri/intel/intel_tex_validate.c
index 7135a6276fe..31ac689ad77 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_validate.c
@@ -42,8 +42,8 @@ copy_image_data_to_tree(struct intel_context *intel,
        */
       intel_miptree_image_copy(intel,
                                intelObj->mt,
-                               intelImage->face,
-                               intelImage->level, intelImage->mt);
+                               intelImage->base.Face,
+                               intelImage->base.Level, intelImage->mt);
 
       intel_miptree_release(intel, &intelImage->mt);
    }
@@ -54,8 +54,8 @@ copy_image_data_to_tree(struct intel_context *intel,
        */
       intel_miptree_image_data(intel,
                                intelObj->mt,
-                               intelImage->face,
-                               intelImage->level,
+                               intelImage->base.Face,
+                               intelImage->base.Level,
                                intelImage->base.Data,
                                intelImage->base.RowStride,
                                intelImage->base.RowStride *
@@ -177,8 +177,8 @@ intel_tex_map_level_images(struct intel_context *intel,
 	 intelImage->base.Data =
 	    intel_miptree_image_map(intel,
 				    intelImage->mt,
-				    intelImage->face,
-				    intelImage->level,
+				    intelImage->base.Face,
+				    intelImage->base.Level,
 				    &intelImage->base.RowStride,
 				    intelImage->base.ImageOffsets);
 	 /* convert stride to texels, not bytes */
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.c b/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.c
index e60b91f64be..433590c4181 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.c
@@ -107,7 +107,7 @@ nouveau_bufferobj_data(struct gl_context *ctx, GLenum target, GLsizeiptrARB size
 }
 
 static void
-nouveau_bufferobj_subdata(struct gl_context *ctx, GLenum target, GLintptrARB offset,
+nouveau_bufferobj_subdata(struct gl_context *ctx, GLintptrARB offset,
 			  GLsizeiptrARB size, const GLvoid *data,
 			  struct gl_buffer_object *obj)
 {
@@ -115,7 +115,7 @@ nouveau_bufferobj_subdata(struct gl_context *ctx, GLenum target, GLintptrARB off
 }
 
 static void
-nouveau_bufferobj_get_subdata(struct gl_context *ctx, GLenum target, GLintptrARB offset,
+nouveau_bufferobj_get_subdata(struct gl_context *ctx, GLintptrARB offset,
 			   GLsizeiptrARB size, GLvoid *data,
 			   struct gl_buffer_object *obj)
 {
@@ -123,23 +123,6 @@ nouveau_bufferobj_get_subdata(struct gl_context *ctx, GLenum target, GLintptrARB
 }
 
 static void *
-nouveau_bufferobj_map(struct gl_context *ctx, GLenum target, GLenum access,
-		   struct gl_buffer_object *obj)
-{
-	unsigned flags = 0;
-
-	if (access == GL_READ_ONLY_ARB ||
-	    access == GL_READ_WRITE_ARB)
-		flags |= GL_MAP_READ_BIT;
-	if (access == GL_WRITE_ONLY_ARB ||
-	    access == GL_READ_WRITE_ARB)
-		flags |= GL_MAP_WRITE_BIT;
-
-	return ctx->Driver.MapBufferRange(ctx, target, 0, obj->Size, flags,
-					  obj);
-}
-
-static void *
 nouveau_bufferobj_map_range(struct gl_context *ctx, GLenum target, GLintptr offset,
 			    GLsizeiptr length, GLbitfield access,
 			    struct gl_buffer_object *obj)
@@ -169,7 +152,7 @@ nouveau_bufferobj_map_range(struct gl_context *ctx, GLenum target, GLintptr offs
 }
 
 static GLboolean
-nouveau_bufferobj_unmap(struct gl_context *ctx, GLenum target, struct gl_buffer_object *obj)
+nouveau_bufferobj_unmap(struct gl_context *ctx, struct gl_buffer_object *obj)
 {
 	assert(obj->Pointer);
 
@@ -189,7 +172,6 @@ nouveau_bufferobj_functions_init(struct dd_function_table *functions)
 	functions->BufferData = nouveau_bufferobj_data;
 	functions->BufferSubData = nouveau_bufferobj_subdata;
 	functions->GetBufferSubData = nouveau_bufferobj_get_subdata;
-	functions->MapBuffer = nouveau_bufferobj_map;
 	functions->MapBufferRange = nouveau_bufferobj_map_range;
 	functions->UnmapBuffer = nouveau_bufferobj_unmap;
 }
diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.c b/src/mesa/drivers/dri/r200/r200_ioctl.c
index 02201cb53d6..44a794da396 100644
--- a/src/mesa/drivers/dri/r200/r200_ioctl.c
+++ b/src/mesa/drivers/dri/r200/r200_ioctl.c
@@ -185,7 +185,6 @@ static void r200Clear( struct gl_context *ctx, GLbitfield mask )
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    __DRIdrawable *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLuint flags = 0;
-   GLuint color_mask = 0;
    GLuint orig_mask = mask;
 
    if ( R200_DEBUG & RADEON_IOCTL ) {
@@ -206,13 +205,11 @@ static void r200Clear( struct gl_context *ctx, GLbitfield mask )
 
    if ( mask & BUFFER_BIT_FRONT_LEFT ) {
       flags |= RADEON_FRONT;
-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
       mask &= ~BUFFER_BIT_FRONT_LEFT;
    }
 
    if ( mask & BUFFER_BIT_BACK_LEFT ) {
       flags |= RADEON_BACK;
-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
       mask &= ~BUFFER_BIT_BACK_LEFT;
    }
 
diff --git a/src/mesa/drivers/dri/r200/r200_tex.c b/src/mesa/drivers/dri/r200/r200_tex.c
index d42e8f12041..91e77f9f7da 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.c
+++ b/src/mesa/drivers/dri/r200/r200_tex.c
@@ -527,7 +527,6 @@ void r200InitTextureFuncs( radeonContextPtr radeon, struct dd_function_table *fu
    functions->CompressedTexSubImage2D	= radeonCompressedTexSubImage2D;
 
    if (radeon->radeonScreen->kernel_mm) {
-      functions->CopyTexImage2D = radeonCopyTexImage2D;
       functions->CopyTexSubImage2D = radeonCopyTexSubImage2D;
    }
 
diff --git a/src/mesa/drivers/dri/r200/r200_texstate.c b/src/mesa/drivers/dri/r200/r200_texstate.c
index 7adf9ad73ed..8c9bd6d00b2 100644
--- a/src/mesa/drivers/dri/r200/r200_texstate.c
+++ b/src/mesa/drivers/dri/r200/r200_texstate.c
@@ -773,18 +773,12 @@ void r200SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint texture_format
 	struct radeon_renderbuffer *rb;
 	radeon_texture_image *rImage;
 	radeonContextPtr radeon;
-	r200ContextPtr rmesa;
 	struct radeon_framebuffer *rfb;
 	radeonTexObjPtr t;
 	uint32_t pitch_val;
-	uint32_t internalFormat, format;
 	gl_format texFormat;
 
-	format = GL_UNSIGNED_BYTE;
-	internalFormat = (texture_format == __DRI_TEXTURE_FORMAT_RGB ? 3 : 4);
-
 	radeon = pDRICtx->driverPrivate;
-	rmesa = pDRICtx->driverPrivate;
 
 	rfb = dPriv->driverPrivate;
         texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c
index 63e03b0e0c7..cf44d7f459c 100644
--- a/src/mesa/drivers/dri/r200/r200_vertprog.c
+++ b/src/mesa/drivers/dri/r200/r200_vertprog.c
@@ -126,10 +126,10 @@ static GLboolean r200VertexProgUpdateParams(struct gl_context *ctx, struct r200_
       case PROGRAM_NAMED_PARAM:
       //fprintf(stderr, "%s", vp->Parameters->Parameters[pi].Name);
       case PROGRAM_CONSTANT:
-	 *fcmd++ = paramList->ParameterValues[pi][0];
-	 *fcmd++ = paramList->ParameterValues[pi][1];
-	 *fcmd++ = paramList->ParameterValues[pi][2];
-	 *fcmd++ = paramList->ParameterValues[pi][3];
+	 *fcmd++ = paramList->ParameterValues[pi][0].f;
+	 *fcmd++ = paramList->ParameterValues[pi][1].f;
+	 *fcmd++ = paramList->ParameterValues[pi][2].f;
+	 *fcmd++ = paramList->ParameterValues[pi][3].f;
 	 break;
       default:
 	 _mesa_problem(NULL, "Bad param type in %s", __FUNCTION__);
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
index b24274259f4..39dcb21d4f4 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
@@ -561,28 +561,29 @@ static int peephole_add_presub_add(
 	struct rc_instruction * inst_add)
 {
 	unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
-	struct rc_src_register * src1 = NULL;
-	unsigned int i;
-
-	if (!is_presub_candidate(c, inst_add))
-		return 0;
+        unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
+        unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;
 
 	if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
 		return 0;
 
-	/* XXX This isn't fully implemented, is it? */
-	/*   src0 and src1 can't have absolute values only one can be negative and they must be all negative or all positive. */
-	for (i = 0; i < 2; i++) {
-		if (inst_add->U.I.SrcReg[i].Abs)
-			return 0;
+	/* src0 and src1 can't have absolute values */
+	if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
+	        return 0;
 
-		/* XXX This looks weird, but it's basically what was here before this commit (see git blame): */
-		if ((inst_add->U.I.SrcReg[i].Negate & dstmask) != dstmask && !src1) {
-			src1 = &inst_add->U.I.SrcReg[i];
-		}
-	}
+	/* presub_replace_add() assumes only one is negative */
+	if (inst_add->U.I.SrcReg[0].Negate && inst_add->U.I.SrcReg[1].Negate)
+	        return 0;
+
+        /* if src0 is negative, at least all bits of dstmask have to be set */
+        if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
+	        return 0;
 
-	if (!src1)
+        /* if src1 is negative, at least all bits of dstmask have to be set */
+        if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
+	        return 0;
+
+	if (!is_presub_candidate(c, inst_add))
 		return 0;
 
 	if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
@@ -615,7 +616,7 @@ static void presub_replace_inv(
  * of the add instruction must have the constatnt 1 swizzle.  This function
  * does not check const registers to see if their value is 1.0, so it should
  * be called after the constant_folding optimization.
- * @return 
+ * @return
  * 	0 if the ADD instruction is still part of the program.
  * 	1 if the ADD instruction is no longer part of the program.
  */
diff --git a/src/mesa/drivers/dri/r300/r300_draw.c b/src/mesa/drivers/dri/r300/r300_draw.c
index 0c4d8537c61..5587c16dd44 100644
--- a/src/mesa/drivers/dri/r300/r300_draw.c
+++ b/src/mesa/drivers/dri/r300/r300_draw.c
@@ -84,7 +84,8 @@ static void r300FixupIndexBuffer(struct gl_context *ctx, const struct _mesa_inde
 	GLboolean mapped_named_bo = GL_FALSE;
 
 	if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer) {
-		ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
+		ctx->Driver.MapBufferRange(ctx, 0, mesa_ind_buf->obj->Size,
+					   GL_MAP_READ_BIT, mesa_ind_buf->obj);
 		mapped_named_bo = GL_TRUE;
 		assert(mesa_ind_buf->obj->Pointer != NULL);
 	}
@@ -138,7 +139,7 @@ static void r300FixupIndexBuffer(struct gl_context *ctx, const struct _mesa_inde
 	r300->ind_buf.count = mesa_ind_buf->count;
 
 	if (mapped_named_bo) {
-		ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, mesa_ind_buf->obj);
+		ctx->Driver.UnmapBuffer(ctx, mesa_ind_buf->obj);
 	}
 }
 
@@ -163,7 +164,10 @@ static void r300SetupIndexBuffer(struct gl_context *ctx, const struct _mesa_inde
 		GLboolean mapped_named_bo = GL_FALSE;
 
 		if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer) {
-			ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
+			ctx->Driver.MapBufferRange(ctx, 0,
+						   mesa_ind_buf->obj->Size,
+						   GL_MAP_READ_BIT,
+						   mesa_ind_buf->obj);
 			assert(mesa_ind_buf->obj->Pointer != NULL);
 			mapped_named_bo = GL_TRUE;
 		}
@@ -184,7 +188,7 @@ static void r300SetupIndexBuffer(struct gl_context *ctx, const struct _mesa_inde
 		r300->ind_buf.count = mesa_ind_buf->count;
 
 		if (mapped_named_bo) {
-			ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, mesa_ind_buf->obj);
+			ctx->Driver.UnmapBuffer(ctx, mesa_ind_buf->obj);
 		}
 	} else {
 		r300FixupIndexBuffer(ctx, mesa_ind_buf);
@@ -235,7 +239,8 @@ static void r300ConvertAttrib(struct gl_context *ctx, int count, const struct gl
 
 	if (input->BufferObj->Name) {
 		if (!input->BufferObj->Pointer) {
-			ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
+			ctx->Driver.MapBufferRange(ctx, 0, input->BufferObj->Size,
+					      GL_MAP_READ_BIT, input->BufferObj);
 			mapped_named_bo = GL_TRUE;
 		}
 
@@ -286,7 +291,7 @@ static void r300ConvertAttrib(struct gl_context *ctx, int count, const struct gl
 
 	radeon_bo_unmap(attr->bo);
 	if (mapped_named_bo) {
-		ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
+		ctx->Driver.UnmapBuffer(ctx, input->BufferObj);
 	}
 }
 
@@ -302,7 +307,8 @@ static void r300AlignDataToDword(struct gl_context *ctx, const struct gl_client_
 	radeon_bo_map(attr->bo, 1);
 
 	if (!input->BufferObj->Pointer) {
-		ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
+		ctx->Driver.MapBufferRange(ctx, 0, input->BufferObj->Size,
+					   GL_MAP_READ_BIT, input->BufferObj);
 		mapped_named_bo = GL_TRUE;
 	}
 
@@ -321,7 +327,7 @@ static void r300AlignDataToDword(struct gl_context *ctx, const struct gl_client_
 	}
 
 	if (mapped_named_bo) {
-		ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
+		ctx->Driver.UnmapBuffer(ctx, input->BufferObj);
 	}
 
 	radeon_bo_unmap(attr->bo);
diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c
index 590d9afe14a..93d8fe185ef 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.c
+++ b/src/mesa/drivers/dri/r300/r300_tex.c
@@ -379,7 +379,6 @@ void r300InitTextureFuncs(radeonContextPtr radeon, struct dd_function_table *fun
 	functions->CompressedTexSubImage2D = radeonCompressedTexSubImage2D;
 
 	if (radeon->radeonScreen->kernel_mm) {
-		functions->CopyTexImage2D = radeonCopyTexImage2D;
 		functions->CopyTexSubImage2D = radeonCopyTexSubImage2D;
 	}
 
diff --git a/src/mesa/drivers/dri/r300/r300_texstate.c b/src/mesa/drivers/dri/r300/r300_texstate.c
index e24ad6f088d..e4388a021ed 100644
--- a/src/mesa/drivers/dri/r300/r300_texstate.c
+++ b/src/mesa/drivers/dri/r300/r300_texstate.c
@@ -427,13 +427,8 @@ void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint texture_format
 	struct radeon_framebuffer *rfb;
 	radeonTexObjPtr t;
 	uint32_t pitch_val;
-	uint32_t internalFormat, type, format;
 	gl_format texFormat;
 
-	type = GL_BGRA;
-	format = GL_UNSIGNED_BYTE;
-	internalFormat = (texture_format == __DRI_TEXTURE_FORMAT_RGB ? 3 : 4);
-
 	radeon = pDRICtx->driverPrivate;
 	rmesa = pDRICtx->driverPrivate;
 
diff --git a/src/mesa/drivers/dri/r600/evergreen_fragprog.c b/src/mesa/drivers/dri/r600/evergreen_fragprog.c
index e527c379b62..cc584ca2b35 100644
--- a/src/mesa/drivers/dri/r600/evergreen_fragprog.c
+++ b/src/mesa/drivers/dri/r600/evergreen_fragprog.c
@@ -752,10 +752,10 @@ GLboolean evergreenSetupFPconstants(struct gl_context * ctx)
 	    unNumParamData = paramList->NumParameters;
 
 	    for(ui=0; ui<unNumParamData; ui++) {
-		        evergreen->ps.consts[ui][0].f32All = paramList->ParameterValues[ui][0];
-		        evergreen->ps.consts[ui][1].f32All = paramList->ParameterValues[ui][1];
-		        evergreen->ps.consts[ui][2].f32All = paramList->ParameterValues[ui][2];
-		        evergreen->ps.consts[ui][3].f32All = paramList->ParameterValues[ui][3];
+		        evergreen->ps.consts[ui][0].f32All = paramList->ParameterValues[ui][0].f;
+		        evergreen->ps.consts[ui][1].f32All = paramList->ParameterValues[ui][1].f;
+		        evergreen->ps.consts[ui][2].f32All = paramList->ParameterValues[ui][2].f;
+		        evergreen->ps.consts[ui][3].f32All = paramList->ParameterValues[ui][3].f;
 	    }
 
 	    /* alloc multiple of 16 constants */
diff --git a/src/mesa/drivers/dri/r600/evergreen_render.c b/src/mesa/drivers/dri/r600/evergreen_render.c
index 4507be29d86..74563caf47c 100644
--- a/src/mesa/drivers/dri/r600/evergreen_render.c
+++ b/src/mesa/drivers/dri/r600/evergreen_render.c
@@ -403,7 +403,8 @@ static void evergreenConvertAttrib(struct gl_context *ctx, int count,
     {
         if (!input->BufferObj->Pointer) 
         {
-            ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
+	    ctx->Driver.MapBufferRange(ctx, 0, input->BufferObj->Size,
+				       GL_MAP_READ_BIT, input->BufferObj);
             mapped_named_bo = GL_TRUE;
         }
 
@@ -456,7 +457,7 @@ static void evergreenConvertAttrib(struct gl_context *ctx, int count,
 
     if (mapped_named_bo) 
     {
-        ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
+        ctx->Driver.UnmapBuffer(ctx, input->BufferObj);
     }
 }
 
@@ -470,7 +471,8 @@ static void evergreenFixupIndexBuffer(struct gl_context *ctx, const struct _mesa
 
     if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer)
     {
-        ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
+        ctx->Driver.MapBufferRange(ctx, 0, mesa_ind_buf->obj->Size,
+				   GL_MAP_READ_BIT, mesa_ind_buf->obj);
         mapped_named_bo = GL_TRUE;
         assert(mesa_ind_buf->obj->Pointer != NULL);
     }
@@ -531,7 +533,7 @@ static void evergreenFixupIndexBuffer(struct gl_context *ctx, const struct _mesa
 
     if (mapped_named_bo)
     {
-        ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, mesa_ind_buf->obj);
+        ctx->Driver.UnmapBuffer(ctx, mesa_ind_buf->obj);
     }
 }
 
@@ -606,7 +608,8 @@ static void evergreenSetupIndexBuffer(struct gl_context *ctx, const struct _mesa
 
         if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer)
         {
-	        ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
+	        ctx->Driver.MapBufferRange(ctx, 0, mesa_ind_buf->obj->Size,
+					   GL_MAP_READ_BIT, mesa_ind_buf->obj);
 	        assert(mesa_ind_buf->obj->Pointer != NULL);
 	        mapped_named_bo = GL_TRUE;
         }
@@ -629,7 +632,7 @@ static void evergreenSetupIndexBuffer(struct gl_context *ctx, const struct _mesa
 
         if (mapped_named_bo)
         {
-	        ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, mesa_ind_buf->obj);
+	        ctx->Driver.UnmapBuffer(ctx, mesa_ind_buf->obj);
         }
     }
     else
@@ -655,7 +658,8 @@ static void evergreenAlignDataToDword(struct gl_context *ctx,
 
     if (!input->BufferObj->Pointer) 
     {
-        ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
+	ctx->Driver.MapBufferRange(ctx, 0, input->BufferObj->Size,
+				   GL_MAP_READ_BIT, input->BufferObj->obj);
         mapped_named_bo = GL_TRUE;
     }
 
@@ -675,7 +679,7 @@ static void evergreenAlignDataToDword(struct gl_context *ctx,
     radeon_bo_unmap(attr->bo);
     if (mapped_named_bo) 
     {
-        ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
+        ctx->Driver.UnmapBuffer(ctx, input->BufferObj);
     }
 
     attr->stride = dst_stride;
diff --git a/src/mesa/drivers/dri/r600/evergreen_tex.c b/src/mesa/drivers/dri/r600/evergreen_tex.c
index 33a5f277683..d240a216817 100644
--- a/src/mesa/drivers/dri/r600/evergreen_tex.c
+++ b/src/mesa/drivers/dri/r600/evergreen_tex.c
@@ -1288,19 +1288,12 @@ void evergreenSetTexBuffer(__DRIcontext *pDRICtx, GLint target, GLint glx_textur
 	struct radeon_renderbuffer *rb;
 	radeon_texture_image *rImage;
 	radeonContextPtr radeon;
-	context_t *rmesa;
 	struct radeon_framebuffer *rfb;
 	radeonTexObjPtr t;
 	uint32_t pitch_val;
-	uint32_t internalFormat, type, format;
 	gl_format texFormat;
 
-	type = GL_BGRA;
-	format = GL_UNSIGNED_BYTE;
-	internalFormat = (glx_texture_format == __DRI_TEXTURE_FORMAT_RGB ? 3 : 4);
-
 	radeon = pDRICtx->driverPrivate;
-	rmesa = pDRICtx->driverPrivate;
 
 	rfb = dPriv->driverPrivate;
         texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
@@ -1688,7 +1681,6 @@ void evergreenInitTextureFuncs(radeonContextPtr radeon, struct dd_function_table
 	functions->CompressedTexSubImage2D = radeonCompressedTexSubImage2D;
 
 	if (radeon->radeonScreen->kernel_mm) {
-		functions->CopyTexImage2D = radeonCopyTexImage2D;
 		functions->CopyTexSubImage2D = radeonCopyTexSubImage2D;
 	}
 
diff --git a/src/mesa/drivers/dri/r600/evergreen_vertprog.c b/src/mesa/drivers/dri/r600/evergreen_vertprog.c
index 018869b9996..117916ac78f 100644
--- a/src/mesa/drivers/dri/r600/evergreen_vertprog.c
+++ b/src/mesa/drivers/dri/r600/evergreen_vertprog.c
@@ -684,17 +684,17 @@ GLboolean evergreenSetupVPconstants(struct gl_context * ctx)
 	    for(ui=0; ui<unNumParamData; ui++) {
             if(paramList->Parameters[ui].Type == PROGRAM_UNIFORM) 
             {
-                evergreen->vs.consts[ui][0].f32All = paramListOrginal->ParameterValues[ui][0];
-		        evergreen->vs.consts[ui][1].f32All = paramListOrginal->ParameterValues[ui][1];
-		        evergreen->vs.consts[ui][2].f32All = paramListOrginal->ParameterValues[ui][2];
-		        evergreen->vs.consts[ui][3].f32All = paramListOrginal->ParameterValues[ui][3];
+                evergreen->vs.consts[ui][0].f32All = paramListOrginal->ParameterValues[ui][0].f;
+		        evergreen->vs.consts[ui][1].f32All = paramListOrginal->ParameterValues[ui][1].f;
+		        evergreen->vs.consts[ui][2].f32All = paramListOrginal->ParameterValues[ui][2].f;
+		        evergreen->vs.consts[ui][3].f32All = paramListOrginal->ParameterValues[ui][3].f;
             }
             else
             {
-		        evergreen->vs.consts[ui][0].f32All = paramList->ParameterValues[ui][0];
-		        evergreen->vs.consts[ui][1].f32All = paramList->ParameterValues[ui][1];
-		        evergreen->vs.consts[ui][2].f32All = paramList->ParameterValues[ui][2];
-		        evergreen->vs.consts[ui][3].f32All = paramList->ParameterValues[ui][3];
+		        evergreen->vs.consts[ui][0].f32All = paramList->ParameterValues[ui][0].f;
+		        evergreen->vs.consts[ui][1].f32All = paramList->ParameterValues[ui][1].f;
+		        evergreen->vs.consts[ui][2].f32All = paramList->ParameterValues[ui][2].f;
+		        evergreen->vs.consts[ui][3].f32All = paramList->ParameterValues[ui][3].f;
             }
 	    }
 
diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.c b/src/mesa/drivers/dri/r600/r600_cmdbuf.c
index ce2f7779563..74f048b1062 100644
--- a/src/mesa/drivers/dri/r600/r600_cmdbuf.c
+++ b/src/mesa/drivers/dri/r600/r600_cmdbuf.c
@@ -259,13 +259,11 @@ static int r600_cs_process_relocs(struct radeon_cs_int *csi,
                                   uint32_t * reloc_chunk,
                                   uint32_t * length_dw_reloc_chunk) 
 {
-    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)csi->csm;
     struct r600_cs_reloc_legacy *relocs;
     int i, j, r;
 
     uint32_t offset_dw = 0;
 
-    csm = (struct r600_cs_manager_legacy*)csi->csm;
     relocs = (struct r600_cs_reloc_legacy *)csi->relocs;
 restart:
     for (i = 0; i < csi->crelocs; i++) {
diff --git a/src/mesa/drivers/dri/r600/r600_tex.c b/src/mesa/drivers/dri/r600/r600_tex.c
index eb7ed30c7a3..3efa1d197fa 100644
--- a/src/mesa/drivers/dri/r600/r600_tex.c
+++ b/src/mesa/drivers/dri/r600/r600_tex.c
@@ -470,7 +470,6 @@ void r600InitTextureFuncs(radeonContextPtr radeon, struct dd_function_table *fun
 	functions->CompressedTexSubImage2D = radeonCompressedTexSubImage2D;
 
 	if (radeon->radeonScreen->kernel_mm) {
-		functions->CopyTexImage2D = radeonCopyTexImage2D;
 		functions->CopyTexSubImage2D = radeonCopyTexSubImage2D;
 	}
 
diff --git a/src/mesa/drivers/dri/r600/r600_texstate.c b/src/mesa/drivers/dri/r600/r600_texstate.c
index 949db29c189..65fae7195fd 100644
--- a/src/mesa/drivers/dri/r600/r600_texstate.c
+++ b/src/mesa/drivers/dri/r600/r600_texstate.c
@@ -1141,13 +1141,8 @@ void r600SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
 	struct radeon_framebuffer *rfb;
 	radeonTexObjPtr t;
 	uint32_t pitch_val;
-	uint32_t internalFormat, type, format;
         gl_format texFormat;
 
-	type = GL_BGRA;
-	format = GL_UNSIGNED_BYTE;
-	internalFormat = (glx_texture_format == __DRI_TEXTURE_FORMAT_RGB ? 3 : 4);
-
 	radeon = pDRICtx->driverPrivate;
 	rmesa = pDRICtx->driverPrivate;
 
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.c b/src/mesa/drivers/dri/r600/r700_fragprog.c
index 40494cd6af0..6f9834e68fe 100644
--- a/src/mesa/drivers/dri/r600/r700_fragprog.c
+++ b/src/mesa/drivers/dri/r600/r700_fragprog.c
@@ -778,10 +778,10 @@ GLboolean r700SetupFragmentProgram(struct gl_context * ctx)
 	    unNumParamData = paramList->NumParameters;
 
 	    for(ui=0; ui<unNumParamData; ui++) {
-		        r700->ps.consts[ui][0].f32All = paramList->ParameterValues[ui][0];
-		        r700->ps.consts[ui][1].f32All = paramList->ParameterValues[ui][1];
-		        r700->ps.consts[ui][2].f32All = paramList->ParameterValues[ui][2];
-		        r700->ps.consts[ui][3].f32All = paramList->ParameterValues[ui][3];
+		        r700->ps.consts[ui][0].f32All = paramList->ParameterValues[ui][0].f;
+		        r700->ps.consts[ui][1].f32All = paramList->ParameterValues[ui][1].f;
+		        r700->ps.consts[ui][2].f32All = paramList->ParameterValues[ui][2].f;
+		        r700->ps.consts[ui][3].f32All = paramList->ParameterValues[ui][3].f;
 	    }
 
         /* Load fp constants to gpu */
diff --git a/src/mesa/drivers/dri/r600/r700_render.c b/src/mesa/drivers/dri/r600/r700_render.c
index 0f7a7a46b71..a565c9f2087 100644
--- a/src/mesa/drivers/dri/r600/r700_render.c
+++ b/src/mesa/drivers/dri/r600/r700_render.c
@@ -490,7 +490,8 @@ static void r700ConvertAttrib(struct gl_context *ctx, int count,
     {
         if (!input->BufferObj->Pointer) 
         {
-            ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
+	   ctx->Driver.MapBufferRange(ctx, 0, input->BufferObj->Size,
+				      GL_MAP_READ_BIT, input->BufferObj);
             mapped_named_bo = GL_TRUE;
         }
 
@@ -543,7 +544,7 @@ static void r700ConvertAttrib(struct gl_context *ctx, int count,
 
     if (mapped_named_bo) 
     {
-        ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
+        ctx->Driver.UnmapBuffer(ctx, input->BufferObj);
     }
 }
 
@@ -564,7 +565,8 @@ static void r700AlignDataToDword(struct gl_context *ctx,
 
     if (!input->BufferObj->Pointer) 
     {
-        ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
+        ctx->Driver.MapBufferRange(ctx, 0, input->BufferObj->Size,
+				   GL_MAP_READ_BIT, input->BufferObj);
         mapped_named_bo = GL_TRUE;
     }
 
@@ -584,7 +586,7 @@ static void r700AlignDataToDword(struct gl_context *ctx,
     radeon_bo_unmap(attr->bo);
     if (mapped_named_bo) 
     {
-        ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
+        ctx->Driver.UnmapBuffer(ctx, input->BufferObj);
     }
 
     attr->stride = dst_stride;
@@ -727,7 +729,8 @@ static void r700FixupIndexBuffer(struct gl_context *ctx, const struct _mesa_inde
 
     if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer)
     {
-        ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
+	ctx->Driver.MapBufferRange(ctx, 0, mesa_ind_buf->obj->Size,
+				   GL_MAP_READ_BIT, mesa_ind_buf->obj);
         mapped_named_bo = GL_TRUE;
         assert(mesa_ind_buf->obj->Pointer != NULL);
     }
@@ -788,7 +791,7 @@ static void r700FixupIndexBuffer(struct gl_context *ctx, const struct _mesa_inde
 
     if (mapped_named_bo)
     {
-        ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, mesa_ind_buf->obj);
+        ctx->Driver.UnmapBuffer(ctx, mesa_ind_buf->obj);
     }
 }
 
@@ -813,7 +816,8 @@ static void r700SetupIndexBuffer(struct gl_context *ctx, const struct _mesa_inde
 
         if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer)
         {
-	        ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
+		ctx->Driver.MapBufferRange(ctx, 0, mesa_ind_buf->obj->Size,
+					   GL_MAP_READ_BIT, mesa_ind_buf->obj);
 	        assert(mesa_ind_buf->obj->Pointer != NULL);
 	        mapped_named_bo = GL_TRUE;
         }
@@ -836,7 +840,7 @@ static void r700SetupIndexBuffer(struct gl_context *ctx, const struct _mesa_inde
 
         if (mapped_named_bo)
         {
-	        ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, mesa_ind_buf->obj);
+	        ctx->Driver.UnmapBuffer(ctx, mesa_ind_buf->obj);
         }
     }
     else
diff --git a/src/mesa/drivers/dri/r600/r700_vertprog.c b/src/mesa/drivers/dri/r600/r700_vertprog.c
index 7d4be9180a0..b1e2742b27d 100644
--- a/src/mesa/drivers/dri/r600/r700_vertprog.c
+++ b/src/mesa/drivers/dri/r600/r700_vertprog.c
@@ -720,17 +720,17 @@ GLboolean r700SetupVertexProgram(struct gl_context * ctx)
 	    for(ui=0; ui<unNumParamData; ui++) {
             if(paramList->Parameters[ui].Type == PROGRAM_UNIFORM) 
             {
-                r700->vs.consts[ui][0].f32All = paramListOrginal->ParameterValues[ui][0];
-		        r700->vs.consts[ui][1].f32All = paramListOrginal->ParameterValues[ui][1];
-		        r700->vs.consts[ui][2].f32All = paramListOrginal->ParameterValues[ui][2];
-		        r700->vs.consts[ui][3].f32All = paramListOrginal->ParameterValues[ui][3];
+              r700->vs.consts[ui][0].f32All = paramListOrginal->ParameterValues[ui][0].f;
+		        r700->vs.consts[ui][1].f32All = paramListOrginal->ParameterValues[ui][1].f;
+		        r700->vs.consts[ui][2].f32All = paramListOrginal->ParameterValues[ui][2].f;
+		        r700->vs.consts[ui][3].f32All = paramListOrginal->ParameterValues[ui][3].f;
             }
             else
             {
-		        r700->vs.consts[ui][0].f32All = paramList->ParameterValues[ui][0];
-		        r700->vs.consts[ui][1].f32All = paramList->ParameterValues[ui][1];
-		        r700->vs.consts[ui][2].f32All = paramList->ParameterValues[ui][2];
-		        r700->vs.consts[ui][3].f32All = paramList->ParameterValues[ui][3];
+		        r700->vs.consts[ui][0].f32All = paramList->ParameterValues[ui][0].f;
+		        r700->vs.consts[ui][1].f32All = paramList->ParameterValues[ui][1].f;
+		        r700->vs.consts[ui][2].f32All = paramList->ParameterValues[ui][2].f;
+		        r700->vs.consts[ui][3].f32All = paramList->ParameterValues[ui][3].f;
             }
 	    }
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h b/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
index 607b7470d4b..a74c6c7a575 100644
--- a/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
+++ b/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
@@ -78,6 +78,9 @@ static inline uint32_t radeon_gem_name_bo(struct radeon_bo *dummy)
 
 static inline void *radeon_bo_manager_gem_ctor(int fd)
 {
+  fprintf(stderr, "[%s:%u] Mesa built without Radeon libdrm support.\n",
+	  __func__, __LINE__);
+
   return NULL;
 }
 
@@ -87,6 +90,9 @@ static inline void radeon_bo_manager_gem_dtor(void *dummy)
 
 static inline void *radeon_cs_manager_gem_ctor(int fd)
 {
+  fprintf(stderr, "[%s:%u] Mesa built without Radeon libdrm support.\n",
+	  __func__, __LINE__);
+
   return NULL;
 }
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
index 0d1af726c07..7b59c0377f8 100644
--- a/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
+++ b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
@@ -130,7 +130,6 @@ radeonBufferData(struct gl_context * ctx,
  */
 static void
 radeonBufferSubData(struct gl_context * ctx,
-                    GLenum target,
                     GLintptrARB offset,
                     GLsizeiptrARB size,
                     const GLvoid * data,
@@ -155,7 +154,6 @@ radeonBufferSubData(struct gl_context * ctx,
  */
 static void
 radeonGetBufferSubData(struct gl_context * ctx,
-                       GLenum target,
                        GLintptrARB offset,
                        GLsizeiptrARB size,
                        GLvoid * data,
@@ -171,17 +169,18 @@ radeonGetBufferSubData(struct gl_context * ctx,
 }
 
 /**
- * Called via glMapBufferARB()
+ * Called via glMapBuffer() and glMapBufferRange()
  */
 static void *
-radeonMapBuffer(struct gl_context * ctx,
-                GLenum target,
-                GLenum access,
-                struct gl_buffer_object *obj)
+radeonMapBufferRange(struct gl_context * ctx,
+		     GLintptr offset, GLsizeiptr length,
+		     GLbitfield access, struct gl_buffer_object *obj)
 {
     struct radeon_buffer_object *radeon_obj = get_radeon_buffer_object(obj);
+    const GLboolean write_only =
+       (access & (GL_MAP_READ_BIT | GL_MAP_WRITE_BIT)) == GL_MAP_WRITE_BIT;
 
-    if (access == GL_WRITE_ONLY_ARB) {
+    if (write_only) {
         ctx->Driver.Flush(ctx);
     }
 
@@ -190,12 +189,13 @@ radeonMapBuffer(struct gl_context * ctx,
         return NULL;
     }
 
-    radeon_bo_map(radeon_obj->bo, access == GL_WRITE_ONLY_ARB);
+    obj->Offset = offset;
+    obj->Length = length;
+    obj->AccessFlags = access;
 
-    obj->Pointer = radeon_obj->bo->ptr;
-    obj->Length = obj->Size;
-    obj->Offset = 0;
+    radeon_bo_map(radeon_obj->bo, write_only);
 
+    obj->Pointer = radeon_obj->bo->ptr + offset;
     return obj->Pointer;
 }
 
@@ -205,7 +205,6 @@ radeonMapBuffer(struct gl_context * ctx,
  */
 static GLboolean
 radeonUnmapBuffer(struct gl_context * ctx,
-                  GLenum target,
                   struct gl_buffer_object *obj)
 {
     struct radeon_buffer_object *radeon_obj = get_radeon_buffer_object(obj);
@@ -229,6 +228,6 @@ radeonInitBufferObjectFuncs(struct dd_function_table *functions)
     functions->BufferData = radeonBufferData;
     functions->BufferSubData = radeonBufferSubData;
     functions->GetBufferSubData = radeonGetBufferSubData;
-    functions->MapBuffer = radeonMapBuffer;
+    functions->MapBufferRange = radeonMapBufferRange;
     functions->UnmapBuffer = radeonUnmapBuffer;
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index bfc307ca987..e7a6623cf84 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -436,7 +436,6 @@ void radeonCopyBuffer( __DRIdrawable *dPriv,
 		       const drm_clip_rect_t	  *rect)
 {
 	radeonContextPtr rmesa;
-	struct radeon_framebuffer *rfb;
 	GLint nbox, i, ret;
 
 	assert(dPriv);
@@ -447,8 +446,6 @@ void radeonCopyBuffer( __DRIdrawable *dPriv,
 
 	LOCK_HARDWARE(rmesa);
 
-	rfb = dPriv->driverPrivate;
-
 	if ( RADEON_DEBUG & RADEON_IOCTL ) {
 		fprintf( stderr, "\n%s( %p )\n\n", __FUNCTION__, (void *) rmesa->glCtx );
 	}
@@ -527,8 +524,6 @@ static GLboolean radeonPageFlip( __DRIdrawable *dPriv )
 {
 	radeonContextPtr radeon;
 	GLint ret;
-	__DRIscreen *psp;
-	struct radeon_renderbuffer *rrb;
 	struct radeon_framebuffer *rfb;
 
 	assert(dPriv);
@@ -537,9 +532,6 @@ static GLboolean radeonPageFlip( __DRIdrawable *dPriv )
 
 	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
 	rfb = dPriv->driverPrivate;
-	rrb = (void *)rfb->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
-
-	psp = dPriv->driScreenPriv;
 
 	LOCK_HARDWARE(radeon);
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index bf8925f61d0..c08b79484af 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -515,7 +515,6 @@ void radeon_prepare_render(radeonContextPtr radeon)
     __DRIcontext *driContext = radeon->dri.context;
     __DRIdrawable *drawable;
     __DRIscreen *screen;
-    struct radeon_framebuffer *draw;
 
     screen = driContext->driScreenPriv;
     if (!screen->dri2.loader)
@@ -527,7 +526,6 @@ void radeon_prepare_render(radeonContextPtr radeon)
 	    radeon_update_renderbuffers(driContext, drawable, GL_FALSE);
 
 	/* Intel driver does the equivalent of this, no clue if it is needed:*/
-	draw = drawable->driverPrivate;
 	radeon_draw_buffer(radeon->glCtx, radeon->glCtx->DrawBuffer);
 
 	driContext->dri2.draw_stamp = drawable->dri2.stamp;
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
index c2722a4e195..5595b705b15 100644
--- a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
@@ -218,11 +218,9 @@ static int cs_end(struct radeon_cs_int *cs,
 
 static int cs_process_relocs(struct radeon_cs_int *cs)
 {
-    struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
     struct cs_reloc_legacy *relocs;
     int i, j, r;
 
-    csm = (struct cs_manager_legacy*)cs->csm;
     relocs = (struct cs_reloc_legacy *)cs->relocs;
 restart:
     for (i = 0; i < cs->crelocs; i++) 
diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.c b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
index a91d8727792..c23e9c2d2a2 100644
--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
@@ -560,7 +560,6 @@ static void radeonClear( struct gl_context *ctx, GLbitfield mask )
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
    __DRIdrawable *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLuint flags = 0;
-   GLuint color_mask = 0;
    GLuint orig_mask = mask;
 
    if (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_FRONT_RIGHT)) {
@@ -582,13 +581,11 @@ static void radeonClear( struct gl_context *ctx, GLbitfield mask )
 
    if ( mask & BUFFER_BIT_FRONT_LEFT ) {
       flags |= RADEON_FRONT;
-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
       mask &= ~BUFFER_BIT_FRONT_LEFT;
    }
 
    if ( mask & BUFFER_BIT_BACK_LEFT ) {
       flags |= RADEON_BACK;
-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
       mask &= ~BUFFER_BIT_BACK_LEFT;
    }
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_lock.c b/src/mesa/drivers/dri/radeon/radeon_lock.c
index 7b6bd36dcf7..ae8a212f806 100644
--- a/src/mesa/drivers/dri/radeon/radeon_lock.c
+++ b/src/mesa/drivers/dri/radeon/radeon_lock.c
@@ -114,16 +114,6 @@ void radeon_lock_hardware(radeonContextPtr radeon
 		)
 {
 	char ret = 0;
-	struct radeon_framebuffer *rfb = NULL;
-	struct radeon_renderbuffer *rrb = NULL;
-
-	if (radeon_get_drawable(radeon)) {
-		rfb = radeon_get_drawable(radeon)->driverPrivate;
-
-		if (rfb)
-			rrb = radeon_get_renderbuffer(&rfb->base,
-						      rfb->base._ColorDrawBufferIndexes[0]);
-	}
 
 	if (!radeon->radeonScreen->driScreen->dri2.enabled) {
 		if (ATOMIC_INC_AND_FETCH(radeon->dri.hwLockCount) > 1)
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.c b/src/mesa/drivers/dri/radeon/radeon_tex.c
index 25a8ddf7b6a..a0b5506ae76 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.c
@@ -455,7 +455,6 @@ void radeonInitTextureFuncs( radeonContextPtr radeon, struct dd_function_table *
    functions->CompressedTexSubImage2D	= radeonCompressedTexSubImage2D;
 
    if (radeon->radeonScreen->kernel_mm) {
-      functions->CopyTexImage2D = radeonCopyTexImage2D;
       functions->CopyTexSubImage2D = radeonCopyTexSubImage2D;
    }
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex_copy.c b/src/mesa/drivers/dri/radeon/radeon_tex_copy.c
index f14dfa25d40..94ff3c4a727 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex_copy.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tex_copy.c
@@ -141,61 +141,6 @@ do_copy_texsubimage(struct gl_context *ctx,
 }
 
 void
-radeonCopyTexImage2D(struct gl_context *ctx, GLenum target, GLint level,
-                     GLenum internalFormat,
-                     GLint x, GLint y, GLsizei width, GLsizei height,
-                     GLint border)
-{
-    struct gl_texture_unit *texUnit = _mesa_get_current_tex_unit(ctx);
-    struct gl_texture_object *texObj =
-        _mesa_select_tex_object(ctx, texUnit, target);
-    struct gl_texture_image *texImage =
-        _mesa_select_tex_image(ctx, texObj, target, level);
-    int srcx, srcy, dstx, dsty;
-
-    radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-    radeon_prepare_render(radeon);
-
-    if (border)
-        goto fail;
-
-    /* Setup or redefine the texture object, mipmap tree and texture
-     * image.  Don't populate yet.
-     */
-    ctx->Driver.TexImage2D(ctx, target, level, internalFormat,
-                           width, height, border,
-                           GL_RGBA, GL_UNSIGNED_BYTE, NULL,
-                           &ctx->DefaultPacking, texObj, texImage);
-
-    srcx = x;
-    srcy = y;
-    dstx = 0;
-    dsty = 0;
-    if (!_mesa_clip_copytexsubimage(ctx,
-                                    &dstx, &dsty,
-                                    &srcx, &srcy,
-                                    &width, &height)) {
-        return;
-    }
-
-    if (!do_copy_texsubimage(ctx, target, level,
-                             radeon_tex_obj(texObj), (radeon_texture_image *)texImage,
-                             0, 0, x, y, width, height)) {
-        goto fail;
-    }
-
-    return;
-
-fail:
-    radeon_print(RADEON_FALLBACKS, RADEON_NORMAL,
-                 "Falling back to sw for glCopyTexImage2D (internalFormat %s, border %d)\n",
-                 _mesa_lookup_enum_by_nr(internalFormat), border);
-
-    _mesa_meta_CopyTexImage2D(ctx, target, level, internalFormat, x, y,
-                              width, height, border);
-}
-
-void
 radeonCopyTexSubImage2D(struct gl_context *ctx, GLenum target, GLint level,
                         GLint xoffset, GLint yoffset,
                         GLint x, GLint y,
diff --git a/src/mesa/drivers/dri/radeon/radeon_texstate.c b/src/mesa/drivers/dri/radeon/radeon_texstate.c
index 9ba98e303a7..430309392a0 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texstate.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texstate.c
@@ -648,18 +648,12 @@ void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint texture_form
 	struct radeon_renderbuffer *rb;
 	radeon_texture_image *rImage;
 	radeonContextPtr radeon;
-	r100ContextPtr rmesa;
 	struct radeon_framebuffer *rfb;
 	radeonTexObjPtr t;
 	uint32_t pitch_val;
-	uint32_t internalFormat, format;
 	gl_format texFormat;
 
-	format = GL_UNSIGNED_BYTE;
-	internalFormat = (texture_format == __DRI_TEXTURE_FORMAT_RGB ? GL_RGB : GL_RGBA);
-
 	radeon = pDRICtx->driverPrivate;
-	rmesa = pDRICtx->driverPrivate;
 
 	rfb = dPriv->driverPrivate;
         texUnit = _mesa_get_current_tex_unit(radeon->glCtx);
@@ -1018,7 +1012,7 @@ static GLboolean radeon_validate_texgen( struct gl_context *ctx, GLuint unit )
 static GLboolean setup_hardware_state(r100ContextPtr rmesa, radeonTexObj *t, int unit)
 {
    const struct gl_texture_image *firstImage;
-   GLint log2Width, log2Height, log2Depth, texelBytes;
+   GLint log2Width, log2Height, texelBytes;
 
    if ( t->bo ) {
 	return GL_TRUE;
@@ -1033,7 +1027,6 @@ static GLboolean setup_hardware_state(r100ContextPtr rmesa, radeonTexObj *t, int
 
    log2Width  = firstImage->WidthLog2;
    log2Height = firstImage->HeightLog2;
-   log2Depth  = firstImage->DepthLog2;
    texelBytes = _mesa_get_format_bytes(firstImage->TexFormat);
 
    if (!t->image_override) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index ce0df32bfe4..ad7e4c146a4 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -787,18 +787,6 @@ static void radeon_teximage(
 	radeon_print(RADEON_TEXTURE, RADEON_NORMAL,
 			"%s %dd: texObj %p, texImage %p, face %d, level %d\n",
 			__func__, dims, texObj, texImage, face, level);
-	{
-		struct radeon_bo *bo;
-		bo = !image->mt ? image->bo : image->mt->bo;
-		if (bo && radeon_bo_is_referenced_by_cs(bo, rmesa->cmdbuf.cs)) {
-			radeon_print(RADEON_TEXTURE, RADEON_VERBOSE,
-				"%s Calling teximage for texture that is "
-				"queued for GPU processing.\n",
-				__func__);
-			radeon_firevertices(rmesa);
-		}
-	}
-
 
 	t->validated = GL_FALSE;
 
@@ -820,6 +808,18 @@ static void radeon_teximage(
 		}
 	}
 
+	{
+		struct radeon_bo *bo;
+		bo = !image->mt ? image->bo : image->mt->bo;
+		if (bo && radeon_bo_is_referenced_by_cs(bo, rmesa->cmdbuf.cs)) {
+			radeon_print(RADEON_TEXTURE, RADEON_VERBOSE,
+				"%s Calling teximage for texture that is "
+				"queued for GPU processing.\n",
+				__func__);
+			radeon_firevertices(rmesa);
+		}
+	}
+
 	/* Upload texture image; note that the spec allows pixels to be NULL */
 	if (compressed) {
 		pixels = _mesa_validate_pbo_compressed_teximage(
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.h b/src/mesa/drivers/dri/radeon/radeon_texture.h
index 538a07fbba8..6fc06d967dd 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.h
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.h
@@ -126,11 +126,6 @@ void radeonGetCompressedTexImage(struct gl_context *ctx, GLenum target, GLint le
 				 struct gl_texture_object *texObj,
 				 struct gl_texture_image *texImage);
 
-void radeonCopyTexImage2D(struct gl_context *ctx, GLenum target, GLint level,
-			GLenum internalFormat,
-			GLint x, GLint y, GLsizei width, GLsizei height,
-			GLint border);
-
 void radeonCopyTexSubImage2D(struct gl_context *ctx, GLenum target, GLint level,
 			GLint xoffset, GLint yoffset,
 			GLint x, GLint y,
diff --git a/src/mesa/drivers/x11/xm_dd.c b/src/mesa/drivers/x11/xm_dd.c
index 81f48f9d95a..81d000b3952 100644
--- a/src/mesa/drivers/x11/xm_dd.c
+++ b/src/mesa/drivers/x11/xm_dd.c
@@ -454,10 +454,10 @@ xmesa_DrawPixels_8R8G8B( struct gl_context *ctx,
                         "glDrawPixels(invalid PBO access)");
             return;
          }
-         buf = (GLubyte *) ctx->Driver.MapBuffer(ctx,
-                                                 GL_PIXEL_UNPACK_BUFFER_EXT,
-                                                 GL_READ_ONLY_ARB,
-                                                 unpack->BufferObj);
+         buf = (GLubyte *) ctx->Driver.MapBufferRange(ctx, 0,
+						      unpack->BufferObj->Size,
+						      GL_MAP_READ_BIT,
+						      unpack->BufferObj);
          if (!buf) {
             /* buffer is already mapped - that's an error */
             _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -508,8 +508,7 @@ xmesa_DrawPixels_8R8G8B( struct gl_context *ctx,
       }
 
       if (_mesa_is_bufferobj(unpack->BufferObj)) {
-         ctx->Driver.UnmapBuffer(ctx, GL_PIXEL_UNPACK_BUFFER_EXT,
-                                 unpack->BufferObj);
+         ctx->Driver.UnmapBuffer(ctx, unpack->BufferObj);
       }
    }
    else {
@@ -589,10 +588,10 @@ xmesa_DrawPixels_5R6G5B( struct gl_context *ctx,
                         "glDrawPixels(invalid PBO access)");
             return;
          }
-         buf = (GLubyte *) ctx->Driver.MapBuffer(ctx,
-                                                 GL_PIXEL_UNPACK_BUFFER_EXT,
-                                                 GL_READ_ONLY_ARB,
-                                                 unpack->BufferObj);
+         buf = (GLubyte *) ctx->Driver.MapBufferRange(ctx, 0,
+						      unpack->BufferObj->Size,
+						      GL_MAP_READ_BIT,
+						      unpack->BufferObj);
          if (!buf) {
             /* buffer is already mapped - that's an error */
             _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -642,8 +641,7 @@ xmesa_DrawPixels_5R6G5B( struct gl_context *ctx,
       }
 
       if (unpack->BufferObj->Name) {
-         ctx->Driver.UnmapBuffer(ctx, GL_PIXEL_UNPACK_BUFFER_EXT,
-                                 unpack->BufferObj);
+         ctx->Driver.UnmapBuffer(ctx, unpack->BufferObj);
       }
    }
    else {