83 files changed, 3597 insertions, 874 deletions
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
new file mode 100644
index 00000000000..e42beabc9ba
--- /dev/null
+++ b/src/mesa/drivers/common/meta.c
@@ -0,0 +1,1249 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.6
+ *
+ * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * Meta operations.  Some GL operations can be expressed in terms of
+ * other GL operations.  For example, glBlitFramebuffer() can be done
+ * with texture mapping and glClear() can be done with polygon rendering.
+ *
+ * \author Brian Paul
+ */
+
+
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "main/imports.h"
+#include "main/arrayobj.h"
+#include "main/blend.h"
+#include "main/bufferobj.h"
+#include "main/depth.h"
+#include "main/enable.h"
+#include "main/image.h"
+#include "main/macros.h"
+#include "main/matrix.h"
+#include "main/polygon.h"
+#include "main/scissor.h"
+#include "main/shaders.h"
+#include "main/stencil.h"
+#include "main/texobj.h"
+#include "main/texenv.h"
+#include "main/teximage.h"
+#include "main/texparam.h"
+#include "main/texstate.h"
+#include "main/varray.h"
+#include "main/viewport.h"
+#include "shader/program.h"
+#include "swrast/swrast.h"
+#include "drivers/common/meta.h"
+
+
+/**
+ * State which we may save/restore across meta ops.
+ * XXX this may be incomplete...
+ */
+struct save_state
+{
+   GLbitfield SavedState;  /**< bitmask of META_* flags */
+
+   /** META_ALPHA_TEST */
+   GLboolean AlphaEnabled;
+
+   /** META_BLEND */
+   GLboolean BlendEnabled;
+   GLboolean ColorLogicOpEnabled;
+
+   /** META_COLOR_MASK */
+   GLubyte ColorMask[4];
+
+   /** META_DEPTH_TEST */
+   struct gl_depthbuffer_attrib Depth;
+
+   /** META_FOG */
+   GLboolean Fog;
+
+   /** META_PIXELSTORE */
+   /* XXX / TO-DO */
+
+   /** META_RASTERIZATION */
+   GLenum FrontPolygonMode, BackPolygonMode;
+   GLboolean PolygonOffset;
+   GLboolean PolygonSmooth;
+   GLboolean PolygonStipple;
+   GLboolean PolygonCull;
+
+   /** META_SCISSOR */
+   struct gl_scissor_attrib Scissor;
+
+   /** META_SHADER */
+   GLboolean VertexProgramEnabled;
+   struct gl_vertex_program *VertexProgram;
+   GLboolean FragmentProgramEnabled;
+   struct gl_fragment_program *FragmentProgram;
+   GLuint Shader;
+
+   /** META_STENCIL_TEST */
+   struct gl_stencil_attrib Stencil;
+
+   /** META_TRANSFORM */
+   GLenum MatrixMode;
+   GLfloat ModelviewMatrix[16];
+   GLfloat ProjectionMatrix[16];
+   GLfloat TextureMatrix[16];
+   GLbitfield ClipPlanesEnabled;
+
+   /** META_TEXTURE */
+   GLuint ActiveUnit;
+   GLuint ClientActiveUnit;
+   /** for unit[0] only */
+   struct gl_texture_object *CurrentTexture[NUM_TEXTURE_TARGETS];
+   /** mask of TEXTURE_2D_BIT, etc */
+   GLbitfield TexEnabled[MAX_TEXTURE_UNITS];
+   GLbitfield TexGenEnabled[MAX_TEXTURE_UNITS];
+   GLuint EnvMode;  /* unit[0] only */
+
+   /** META_VERTEX */
+   struct gl_array_object *ArrayObj;
+   struct gl_buffer_object *ArrayBufferObj;
+
+   /** META_VIEWPORT */
+   GLint ViewportX, ViewportY, ViewportW, ViewportH;
+   GLclampd DepthNear, DepthFar;
+
+   /** Miscellaneous (always disabled) */
+   GLboolean Lighting;
+};
+
+
+/**
+ * State for glBlitFramebufer()
+ */
+struct blit_state
+{
+   GLuint TexObj;
+   GLsizei TexWidth, TexHeight;
+   GLenum TexType;
+   GLuint ArrayObj;
+   GLuint VBO;
+   GLfloat verts[4][4]; /** four verts of X,Y,S,T */
+};
+
+
+/**
+ * State for glClear()
+ */
+struct clear_state
+{
+   GLuint ArrayObj;
+   GLuint VBO;
+   GLfloat verts[4][7]; /** four verts of X,Y,Z,R,G,B,A */
+};
+
+
+/**
+ * State for glCopyPixels()
+ */
+struct copypix_state
+{
+   GLuint TexObj;
+   GLsizei TexWidth, TexHeight;
+   GLenum TexType;
+   GLuint ArrayObj;
+   GLuint VBO;
+   GLfloat verts[4][5]; /** four verts of X,Y,Z,S,T */
+};
+
+
+/**
+ * State for glDrawPixels()
+ */
+struct drawpix_state
+{
+   GLuint TexObj;
+   GLsizei TexWidth, TexHeight;
+   GLenum TexIntFormat;
+   GLuint ArrayObj;
+   GLuint VBO;
+   GLfloat verts[4][5]; /** four verts of X,Y,Z,S,T */
+};
+
+
+
+/**
+ * All per-context meta state.
+ */
+struct gl_meta_state
+{
+   struct save_state Save;    /**< state saved during meta-ops */
+
+   struct blit_state Blit;    /**< For _mesa_meta_blit_framebuffer() */
+   struct clear_state Clear;  /**< For _mesa_meta_clear() */
+   struct copypix_state CopyPix;  /**< For _mesa_meta_copy_pixels() */
+   struct drawpix_state DrawPix;  /**< For _mesa_meta_draw_pixels() */
+
+   /* other possible meta-ops:
+    * glDrawPixels()
+    * glBitmap()
+    * glGenerateMipmap()
+    */
+};
+
+
+/**
+ * Initialize meta-ops for a context.
+ * To be called once during context creation.
+ */
+void
+_mesa_meta_init(GLcontext *ctx)
+{
+   ASSERT(!ctx->Meta);
+
+   ctx->Meta = CALLOC_STRUCT(gl_meta_state);
+}
+
+
+/**
+ * Free context meta-op state.
+ * To be called once during context destruction.
+ */
+void
+_mesa_meta_free(GLcontext *ctx)
+{
+   struct gl_meta_state *meta = ctx->Meta;
+
+   if (meta->Blit.TexObj) {
+      _mesa_DeleteTextures(1, &meta->Blit.TexObj);
+      _mesa_DeleteBuffersARB(1, & meta->Blit.VBO);
+      _mesa_DeleteVertexArraysAPPLE(1, &meta->Blit.ArrayObj);
+   }
+
+   if (meta->Clear.VBO) {
+      _mesa_DeleteBuffersARB(1, & meta->Clear.VBO);
+      _mesa_DeleteVertexArraysAPPLE(1, &meta->Clear.ArrayObj);
+   }
+
+   if (meta->CopyPix.TexObj) {
+      _mesa_DeleteTextures(1, &meta->CopyPix.TexObj);
+      _mesa_DeleteBuffersARB(1, & meta->CopyPix.VBO);
+      _mesa_DeleteVertexArraysAPPLE(1, &meta->CopyPix.ArrayObj);
+   }
+
+   if (meta->DrawPix.TexObj) {
+      _mesa_DeleteTextures(1, &meta->DrawPix.TexObj);
+      _mesa_DeleteBuffersARB(1, & meta->DrawPix.VBO);
+      _mesa_DeleteVertexArraysAPPLE(1, &meta->DrawPix.ArrayObj);
+   }
+
+   _mesa_free(ctx->Meta);
+   ctx->Meta = NULL;
+}
+
+
+/**
+ * Enter meta state.  This is like a light-weight version of glPushAttrib
+ * but it also resets most GL state back to default values.
+ *
+ * \param state  bitmask of META_* flags indicating which attribute groups
+ *               to save and reset to their defaults
+ */
+static void
+_mesa_meta_begin(GLcontext *ctx, GLbitfield state)
+{
+   struct save_state *save = &ctx->Meta->Save;
+
+   save->SavedState = state;
+
+   if (state & META_ALPHA_TEST) {
+      save->AlphaEnabled = ctx->Color.AlphaEnabled;
+      if (ctx->Color.AlphaEnabled)
+         _mesa_Disable(GL_ALPHA_TEST);
+   }
+
+   if (state & META_BLEND) {
+      save->BlendEnabled = ctx->Color.BlendEnabled;
+      if (ctx->Color.BlendEnabled)
+         _mesa_Disable(GL_BLEND);
+      save->ColorLogicOpEnabled = ctx->Color.ColorLogicOpEnabled;
+      if (ctx->Color.ColorLogicOpEnabled)
+         _mesa_Disable(GL_COLOR_LOGIC_OP);
+   }
+
+   if (state & META_COLOR_MASK) {
+      COPY_4V(save->ColorMask, ctx->Color.ColorMask);
+      if (!ctx->Color.ColorMask[0] ||
+          !ctx->Color.ColorMask[1] ||
+          !ctx->Color.ColorMask[2] ||
+          !ctx->Color.ColorMask[3])
+         _mesa_ColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+   }
+
+   if (state & META_DEPTH_TEST) {
+      save->Depth = ctx->Depth; /* struct copy */
+      if (ctx->Depth.Test)
+         _mesa_Disable(GL_DEPTH_TEST);
+   }
+
+   if (state & META_FOG) {
+      save->Fog = ctx->Fog.Enabled;
+      if (ctx->Fog.Enabled)
+         _mesa_set_enable(ctx, GL_FOG, GL_FALSE);
+   }
+
+   if (state & META_RASTERIZATION) {
+      save->FrontPolygonMode = ctx->Polygon.FrontMode;
+      save->BackPolygonMode = ctx->Polygon.BackMode;
+      save->PolygonOffset = ctx->Polygon.OffsetFill;
+      save->PolygonSmooth = ctx->Polygon.SmoothFlag;
+      save->PolygonStipple = ctx->Polygon.StippleFlag;
+      save->PolygonCull = ctx->Polygon.CullFlag;
+      _mesa_PolygonMode(GL_FRONT_AND_BACK, GL_FILL);
+      _mesa_set_enable(ctx, GL_POLYGON_OFFSET_FILL, GL_FALSE);
+      _mesa_set_enable(ctx, GL_POLYGON_SMOOTH, GL_FALSE);
+      _mesa_set_enable(ctx, GL_POLYGON_STIPPLE, GL_FALSE);
+      _mesa_set_enable(ctx, GL_CULL_FACE, GL_FALSE);
+   }
+
+   if (state & META_SCISSOR) {
+      save->Scissor = ctx->Scissor; /* struct copy */
+   }
+
+   if (state & META_SHADER) {
+      if (ctx->Extensions.ARB_vertex_program) {
+         save->VertexProgramEnabled = ctx->VertexProgram.Enabled;
+         save->VertexProgram = ctx->VertexProgram.Current;
+         _mesa_set_enable(ctx, GL_VERTEX_PROGRAM_ARB, GL_FALSE);
+      }
+
+      if (ctx->Extensions.ARB_fragment_program) {
+         save->FragmentProgramEnabled = ctx->FragmentProgram.Enabled;
+         save->FragmentProgram = ctx->FragmentProgram.Current;
+         _mesa_set_enable(ctx, GL_FRAGMENT_PROGRAM_ARB, GL_FALSE);
+      }
+
+      if (ctx->Extensions.ARB_shader_objects) {
+         save->Shader = ctx->Shader.CurrentProgram ?
+            ctx->Shader.CurrentProgram->Name : 0;
+         _mesa_UseProgramObjectARB(0);
+      }
+   }
+
+   if (state & META_STENCIL_TEST) {
+      save->Stencil = ctx->Stencil; /* struct copy */
+      if (ctx->Stencil.Enabled)
+         _mesa_Disable(GL_STENCIL_TEST);
+      /* NOTE: other stencil state not reset */
+   }
+
+   if (state & META_TEXTURE) {
+      GLuint u, tgt;
+
+      save->ActiveUnit = ctx->Texture.CurrentUnit;
+      save->ClientActiveUnit = ctx->Array.ActiveTexture;
+      save->EnvMode = ctx->Texture.Unit[0].EnvMode;
+
+      /* Disable all texture units */
+      for (u = 0; u < ctx->Const.MaxTextureUnits; u++) {
+         save->TexEnabled[u] = ctx->Texture.Unit[u].Enabled;
+         save->TexGenEnabled[u] = ctx->Texture.Unit[u].TexGenEnabled;
+         if (ctx->Texture.Unit[u].Enabled ||
+             ctx->Texture.Unit[u].TexGenEnabled) {
+            _mesa_ActiveTextureARB(GL_TEXTURE0 + u);
+            _mesa_set_enable(ctx, GL_TEXTURE_1D, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_2D, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_3D, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_CUBE_MAP, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_RECTANGLE, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_GEN_S, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_GEN_T, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_GEN_R, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_GEN_Q, GL_FALSE);
+         }
+      }
+
+      /* save current texture objects for unit[0] only */
+      for (tgt = 0; tgt < NUM_TEXTURE_TARGETS; tgt++) {
+         _mesa_reference_texobj(&save->CurrentTexture[tgt],
+                                ctx->Texture.Unit[0].CurrentTex[tgt]);
+      }
+
+      /* set defaults for unit[0] */
+      _mesa_ActiveTextureARB(GL_TEXTURE0);
+      _mesa_ClientActiveTextureARB(GL_TEXTURE0);
+      _mesa_TexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+   }
+
+   if (state & META_TRANSFORM) {
+      GLuint activeTexture = ctx->Texture.CurrentUnit;
+      _mesa_memcpy(save->ModelviewMatrix, ctx->ModelviewMatrixStack.Top->m,
+                   16 * sizeof(GLfloat));
+      _mesa_memcpy(save->ProjectionMatrix, ctx->ProjectionMatrixStack.Top->m,
+                   16 * sizeof(GLfloat));
+      _mesa_memcpy(save->TextureMatrix, ctx->TextureMatrixStack[0].Top->m,
+                   16 * sizeof(GLfloat));
+      save->MatrixMode = ctx->Transform.MatrixMode;
+      /* set 1:1 vertex:pixel coordinate transform */
+      _mesa_ActiveTextureARB(GL_TEXTURE0);
+      _mesa_MatrixMode(GL_TEXTURE);
+      _mesa_LoadIdentity();
+      _mesa_ActiveTextureARB(GL_TEXTURE0 + activeTexture);
+      _mesa_MatrixMode(GL_MODELVIEW);
+      _mesa_LoadIdentity();
+      _mesa_MatrixMode(GL_PROJECTION);
+      _mesa_LoadIdentity();
+      _mesa_Ortho(0.0F, ctx->DrawBuffer->Width,
+                  0.0F, ctx->DrawBuffer->Height,
+                  -1.0F, 1.0F);
+      save->ClipPlanesEnabled = ctx->Transform.ClipPlanesEnabled;
+      if (ctx->Transform.ClipPlanesEnabled) {
+         GLuint i;
+         for (i = 0; i < ctx->Const.MaxClipPlanes; i++) {
+            _mesa_set_enable(ctx, GL_CLIP_PLANE0 + i, GL_FALSE);
+         }
+      }
+   }
+
+   if (state & META_VERTEX) {
+      /* save vertex array object state */
+      _mesa_reference_array_object(ctx, &save->ArrayObj,
+                                   ctx->Array.ArrayObj);
+      _mesa_reference_buffer_object(ctx, &save->ArrayBufferObj,
+                                    ctx->Array.ArrayBufferObj);
+      /* set some default state? */
+   }
+
+   if (state & META_VIEWPORT) {
+      save->ViewportX = ctx->Viewport.X;
+      save->ViewportY = ctx->Viewport.Y;
+      save->ViewportW = ctx->Viewport.Width;
+      save->ViewportH = ctx->Viewport.Height;
+      _mesa_Viewport(0, 0, ctx->DrawBuffer->Width, ctx->DrawBuffer->Height);
+      save->DepthNear = ctx->Viewport.Near;
+      save->DepthFar = ctx->Viewport.Far;
+      _mesa_DepthRange(0.0, 1.0);
+   }
+
+   /* misc */
+   {
+      save->Lighting = ctx->Light.Enabled;
+      if (ctx->Light.Enabled)
+         _mesa_set_enable(ctx, GL_LIGHTING, GL_FALSE);
+   }
+}
+
+
+/**
+ * Leave meta state.  This is like a light-weight version of glPopAttrib().
+ */
+static void
+_mesa_meta_end(GLcontext *ctx)
+{
+   struct save_state *save = &ctx->Meta->Save;
+   const GLbitfield state = save->SavedState;
+
+   if (state & META_ALPHA_TEST) {
+      if (ctx->Color.AlphaEnabled != save->AlphaEnabled)
+         _mesa_set_enable(ctx, GL_ALPHA_TEST, save->AlphaEnabled);
+   }
+
+   if (state & META_BLEND) {
+      if (ctx->Color.BlendEnabled != save->BlendEnabled)
+         _mesa_set_enable(ctx, GL_BLEND, save->BlendEnabled);
+      if (ctx->Color.ColorLogicOpEnabled != save->ColorLogicOpEnabled)
+         _mesa_set_enable(ctx, GL_COLOR_LOGIC_OP, save->ColorLogicOpEnabled);
+   }
+
+   if (state & META_COLOR_MASK) {
+      if (!TEST_EQ_4V(ctx->Color.ColorMask, save->ColorMask))
+         _mesa_ColorMask(save->ColorMask[0], save->ColorMask[1],
+                         save->ColorMask[2], save->ColorMask[3]);
+   }
+
+   if (state & META_DEPTH_TEST) {
+      if (ctx->Depth.Test != save->Depth.Test)
+         _mesa_set_enable(ctx, GL_DEPTH_TEST, save->Depth.Test);
+      _mesa_DepthFunc(save->Depth.Func);
+      _mesa_DepthMask(save->Depth.Mask);
+   }
+
+   if (state & META_FOG) {
+      _mesa_set_enable(ctx, GL_FOG, save->Fog);
+   }
+
+   if (state & META_RASTERIZATION) {
+      _mesa_PolygonMode(GL_FRONT, save->FrontPolygonMode);
+      _mesa_PolygonMode(GL_BACK, save->BackPolygonMode);
+      _mesa_set_enable(ctx, GL_POLYGON_STIPPLE, save->PolygonStipple);
+      _mesa_set_enable(ctx, GL_POLYGON_OFFSET_FILL, save->PolygonOffset);
+      _mesa_set_enable(ctx, GL_POLYGON_SMOOTH, save->PolygonSmooth);
+      _mesa_set_enable(ctx, GL_CULL_FACE, save->PolygonCull);
+   }
+
+   if (state & META_SCISSOR) {
+      _mesa_set_enable(ctx, GL_SCISSOR_TEST, save->Scissor.Enabled);
+      _mesa_Scissor(save->Scissor.X, save->Scissor.Y,
+                    save->Scissor.Width, save->Scissor.Height);
+   }
+
+   if (state & META_SHADER) {
+      if (ctx->Extensions.ARB_vertex_program) {
+         _mesa_set_enable(ctx, GL_VERTEX_PROGRAM_ARB,
+                          save->VertexProgramEnabled);
+         _mesa_reference_vertprog(ctx, &ctx->VertexProgram.Current, 
+                                  save->VertexProgram);
+      }
+
+      if (ctx->Extensions.ARB_fragment_program) {
+         _mesa_set_enable(ctx, GL_FRAGMENT_PROGRAM_ARB,
+                          save->FragmentProgramEnabled);
+         _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current,
+                                  save->FragmentProgram);
+      }
+
+      if (ctx->Extensions.ARB_shader_objects) {
+         _mesa_UseProgramObjectARB(save->Shader);
+      }
+   }
+
+   if (state & META_STENCIL_TEST) {
+      const struct gl_stencil_attrib *stencil = &save->Stencil;
+
+      _mesa_set_enable(ctx, GL_STENCIL_TEST, stencil->Enabled);
+      _mesa_ClearStencil(stencil->Clear);
+      if (ctx->Extensions.EXT_stencil_two_side) {
+         _mesa_set_enable(ctx, GL_STENCIL_TEST_TWO_SIDE_EXT,
+                          stencil->TestTwoSide);
+         _mesa_ActiveStencilFaceEXT(stencil->ActiveFace
+                                    ? GL_BACK : GL_FRONT);
+      }
+      /* front state */
+      _mesa_StencilFuncSeparate(GL_FRONT,
+                                stencil->Function[0],
+                                stencil->Ref[0],
+                                stencil->ValueMask[0]);
+      _mesa_StencilMaskSeparate(GL_FRONT, stencil->WriteMask[0]);
+      _mesa_StencilOpSeparate(GL_FRONT, stencil->FailFunc[0],
+                              stencil->ZFailFunc[0],
+                              stencil->ZPassFunc[0]);
+      /* back state */
+      _mesa_StencilFuncSeparate(GL_BACK,
+                                stencil->Function[1],
+                                stencil->Ref[1],
+                                stencil->ValueMask[1]);
+      _mesa_StencilMaskSeparate(GL_BACK, stencil->WriteMask[1]);
+      _mesa_StencilOpSeparate(GL_BACK, stencil->FailFunc[1],
+                              stencil->ZFailFunc[1],
+                              stencil->ZPassFunc[1]);
+   }
+
+   if (state & META_TEXTURE) {
+      GLuint u, tgt;
+
+      ASSERT(ctx->Texture.CurrentUnit == 0);
+
+      /* restore texenv for unit[0] */
+      _mesa_TexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, save->EnvMode);
+
+      /* restore texture objects for unit[0] only */
+      for (tgt = 0; tgt < NUM_TEXTURE_TARGETS; tgt++) {
+         _mesa_reference_texobj(&ctx->Texture.Unit[0].CurrentTex[tgt],
+                                save->CurrentTexture[tgt]);
+      }
+
+      /* Re-enable textures, texgen */
+      for (u = 0; u < ctx->Const.MaxTextureUnits; u++) {
+         if (save->TexEnabled[u]) {
+            _mesa_ActiveTextureARB(GL_TEXTURE0 + u);
+
+            if (save->TexEnabled[u] & TEXTURE_1D_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_1D, GL_TRUE);
+            if (save->TexEnabled[u] & TEXTURE_2D_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_2D, GL_TRUE);
+            if (save->TexEnabled[u] & TEXTURE_3D_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_3D, GL_TRUE);
+            if (save->TexEnabled[u] & TEXTURE_CUBE_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_CUBE_MAP, GL_TRUE);
+            if (save->TexEnabled[u] & TEXTURE_RECT_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_RECTANGLE, GL_TRUE);
+         }
+
+         if (save->TexGenEnabled[u]) {
+            _mesa_ActiveTextureARB(GL_TEXTURE0 + u);
+
+            if (save->TexGenEnabled[u] & S_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_GEN_S, GL_TRUE);
+            if (save->TexGenEnabled[u] & T_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_GEN_T, GL_TRUE);
+            if (save->TexGenEnabled[u] & R_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_GEN_R, GL_TRUE);
+            if (save->TexGenEnabled[u] & Q_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_GEN_Q, GL_TRUE);
+         }
+      }
+
+      /* restore current unit state */
+      _mesa_ActiveTextureARB(GL_TEXTURE0 + save->ActiveUnit);
+      _mesa_ClientActiveTextureARB(GL_TEXTURE0 + save->ClientActiveUnit);
+   }
+
+   if (state & META_TRANSFORM) {
+      GLuint activeTexture = ctx->Texture.CurrentUnit;
+      _mesa_ActiveTextureARB(GL_TEXTURE0);
+      _mesa_MatrixMode(GL_TEXTURE);
+      _mesa_LoadMatrixf(save->TextureMatrix);
+      _mesa_ActiveTextureARB(GL_TEXTURE0 + activeTexture);
+
+      _mesa_MatrixMode(GL_MODELVIEW);
+      _mesa_LoadMatrixf(save->ModelviewMatrix);
+
+      _mesa_MatrixMode(GL_PROJECTION);
+      _mesa_LoadMatrixf(save->ProjectionMatrix);
+
+      _mesa_MatrixMode(save->MatrixMode);
+
+      save->ClipPlanesEnabled = ctx->Transform.ClipPlanesEnabled;
+      if (save->ClipPlanesEnabled) {
+         GLuint i;
+         for (i = 0; i < ctx->Const.MaxClipPlanes; i++) {
+            if (save->ClipPlanesEnabled & (1 << i)) {
+               _mesa_set_enable(ctx, GL_CLIP_PLANE0 + i, GL_TRUE);
+            }
+         }
+      }
+   }
+
+   if (state & META_VERTEX) {
+      /* restore vertex buffer object */
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, save->ArrayBufferObj->Name);
+      _mesa_reference_buffer_object(ctx, &save->ArrayBufferObj, NULL);
+
+      /* restore vertex array object */
+      _mesa_BindVertexArray(save->ArrayObj->Name);
+      _mesa_reference_array_object(ctx, &save->ArrayObj, NULL);
+   }
+
+   if (state & META_VIEWPORT) {
+      _mesa_Viewport(save->ViewportX, save->ViewportY,
+                     save->ViewportW, save->ViewportH);
+      _mesa_DepthRange(save->DepthNear, save->DepthFar);
+   }
+
+   /* misc */
+   if (save->Lighting) {
+      _mesa_set_enable(ctx, GL_LIGHTING, GL_TRUE);
+   }
+   if (save->Fog) {
+      _mesa_set_enable(ctx, GL_FOG, GL_TRUE);
+   }
+}
+
+
+/**
+ * Meta implementation of ctx->Driver.BlitFramebuffer() in terms
+ * of texture mapping and polygon rendering.
+ * Note: this function requires GL_ARB_texture_rectangle support.
+ */
+void
+_mesa_meta_blit_framebuffer(GLcontext *ctx,
+                            GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                            GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                            GLbitfield mask, GLenum filter)
+{
+   struct blit_state *blit = &ctx->Meta->Blit;
+   const GLint srcX = MIN2(srcX0, srcX1);
+   const GLint srcY = MIN2(srcY0, srcY1);
+   const GLint srcW = abs(srcX1 - srcX0);
+   const GLint srcH = abs(srcY1 - srcY0);
+   GLboolean srcFlipX = srcX1 < srcX0;
+   GLboolean srcFlipY = srcY1 < srcY0;
+
+   ASSERT(ctx->Extensions.NV_texture_rectangle);
+
+   if (srcW > ctx->Const.MaxTextureRectSize ||
+       srcH > ctx->Const.MaxTextureRectSize) {
+      /* XXX avoid this fallback */
+      _swrast_BlitFramebuffer(ctx, srcX0, srcY0, srcX1, srcY1,
+                              dstX0, dstY0, dstX1, dstY1, mask, filter);
+      return;
+   }
+
+
+   if (srcFlipX) {
+      GLint tmp = dstX0;
+      dstX0 = dstX1;
+      dstX1 = tmp;
+   }
+
+   if (srcFlipY) {
+      GLint tmp = dstY0;
+      dstY0 = dstY1;
+      dstY1 = tmp;
+   }
+
+   /* only scissor effects blit so save/clear all other relevant state */
+   _mesa_meta_begin(ctx, ~META_SCISSOR);
+
+   if (blit->TexObj == 0) {
+      /* one-time setup */
+
+      /* create texture object */
+      _mesa_GenTextures(1, &blit->TexObj);
+      _mesa_BindTexture(GL_TEXTURE_RECTANGLE, blit->TexObj);
+      _mesa_TexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+   }
+   else {
+      _mesa_BindTexture(GL_TEXTURE_RECTANGLE, blit->TexObj);
+   }
+
+   _mesa_TexParameteri(GL_TEXTURE_RECTANGLE, GL_TEXTURE_MIN_FILTER, filter);
+   _mesa_TexParameteri(GL_TEXTURE_RECTANGLE, GL_TEXTURE_MAG_FILTER, filter);
+
+   if (blit->ArrayObj == 0) {
+      /* one-time setup */
+
+      /* create vertex array object */
+      _mesa_GenVertexArrays(1, &blit->ArrayObj);
+      _mesa_BindVertexArray(blit->ArrayObj);
+
+      /* create vertex array buffer */
+      _mesa_GenBuffersARB(1, &blit->VBO);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, blit->VBO);
+      _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(blit->verts),
+                          blit->verts, GL_STREAM_DRAW_ARB);
+
+      /* setup vertex arrays */
+      _mesa_VertexPointer(2, GL_FLOAT, 4 * sizeof(GLfloat),
+                          (void*) (0 * sizeof(GLfloat)));
+      _mesa_TexCoordPointer(2, GL_FLOAT, 4 * sizeof(GLfloat),
+                            (void *) (2 * sizeof(GLfloat)));
+      _mesa_EnableClientState(GL_VERTEX_ARRAY);
+      _mesa_EnableClientState(GL_TEXTURE_COORD_ARRAY);
+   }
+   else {
+      _mesa_BindVertexArray(blit->ArrayObj);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, blit->VBO);
+   }
+
+   /* vertex positions */
+   blit->verts[0][0] = (GLfloat) dstX0;
+   blit->verts[0][1] = (GLfloat) dstY0;
+   blit->verts[1][0] = (GLfloat) dstX1;
+   blit->verts[1][1] = (GLfloat) dstY0;
+   blit->verts[2][0] = (GLfloat) dstX1;
+   blit->verts[2][1] = (GLfloat) dstY1;
+   blit->verts[3][0] = (GLfloat) dstX0;
+   blit->verts[3][1] = (GLfloat) dstY1;
+
+   /* texcoords */
+   blit->verts[0][2] = 0.0F;
+   blit->verts[0][3] = 0.0F;
+   blit->verts[1][2] = (GLfloat) srcW;
+   blit->verts[1][3] = 0.0F;
+   blit->verts[2][2] = (GLfloat) srcW;
+   blit->verts[2][3] = (GLfloat) srcH;
+   blit->verts[3][2] = 0.0F;
+   blit->verts[3][3] = (GLfloat) srcH;
+
+   /* upload new vertex data */
+   _mesa_BufferSubDataARB(GL_ARRAY_BUFFER_ARB, 0,
+                          sizeof(blit->verts), blit->verts);
+
+   /* copy framebuffer image to texture */
+   if (mask & GL_COLOR_BUFFER_BIT) {
+      if (blit->TexWidth == srcW &&
+          blit->TexHeight == srcH &&
+          blit->TexType == GL_RGBA) {
+         /* replace existing tex image */
+         _mesa_CopyTexSubImage2D(GL_TEXTURE_RECTANGLE, 0,
+                                 0, 0, srcX, srcY, srcW, srcH);
+      }
+      else {
+         /* create new tex image */
+         _mesa_CopyTexImage2D(GL_TEXTURE_RECTANGLE, 0, GL_RGBA,
+                              srcX, srcY, srcW, srcH, 0);
+         blit->TexWidth = srcW;
+         blit->TexHeight = srcH;
+         blit->TexType = GL_RGBA;
+      }
+
+      mask &= ~GL_COLOR_BUFFER_BIT;
+   }
+
+   _mesa_Enable(GL_TEXTURE_RECTANGLE);
+
+   /* draw textured quad */
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+   _mesa_Disable(GL_TEXTURE_RECTANGLE);
+
+   _mesa_meta_end(ctx);
+
+   /* XXX, TO-DO: try to handle these cases above! */
+   if (mask & (GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT)) {
+      _swrast_BlitFramebuffer(ctx, srcX0, srcY0, srcX1, srcY1,
+                              dstX0, dstY0, dstX1, dstY1, mask, filter);
+   }
+}
+
+
+/**
+ * Meta implementation of ctx->Driver.Clear() in terms of polygon rendering.
+ */
+void
+_mesa_meta_clear(GLcontext *ctx, GLbitfield buffers)
+{
+   struct clear_state *clear = &ctx->Meta->Clear;
+   GLfloat z = 1.0 - 2.0 * ctx->Depth.Clear;
+   GLuint i;
+
+   /* only scissor and color mask effects clearing */
+   _mesa_meta_begin(ctx, ~(META_SCISSOR | META_COLOR_MASK));
+
+   if (clear->ArrayObj == 0) {
+      /* one-time setup */
+
+      /* create vertex array object */
+      _mesa_GenVertexArrays(1, &clear->ArrayObj);
+      _mesa_BindVertexArray(clear->ArrayObj);
+
+      /* create vertex array buffer */
+      _mesa_GenBuffersARB(1, &clear->VBO);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, clear->VBO);
+      _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(clear->verts),
+                          clear->verts, GL_STREAM_DRAW_ARB);
+
+      /* setup vertex arrays */
+      _mesa_VertexPointer(3, GL_FLOAT, 7 * sizeof(GLfloat), (void *) 0);
+      _mesa_ColorPointer(4, GL_FLOAT, 7 * sizeof(GLfloat),
+                         (void *) (3 * sizeof(GLfloat)));
+      _mesa_EnableClientState(GL_VERTEX_ARRAY);
+      _mesa_EnableClientState(GL_COLOR_ARRAY);
+   }
+   else {
+      _mesa_BindVertexArray(clear->ArrayObj);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, clear->VBO);
+   }
+
+   /* GL_COLOR_BUFFER_BIT */
+   if (buffers & BUFFER_BITS_COLOR) {
+      /* leave colormask, glDrawBuffer state as-is */
+   }
+   else {
+      _mesa_ColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
+   }
+
+   /* GL_DEPTH_BUFFER_BIT */
+   if (buffers & BUFFER_BIT_DEPTH) {
+      _mesa_set_enable(ctx, GL_DEPTH_TEST, GL_TRUE);
+      _mesa_DepthFunc(GL_ALWAYS);
+      _mesa_DepthMask(GL_TRUE);
+   }
+   else {
+      assert(!ctx->Depth.Test);
+   }
+
+   /* GL_STENCIL_BUFFER_BIT */
+   if (buffers & BUFFER_BIT_STENCIL) {
+      _mesa_set_enable(ctx, GL_STENCIL_TEST, GL_TRUE);
+      _mesa_StencilOpSeparate(GL_FRONT_AND_BACK,
+                              GL_REPLACE, GL_REPLACE, GL_REPLACE);
+      _mesa_StencilFuncSeparate(GL_FRONT_AND_BACK, GL_ALWAYS,
+                                ctx->Stencil.Clear & 0x7fffffff,
+                                ctx->Stencil.WriteMask[0]);
+   }
+   else {
+      assert(!ctx->Stencil.Enabled);
+   }
+
+   /* vertex positions */
+   clear->verts[0][0] = (GLfloat) ctx->DrawBuffer->_Xmin;
+   clear->verts[0][1] = (GLfloat) ctx->DrawBuffer->_Ymin;
+   clear->verts[0][2] = z;
+   clear->verts[1][0] = (GLfloat) ctx->DrawBuffer->_Xmax;
+   clear->verts[1][1] = (GLfloat) ctx->DrawBuffer->_Ymin;
+   clear->verts[1][2] = z;
+   clear->verts[2][0] = (GLfloat) ctx->DrawBuffer->_Xmax;
+   clear->verts[2][1] = (GLfloat) ctx->DrawBuffer->_Ymax;
+   clear->verts[2][2] = z;
+   clear->verts[3][0] = (GLfloat) ctx->DrawBuffer->_Xmin;
+   clear->verts[3][1] = (GLfloat) ctx->DrawBuffer->_Ymax;
+   clear->verts[3][2] = z;
+
+   /* vertex colors */
+   for (i = 0; i < 4; i++) {
+      COPY_4FV(&clear->verts[i][3], ctx->Color.ClearColor);
+   }
+
+   /* upload new vertex data */
+   _mesa_BufferSubDataARB(GL_ARRAY_BUFFER_ARB, 0,
+                          sizeof(clear->verts), clear->verts);
+
+   /* draw quad */
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+   _mesa_meta_end(ctx);
+}
+
+
+/**
+ * Meta implementation of ctx->Driver.CopyPixels() in terms
+ * of texture mapping and polygon rendering.
+ * Note: this function requires GL_ARB_texture_rectangle support.
+ */
+void
+_mesa_meta_copy_pixels(GLcontext *ctx, GLint srcX, GLint srcY,
+                       GLsizei width, GLsizei height,
+                       GLint dstX, GLint dstY, GLenum type)
+{
+   const GLenum filter = GL_NEAREST;
+   struct copypix_state *copypix = &ctx->Meta->CopyPix;
+   const GLfloat z = ctx->Current.RasterPos[2];
+   const GLfloat dstX1 = dstX + width * ctx->Pixel.ZoomX;
+   const GLfloat dstY1 = dstY + height * ctx->Pixel.ZoomY;
+
+   ASSERT(ctx->Extensions.NV_texture_rectangle);
+
+   if (type != GL_COLOR ||
+       ctx->_ImageTransferState ||
+       ctx->Fog.Enabled ||
+       width > ctx->Const.MaxTextureRectSize ||
+       height > ctx->Const.MaxTextureRectSize) {
+      /* XXX avoid this fallback */
+      _swrast_CopyPixels(ctx, srcX, srcY, width, height, dstX, dstY, type);
+      return;
+   }
+
+   /* Most GL state applies to glCopyPixels, but a there's a few things
+    * we need to override:
+    */
+   _mesa_meta_begin(ctx, (META_RASTERIZATION |
+                          META_SHADER |
+                          META_TEXTURE |
+                          META_TRANSFORM |
+                          META_VERTEX |
+                          META_VIEWPORT));
+
+   if (copypix->TexObj == 0) {
+      /* one-time setup */
+
+      /* create texture object */
+      _mesa_GenTextures(1, &copypix->TexObj);
+      _mesa_BindTexture(GL_TEXTURE_RECTANGLE, copypix->TexObj);
+      _mesa_TexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+      _mesa_TexParameteri(GL_TEXTURE_RECTANGLE, GL_TEXTURE_MIN_FILTER, filter);
+      _mesa_TexParameteri(GL_TEXTURE_RECTANGLE, GL_TEXTURE_MAG_FILTER, filter);
+   }
+   else {
+      _mesa_BindTexture(GL_TEXTURE_RECTANGLE, copypix->TexObj);
+   }
+
+   if (copypix->ArrayObj == 0) {
+      /* one-time setup */
+
+      /* create vertex array object */
+      _mesa_GenVertexArrays(1, &copypix->ArrayObj);
+      _mesa_BindVertexArray(copypix->ArrayObj);
+
+      /* create vertex array buffer */
+      _mesa_GenBuffersARB(1, &copypix->VBO);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, copypix->VBO);
+      _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(copypix->verts),
+                          copypix->verts, GL_STREAM_DRAW_ARB);
+
+      /* setup vertex arrays */
+      _mesa_VertexPointer(3, GL_FLOAT, sizeof(copypix->verts[0]),
+                          (void*) (0 * sizeof(GLfloat)));
+      _mesa_TexCoordPointer(2, GL_FLOAT, sizeof(copypix->verts[0]),
+                            (void *) (3 * sizeof(GLfloat)));
+      _mesa_EnableClientState(GL_VERTEX_ARRAY);
+      _mesa_EnableClientState(GL_TEXTURE_COORD_ARRAY);
+   }
+   else {
+      _mesa_BindVertexArray(copypix->ArrayObj);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, copypix->VBO);
+   }
+
+   /* vertex positions, texcoords */
+   copypix->verts[0][0] = (GLfloat) dstX;
+   copypix->verts[0][1] = (GLfloat) dstY;
+   copypix->verts[0][2] = z;
+   copypix->verts[0][3] = 0.0F;
+   copypix->verts[0][4] = 0.0F;
+   copypix->verts[1][0] = (GLfloat) dstX1;
+   copypix->verts[1][1] = (GLfloat) dstY;
+   copypix->verts[1][2] = z;
+   copypix->verts[1][3] = (GLfloat) width;
+   copypix->verts[1][4] = 0.0F;
+   copypix->verts[2][0] = (GLfloat) dstX1;
+   copypix->verts[2][1] = (GLfloat) dstY1;
+   copypix->verts[2][2] = z;
+   copypix->verts[2][3] = (GLfloat) width;
+   copypix->verts[2][4] = (GLfloat) height;
+   copypix->verts[3][0] = (GLfloat) dstX;
+   copypix->verts[3][1] = (GLfloat) dstY1;
+   copypix->verts[3][2] = z;
+   copypix->verts[3][3] = 0.0F;
+   copypix->verts[3][4] = (GLfloat) height;
+
+   /* upload new vertex data */
+   _mesa_BufferSubDataARB(GL_ARRAY_BUFFER_ARB, 0,
+                          sizeof(copypix->verts), copypix->verts);
+
+   /* copy framebuffer image to texture */
+   if (type == GL_COLOR) {
+      if (copypix->TexWidth == width &&
+          copypix->TexHeight == height &&
+          copypix->TexType == type) {
+         /* replace existing tex image */
+         _mesa_CopyTexSubImage2D(GL_TEXTURE_RECTANGLE, 0,
+                                 0, 0, srcX, srcY, width, height);
+      }
+      else {
+         /* create new tex image */
+         _mesa_CopyTexImage2D(GL_TEXTURE_RECTANGLE, 0, GL_RGBA,
+                              srcX, srcY, width, height, 0);
+         copypix->TexWidth = width;
+         copypix->TexHeight = height;
+         copypix->TexType = type;
+      }
+   }
+   else if (type == GL_DEPTH) {
+      /* TO-DO: Use a GL_DEPTH_COMPONENT texture and a fragment program/shader
+       * that replaces the fragment.z value.
+       */
+   }
+   else {
+      ASSERT(type == GL_STENCIL);
+      /* have to use sw fallback */
+   }
+
+   _mesa_Enable(GL_TEXTURE_RECTANGLE);
+
+   /* draw textured quad */
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+   _mesa_Disable(GL_TEXTURE_RECTANGLE);
+
+   _mesa_meta_end(ctx);
+}
+
+
+
+/**
+ * When the glDrawPixels() image size is greater than the max rectangle
+ * texture size we use this function to break the glDrawPixels() image
+ * into tiles which fit into the max texture size.
+ */
+static void
+tiled_draw_pixels(GLcontext *ctx,
+                  GLint x, GLint y, GLsizei width, GLsizei height,
+                  GLenum format, GLenum type,
+                  const struct gl_pixelstore_attrib *unpack,
+                  const GLvoid *pixels)
+{
+   const GLint maxSize = ctx->Const.MaxTextureRectSize;
+   struct gl_pixelstore_attrib tileUnpack = *unpack;
+   GLint i, j;
+
+   for (i = 0; i < width; i += maxSize) {
+      const GLint tileWidth = MIN2(maxSize, width - i);
+      const GLint tileX = (GLint) (x + i * ctx->Pixel.ZoomX);
+
+      tileUnpack.SkipPixels = unpack->SkipPixels + i;
+
+      for (j = 0; j < height; j += maxSize) {
+         const GLint tileHeight = MIN2(maxSize, height - j);
+         const GLint tileY = (GLint) (y + j * ctx->Pixel.ZoomY);
+
+         tileUnpack.SkipRows = unpack->SkipRows + j;
+
+         _mesa_meta_draw_pixels(ctx, tileX, tileY,
+                                tileWidth, tileHeight,
+                                format, type, &tileUnpack, pixels);
+      }
+   }
+}
+
+
+/**
+ * Meta implementation of ctx->Driver.DrawPixels() in terms
+ * of texture mapping and polygon rendering.
+ * Note: this function requires GL_ARB_texture_rectangle support.
+ */
+void
+_mesa_meta_draw_pixels(GLcontext *ctx,
+		       GLint x, GLint y, GLsizei width, GLsizei height,
+		       GLenum format, GLenum type,
+		       const struct gl_pixelstore_attrib *unpack,
+		       const GLvoid *pixels)
+{
+   const GLenum filter = GL_NEAREST;
+   struct drawpix_state *drawpix = &ctx->Meta->DrawPix;
+   const GLfloat z = ctx->Current.RasterPos[2];
+   const GLfloat x1 = x + width * ctx->Pixel.ZoomX;
+   const GLfloat y1 = y + height * ctx->Pixel.ZoomY;
+   const struct gl_pixelstore_attrib unpackSave = ctx->Unpack;
+   GLenum texIntFormat;
+   GLboolean fallback;
+
+   ASSERT(ctx->Extensions.NV_texture_rectangle);
+
+   /*
+    * Determine if we can do the glDrawPixels with texture mapping.
+    */
+   fallback = GL_FALSE;
+   if (ctx->_ImageTransferState ||
+       ctx->Fog.Enabled) {
+      fallback = GL_TRUE;
+   }
+
+   if (_mesa_is_color_format(format)) {
+      texIntFormat = GL_RGBA;
+   }
+   else {
+      fallback = GL_TRUE;
+   }
+
+   if (fallback) {
+      _swrast_DrawPixels(ctx, x, y, width, height,
+                         format, type, unpack, pixels);
+      return;
+   }
+
+   /*
+    * Check image size against max texture size, draw as tiles if needed.
+    */
+   if (width > ctx->Const.MaxTextureRectSize ||
+       height > ctx->Const.MaxTextureRectSize) {
+      tiled_draw_pixels(ctx, x, y, width, height,
+                        format, type, unpack, pixels);
+      return;
+   }
+
+   /* Most GL state applies to glDrawPixels, but a there's a few things
+    * we need to override:
+    */
+   _mesa_meta_begin(ctx, (META_RASTERIZATION |
+                          META_SHADER |
+                          META_TEXTURE |
+                          META_TRANSFORM |
+                          META_VERTEX |
+                          META_VIEWPORT));
+
+   if (drawpix->TexObj == 0) {
+      /* one-time setup */
+
+      /* create texture object */
+      _mesa_GenTextures(1, &drawpix->TexObj);
+      _mesa_BindTexture(GL_TEXTURE_RECTANGLE, drawpix->TexObj);
+      _mesa_TexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+      _mesa_TexParameteri(GL_TEXTURE_RECTANGLE, GL_TEXTURE_MIN_FILTER, filter);
+      _mesa_TexParameteri(GL_TEXTURE_RECTANGLE, GL_TEXTURE_MAG_FILTER, filter);
+   }
+   else {
+      _mesa_BindTexture(GL_TEXTURE_RECTANGLE, drawpix->TexObj);
+   }
+
+   if (drawpix->ArrayObj == 0) {
+      /* one-time setup */
+
+      /* create vertex array object */
+      _mesa_GenVertexArrays(1, &drawpix->ArrayObj);
+      _mesa_BindVertexArray(drawpix->ArrayObj);
+
+      /* create vertex array buffer */
+      _mesa_GenBuffersARB(1, &drawpix->VBO);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, drawpix->VBO);
+      _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(drawpix->verts),
+                          drawpix->verts, GL_STREAM_DRAW_ARB);
+
+      /* setup vertex arrays */
+      _mesa_VertexPointer(3, GL_FLOAT, sizeof(drawpix->verts[0]),
+                          (void*) (0 * sizeof(GLfloat)));
+      _mesa_TexCoordPointer(2, GL_FLOAT, sizeof(drawpix->verts[0]),
+                            (void *) (3 * sizeof(GLfloat)));
+      _mesa_EnableClientState(GL_VERTEX_ARRAY);
+      _mesa_EnableClientState(GL_TEXTURE_COORD_ARRAY);
+   }
+   else {
+      _mesa_BindVertexArray(drawpix->ArrayObj);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, drawpix->VBO);
+   }
+
+   /* vertex positions, texcoords */
+   drawpix->verts[0][0] = (GLfloat) x;
+   drawpix->verts[0][1] = (GLfloat) y;
+   drawpix->verts[0][2] = z;
+   drawpix->verts[0][3] = 0.0F;
+   drawpix->verts[0][4] = 0.0F;
+   drawpix->verts[1][0] = (GLfloat) x1;
+   drawpix->verts[1][1] = (GLfloat) y;
+   drawpix->verts[1][2] = z;
+   drawpix->verts[1][3] = (GLfloat) width;
+   drawpix->verts[1][4] = 0.0F;
+   drawpix->verts[2][0] = (GLfloat) x1;
+   drawpix->verts[2][1] = (GLfloat) y1;
+   drawpix->verts[2][2] = z;
+   drawpix->verts[2][3] = (GLfloat) width;
+   drawpix->verts[2][4] = (GLfloat) height;
+   drawpix->verts[3][0] = (GLfloat) x;
+   drawpix->verts[3][1] = (GLfloat) y1;
+   drawpix->verts[3][2] = z;
+   drawpix->verts[3][3] = 0.0F;
+   drawpix->verts[3][4] = (GLfloat) height;
+
+   /* upload new vertex data */
+   _mesa_BufferSubDataARB(GL_ARRAY_BUFFER_ARB, 0,
+                          sizeof(drawpix->verts), drawpix->verts);
+
+   /* set given unpack params */
+   ctx->Unpack = *unpack; /* XXX bufobj */
+
+   /* copy pixel data to texture */
+   if (drawpix->TexWidth == width &&
+       drawpix->TexHeight == height &&
+       drawpix->TexIntFormat == texIntFormat) {
+      /* replace existing tex image */
+      _mesa_TexSubImage2D(GL_TEXTURE_RECTANGLE, 0,
+                          0, 0, width, height, format, type, pixels);
+   }
+   else {
+      /* create new tex image */
+      _mesa_TexImage2D(GL_TEXTURE_RECTANGLE, 0, texIntFormat,
+                       width, height, 0, format, type, pixels);
+      drawpix->TexWidth = width;
+      drawpix->TexHeight = height;
+      drawpix->TexIntFormat = texIntFormat;
+   }
+
+   /* restore unpack params */
+   ctx->Unpack = unpackSave; /* XXX bufobj */
+
+   _mesa_Enable(GL_TEXTURE_RECTANGLE);
+
+   /* draw textured quad */
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+   _mesa_Disable(GL_TEXTURE_RECTANGLE);
+
+   _mesa_meta_end(ctx);
+}
diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h
new file mode 100644
index 00000000000..a9c5f980432
--- /dev/null
+++ b/src/mesa/drivers/common/meta.h
@@ -0,0 +1,80 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.6
+ *
+ * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef META_H
+#define META_H
+
+
+/**
+ * Flags passed to _mesa_meta_begin().
+ * XXX these flags may evolve...
+ */
+/*@{*/
+#define META_ALPHA_TEST      0x1
+#define META_BLEND           0x2  /**< includes logicop */
+#define META_COLOR_MASK      0x4
+#define META_DEPTH_TEST      0x8
+#define META_FOG            0x10
+#define META_RASTERIZATION  0x20
+#define META_SCISSOR        0x40
+#define META_SHADER         0x80
+#define META_STENCIL_TEST  0x100
+#define META_TRANSFORM     0x200 /**< modelview, projection */
+#define META_TEXTURE       0x400
+#define META_VERTEX        0x800
+#define META_VIEWPORT     0x1000
+#define META_ALL            ~0x0
+/*@}*/
+
+
+extern void
+_mesa_meta_init(GLcontext *ctx);
+
+extern void
+_mesa_meta_free(GLcontext *ctx);
+
+extern void
+_mesa_meta_blit_framebuffer(GLcontext *ctx,
+                            GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                            GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                            GLbitfield mask, GLenum filter);
+
+extern void
+_mesa_meta_clear(GLcontext *ctx, GLbitfield buffers);
+
+extern void
+_mesa_meta_copy_pixels(GLcontext *ctx, GLint srcx, GLint srcy,
+                       GLsizei width, GLsizei height,
+                       GLint dstx, GLint dsty, GLenum type);
+
+extern void
+_mesa_meta_draw_pixels(GLcontext *ctx,
+		       GLint x, GLint y, GLsizei width, GLsizei height,
+		       GLenum format, GLenum type,
+		       const struct gl_pixelstore_attrib *unpack,
+		       const GLvoid *pixels);
+
+
+#endif /* META_H */
diff --git a/src/mesa/drivers/dri/common/.gitignore b/src/mesa/drivers/dri/common/.gitignore
new file mode 100644
index 00000000000..1edeb79fd12
--- /dev/null
+++ b/src/mesa/drivers/dri/common/.gitignore
@@ -0,0 +1 @@
+*.os
diff --git a/src/mesa/drivers/dri/common/dri_metaops.c b/src/mesa/drivers/dri/common/dri_metaops.c
index fe183c2e870..cdbea344951 100644
--- a/src/mesa/drivers/dri/common/dri_metaops.c
+++ b/src/mesa/drivers/dri/common/dri_metaops.c
@@ -357,6 +357,7 @@ meta_clear_tris(struct dri_metaops *meta, GLbitfield mask)
    GLuint saved_shader_program = 0;
    unsigned int saved_active_texture;
    struct gl_array_object *arraySave = NULL;
+   GLfloat saved_near, saved_far;
 
    if (!meta->clear.arrayObj)
       meta_init_clear(meta);
@@ -370,8 +371,7 @@ meta_clear_tris(struct dri_metaops *meta, GLbitfield mask)
 		    GL_POLYGON_BIT |
 		    GL_STENCIL_BUFFER_BIT |
 		    GL_TRANSFORM_BIT |
-		    GL_CURRENT_BIT |
-		    GL_VIEWPORT_BIT);
+		    GL_CURRENT_BIT);
    saved_active_texture = ctx->Texture.CurrentUnit;
 
    /* Disable existing GL state we don't want to apply to a clear. */
@@ -440,6 +440,8 @@ meta_clear_tris(struct dri_metaops *meta, GLbitfield mask)
    /* The ClearDepth value is unaffected by DepthRange, so do a default
     * mapping.
     */
+   saved_near = ctx->Viewport.Near;
+   saved_far = ctx->Viewport.Far;
    _mesa_DepthRange(0.0, 1.0);
 
    /* Prepare the vertices, which are the same regardless of which buffer we're
@@ -519,6 +521,7 @@ meta_clear_tris(struct dri_metaops *meta, GLbitfield mask)
    if (saved_shader_program)
       _mesa_UseProgramObjectARB(saved_shader_program);
 
+   _mesa_DepthRange(saved_near, saved_far);
    _mesa_PopAttrib();
 
    /* restore current array object */
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index 1d940603fa8..e48e10d7c06 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -778,7 +778,7 @@ dri2CreateNewScreen(int scrn, int fd,
     if (driDriverAPI.InitScreen2 == NULL)
         return NULL;
 
-    psp = _mesa_malloc(sizeof(*psp));
+    psp = _mesa_calloc(sizeof(*psp));
     if (!psp)
 	return NULL;
 
diff --git a/src/mesa/drivers/dri/common/extension_helper.h b/src/mesa/drivers/dri/common/extension_helper.h
index 62801494ce3..08a97bb1110 100644
--- a/src/mesa/drivers/dri/common/extension_helper.h
+++ b/src/mesa/drivers/dri/common/extension_helper.h
@@ -752,6 +752,13 @@ static const char VertexAttrib4ubNV_names[] =
     "";
 #endif
 
+#if defined(need_GL_APPLE_texture_range)
+static const char TextureRangeAPPLE_names[] = 
+    "iip\0" /* Parameter signature */
+    "glTextureRangeAPPLE\0"
+    "";
+#endif
+
 #if defined(need_GL_SUN_vertex)
 static const char TexCoord2fColor4fNormal3fVertex3fSUN_names[] = 
     "ffffffffffff\0" /* Parameter signature */
@@ -1567,6 +1574,13 @@ static const char FragmentLightivSGIX_names[] =
     "";
 #endif
 
+#if defined(need_GL_APPLE_texture_range)
+static const char GetTexParameterPointervAPPLE_names[] = 
+    "iip\0" /* Parameter signature */
+    "glGetTexParameterPointervAPPLE\0"
+    "";
+#endif
+
 #if defined(need_GL_EXT_pixel_transform)
 static const char PixelTransformParameterfvEXT_names[] = 
     "iip\0" /* Parameter signature */
@@ -1771,6 +1785,13 @@ static const char DeleteFencesNV_names[] =
     "";
 #endif
 
+#if defined(need_GL_SGIX_polynomial_ffd)
+static const char DeformationMap3dSGIX_names[] = 
+    "iddiiddiiddiip\0" /* Parameter signature */
+    "glDeformationMap3dSGIX\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_2_0)
 static const char IsShader_names[] = 
     "i\0" /* Parameter signature */
@@ -1997,13 +2018,6 @@ static const char GetCombinerOutputParameterivNV_names[] =
     "";
 #endif
 
-#if defined(need_GL_IBM_multimode_draw_arrays)
-static const char MultiModeDrawArraysIBM_names[] = 
-    "pppii\0" /* Parameter signature */
-    "glMultiModeDrawArraysIBM\0"
-    "";
-#endif
-
 #if defined(need_GL_SGIS_pixel_texture)
 static const char PixelTexGenParameterivSGIS_names[] = 
     "ip\0" /* Parameter signature */
@@ -2079,6 +2093,13 @@ static const char Uniform2fvARB_names[] =
     "";
 #endif
 
+#if defined(need_GL_APPLE_flush_buffer_range)
+static const char BufferParameteriAPPLE_names[] = 
+    "iii\0" /* Parameter signature */
+    "glBufferParameteriAPPLE\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_1_3)
 static const char MultiTexCoord3dvARB_names[] = 
     "ip\0" /* Parameter signature */
@@ -2803,6 +2824,13 @@ static const char IsProgramNV_names[] =
     "";
 #endif
 
+#if defined(need_GL_APPLE_flush_buffer_range)
+static const char FlushMappedBufferRangeAPPLE_names[] = 
+    "iii\0" /* Parameter signature */
+    "glFlushMappedBufferRangeAPPLE\0"
+    "";
+#endif
+
 #if defined(need_GL_SUN_triangle_list)
 static const char ReplacementCodePointerSUN_names[] = 
     "iip\0" /* Parameter signature */
@@ -3920,6 +3948,13 @@ static const char VertexAttribs4dvNV_names[] =
     "";
 #endif
 
+#if defined(need_GL_IBM_multimode_draw_arrays)
+static const char MultiModeDrawArraysIBM_names[] = 
+    "pppii\0" /* Parameter signature */
+    "glMultiModeDrawArraysIBM\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
 static const char VertexAttrib4dARB_names[] = 
     "idddd\0" /* Parameter signature */
@@ -4604,13 +4639,6 @@ static const char Minmax_names[] =
     "";
 #endif
 
-#if defined(need_GL_SGIX_polynomial_ffd)
-static const char DeformationMap3dSGIX_names[] = 
-    "iddiiddiiddiip\0" /* Parameter signature */
-    "glDeformationMap3dSGIX\0"
-    "";
-#endif
-
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_fog_coord)
 static const char FogCoorddvEXT_names[] = 
     "p\0" /* Parameter signature */
@@ -4973,6 +5001,22 @@ static const struct dri_extension_function GL_3DFX_tbuffer_functions[] = {
 };
 #endif
 
+#if defined(need_GL_APPLE_flush_buffer_range)
+static const struct dri_extension_function GL_APPLE_flush_buffer_range_functions[] = {
+    { BufferParameteriAPPLE_names, BufferParameteriAPPLE_remap_index, -1 },
+    { FlushMappedBufferRangeAPPLE_names, FlushMappedBufferRangeAPPLE_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
+#if defined(need_GL_APPLE_texture_range)
+static const struct dri_extension_function GL_APPLE_texture_range_functions[] = {
+    { TextureRangeAPPLE_names, TextureRangeAPPLE_remap_index, -1 },
+    { GetTexParameterPointervAPPLE_names, GetTexParameterPointervAPPLE_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
 #if defined(need_GL_APPLE_vertex_array_object)
 static const struct dri_extension_function GL_APPLE_vertex_array_object_functions[] = {
     { DeleteVertexArraysAPPLE_names, DeleteVertexArraysAPPLE_remap_index, -1 },
@@ -6131,9 +6175,9 @@ static const struct dri_extension_function GL_SGIX_pixel_texture_functions[] = {
 #if defined(need_GL_SGIX_polynomial_ffd)
 static const struct dri_extension_function GL_SGIX_polynomial_ffd_functions[] = {
     { LoadIdentityDeformationMapSGIX_names, LoadIdentityDeformationMapSGIX_remap_index, -1 },
+    { DeformationMap3dSGIX_names, DeformationMap3dSGIX_remap_index, -1 },
     { DeformSGIX_names, DeformSGIX_remap_index, -1 },
     { DeformationMap3fSGIX_names, DeformationMap3fSGIX_remap_index, -1 },
-    { DeformationMap3dSGIX_names, DeformationMap3dSGIX_remap_index, -1 },
     { NULL, 0, 0 }
 };
 #endif
diff --git a/src/mesa/drivers/dri/fb/fb_egl.c b/src/mesa/drivers/dri/fb/fb_egl.c
index dee67feb5ac..4e41860d8c8 100644
--- a/src/mesa/drivers/dri/fb/fb_egl.c
+++ b/src/mesa/drivers/dri/fb/fb_egl.c
@@ -605,7 +605,7 @@ fbDestroySurface(_EGLDriver *drv, EGLDisplay dpy, EGLSurface surface)
 {
    fbSurface *fs = Lookup_fbSurface(surface);
    _eglUnlinkSurface(&fs->Base);
-   if (!fs->Base.IsBound)
+   if (!_eglIsSurfaceBound(&fs->Base))
       free(fs);
    return EGL_TRUE;
 }
@@ -616,7 +616,7 @@ fbDestroyContext(_EGLDriver *drv, EGLDisplay dpy, EGLContext context)
 {
    fbContext *fc = Lookup_fbContext(context);
    _eglUnlinkContext(&fc->Base);
-   if (!fc->Base.IsBound)
+   if (!_eglIsContextBound(&fc->Base))
       free(fc);
    return EGL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/i915/i830_context.h b/src/mesa/drivers/dri/i915/i830_context.h
index 1bdb32049d7..f73cbbf88bb 100644
--- a/src/mesa/drivers/dri/i915/i830_context.h
+++ b/src/mesa/drivers/dri/i915/i830_context.h
@@ -40,6 +40,7 @@
 #define I830_UPLOAD_BUFFERS          0x2
 #define I830_UPLOAD_STIPPLE          0x4
 #define I830_UPLOAD_INVARIENT        0x8
+#define I830_UPLOAD_RASTER_RULES     0x10
 #define I830_UPLOAD_TEX(i)           (0x10<<(i))
 #define I830_UPLOAD_TEXBLEND(i)      (0x100<<(i))
 #define I830_UPLOAD_TEX_ALL          (0x0f0)
@@ -99,6 +100,11 @@
 
 #define I830_TEXBLEND_SIZE	12      /* (4 args + op) * 2 + COLOR_FACTOR */
 
+enum {
+   I830_RASTER_RULES,
+   I830_RASTER_RULES_SIZE
+};
+
 struct i830_texture_object
 {
    struct intel_texture_object intel;
@@ -112,6 +118,7 @@ struct i830_hw_state
    GLuint Ctx[I830_CTX_SETUP_SIZE];
    GLuint Buffer[I830_DEST_SETUP_SIZE];
    GLuint Stipple[I830_STP_SETUP_SIZE];
+   GLuint RasterRules[I830_RASTER_RULES_SIZE];
    GLuint Tex[I830_TEX_UNITS][I830_TEX_SETUP_SIZE];
    GLuint TexBlend[I830_TEX_UNITS][I830_TEXBLEND_SIZE];
    GLuint TexBlendWordsUsed[I830_TEX_UNITS];
@@ -197,6 +204,7 @@ extern void i830InitStateFuncs(struct dd_function_table *functions);
 extern void i830EmitState(struct i830_context *i830);
 
 extern void i830InitState(struct i830_context *i830);
+extern void i830_update_provoking_vertex(GLcontext *ctx);
 
 /* i830_metaops.c
  */
diff --git a/src/mesa/drivers/dri/i915/i830_reg.h b/src/mesa/drivers/dri/i915/i830_reg.h
index db16871001d..ae1317029a2 100644
--- a/src/mesa/drivers/dri/i915/i830_reg.h
+++ b/src/mesa/drivers/dri/i915/i830_reg.h
@@ -420,8 +420,11 @@
 #define ENABLE_LINE_STRIP_PROVOKE_VRTX	(1<<8)
 #define ENABLE_TRI_FAN_PROVOKE_VRTX	(1<<5)
 #define ENABLE_TRI_STRIP_PROVOKE_VRTX	(1<<2)
+#define LINE_STRIP_PROVOKE_VRTX_MASK	(3<<6)
 #define LINE_STRIP_PROVOKE_VRTX(x)	((x)<<6)
+#define TRI_FAN_PROVOKE_VRTX_MASK	(3<<3)
 #define TRI_FAN_PROVOKE_VRTX(x) 	((x)<<3)
+#define TRI_STRIP_PROVOKE_VRTX_MASK	(3<<0)
 #define TRI_STRIP_PROVOKE_VRTX(x)	(x)
 
 /* _3DSTATE_SCISSOR_ENABLE, p200 */
diff --git a/src/mesa/drivers/dri/i915/i830_state.c b/src/mesa/drivers/dri/i915/i830_state.c
index 8ef6c9144f1..645ebe30577 100644
--- a/src/mesa/drivers/dri/i915/i830_state.c
+++ b/src/mesa/drivers/dri/i915/i830_state.c
@@ -1047,6 +1047,16 @@ i830_init_packets(struct i830_context *i830)
                                          TEXBIND_SET1(TEXCOORDSRC_VTXSET_1) |
                                          TEXBIND_SET0(TEXCOORDSRC_VTXSET_0));
 
+   i830->state.RasterRules[I830_RASTER_RULES] = (_3DSTATE_RASTER_RULES_CMD |
+						 ENABLE_POINT_RASTER_RULE |
+						 OGL_POINT_RASTER_RULE |
+						 ENABLE_LINE_STRIP_PROVOKE_VRTX |
+						 ENABLE_TRI_FAN_PROVOKE_VRTX |
+						 ENABLE_TRI_STRIP_PROVOKE_VRTX |
+						 LINE_STRIP_PROVOKE_VRTX(1) |
+						 TRI_FAN_PROVOKE_VRTX(2) |
+						 TRI_STRIP_PROVOKE_VRTX(2));
+
 
    i830->state.Stipple[I830_STPREG_ST0] = _3DSTATE_STIPPLE;
 
@@ -1058,6 +1068,27 @@ i830_init_packets(struct i830_context *i830)
    i830->state.Buffer[I830_DESTREG_SR2] = 0;
 }
 
+void
+i830_update_provoking_vertex(GLcontext * ctx)
+{
+   struct i830_context *i830 = i830_context(ctx);
+
+   I830_STATECHANGE(i830, I830_UPLOAD_RASTER_RULES);
+   i830->state.RasterRules[I830_RASTER_RULES] &= ~(LINE_STRIP_PROVOKE_VRTX_MASK |
+						   TRI_FAN_PROVOKE_VRTX_MASK |
+						   TRI_STRIP_PROVOKE_VRTX_MASK);
+
+   /* _NEW_LIGHT */
+   if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION) {
+      i830->state.RasterRules[I830_RASTER_RULES] |= (LINE_STRIP_PROVOKE_VRTX(1) |
+						     TRI_FAN_PROVOKE_VRTX(2) |
+						     TRI_STRIP_PROVOKE_VRTX(2));
+   } else {
+      i830->state.RasterRules[I830_RASTER_RULES] |= (LINE_STRIP_PROVOKE_VRTX(0) |
+						     TRI_FAN_PROVOKE_VRTX(1) |
+						     TRI_STRIP_PROVOKE_VRTX(0));
+    }
+}
 
 void
 i830InitStateFuncs(struct dd_function_table *functions)
@@ -1101,6 +1132,7 @@ i830InitState(struct i830_context *i830)
    i830->current = &i830->state;
    i830->state.emitted = 0;
    i830->state.active = (I830_UPLOAD_INVARIENT |
+                         I830_UPLOAD_RASTER_RULES |
                          I830_UPLOAD_TEXBLEND(0) |
                          I830_UPLOAD_STIPPLE |
                          I830_UPLOAD_CTX | I830_UPLOAD_BUFFERS);
diff --git a/src/mesa/drivers/dri/i915/i830_vtbl.c b/src/mesa/drivers/dri/i915/i830_vtbl.c
index 9c6f891dd32..983f6724c98 100644
--- a/src/mesa/drivers/dri/i915/i830_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i830_vtbl.c
@@ -299,7 +299,7 @@ i830_emit_invarient_state(struct intel_context *intel)
 {
    BATCH_LOCALS;
 
-   BEGIN_BATCH(30, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(29, IGNORE_CLIPRECTS);
 
    OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
    OUT_BATCH(0);
@@ -351,15 +351,6 @@ i830_emit_invarient_state(struct intel_context *intel)
    OUT_BATCH(_3DSTATE_MAP_COORD_TRANSFORM);
    OUT_BATCH(DISABLE_TEX_TRANSFORM | TEXTURE_SET(3));
 
-   OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
-             ENABLE_POINT_RASTER_RULE |
-             OGL_POINT_RASTER_RULE |
-             ENABLE_LINE_STRIP_PROVOKE_VRTX |
-             ENABLE_TRI_FAN_PROVOKE_VRTX |
-             ENABLE_TRI_STRIP_PROVOKE_VRTX |
-             LINE_STRIP_PROVOKE_VRTX(1) |
-             TRI_FAN_PROVOKE_VRTX(2) | TRI_STRIP_PROVOKE_VRTX(2));
-
    OUT_BATCH(_3DSTATE_VERTEX_TRANSFORM);
    OUT_BATCH(DISABLE_VIEWPORT_TRANSFORM | DISABLE_PERSPECTIVE_DIVIDE);
 
@@ -394,6 +385,9 @@ get_state_size(struct i830_hw_state *state)
    if (dirty & I830_UPLOAD_INVARIENT)
       sz += 40 * sizeof(int);
 
+   if (dirty & I830_UPLOAD_RASTER_RULES)
+      sz += sizeof(state->RasterRules);
+
    if (dirty & I830_UPLOAD_CTX)
       sz += sizeof(state->Ctx);
 
@@ -486,6 +480,11 @@ i830_emit_state(struct intel_context *intel)
       i830_emit_invarient_state(intel);
    }
 
+   if (dirty & I830_UPLOAD_RASTER_RULES) {
+      DBG("I830_UPLOAD_RASTER_RULES:\n");
+      emit(intel, state->RasterRules, sizeof(state->RasterRules));
+   }
+
    if (dirty & I830_UPLOAD_CTX) {
       DBG("I830_UPLOAD_CTX:\n");
       emit(intel, state->Ctx, sizeof(state->Ctx));
@@ -737,6 +736,13 @@ i830_assert_not_dirty( struct intel_context *intel )
    assert(!get_dirty(state));
 }
 
+static void
+i830_invalidate_state(struct intel_context *intel, GLuint new_state)
+{
+   if (new_state & _NEW_LIGHT)
+      i830_update_provoking_vertex(&intel->ctx);
+}
+
 void
 i830InitVtbl(struct i830_context *i830)
 {
@@ -752,4 +758,5 @@ i830InitVtbl(struct i830_context *i830)
    i830->intel.vtbl.render_prevalidate = i830_render_prevalidate;
    i830->intel.vtbl.assert_not_dirty = i830_assert_not_dirty;
    i830->intel.vtbl.finish_batch = intel_finish_vb;
+   i830->intel.vtbl.invalidate_state = i830_invalidate_state;
 }
diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index 5aa41334b0b..bb08cf8d18f 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -77,6 +77,8 @@ i915InvalidateState(GLcontext * ctx, GLuint new_state)
       i915_update_fog(ctx);
    if (new_state & (_NEW_STENCIL | _NEW_BUFFERS | _NEW_POLYGON))
       i915_update_stencil(ctx);
+   if (new_state & (_NEW_LIGHT))
+       i915_update_provoking_vertex(ctx);
 }
 
 
diff --git a/src/mesa/drivers/dri/i915/i915_context.h b/src/mesa/drivers/dri/i915/i915_context.h
index c6b7377da80..8de4a9d0d36 100644
--- a/src/mesa/drivers/dri/i915/i915_context.h
+++ b/src/mesa/drivers/dri/i915/i915_context.h
@@ -48,6 +48,7 @@
 #define I915_UPLOAD_FOG              0x20
 #define I915_UPLOAD_INVARIENT        0x40
 #define I915_UPLOAD_DEFAULTS         0x80
+#define I915_UPLOAD_RASTER_RULES     0x100
 #define I915_UPLOAD_TEX(i)           (0x00010000<<(i))
 #define I915_UPLOAD_TEX_ALL          (0x00ff0000)
 #define I915_UPLOAD_TEX_0_SHIFT      16
@@ -112,6 +113,10 @@
 #define I915_DEFREG_Z1    5
 #define I915_DEF_SETUP_SIZE    6
 
+enum {
+   I915_RASTER_RULES,
+   I915_RASTER_RULES_SETUP_SIZE,
+};
 
 #define I915_MAX_CONSTANT      32
 #define I915_CONSTANT_SIZE     (2+(4*I915_MAX_CONSTANT))
@@ -208,6 +213,7 @@ struct i915_hw_state
    GLuint Stipple[I915_STP_SETUP_SIZE];
    GLuint Fog[I915_FOG_SETUP_SIZE];
    GLuint Defaults[I915_DEF_SETUP_SIZE];
+   GLuint RasterRules[I915_RASTER_RULES_SETUP_SIZE];
    GLuint Tex[I915_TEX_UNITS][I915_TEX_SETUP_SIZE];
    GLuint Constant[I915_CONSTANT_SIZE];
    GLuint ConstantSize;
@@ -324,6 +330,7 @@ extern void i915InitStateFunctions(struct dd_function_table *functions);
 extern void i915InitState(struct i915_context *i915);
 extern void i915_update_fog(GLcontext * ctx);
 extern void i915_update_stencil(GLcontext * ctx);
+extern void i915_update_provoking_vertex(GLcontext *ctx);
 
 
 /*======================================================================
diff --git a/src/mesa/drivers/dri/i915/i915_reg.h b/src/mesa/drivers/dri/i915/i915_reg.h
index 80ec46190d7..b5fa7fddb96 100644
--- a/src/mesa/drivers/dri/i915/i915_reg.h
+++ b/src/mesa/drivers/dri/i915/i915_reg.h
@@ -297,7 +297,9 @@
 #define TEXKILL_4D                      (1<<9)
 #define ENABLE_LINE_STRIP_PROVOKE_VRTX	(1<<8)
 #define ENABLE_TRI_FAN_PROVOKE_VRTX	(1<<5)
+#define LINE_STRIP_PROVOKE_VRTX_MASK	(3 << 6)
 #define LINE_STRIP_PROVOKE_VRTX(x)	((x)<<6)
+#define TRI_FAN_PROVOKE_VRTX_MASK	(3 << 3)
 #define TRI_FAN_PROVOKE_VRTX(x) 	((x)<<3)
 
 /* _3DSTATE_SCISSOR_ENABLE, p256 */
diff --git a/src/mesa/drivers/dri/i915/i915_state.c b/src/mesa/drivers/dri/i915/i915_state.c
index 670451bc838..b60efea75bd 100644
--- a/src/mesa/drivers/dri/i915/i915_state.c
+++ b/src/mesa/drivers/dri/i915/i915_state.c
@@ -1033,6 +1033,13 @@ i915_init_packets(struct i915_context *i915)
       i915->state.Buffer[I915_DESTREG_SR2] = 0;
    }
 
+   i915->state.RasterRules[I915_RASTER_RULES] = _3DSTATE_RASTER_RULES_CMD |
+      ENABLE_POINT_RASTER_RULE |
+      OGL_POINT_RASTER_RULE |
+      ENABLE_LINE_STRIP_PROVOKE_VRTX |
+      ENABLE_TRI_FAN_PROVOKE_VRTX |
+      LINE_STRIP_PROVOKE_VRTX(1) |
+      TRI_FAN_PROVOKE_VRTX(2) | ENABLE_TEXKILL_3D_4D | TEXKILL_4D;
 
 #if 0
    {
@@ -1053,7 +1060,33 @@ i915_init_packets(struct i915_context *i915)
    i915->state.active = (I915_UPLOAD_PROGRAM |
                          I915_UPLOAD_STIPPLE |
                          I915_UPLOAD_CTX |
-                         I915_UPLOAD_BUFFERS | I915_UPLOAD_INVARIENT);
+                         I915_UPLOAD_BUFFERS |
+			 I915_UPLOAD_INVARIENT |
+			 I915_UPLOAD_RASTER_RULES);
+}
+
+void
+i915_update_provoking_vertex(GLcontext * ctx)
+{
+   struct i915_context *i915 = I915_CONTEXT(ctx);
+
+   I915_STATECHANGE(i915, I915_UPLOAD_CTX);
+   i915->state.Ctx[I915_CTXREG_LIS6] &= ~(S6_TRISTRIP_PV_MASK);
+
+   I915_STATECHANGE(i915, I915_UPLOAD_RASTER_RULES);
+   i915->state.RasterRules[I915_RASTER_RULES] &= ~(LINE_STRIP_PROVOKE_VRTX_MASK |
+						   TRI_FAN_PROVOKE_VRTX_MASK);
+
+   /* _NEW_LIGHT */
+   if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION) {
+      i915->state.RasterRules[I915_RASTER_RULES] |= (LINE_STRIP_PROVOKE_VRTX(1) |
+						     TRI_FAN_PROVOKE_VRTX(2));
+      i915->state.Ctx[I915_CTXREG_LIS6] |= (2 << S6_TRISTRIP_PV_SHIFT);
+   } else {
+      i915->state.RasterRules[I915_RASTER_RULES] |= (LINE_STRIP_PROVOKE_VRTX(0) |
+						     TRI_FAN_PROVOKE_VRTX(1));
+      i915->state.Ctx[I915_CTXREG_LIS6] |= (0 << S6_TRISTRIP_PV_SHIFT);
+    }
 }
 
 void
diff --git a/src/mesa/drivers/dri/i915/i915_vtbl.c b/src/mesa/drivers/dri/i915/i915_vtbl.c
index 707864ebfdb..9a723d3cd73 100644
--- a/src/mesa/drivers/dri/i915/i915_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i915_vtbl.c
@@ -176,7 +176,7 @@ i915_emit_invarient_state(struct intel_context *intel)
 {
    BATCH_LOCALS;
 
-   BEGIN_BATCH(18, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(17, IGNORE_CLIPRECTS);
 
    OUT_BATCH(_3DSTATE_AA_CMD |
              AA_LINE_ECAAR_WIDTH_ENABLE |
@@ -200,14 +200,6 @@ i915_emit_invarient_state(struct intel_context *intel)
              CSB_TCB(3, 3) |
              CSB_TCB(4, 4) | CSB_TCB(5, 5) | CSB_TCB(6, 6) | CSB_TCB(7, 7));
 
-   OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
-             ENABLE_POINT_RASTER_RULE |
-             OGL_POINT_RASTER_RULE |
-             ENABLE_LINE_STRIP_PROVOKE_VRTX |
-             ENABLE_TRI_FAN_PROVOKE_VRTX |
-             LINE_STRIP_PROVOKE_VRTX(1) |
-             TRI_FAN_PROVOKE_VRTX(2) | ENABLE_TEXKILL_3D_4D | TEXKILL_4D);
-
    /* Need to initialize this to zero.
     */
    OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | (0));
@@ -258,6 +250,9 @@ get_state_size(struct i915_hw_state *state)
    if (dirty & I915_UPLOAD_INVARIENT)
       sz += 30 * 4;
 
+   if (dirty & I915_UPLOAD_RASTER_RULES)
+      sz += sizeof(state->RasterRules);
+
    if (dirty & I915_UPLOAD_CTX)
       sz += sizeof(state->Ctx);
 
@@ -366,6 +361,12 @@ i915_emit_state(struct intel_context *intel)
       i915_emit_invarient_state(intel);
    }
 
+   if (dirty & I915_UPLOAD_RASTER_RULES) {
+      if (INTEL_DEBUG & DEBUG_STATE)
+         fprintf(stderr, "I915_UPLOAD_RASTER_RULES:\n");
+      emit(intel, state->RasterRules, sizeof(state->RasterRules));
+   }
+
    if (dirty & I915_UPLOAD_CTX) {
       if (INTEL_DEBUG & DEBUG_STATE)
          fprintf(stderr, "I915_UPLOAD_CTX:\n");
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index 00a42111da0..128afb56866 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -43,6 +43,7 @@ DRIVER_SOURCES = \
 	brw_clip_util.c \
 	brw_context.c \
 	brw_curbe.c \
+	brw_disasm.c \
 	brw_draw.c \
 	brw_draw_upload.c \
 	brw_eu.c \
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 57ddf75413a..847c44ed83a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -143,6 +143,7 @@ struct brw_context;
 #define BRW_NEW_DEPTH_BUFFER		0x20000
 #define BRW_NEW_NR_WM_SURFACES		0x40000
 #define BRW_NEW_NR_VS_SURFACES		0x80000
+#define BRW_NEW_INDEX_BUFFER		0x100000
 
 struct brw_state_flags {
    /** State update flags signalled by mesa internals */
@@ -438,9 +439,13 @@ struct brw_query_object {
    unsigned int count;
 };
 
+
+/**
+ * brw_context is derived from intel_context.
+ */
 struct brw_context 
 {
-   struct intel_context intel;
+   struct intel_context intel;  /**< base class, must be first field */
    GLuint primitive;
 
    GLboolean emit_state_always;
@@ -475,6 +480,9 @@ struct brw_context
    struct {
       struct brw_vertex_element inputs[VERT_ATTRIB_MAX];
 
+      struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
+      GLuint nr_enabled;
+
 #define BRW_NR_UPLOAD_BUFS 17
 #define BRW_UPLOAD_INIT_SIZE (128*1024)
 
@@ -498,8 +506,15 @@ struct brw_context
        */
       const struct _mesa_index_buffer *ib;
 
+      /* Updates to these fields are signaled by BRW_NEW_INDEX_BUFFER. */
       dri_bo *bo;
       unsigned int offset;
+      unsigned int size;
+      /* Offset to index buffer index to use in CMD_3D_PRIM so that we can
+       * avoid re-uploading the IB packet over and over if we're actually
+       * referencing the same index buffer.
+       */
+      unsigned int start_vertex_offset;
    } ib;
 
    /* Active vertex program: 
@@ -706,6 +721,8 @@ void brw_upload_urb_fence(struct brw_context *brw);
  */
 void brw_upload_cs_urb_state(struct brw_context *brw);
 
+/* brw_disasm.c */
+int brw_disasm (FILE *file, struct brw_instruction *inst);
 
 /*======================================================================
  * Inline conversion functions.  These are better-typed than the
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index d166250b4fe..78d457ad2be 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -471,8 +471,9 @@
 #define BRW_CONDITIONAL_GE    4
 #define BRW_CONDITIONAL_L     5
 #define BRW_CONDITIONAL_LE    6
-#define BRW_CONDITIONAL_C     7
+#define BRW_CONDITIONAL_R     7
 #define BRW_CONDITIONAL_O     8
+#define BRW_CONDITIONAL_U     9
 
 #define BRW_DEBUG_NONE        0
 #define BRW_DEBUG_BREAKPOINT  1
@@ -512,6 +513,7 @@
 #define BRW_OPCODE_RSL        11
 #define BRW_OPCODE_ASR        12
 #define BRW_OPCODE_CMP        16
+#define BRW_OPCODE_CMPN       17
 #define BRW_OPCODE_JMPI       32
 #define BRW_OPCODE_IF         34
 #define BRW_OPCODE_IFF        35
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
new file mode 100644
index 00000000000..9fef230507f
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -0,0 +1,903 @@
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <stdarg.h>
+
+#include "main/mtypes.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+
+struct {
+    char    *name;
+    int	    nsrc;
+    int	    ndst;
+} opcode[128] = {
+    [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+
+    [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 1, .ndst = 01 },
+    [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
+};
+
+char *conditional_modifier[16] = {
+    [BRW_CONDITIONAL_NONE] = "",
+    [BRW_CONDITIONAL_Z] = ".e",
+    [BRW_CONDITIONAL_NZ] = ".ne",
+    [BRW_CONDITIONAL_G] = ".g",
+    [BRW_CONDITIONAL_GE] = ".ge",
+    [BRW_CONDITIONAL_L] = ".l",
+    [BRW_CONDITIONAL_LE] = ".le",
+    [BRW_CONDITIONAL_R] = ".r",
+    [BRW_CONDITIONAL_O] = ".o",
+    [BRW_CONDITIONAL_U] = ".u",
+};
+
+char *negate[2] = {
+    [0] = "",
+    [1] = "-",
+};
+
+char *_abs[2] = {
+    [0] = "",
+    [1] = "(abs)",
+};
+
+char *vert_stride[16] = {
+    [0] = "0",
+    [1] = "1",
+    [2] = "2",
+    [3] = "4",
+    [4] = "8",
+    [5] = "16",
+    [6] = "32",
+    [15] = "VxH",
+};
+
+char *width[8] = {
+    [0] = "1",
+    [1] = "2",
+    [2] = "4",
+    [3] = "8",
+    [4] = "16",
+};
+
+char *horiz_stride[4] = {
+    [0] = "0",
+    [1] = "1",
+    [2] = "2",
+    [3] = "4"
+};
+
+char *chan_sel[4] = {
+    [0] = "x",
+    [1] = "y",
+    [2] = "z",
+    [3] = "w",
+};
+
+char *dest_condmod[16] = {
+};
+
+char *debug_ctrl[2] = {
+    [0] = "",
+    [1] = ".breakpoint"
+};
+
+char *saturate[2] = {
+    [0] = "",
+    [1] = ".sat"
+};
+
+char *exec_size[8] = {
+    [0] = "1",
+    [1] = "2",
+    [2] = "4",
+    [3] = "8",
+    [4] = "16",
+    [5] = "32"
+};
+
+char *pred_inv[2] = {
+    [0] = "+",
+    [1] = "-"
+};
+
+char *pred_ctrl_align16[16] = {
+    [1] = "",
+    [2] = ".x",
+    [3] = ".y",
+    [4] = ".z",
+    [5] = ".w",
+    [6] = ".any4h",
+    [7] = ".all4h",
+};
+
+char *pred_ctrl_align1[16] = {
+    [1] = "",
+    [2] = ".anyv",
+    [3] = ".allv",
+    [4] = ".any2h",
+    [5] = ".all2h",
+    [6] = ".any4h",
+    [7] = ".all4h",
+    [8] = ".any8h",
+    [9] = ".all8h",
+    [10] = ".any16h",
+    [11] = ".all16h",
+};
+
+char *thread_ctrl[4] = {
+    [0] = "",
+    [2] = "switch"
+};
+
+char *compr_ctrl[4] = {
+    [0] = "",
+    [1] = "sechalf",
+    [2] = "compr",
+};
+
+char *dep_ctrl[4] = {
+    [0] = "",
+    [1] = "NoDDClr",
+    [2] = "NoDDChk",
+    [3] = "NoDDClr,NoDDChk",
+};
+
+char *mask_ctrl[4] = {
+    [0] = "",
+    [1] = "nomask",
+};
+
+char *access_mode[2] = {
+    [0] = "align1",
+    [1] = "align16",
+};
+
+char *reg_encoding[8] = {
+    [0] = "UD",
+    [1] = "D",
+    [2] = "UW",
+    [3] = "W",
+    [4] = "UB",
+    [5] = "B",
+    [7] = "F"
+};
+
+char *imm_encoding[8] = {
+    [0] = "UD",
+    [1] = "D",
+    [2] = "UW",
+    [3] = "W",
+    [5] = "VF",
+    [5] = "V",
+    [7] = "F"
+};
+
+char *reg_file[4] = {
+    [0] = "A",
+    [1] = "g",
+    [2] = "m",
+    [3] = "imm",
+};
+
+char *writemask[16] = {
+    [0x0] = ".",
+    [0x1] = ".x",
+    [0x2] = ".y",
+    [0x3] = ".xy",
+    [0x4] = ".z",
+    [0x5] = ".xz",
+    [0x6] = ".yz",
+    [0x7] = ".xyz",
+    [0x8] = ".w",
+    [0x9] = ".xw",
+    [0xa] = ".yw",
+    [0xb] = ".xyw",
+    [0xc] = ".zw",
+    [0xd] = ".xzw",
+    [0xe] = ".yzw",
+    [0xf] = "",
+};
+
+char *end_of_thread[2] = {
+    [0] = "",
+    [1] = "EOT"
+};
+
+char *target_function[16] = {
+    [BRW_MESSAGE_TARGET_NULL] = "null",
+    [BRW_MESSAGE_TARGET_MATH] = "math",
+    [BRW_MESSAGE_TARGET_SAMPLER] = "sampler",
+    [BRW_MESSAGE_TARGET_GATEWAY] = "gateway",
+    [BRW_MESSAGE_TARGET_DATAPORT_READ] = "read",
+    [BRW_MESSAGE_TARGET_DATAPORT_WRITE] = "write",
+    [BRW_MESSAGE_TARGET_URB] = "urb",
+    [BRW_MESSAGE_TARGET_THREAD_SPAWNER] = "thread_spawner"
+};
+
+char *math_function[16] = {
+    [BRW_MATH_FUNCTION_INV] = "inv",
+    [BRW_MATH_FUNCTION_LOG] = "log",
+    [BRW_MATH_FUNCTION_EXP] = "exp",
+    [BRW_MATH_FUNCTION_SQRT] = "sqrt",
+    [BRW_MATH_FUNCTION_RSQ] = "rsq",
+    [BRW_MATH_FUNCTION_SIN] = "sin",
+    [BRW_MATH_FUNCTION_COS] = "cos",
+    [BRW_MATH_FUNCTION_SINCOS] = "sincos",
+    [BRW_MATH_FUNCTION_TAN] = "tan",
+    [BRW_MATH_FUNCTION_POW] = "pow",
+    [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+    [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intmod",
+    [BRW_MATH_FUNCTION_INT_DIV_REMAINDER] = "intdiv",
+};
+
+char *math_saturate[2] = {
+    [0] = "",
+    [1] = "sat"
+};
+
+char *math_signed[2] = {
+    [0] = "",
+    [1] = "signed"
+};
+
+char *math_scalar[2] = {
+    [0] = "",
+    [1] = "scalar"
+};
+
+char *math_precision[2] = {
+    [0] = "",
+    [1] = "partial_precision"
+};
+
+char *urb_swizzle[4] = {
+    [BRW_URB_SWIZZLE_NONE] = "",
+    [BRW_URB_SWIZZLE_INTERLEAVE] = "interleave",
+    [BRW_URB_SWIZZLE_TRANSPOSE] = "transpose",
+};
+
+char *urb_allocate[2] = {
+    [0] = "",
+    [1] = "allocate"
+};
+
+char *urb_used[2] = {
+    [0] = "",
+    [1] = "used"
+};
+
+char *urb_complete[2] = {
+    [0] = "",
+    [1] = "complete"
+};
+
+char *sampler_target_format[4] = {
+    [0] = "F",
+    [2] = "UD",
+    [3] = "D"
+};
+
+
+static int column;
+
+static int string (FILE *file, char *string)
+{
+    fputs (string, file);
+    column += strlen (string);
+    return 0;
+}
+
+static int format (FILE *f, char *format, ...)
+{
+    char    buf[1024];
+    va_list	args;
+    va_start (args, format);
+
+    vsnprintf (buf, sizeof (buf) - 1, format, args);
+    string (f, buf);
+    return 0;
+}
+
+static int newline (FILE *f)
+{
+    putc ('\n', f);
+    column = 0;
+    return 0;
+}
+
+static int pad (FILE *f, int c)
+{
+    do
+	string (f, " ");
+    while (column < c);
+    return 0;
+}
+
+static int control (FILE *file, char *name, char *ctrl[], GLuint id, int *space)
+{
+    if (!ctrl[id]) {
+	fprintf (file, "*** invalid %s value %d ",
+		 name, id);
+	return 1;
+    }
+    if (ctrl[id][0])
+    {
+	if (space && *space)
+	    string (file, " ");
+	string (file, ctrl[id]);
+	if (space)
+	    *space = 1;
+    }
+    return 0;
+}
+
+static int print_opcode (FILE *file, int id)
+{
+    if (!opcode[id].name) {
+	format (file, "*** invalid opcode value %d ", id);
+	return 1;
+    }
+    string (file, opcode[id].name);
+    return 0;
+}
+
+static int reg (FILE *file, GLuint _reg_file, GLuint _reg_nr)
+{
+    int	err = 0;
+    if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) {
+	switch (_reg_nr & 0xf0) {
+	case BRW_ARF_NULL:
+	    string (file, "null");
+	    return -1;
+	case BRW_ARF_ADDRESS:
+	    format (file, "a%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_ACCUMULATOR:
+	    format (file, "acc%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_MASK:
+	    format (file, "mask%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_MASK_STACK:
+	    format (file, "msd%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_STATE:
+	    format (file, "sr%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_CONTROL:
+	    format (file, "cr%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_NOTIFICATION_COUNT:
+	    format (file, "n%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_IP:
+	    string (file, "ip");
+	    return -1;
+	    break;
+	default:
+	    format (file, "ARF%d", _reg_nr);
+	    break;
+	}
+    } else {
+	err  |= control (file, "src reg file", reg_file, _reg_file, NULL);
+	format (file, "%d", _reg_nr);
+    }
+    return err;
+}
+
+static int dest (FILE *file, struct brw_instruction *inst)
+{
+    int	err = 0;
+
+    if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    err |= reg (file, inst->bits1.da1.dest_reg_file, inst->bits1.da1.dest_reg_nr);
+	    if (err == -1)
+		return 0;
+	    if (inst->bits1.da1.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.da1.dest_subreg_nr);
+	    format (file, "<%d>", inst->bits1.da1.dest_horiz_stride);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
+	}
+	else
+	{
+	    string (file, "g[a0");
+	    if (inst->bits1.ia1.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.ia1.dest_subreg_nr);
+	    if (inst->bits1.ia1.dest_indirect_offset)
+		format (file, " %d", inst->bits1.ia1.dest_indirect_offset);
+	    string (file, "]");
+	    format (file, "<%d>", inst->bits1.ia1.dest_horiz_stride);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.ia1.dest_reg_type, NULL);
+	}
+    }
+    else
+    {
+	if (inst->bits1.da16.dest_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    err |= reg (file, inst->bits1.da16.dest_reg_file, inst->bits1.da16.dest_reg_nr);
+	    if (err == -1)
+		return 0;
+	    if (inst->bits1.da16.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.da16.dest_subreg_nr);
+	    string (file, "<1>");
+	    err |= control (file, "writemask", writemask, inst->bits1.da16.dest_writemask, NULL);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da16.dest_reg_type, NULL);
+	}
+	else
+	{
+	    err = 1;
+	    string (file, "Indirect align16 address mode not supported");
+	}
+    }
+
+    return 0;
+}
+
+static int src_align1_region (FILE *file,
+			      GLuint _vert_stride, GLuint _width, GLuint _horiz_stride)
+{
+    int err = 0;
+    string (file, "<");
+    err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+    string (file, ",");
+    err |= control (file, "width", width, _width, NULL);
+    string (file, ",");
+    err |= control (file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
+    string (file, ">");
+    return err;
+}
+
+static int src_da1 (FILE *file, GLuint type, GLuint _reg_file,
+		    GLuint _vert_stride, GLuint _width, GLuint _horiz_stride,
+		    GLuint reg_num, GLuint sub_reg_num, GLuint __abs, GLuint _negate)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    err |= reg (file, _reg_file, reg_num);
+    if (err == -1)
+	return 0;
+    if (sub_reg_num)
+	format (file, ".%d", sub_reg_num);
+    src_align1_region (file, _vert_stride, _width, _horiz_stride);
+    err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    return err;
+}
+
+static int src_ia1 (FILE *file,
+		    GLuint type,
+		    GLuint _reg_file,
+		    GLint _addr_imm,
+		    GLuint _addr_subreg_nr,
+		    GLuint _negate,
+		    GLuint __abs,
+		    GLuint _addr_mode,
+		    GLuint _horiz_stride,
+		    GLuint _width,
+		    GLuint _vert_stride)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    string (file, "g[a0");
+    if (_addr_subreg_nr)
+	format (file, ".%d", _addr_subreg_nr);
+    if (_addr_imm)
+	format (file, " %d", _addr_imm);
+    string (file, "]");
+    src_align1_region (file, _vert_stride, _width, _horiz_stride);
+    err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    return err;
+}
+
+static int src_da16 (FILE *file,
+		     GLuint _reg_type,
+		     GLuint _reg_file,
+		     GLuint _vert_stride,
+		     GLuint _reg_nr,
+		     GLuint _subreg_nr,
+		     GLuint __abs,
+		     GLuint _negate,
+		     GLuint swz_x,
+		     GLuint swz_y,
+		     GLuint swz_z,
+		     GLuint swz_w)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    err |= reg (file, _reg_file, _reg_nr);
+    if (err == -1)
+	return 0;
+    if (_subreg_nr)
+	format (file, ".%d", _subreg_nr);
+    string (file, "<");
+    err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+    string (file, ",1,1>");
+    err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL);
+    /*
+     * Three kinds of swizzle display:
+     *  identity - nothing printed
+     *  1->all	 - print the single channel
+     *  1->1     - print the mapping
+     */
+    if (swz_x == BRW_CHANNEL_X &&
+	swz_y == BRW_CHANNEL_Y &&
+	swz_z == BRW_CHANNEL_Z &&
+	swz_w == BRW_CHANNEL_W)
+    {
+	;
+    }
+    else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+    {
+	string (file, ".");
+	err |= control (file, "channel select", chan_sel, swz_x, NULL);
+    }
+    else
+    {
+	string (file, ".");
+	err |= control (file, "channel select", chan_sel, swz_x, NULL);
+	err |= control (file, "channel select", chan_sel, swz_y, NULL);
+	err |= control (file, "channel select", chan_sel, swz_z, NULL);
+	err |= control (file, "channel select", chan_sel, swz_w, NULL);
+    }
+    return err;
+}
+
+
+static int imm (FILE *file, GLuint type, struct brw_instruction *inst) {
+    switch (type) {
+    case BRW_REGISTER_TYPE_UD:
+	format (file, "0x%08xUD", inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_D:
+	format (file, "%dD", inst->bits3.d);
+	break;
+    case BRW_REGISTER_TYPE_UW:
+	format (file, "0x%04xUW", (uint16_t) inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_W:
+	format (file, "%dW", (int16_t) inst->bits3.d);
+	break;
+    case BRW_REGISTER_TYPE_UB:
+	format (file, "0x%02xUB", (int8_t) inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_VF:
+	format (file, "Vector Float");
+	break;
+    case BRW_REGISTER_TYPE_V:
+	format (file, "0x%08xV", inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_F:
+	format (file, "%-gF", inst->bits3.f);
+    }
+    return 0;
+}
+
+static int src0 (FILE *file, struct brw_instruction *inst)
+{
+    if (inst->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE)
+	return imm (file, inst->bits1.da1.src0_reg_type,
+		    inst);
+    else if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits2.da1.src0_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da1 (file,
+			    inst->bits1.da1.src0_reg_type,
+			    inst->bits1.da1.src0_reg_file,
+			    inst->bits2.da1.src0_vert_stride,
+			    inst->bits2.da1.src0_width,
+			    inst->bits2.da1.src0_horiz_stride,
+			    inst->bits2.da1.src0_reg_nr,
+			    inst->bits2.da1.src0_subreg_nr,
+			    inst->bits2.da1.src0_abs,
+			    inst->bits2.da1.src0_negate);
+	}
+	else
+	{
+	    return src_ia1 (file,
+			    inst->bits1.ia1.src0_reg_type,
+			    inst->bits1.ia1.src0_reg_file,
+			    inst->bits2.ia1.src0_indirect_offset,
+			    inst->bits2.ia1.src0_subreg_nr,
+			    inst->bits2.ia1.src0_negate,
+			    inst->bits2.ia1.src0_abs,
+			    inst->bits2.ia1.src0_address_mode,
+			    inst->bits2.ia1.src0_horiz_stride,
+			    inst->bits2.ia1.src0_width,
+			    inst->bits2.ia1.src0_vert_stride);
+	}
+    }
+    else
+    {
+	if (inst->bits2.da16.src0_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da16 (file,
+			     inst->bits1.da16.src0_reg_type,
+			     inst->bits1.da16.src0_reg_file,
+			     inst->bits2.da16.src0_vert_stride,
+			     inst->bits2.da16.src0_reg_nr,
+			     inst->bits2.da16.src0_subreg_nr,
+			     inst->bits2.da16.src0_abs,
+			     inst->bits2.da16.src0_negate,
+			     inst->bits2.da16.src0_swz_x,
+			     inst->bits2.da16.src0_swz_y,
+			     inst->bits2.da16.src0_swz_z,
+			     inst->bits2.da16.src0_swz_w);
+	}
+	else
+	{
+	    string (file, "Indirect align16 address mode not supported");
+	    return 1;
+	}
+    }
+}
+
+static int src1 (FILE *file, struct brw_instruction *inst)
+{
+    if (inst->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE)
+	return imm (file, inst->bits1.da1.src1_reg_type,
+		    inst);
+    else if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits3.da1.src1_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da1 (file,
+			    inst->bits1.da1.src1_reg_type,
+			    inst->bits1.da1.src1_reg_file,
+			    inst->bits3.da1.src1_vert_stride,
+			    inst->bits3.da1.src1_width,
+			    inst->bits3.da1.src1_horiz_stride,
+			    inst->bits3.da1.src1_reg_nr,
+			    inst->bits3.da1.src1_subreg_nr,
+			    inst->bits3.da1.src1_abs,
+			    inst->bits3.da1.src1_negate);
+	}
+	else
+	{
+	    return src_ia1 (file,
+			    inst->bits1.ia1.src1_reg_type,
+			    inst->bits1.ia1.src1_reg_file,
+			    inst->bits3.ia1.src1_indirect_offset,
+			    inst->bits3.ia1.src1_subreg_nr,
+			    inst->bits3.ia1.src1_negate,
+			    inst->bits3.ia1.src1_abs,
+			    inst->bits3.ia1.src1_address_mode,
+			    inst->bits3.ia1.src1_horiz_stride,
+			    inst->bits3.ia1.src1_width,
+			    inst->bits3.ia1.src1_vert_stride);
+	}
+    }
+    else
+    {
+	if (inst->bits3.da16.src1_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da16 (file,
+			     inst->bits1.da16.src1_reg_type,
+			     inst->bits1.da16.src1_reg_file,
+			     inst->bits3.da16.src1_vert_stride,
+			     inst->bits3.da16.src1_reg_nr,
+			     inst->bits3.da16.src1_subreg_nr,
+			     inst->bits3.da16.src1_abs,
+			     inst->bits3.da16.src1_negate,
+			     inst->bits3.da16.src1_swz_x,
+			     inst->bits3.da16.src1_swz_y,
+			     inst->bits3.da16.src1_swz_z,
+			     inst->bits3.da16.src1_swz_w);
+	}
+	else
+	{
+	    string (file, "Indirect align16 address mode not supported");
+	    return 1;
+	}
+    }
+}
+
+int brw_disasm (FILE *file, struct brw_instruction *inst)
+{
+    int	err = 0;
+    int space = 0;
+
+    if (inst->header.predicate_control) {
+	string (file, "(");
+	err |= control (file, "predicate inverse", pred_inv, inst->header.predicate_inverse, NULL);
+	string (file, "f0");
+	if (inst->bits2.da1.flag_reg_nr)
+	    format (file, ".%d", inst->bits2.da1.flag_reg_nr);
+	if (inst->header.access_mode == BRW_ALIGN_1)
+	    err |= control (file, "predicate control align1", pred_ctrl_align1,
+			    inst->header.predicate_control, NULL);
+	else
+	    err |= control (file, "predicate control align16", pred_ctrl_align16,
+			    inst->header.predicate_control, NULL);
+	string (file, ") ");
+    }
+
+    err |= print_opcode (file, inst->header.opcode);
+    err |= control (file, "saturate", saturate, inst->header.saturate, NULL);
+    err |= control (file, "debug control", debug_ctrl, inst->header.debug_control, NULL);
+
+    if (inst->header.opcode != BRW_OPCODE_SEND)
+	err |= control (file, "conditional modifier", conditional_modifier,
+			inst->header.destreg__conditionalmod, NULL);
+
+    if (inst->header.opcode != BRW_OPCODE_NOP) {
+	string (file, "(");
+	err |= control (file, "execution size", exec_size, inst->header.execution_size, NULL);
+	string (file, ")");
+    }
+
+    if (inst->header.opcode == BRW_OPCODE_SEND)
+	format (file, " %d", inst->header.destreg__conditionalmod);
+
+    if (opcode[inst->header.opcode].ndst > 0) {
+	pad (file, 16);
+	err |= dest (file, inst);
+    }
+    if (opcode[inst->header.opcode].nsrc > 0) {
+	pad (file, 32);
+	err |= src0 (file, inst);
+    }
+    if (opcode[inst->header.opcode].nsrc > 1) {
+	pad (file, 48);
+	err |= src1 (file, inst);
+    }
+
+    if (inst->header.opcode == BRW_OPCODE_SEND) {
+	newline (file);
+	pad (file, 16);
+	space = 0;
+	err |= control (file, "target function", target_function,
+			inst->bits3.generic.msg_target, &space);
+	switch (inst->bits3.generic.msg_target) {
+	case BRW_MESSAGE_TARGET_MATH:
+	    err |= control (file, "math function", math_function,
+			    inst->bits3.math.function, &space);
+	    err |= control (file, "math saturate", math_saturate,
+			    inst->bits3.math.saturate, &space);
+	    err |= control (file, "math signed", math_signed,
+			    inst->bits3.math.int_type, &space);
+	    err |= control (file, "math scalar", math_scalar,
+			    inst->bits3.math.data_type, &space);
+	    err |= control (file, "math precision", math_precision,
+			    inst->bits3.math.precision, &space);
+	    break;
+	case BRW_MESSAGE_TARGET_SAMPLER:
+	    format (file, " (%d, %d, ",
+		    inst->bits3.sampler.binding_table_index,
+		    inst->bits3.sampler.sampler);
+	    err |= control (file, "sampler target format", sampler_target_format,
+			    inst->bits3.sampler.return_format, NULL);
+	    string (file, ")");
+	    break;
+	case BRW_MESSAGE_TARGET_DATAPORT_WRITE:
+	    format (file, " (%d, %d, %d, %d)",
+		    inst->bits3.dp_write.binding_table_index,
+		    (inst->bits3.dp_write.pixel_scoreboard_clear << 3) |
+		    inst->bits3.dp_write.msg_control,
+		    inst->bits3.dp_write.msg_type,
+		    inst->bits3.dp_write.send_commit_msg);
+	    break;
+	case BRW_MESSAGE_TARGET_URB:
+	    format (file, " %d", inst->bits3.urb.offset);
+	    space = 1;
+	    err |= control (file, "urb swizzle", urb_swizzle,
+			    inst->bits3.urb.swizzle_control, &space);
+	    err |= control (file, "urb allocate", urb_allocate,
+			    inst->bits3.urb.allocate, &space);
+	    err |= control (file, "urb used", urb_used,
+			    inst->bits3.urb.used, &space);
+	    err |= control (file, "urb complete", urb_complete,
+			    inst->bits3.urb.complete, &space);
+	    break;
+	case BRW_MESSAGE_TARGET_THREAD_SPAWNER:
+	    break;
+	default:
+	    format (file, "unsupported target %d", inst->bits3.generic.msg_target);
+	    break;
+	}
+	if (space)
+	    string (file, " ");
+	format (file, "mlen %d",
+		inst->bits3.generic.msg_length);
+	format (file, " rlen %d",
+		inst->bits3.generic.response_length);
+    }
+    pad (file, 64);
+    if (inst->header.opcode != BRW_OPCODE_NOP) {
+	string (file, "{");
+	space = 1;
+	err |= control(file, "access mode", access_mode, inst->header.access_mode, &space);
+	err |= control (file, "mask control", mask_ctrl, inst->header.mask_control, &space);
+	err |= control (file, "dependency control", dep_ctrl, inst->header.dependency_control, &space);
+	err |= control (file, "compression control", compr_ctrl, inst->header.compression_control, &space);
+	err |= control (file, "thread control", thread_ctrl, inst->header.thread_control, &space);
+	if (inst->header.opcode == BRW_OPCODE_SEND)
+	    err |= control (file, "end of thread", end_of_thread,
+			    inst->bits3.generic.end_of_thread, &space);
+	if (space)
+	    string (file, " ");
+	string (file, "}");
+    }
+    string (file, ";");
+    newline (file);
+    return err;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 5152c3f3a5f..682094ff139 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -141,6 +141,8 @@ static void brw_emit_prim(struct brw_context *brw,
 
    prim_packet.verts_per_instance = trim(prim->mode, prim->count);
    prim_packet.start_vert_location = prim->start;
+   if (prim->indexed)
+      prim_packet.start_vert_location += brw->ib.start_vertex_offset;
    prim_packet.instance_count = 1;
    prim_packet.start_instance_location = 0;
    prim_packet.base_vert_location = 0;
@@ -422,54 +424,31 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    return retval;
 }
 
-static GLboolean brw_need_rebase( GLcontext *ctx,
-				  const struct gl_client_array *arrays[],
-				  const struct _mesa_index_buffer *ib,
-				  GLuint min_index )
-{
-   if (min_index == 0) 
-      return GL_FALSE;
-
-   if (ib) {
-      if (!vbo_all_varyings_in_vbos(arrays))
-	 return GL_TRUE;
-      else
-	 return GL_FALSE;
-   }
-   else {
-      /* Hmm.  This isn't quite what I wanted.  BRW can actually
-       * handle the mixed case well enough that we shouldn't need to
-       * rebase.  However, it's probably not very common, nor hugely
-       * expensive to do it this way:
-       */
-      if (!vbo_all_varyings_in_vbos(arrays))
-	 return GL_TRUE;
-      else
-	 return GL_FALSE;
-   }
-}
-				  
-
 void brw_draw_prims( GLcontext *ctx,
 		     const struct gl_client_array *arrays[],
 		     const struct _mesa_prim *prim,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
+		     GLboolean index_bounds_valid,
 		     GLuint min_index,
 		     GLuint max_index )
 {
    GLboolean retval;
 
-   /* Decide if we want to rebase.  If so we end up recursing once
-    * only into this function.
-    */
-   if (brw_need_rebase( ctx, arrays, ib, min_index )) {
-      vbo_rebase_prims( ctx, arrays, 
-			prim, nr_prims, 
-			ib, min_index, max_index, 
-			brw_draw_prims );
-      
-      return;
+   if (!vbo_all_varyings_in_vbos(arrays)) {
+      if (!index_bounds_valid)
+	 vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
+
+      /* Decide if we want to rebase.  If so we end up recursing once
+       * only into this function.
+       */
+      if (min_index != 0) {
+	 vbo_rebase_prims(ctx, arrays,
+			  prim, nr_prims,
+			  ib, min_index, max_index,
+			  brw_draw_prims );
+	 return;
+      }
    }
 
    /* Make a first attempt at drawing:
diff --git a/src/mesa/drivers/dri/i965/brw_draw.h b/src/mesa/drivers/dri/i965/brw_draw.h
index 9aebbdb1b86..2a14db217fc 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.h
+++ b/src/mesa/drivers/dri/i965/brw_draw.h
@@ -39,6 +39,7 @@ void brw_draw_prims( GLcontext *ctx,
 		     const struct _mesa_prim *prims,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
+		     GLboolean index_bounds_valid,
 		     GLuint min_index,
 		     GLuint max_index );
 
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index c29f1dd5c03..ab6b62812f1 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -350,9 +350,6 @@ static void brw_prepare_vertices(struct brw_context *brw)
    unsigned int min_index = brw->vb.min_index;
    unsigned int max_index = brw->vb.max_index;
 
-   struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
-   GLuint nr_enabled = 0;
-
    struct brw_vertex_element *upload[VERT_ATTRIB_MAX];
    GLuint nr_uploads = 0;
 
@@ -362,12 +359,13 @@ static void brw_prepare_vertices(struct brw_context *brw)
       _mesa_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
 
    /* Accumulate the list of enabled arrays. */
+   brw->vb.nr_enabled = 0;
    while (vs_inputs) {
       GLuint i = _mesa_ffsll(vs_inputs) - 1;
       struct brw_vertex_element *input = &brw->vb.inputs[i];
 
       vs_inputs &= ~(1 << i);
-      enabled[nr_enabled++] = input;
+      brw->vb.enabled[brw->vb.nr_enabled++] = input;
    }
 
    /* XXX: In the rare cases where this happens we fallback all
@@ -376,16 +374,15 @@ static void brw_prepare_vertices(struct brw_context *brw)
     * cases with > 17 vertex attributes enabled, so it probably
     * isn't an issue at this point.
     */
-   if (nr_enabled >= BRW_VEP_MAX) {
+   if (brw->vb.nr_enabled >= BRW_VEP_MAX) {
       intel->Fallback = 1;
       return;
    }
 
-   for (i = 0; i < nr_enabled; i++) {
-      struct brw_vertex_element *input = enabled[i];
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
 
       input->element_size = get_size(input->glarray->Type) * input->glarray->Size;
-      input->count = input->glarray->StrideB ? max_index + 1 - min_index : 1;
 
       if (input->glarray->BufferObj->Name != 0) {
 	 struct intel_buffer_object *intel_buffer =
@@ -398,7 +395,23 @@ static void brw_prepare_vertices(struct brw_context *brw)
 	 dri_bo_reference(input->bo);
 	 input->offset = (unsigned long)input->glarray->Ptr;
 	 input->stride = input->glarray->StrideB;
+	 input->count = input->glarray->_MaxElement;
+
+	 /* This is a common place to reach if the user mistakenly supplies
+	  * a pointer in place of a VBO offset.  If we just let it go through,
+	  * we may end up dereferencing a pointer beyond the bounds of the
+	  * GTT.  We would hope that the VBO's max_index would save us, but
+	  * Mesa appears to hand us min/max values not clipped to the
+	  * array object's _MaxElement, and _MaxElement frequently appears
+	  * to be wrong anyway.
+	  *
+	  * The VBO spec allows application termination in this case, and it's
+	  * probably a service to the poor programmer to do so rather than
+	  * trying to just not render.
+	  */
+	 assert(input->offset < input->bo->size);
       } else {
+	 input->count = input->glarray->StrideB ? max_index + 1 - min_index : 1;
 	 if (input->bo != NULL) {
 	    /* Already-uploaded vertex data is present from a previous
 	     * prepare_vertices, but we had to re-validate state due to
@@ -466,8 +479,8 @@ static void brw_prepare_vertices(struct brw_context *brw)
 
    brw_prepare_query_begin(brw);
 
-   for (i = 0; i < nr_enabled; i++) {
-      struct brw_vertex_element *input = enabled[i];
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
 
       brw_add_validated_bo(brw, input->bo);
    }
@@ -477,34 +490,44 @@ static void brw_emit_vertices(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = intel_context(ctx);
-   GLbitfield vs_inputs = brw->vs.prog_data->inputs_read;
-   struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
    GLuint i;
-   GLuint nr_enabled = 0;
 
-  /* Accumulate the list of enabled arrays. */
-   while (vs_inputs) {
-      i = _mesa_ffsll(vs_inputs) - 1;
-      struct brw_vertex_element *input = &brw->vb.inputs[i];
+   brw_emit_query_begin(brw);
 
-      vs_inputs &= ~(1 << i);
-      enabled[nr_enabled++] = input;
+   /* If the VS doesn't read any inputs (calculating vertex position from
+    * a state variable for some reason, for example), emit a single pad
+    * VERTEX_ELEMENT struct and bail.
+    *
+    * The stale VB state stays in place, but they don't do anything unless
+    * a VE loads from them.
+    */
+   if (brw->vb.nr_enabled == 0) {
+      BEGIN_BATCH(3, IGNORE_CLIPRECTS);
+      OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | 1);
+      OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) |
+		BRW_VE0_VALID |
+		(BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
+		(0 << BRW_VE0_SRC_OFFSET_SHIFT));
+      OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT));
+      ADVANCE_BATCH();
+      return;
    }
 
-   brw_emit_query_begin(brw);
-
    /* Now emit VB and VEP state packets.
     *
     * This still defines a hardware VB for each input, even if they
     * are interleaved or from the same VBO.  TBD if this makes a
     * performance difference.
     */
-   BEGIN_BATCH(1 + nr_enabled * 4, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 4, IGNORE_CLIPRECTS);
    OUT_BATCH((CMD_VERTEX_BUFFER << 16) |
-	     ((1 + nr_enabled * 4) - 2));
+	     ((1 + brw->vb.nr_enabled * 4) - 2));
 
-   for (i = 0; i < nr_enabled; i++) {
-      struct brw_vertex_element *input = enabled[i];
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
 
       OUT_BATCH((i << BRW_VB0_INDEX_SHIFT) |
 		BRW_VB0_ACCESS_VERTEXDATA |
@@ -524,15 +547,15 @@ static void brw_emit_vertices(struct brw_context *brw)
                         input->offset + input->element_size);
           }
       } else
-          OUT_BATCH(brw->vb.max_index);
+          OUT_BATCH(input->count);
       OUT_BATCH(0); /* Instance data step rate */
    }
    ADVANCE_BATCH();
 
-   BEGIN_BATCH(1 + nr_enabled * 2, IGNORE_CLIPRECTS);
-   OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + nr_enabled * 2) - 2));
-   for (i = 0; i < nr_enabled; i++) {
-      struct brw_vertex_element *input = enabled[i];
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 2, IGNORE_CLIPRECTS);
+   OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + brw->vb.nr_enabled * 2) - 2));
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
       uint32_t format = get_surface_type(input->glarray->Type,
 					 input->glarray->Size,
 					 input->glarray->Format,
@@ -589,17 +612,20 @@ static void brw_prepare_indices(struct brw_context *brw)
    dri_bo *bo = NULL;
    struct gl_buffer_object *bufferobj;
    GLuint offset;
+   GLuint ib_type_size;
 
    if (index_buffer == NULL)
       return;
 
-   ib_size = get_size(index_buffer->type) * index_buffer->count;
+   ib_type_size = get_size(index_buffer->type);
+   ib_size = ib_type_size * index_buffer->count;
    bufferobj = index_buffer->obj;;
 
    /* Turn into a proper VBO:
     */
    if (!bufferobj->Name) {
-     
+      brw->ib.start_vertex_offset = 0;
+
       /* Get new bufferobj, offset:
        */
       get_space(brw, ib_size, &bo, &offset);
@@ -615,6 +641,7 @@ static void brw_prepare_indices(struct brw_context *brw)
       }
    } else {
       offset = (GLuint) (unsigned long) index_buffer->ptr;
+      brw->ib.start_vertex_offset = 0;
 
       /* If the index buffer isn't aligned to its element size, we have to
        * rebase it into a temporary.
@@ -635,39 +662,62 @@ static void brw_prepare_indices(struct brw_context *brw)
 	  bo = intel_bufferobj_buffer(intel, intel_buffer_object(bufferobj),
 				      INTEL_READ);
 	  dri_bo_reference(bo);
+
+	  /* Use CMD_3D_PRIM's start_vertex_offset to avoid re-uploading
+	   * the index buffer state when we're just moving the start index
+	   * of our drawing.
+	   */
+	  brw->ib.start_vertex_offset = offset / ib_type_size;
+	  offset = 0;
+	  ib_size = bo->size;
        }
    }
 
-   dri_bo_unreference(brw->ib.bo);
-   brw->ib.bo = bo;
-   brw->ib.offset = offset;
+   if (brw->ib.bo != bo ||
+       brw->ib.offset != offset ||
+       brw->ib.size != ib_size)
+   {
+      drm_intel_bo_unreference(brw->ib.bo);
+      brw->ib.bo = bo;
+      brw->ib.offset = offset;
+      brw->ib.size = ib_size;
+
+      brw->state.dirty.brw |= BRW_NEW_INDEX_BUFFER;
+   } else {
+      drm_intel_bo_unreference(bo);
+   }
 
    brw_add_validated_bo(brw, brw->ib.bo);
 }
 
-static void brw_emit_indices(struct brw_context *brw)
+const struct brw_tracked_state brw_indices = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_INDICES,
+      .cache = 0,
+   },
+   .prepare = brw_prepare_indices,
+};
+
+static void brw_emit_index_buffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
-   GLuint ib_size;
 
    if (index_buffer == NULL)
       return;
 
-   ib_size = get_size(index_buffer->type) * index_buffer->count - 1;
-
    /* Emit the indexbuffer packet:
     */
    {
       struct brw_indexbuffer ib;
 
       memset(&ib, 0, sizeof(ib));
-   
+
       ib.header.bits.opcode = CMD_INDEX_BUFFER;
       ib.header.bits.length = sizeof(ib)/4 - 2;
       ib.header.bits.index_format = get_index_type(index_buffer->type);
       ib.header.bits.cut_index_enable = 0;
-   
 
       BEGIN_BATCH(4, IGNORE_CLIPRECTS);
       OUT_BATCH( ib.header.dword );
@@ -676,18 +726,17 @@ static void brw_emit_indices(struct brw_context *brw)
 		brw->ib.offset);
       OUT_RELOC(brw->ib.bo,
 		I915_GEM_DOMAIN_VERTEX, 0,
-		brw->ib.offset + ib_size);
+		brw->ib.offset + brw->ib.size);
       OUT_BATCH( 0 );
       ADVANCE_BATCH();
    }
 }
 
-const struct brw_tracked_state brw_indices = {
+const struct brw_tracked_state brw_index_buffer = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_BATCH | BRW_NEW_INDICES,
+      .brw = BRW_NEW_BATCH | BRW_NEW_INDEX_BUFFER,
       .cache = 0,
    },
-   .prepare = brw_prepare_indices,
-   .emit = brw_emit_indices,
+   .emit = brw_emit_index_buffer,
 };
diff --git a/src/mesa/drivers/dri/i965/brw_eu.c b/src/mesa/drivers/dri/i965/brw_eu.c
index c53efba5991..1df561386e6 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.c
+++ b/src/mesa/drivers/dri/i965/brw_eu.c
@@ -62,7 +62,7 @@ void brw_set_predicate_control( struct brw_compile *p, GLuint pc )
 
 void brw_set_conditionalmod( struct brw_compile *p, GLuint conditional )
 {
-   p->current->header.destreg__conditonalmod = conditional;
+   p->current->header.destreg__conditionalmod = conditional;
 }
 
 void brw_set_access_mode( struct brw_compile *p, GLuint access_mode )
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 2412014248c..241cdc33f86 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -481,8 +481,8 @@ static struct brw_instruction *next_insn( struct brw_compile *p,
    /* Reset this one-shot flag: 
     */
 
-   if (p->current->header.destreg__conditonalmod) {
-      p->current->header.destreg__conditonalmod = 0;   
+   if (p->current->header.destreg__conditionalmod) {
+      p->current->header.destreg__conditionalmod = 0;
       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
    }
 
@@ -679,7 +679,7 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
       assert(if_insn->header.opcode == BRW_OPCODE_IF);
 
       if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
-      if_insn->bits3.if_else.pop_count = 1;
+      if_insn->bits3.if_else.pop_count = 0;
       if_insn->bits3.if_else.pad0 = 0;
    }
 
@@ -871,7 +871,7 @@ void brw_CMP(struct brw_compile *p,
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
 
-   insn->header.destreg__conditonalmod = conditional;
+   insn->header.destreg__conditionalmod = conditional;
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, src1);
@@ -915,7 +915,7 @@ void brw_math( struct brw_compile *p,
     * instructions.
     */
    insn->header.predicate_control = 0; 
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
 
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src);
@@ -952,7 +952,7 @@ void brw_math_16( struct brw_compile *p,
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
    insn = next_insn(p, BRW_OPCODE_SEND);
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
 
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src);
@@ -969,7 +969,7 @@ void brw_math_16( struct brw_compile *p,
     */
    insn = next_insn(p, BRW_OPCODE_SEND);
    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
-   insn->header.destreg__conditonalmod = msg_reg_nr+1;
+   insn->header.destreg__conditionalmod = msg_reg_nr+1;
 
    brw_set_dest(insn, offset(dest,1));
    brw_set_src0(insn, src);
@@ -1016,7 +1016,7 @@ void brw_dp_WRITE_16( struct brw_compile *p,
    
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
   
       brw_set_dest(insn, dest);
       brw_set_src0(insn, src);
@@ -1062,7 +1062,7 @@ void brw_dp_READ_16( struct brw_compile *p,
    
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
   
       brw_set_dest(insn, dest);	/* UW? */
       brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
@@ -1116,7 +1116,7 @@ void brw_dp_READ_4( struct brw_compile *p,
    
       insn->header.predicate_control = BRW_PREDICATE_NONE;
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
       insn->header.mask_control = BRW_MASK_DISABLE;
   
       /* cast dest to a uword[8] vector */
@@ -1190,7 +1190,7 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
    
       insn->header.predicate_control = BRW_PREDICATE_NONE;
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
       insn->header.mask_control = BRW_MASK_DISABLE;
       /*insn->header.access_mode = BRW_ALIGN_16;*/
   
@@ -1224,7 +1224,7 @@ void brw_fb_WRITE(struct brw_compile *p,
    
    insn->header.predicate_control = 0; /* XXX */
    insn->header.compression_control = BRW_COMPRESSION_NONE; 
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
   
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src0);
@@ -1322,7 +1322,7 @@ void brw_SAMPLE(struct brw_compile *p,
    
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE;
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
 
       brw_set_dest(insn, dest);
       brw_set_src0(insn, src0);
@@ -1375,7 +1375,7 @@ void brw_urb_WRITE(struct brw_compile *p,
    brw_set_src0(insn, src0);
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
 
    brw_set_urb_message(p->brw,
 		       insn,
@@ -1410,7 +1410,7 @@ void brw_ff_sync(struct brw_compile *p,
    brw_set_src0(insn, src0);
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
 
    brw_set_ff_sync_message(p->brw,
 		       insn,
diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c
index 980eac7646d..a9b2aa2eace 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c
@@ -101,7 +101,7 @@ static void brw_gs_emit_vue(struct brw_gs_compile *c,
 		 BRW_URB_SWIZZLE_NONE);
 }
 
-void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
+static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
 {
 	struct brw_compile *p = &c->func;
 	brw_MOV(p, get_element_ud(c->reg.R0, 1), brw_imm_ud(num_prim));
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index b5f6371c82c..bc0f0760738 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -113,7 +113,7 @@ struct brw_sf_unit_key {
 
    unsigned int nr_urb_entries, urb_size, sfsize;
 
-   GLenum front_face, cull_face;
+   GLenum front_face, cull_face, provoking_vertex;
    unsigned scissor:1;
    unsigned line_smooth:1;
    unsigned point_sprite:1;
@@ -153,6 +153,9 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
    key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
    key->point_attenuated = ctx->Point._Attenuated;
 
+   /* _NEW_LIGHT */
+   key->provoking_vertex = ctx->Light.ProvokingVertex;
+
    key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
 }
 
@@ -284,9 +287,15 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
     */
-   sf.sf7.trifan_pv = 2;
-   sf.sf7.linestrip_pv = 1;
-   sf.sf7.tristrip_pv = 2;
+   if (key->provoking_vertex == GL_LAST_VERTEX_CONVENTION) {
+      sf.sf7.trifan_pv = 2;
+      sf.sf7.linestrip_pv = 1;
+      sf.sf7.tristrip_pv = 2;
+   } else {
+      sf.sf7.trifan_pv = 1;
+      sf.sf7.linestrip_pv = 0;
+      sf.sf7.tristrip_pv = 0;
+   }
    sf.sf7.line_last_pixel_enable = 0;
 
    /* Set bias for OpenGL rasterization rules:
@@ -300,6 +309,9 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 			 &sf, sizeof(sf),
 			 NULL, NULL);
 
+   /* STATE_PREFETCH command description describes this state as being
+    * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
+    */
    /* Emit SF program relocation */
    dri_bo_emit_reloc(bo,
 		     I915_GEM_DOMAIN_INSTRUCTION, 0,
@@ -340,6 +352,7 @@ static void upload_sf_unit( struct brw_context *brw )
 const struct brw_tracked_state brw_sf_unit = {
    .dirty = {
       .mesa  = (_NEW_POLYGON | 
+		_NEW_LIGHT |
 		_NEW_LINE | 
 		_NEW_POINT | 
 		_NEW_SCISSOR |
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index bf9f6cae55e..78572356a3d 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -92,6 +92,7 @@ const struct brw_tracked_state brw_clear_batch_cache;
 const struct brw_tracked_state brw_drawing_rect;
 const struct brw_tracked_state brw_indices;
 const struct brw_tracked_state brw_vertices;
+const struct brw_tracked_state brw_index_buffer;
 
 /**
  * Use same key for WM and VS surfaces.
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 38d9dd8991e..95d42d2dcc5 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -94,6 +94,7 @@ const struct brw_tracked_state *atoms[] =
 
    &brw_drawing_rect,
    &brw_indices,
+   &brw_index_buffer,
    &brw_vertices,
 
    &brw_constant_buffer
@@ -208,6 +209,7 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_PSP),
    DEFINE_BIT(BRW_NEW_FENCE),
    DEFINE_BIT(BRW_NEW_INDICES),
+   DEFINE_BIT(BRW_NEW_INDEX_BUFFER),
    DEFINE_BIT(BRW_NEW_VERTICES),
    DEFINE_BIT(BRW_NEW_BATCH),
    DEFINE_BIT(BRW_NEW_DEPTH_BUFFER),
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index 8ba7eb27b36..a6de09207b7 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -1200,7 +1200,7 @@ struct brw_instruction
       GLuint predicate_control:4;
       GLuint predicate_inverse:1;
       GLuint execution_size:3;
-      GLuint destreg__conditonalmod:4; /* destreg - send, conditionalmod - others */
+      GLuint destreg__conditionalmod:4; /* destreg - send, conditionalmod - others */
       GLuint pad0:2;
       GLuint debug_control:1;
       GLuint saturate:1;
@@ -1228,7 +1228,9 @@ struct brw_instruction
 	 GLuint dest_reg_type:3;
 	 GLuint src0_reg_file:2;
 	 GLuint src0_reg_type:3;
-	 GLuint pad:6;
+	 GLuint src1_reg_file:2;        /* 0x00000c00 */
+	 GLuint src1_reg_type:3;        /* 0x00007000 */
+	 GLuint pad:1;
 	 GLint dest_indirect_offset:10;	/* offset against the deref'd address reg */
 	 GLuint dest_subreg_nr:3; /* subnr for the address reg a0.x */
 	 GLuint dest_horiz_stride:2;
@@ -1243,7 +1245,7 @@ struct brw_instruction
 	 GLuint src0_reg_type:3;
 	 GLuint src1_reg_file:2;
 	 GLuint src1_reg_type:3;
-	 GLuint pad0:1;
+	 GLuint pad:1;
 	 GLuint dest_writemask:4;
 	 GLuint dest_subreg_nr:1;
 	 GLuint dest_reg_nr:8;
@@ -1348,7 +1350,7 @@ struct brw_instruction
 	 GLuint src1_reg_nr:8;
 	 GLuint src1_abs:1;
 	 GLuint src1_negate:1;
-	 GLuint pad:1;
+	 GLuint src1_address_mode:1;
 	 GLuint src1_horiz_stride:2;
 	 GLuint src1_width:3;
 	 GLuint src1_vert_stride:4;
@@ -1363,7 +1365,7 @@ struct brw_instruction
 	 GLuint src1_reg_nr:8;
 	 GLuint src1_abs:1;
 	 GLuint src1_negate:1;
-	 GLuint pad0:1;
+	 GLuint src1_address_mode:1;
 	 GLuint src1_swz_z:2;
 	 GLuint src1_swz_w:2;
 	 GLuint pad1:1;
@@ -1377,7 +1379,7 @@ struct brw_instruction
 	 GLuint src1_subreg_nr:3;
 	 GLuint src1_abs:1;
 	 GLuint src1_negate:1;
-	 GLuint pad0:1;
+	 GLuint src1_address_mode:1;
 	 GLuint src1_horiz_stride:2;
 	 GLuint src1_width:3;
 	 GLuint src1_vert_stride:4;
@@ -1565,6 +1567,7 @@ struct brw_instruction
 
       GLint d;
       GLuint ud;
+      float f;
    } bits3;
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 7f9b2535340..1d2e953eb1e 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -65,11 +65,6 @@ GLboolean brw_miptree_layout(struct intel_context *intel,
 
           if (mt->compressed) {
               mt->pitch = ALIGN(mt->width0, align_w);
-              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * mt->pitch * mt->cpp;
-              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * 6;
-          } else {
-              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * mt->pitch * mt->cpp;
-              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * 6;
           }
 
           if (mt->first_level != mt->last_level) {
@@ -90,6 +85,14 @@ GLboolean brw_miptree_layout(struct intel_context *intel,
 
           mt->pitch = intel_miptree_pitch_align(intel, mt, tiling, mt->pitch);
 
+          if (mt->compressed) {
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * 6;
+          } else {
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * 6;
+          }
+
           for (level = mt->first_level; level <= mt->last_level; level++) {
               GLuint img_height;
               GLuint nr_images = 6;
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 514f15d5e3a..83167b9258e 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -68,6 +68,7 @@ static void release_tmps( struct brw_vs_compile *c )
 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 {
    GLuint i, reg = 0, mrf;
+   int attributes_in_vue;
 
    /* Determine whether to use a real constant buffer or use a block
     * of GRF registers for constants.  The later is faster but only
@@ -128,6 +129,11 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 	 reg++;
       }
    }
+   /* If there are no inputs, we'll still be reading one attribute's worth
+    * because it's required -- see urb_read_length setting.
+    */
+   if (c->nr_inputs == 0)
+      reg++;
 
    /* Allocate outputs.  The non-position outputs go straight into message regs.
     */
@@ -220,11 +226,22 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     * vertex urb, so is half the amount:
     */
    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
+   /* Setting this field to 0 leads to undefined behavior according to the
+    * the VS_STATE docs.  Our VUEs will always have at least one attribute
+    * sitting in them, even if it's padding.
+    */
+   if (c->prog_data.urb_read_length == 0)
+      c->prog_data.urb_read_length = 1;
+
+   /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
+    * them to fit the biggest thing they need to.
+    */
+   attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 
    if (BRW_IS_IGDNG(c->func.brw))
-       c->prog_data.urb_entry_size = (c->nr_outputs + 6 + 3) / 4;
+       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
    else
-       c->prog_data.urb_entry_size = (c->nr_outputs + 2 + 3) / 4;
+       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 
    c->prog_data.total_grf = reg;
 
@@ -1245,9 +1262,30 @@ post_vs_emit( struct brw_vs_compile *c,
 
    /* patch up the END code to jump past subroutines, etc */
    offset = last_inst - end_inst;
-   brw_set_src1(end_inst, brw_imm_d(offset * 16));
+   if (offset > 1) {
+      brw_set_src1(end_inst, brw_imm_d(offset * 16));
+   } else {
+      end_inst->header.opcode = BRW_OPCODE_NOP;
+   }
 }
 
+static uint32_t
+get_predicate(uint32_t swizzle)
+{
+   switch (swizzle) {
+   case SWIZZLE_XXXX:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_X;
+   case SWIZZLE_YYYY:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+   case SWIZZLE_ZZZZ:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+   case SWIZZLE_WWWW:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_W;
+   default:
+      _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n", swizzle);
+      return BRW_PREDICATE_NORMAL;
+   }
+}
 
 /* Emit the vertex program instructions here.
  */
@@ -1266,7 +1304,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
    GLuint file;
 
    if (INTEL_DEBUG & DEBUG_VS) {
-      _mesa_printf("vs-emit:\n");
+      _mesa_printf("vs-mesa:\n");
       _mesa_print_program(&c->vp->program.Base); 
       _mesa_printf("\n");
    }
@@ -1453,7 +1491,10 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 break;
       case OPCODE_IF:
 	 assert(if_depth < MAX_IF_DEPTH);
-         if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
+	 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
+	 if_inst[if_depth]->header.predicate_control =
+	    get_predicate(inst->DstReg.CondSwizzle);
+	 if_depth++;
 	 break;
       case OPCODE_ELSE:
 	 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
@@ -1541,6 +1582,19 @@ void brw_vs_emit(struct brw_vs_compile *c )
 				    "unknown");
       }
 
+      /* Set the predication update on the last instruction of the native
+       * instruction sequence.
+       *
+       * This would be problematic if it was set on a math instruction,
+       * but that shouldn't be the case with the current GLSL compiler.
+       */
+      if (inst->CondUpdate) {
+	 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
+
+	 assert(hw_insn->header.destreg__conditionalmod == 0);
+	 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
+      }
+
       if ((inst->DstReg.File == PROGRAM_OUTPUT)
           && (inst->DstReg.Index != VERT_RESULT_HPOS)
           && c->output_regs[inst->DstReg.Index].used_in_src) {
@@ -1578,4 +1632,13 @@ void brw_vs_emit(struct brw_vs_compile *c )
    emit_vertex_write(c);
 
    post_vs_emit(c, end_inst, last_inst);
+
+   if (INTEL_DEBUG & DEBUG_VS) {
+      int i;
+
+      _mesa_printf("vs-native:\n");
+      for (i = 0; i < p->nr_insn; i++)
+	 brw_disasm(stderr, &p->store[i]);
+      _mesa_printf("\n");
+   }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 14e05be4f6c..2292de94c44 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -41,13 +41,13 @@ GLuint brw_wm_nr_args( GLuint opcode )
 {
    switch (opcode) {
    case WM_FRONTFACING:
-      return 0;
    case WM_PIXELXY:
+      return 0;
    case WM_CINTERP:
    case WM_WPOSXY:
+   case WM_DELTAXY:
       return 1;
    case WM_LINTERP:
-   case WM_DELTAXY:
    case WM_PIXELW:
       return 2;
    case WM_FB_WRITE:
@@ -171,9 +171,11 @@ static void do_wm_prog( struct brw_context *brw,
     * differently from "simple" shaders.
     */
    if (fp->isGLSL) {
+      c->dispatch_width = 8;
       brw_wm_glsl_emit(brw, c);
    }
    else {
+      c->dispatch_width = 16;
       brw_wm_non_glsl_emit(brw, c);
    }
 
@@ -202,6 +204,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
    /* BRW_NEW_FRAGMENT_PROGRAM */
    const struct brw_fragment_program *fp = 
       (struct brw_fragment_program *)brw->fragment_program;
+   GLboolean uses_depth = (fp->program.Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
    GLuint lookup = 0;
    GLuint line_aa;
    GLuint i;
@@ -263,6 +266,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
 	 
    brw_wm_lookup_iz(line_aa,
 		    lookup,
+		    uses_depth,
 		    key);
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index ba497432c60..ae98b5492db 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -260,6 +260,7 @@ struct brw_wm_compile {
    GLuint tmp_index;
    GLuint tmp_max;
    GLuint subroutines[BRW_WM_MAX_SUBROUTINE];
+   GLuint dispatch_width;
 
    /** we may need up to 3 constants per instruction (if use_const_buffer) */
    struct {
@@ -292,6 +293,7 @@ void brw_wm_print_program( struct brw_wm_compile *c,
 
 void brw_wm_lookup_iz( GLuint line_aa,
 		       GLuint lookup,
+		       GLboolean ps_uses_depth,
 		       struct brw_wm_prog_key *key );
 
 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index 9f82916c025..b3cf524c63e 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -65,8 +65,7 @@ static INLINE struct brw_reg sechalf( struct brw_reg reg )
 
 static void emit_pixel_xy(struct brw_compile *p,
 			  const struct brw_reg *dst,
-			  GLuint mask,
-			  const struct brw_reg *arg0)
+			  GLuint mask)
 {
    struct brw_reg r1 = brw_vec1_grf(1, 0);
    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
@@ -98,8 +97,7 @@ static void emit_pixel_xy(struct brw_compile *p,
 static void emit_delta_xy(struct brw_compile *p,
 			  const struct brw_reg *dst,
 			  GLuint mask,
-			  const struct brw_reg *arg0,
-			  const struct brw_reg *arg1)
+			  const struct brw_reg *arg0)
 {
    struct brw_reg r1 = brw_vec1_grf(1, 0);
 
@@ -545,16 +543,18 @@ static void emit_dp3( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 
    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-   brw_MAC(p, dst[0], arg0[2], arg1[2]);
+   brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
    brw_set_saturate(p, 0);
 }
 
@@ -565,17 +565,19 @@ static void emit_dp4( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 
    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-   brw_MAC(p, dst[0], arg0[3], arg1[3]);
+   brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
    brw_set_saturate(p, 0);
 }
 
@@ -632,18 +634,19 @@ static void emit_math1( struct brw_compile *p,
 			GLuint mask,
 			const struct brw_reg *arg0 )
 {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   //assert((mask & WRITEMASK_XYZW) == WRITEMASK_X ||
-   //	  function == BRW_MATH_FUNCTION_SINCOS);
-   
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
    brw_MOV(p, brw_message_reg(2), arg0[0]);
 
    /* Send two messages to perform all 16 operations:
     */
    brw_math_16(p, 
-	       dst[0],
+	       dst[dst_chan],
 	       function,
 	       (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
 	       2,
@@ -659,10 +662,12 @@ static void emit_math2( struct brw_compile *p,
 			const struct brw_reg *arg0,
 			const struct brw_reg *arg1)
 {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
    brw_push_insn_state(p);
 
@@ -681,7 +686,7 @@ static void emit_math2( struct brw_compile *p,
     */
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_math(p, 
-	    dst[0],
+	    dst[dst_chan],
 	    function,
 	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
 	    2,
@@ -691,7 +696,7 @@ static void emit_math2( struct brw_compile *p,
 
    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
    brw_math(p, 
-	    offset(dst[0],1),
+	    offset(dst[dst_chan],1),
 	    function,
 	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
 	    4,
@@ -1194,11 +1199,11 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 /* Generated instructions for calculating triangle interpolants:
 	  */
       case WM_PIXELXY:
-	 emit_pixel_xy(p, dst, dst_flags, args[0]);
+	 emit_pixel_xy(p, dst, dst_flags);
 	 break;
 
       case WM_DELTAXY:
-	 emit_delta_xy(p, dst, dst_flags, args[0], args[1]);
+	 emit_delta_xy(p, dst, dst_flags, args[0]);
 	 break;
 
       case WM_WPOSXY:
@@ -1385,4 +1390,13 @@ void brw_wm_emit( struct brw_wm_compile *c )
 		      inst->dst[i]->hw_reg, 
 		      inst->dst[i]->spill_slot);
    }
+
+   if (INTEL_DEBUG & DEBUG_WM) {
+      int i;
+
+      _mesa_printf("wm-native:\n");
+      for (i = 0; i < p->nr_insn; i++)
+	 brw_disasm(stderr, &p->store[i]);
+      _mesa_printf("\n");
+   }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index b9e8dd2e96e..5dc076a8257 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -226,9 +226,42 @@ static struct prog_instruction * emit_op(struct brw_wm_compile *c,
                       0, 0, 0,  /* tex unit, target, shadow */
                       src0, src1, src2);
 }
-   
 
 
+/* Many Mesa opcodes produce the same value across all the result channels.
+ * We'd rather not have to support that splatting in the opcode implementations,
+ * and brw_wm_pass*.c wants to optimize them out by shuffling references around
+ * anyway.  We can easily get both by emitting the opcode to one channel, and
+ * then MOVing it to the others, which brw_wm_pass*.c already understands.
+ */
+static struct prog_instruction *emit_scalar_insn(struct brw_wm_compile *c,
+						 const struct prog_instruction *inst0)
+{
+   struct prog_instruction *inst;
+   unsigned int dst_chan;
+   unsigned int other_channel_mask;
+
+   if (inst0->DstReg.WriteMask == 0)
+      return NULL;
+
+   dst_chan = _mesa_ffs(inst0->DstReg.WriteMask) - 1;
+   inst = get_fp_inst(c);
+   *inst = *inst0;
+   inst->DstReg.WriteMask = 1 << dst_chan;
+
+   other_channel_mask = inst0->DstReg.WriteMask & ~(1 << dst_chan);
+   if (other_channel_mask != 0) {
+      inst = emit_op(c,
+		     OPCODE_MOV,
+		     dst_mask(inst0->DstReg, other_channel_mask),
+		     0,
+		     src_swizzle1(src_reg_from_dst(inst0->DstReg), dst_chan),
+		     src_undef(),
+		     src_undef());
+   }
+   return inst;
+}
+
 
 /***********************************************************************
  * Special instructions for interpolation and other tasks
@@ -376,14 +409,6 @@ static void emit_interp( struct brw_wm_compile *c,
       }
       break;
    case FRAG_ATTRIB_FOGC:
-      /* The FOGC input is really special.  When a program uses glFogFragCoord,
-       * the results returned are supposed to be (f,0,0,1).  But for Mesa GLSL,
-       * the glFrontFacing and glPointCoord values are also stashed in FOGC.
-       * So, write the interpolated fog value to X, then either 0, 1, or the
-       * stashed values to Y, Z, W.  Note that this means that
-       * glFogFragCoord.yzw can be wrong in those cases!
-       */
-
       /* Interpolate the fog coordinate */
       emit_op(c,
 	      WM_PINTERP,
@@ -393,26 +418,40 @@ static void emit_interp( struct brw_wm_compile *c,
 	      deltas,
 	      get_pixel_w(c));
 
-      /* Move the front facing value into FOGC.y if it's needed. */
-      if (c->fp->program.UsesFrontFacing) {
-	 emit_op(c,
-		 WM_FRONTFACING,
-		 dst_mask(dst, WRITEMASK_Y),
-		 0,
-		 src_undef(),
-		 src_undef(),
-		 src_undef());
-      } else {
-	 emit_op(c,
-		 OPCODE_MOV,
-		 dst_mask(dst, WRITEMASK_Y),
-		 0,
-		 src_swizzle1(interp, SWIZZLE_ZERO),
-		 src_undef(),
-		 src_undef());
-      }
+      emit_op(c,
+	      OPCODE_MOV,
+	      dst_mask(dst, WRITEMASK_YZW),
+	      0,
+	      src_swizzle(interp,
+			  SWIZZLE_ZERO,
+			  SWIZZLE_ZERO,
+			  SWIZZLE_ZERO,
+			  SWIZZLE_ONE),
+	      src_undef(),
+	      src_undef());
+      break;
+
+   case FRAG_ATTRIB_FACE:
+      /* XXX review/test this case */
+      emit_op(c,
+              WM_FRONTFACING,
+              dst_mask(dst, WRITEMASK_X),
+              0,
+              src_undef(),
+              src_undef(),
+              src_undef());
+      break;
+
+   case FRAG_ATTRIB_PNTC:
+      /* XXX review/test this case */
+      emit_op(c,
+	      WM_PINTERP,
+	      dst_mask(dst, WRITEMASK_XY),
+	      0,
+	      interp,
+	      deltas,
+	      get_pixel_w(c));
 
-      /* Should do the PointCoord thing here. */
       emit_op(c,
 	      OPCODE_MOV,
 	      dst_mask(dst, WRITEMASK_ZW),
@@ -425,6 +464,7 @@ static void emit_interp( struct brw_wm_compile *c,
 	      src_undef(),
 	      src_undef());
       break;
+
    default:
       emit_op(c,
 	      WM_PINTERP,
@@ -683,7 +723,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 
        /* tmp0 = 1 / tmp1 */
        emit_op(c, OPCODE_RCP,
-               tmp0,
+               dst_mask(tmp0, WRITEMASK_X),
                0,
                tmp1src,
                src_undef(),
@@ -694,7 +734,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                tmpcoord,
                0,
                src0,
-               tmp0src,
+               src_swizzle1(tmp0src, SWIZZLE_X),
                src_undef());
 
        release_temp(c, tmp0);
@@ -717,7 +757,11 @@ static void precalc_tex( struct brw_wm_compile *c,
 	      tmpcoord,
 	      0,
 	      inst->SrcReg[0],
-	      scale,
+	      src_swizzle(scale,
+			  SWIZZLE_X,
+			  SWIZZLE_Y,
+			  SWIZZLE_ONE,
+			  SWIZZLE_ONE),
 	      src_undef());
 
       coord = src_reg_from_dst(tmpcoord);
@@ -1134,9 +1178,11 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 break;
       case OPCODE_PRINT:
 	 break;
-	 
       default:
-	 emit_insn(c, inst);
+	 if (brw_wm_is_scalar_result(inst->Opcode))
+	    emit_scalar_insn(c, inst);
+	 else
+	    emit_insn(c, inst);
 	 break;
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
index 85a4237d5a7..a5b18ec7d76 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@@ -10,6 +10,9 @@ enum _subroutine {
     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
 };
 
+static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
+                                  const struct prog_instruction *inst,
+                                  GLuint component);
 
 /**
  * Determine if the given fragment program uses GLSL features such
@@ -22,6 +25,7 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
     for (i = 0; i < fp->Base.NumInstructions; i++) {
 	const struct prog_instruction *inst = &fp->Base.Instructions[i];
 	switch (inst->Opcode) {
+	    case OPCODE_ARL:
 	    case OPCODE_IF:
 	    case OPCODE_ENDIF:
 	    case OPCODE_CAL:
@@ -130,19 +134,6 @@ static void set_reg(struct brw_wm_compile *c, int file, int index,
     c->wm_regs[file][index][component].inited = GL_TRUE;
 }
 
-/**
- * Examine instruction's write mask to find index of first component
- * enabled for writing.
- */
-static int get_scalar_dst_index(const struct prog_instruction *inst)
-{
-    int i;
-    for (i = 0; i < 4; i++)
-	if (inst->DstReg.WriteMask & (1<<i))
-	    break;
-    return i;
-}
-
 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 {
     struct brw_reg reg;
@@ -402,6 +393,27 @@ static void prealloc_reg(struct brw_wm_compile *c)
     prealloc_grf(c, 126);
     prealloc_grf(c, 127);
 
+    for (i = 0; i < c->nr_fp_insns; i++) {
+	const struct prog_instruction *inst = &c->prog_instructions[i];
+	struct brw_reg dst[4];
+
+	switch (inst->Opcode) {
+	case OPCODE_TEX:
+	case OPCODE_TXB:
+	    /* Allocate the channels of texture results contiguously,
+	     * since they are written out that way by the sampler unit.
+	     */
+	    for (j = 0; j < 4; j++) {
+		dst[j] = get_dst_reg(c, inst, j);
+		if (j != 0)
+		    assert(dst[j].nr == dst[j - 1].nr + 1);
+	    }
+	    break;
+	default:
+	    break;
+	}
+    }
+
     /* An instruction may reference up to three constants.
      * They'll be found in these registers.
      * XXX alloc these on demand!
@@ -639,23 +651,6 @@ static void invoke_subroutine( struct brw_wm_compile *c,
     }
 }
 
-static void emit_abs( struct brw_wm_compile *c,
-                      const struct prog_instruction *inst)
-{
-    int i;
-    struct brw_compile *p = &c->func;
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for (i = 0; i < 4; i++) {
-	if (inst->DstReg.WriteMask & (1<<i)) {
-	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i);
-	    src = get_src_reg(c, inst, 0, i);
-	    brw_MOV(p, dst, brw_abs(src));
-	}
-    }
-    brw_set_saturate(p, 0);
-}
-
 static void emit_trunc( struct brw_wm_compile *c,
                         const struct prog_instruction *inst)
 {
@@ -1031,12 +1026,20 @@ static void emit_dp3(struct brw_wm_compile *c,
     struct brw_reg src0[3], src1[3], dst;
     int i;
     struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
     for (i = 0; i < 3; i++) {
 	src0[i] = get_src_reg(c, inst, 0, i);
 	src1[i] = get_src_reg_imm(c, inst, 1, i);
     }
 
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+    dst = get_dst_reg(c, inst, dst_chan);
     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
@@ -1050,11 +1053,19 @@ static void emit_dp4(struct brw_wm_compile *c,
     struct brw_reg src0[4], src1[4], dst;
     int i;
     struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
     for (i = 0; i < 4; i++) {
 	src0[i] = get_src_reg(c, inst, 0, i);
 	src1[i] = get_src_reg_imm(c, inst, 1, i);
     }
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+    dst = get_dst_reg(c, inst, dst_chan);
     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
     brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
@@ -1069,11 +1080,19 @@ static void emit_dph(struct brw_wm_compile *c,
     struct brw_reg src0[4], src1[4], dst;
     int i;
     struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
     for (i = 0; i < 4; i++) {
 	src0[i] = get_src_reg(c, inst, 0, i);
 	src1[i] = get_src_reg_imm(c, inst, 1, i);
     }
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+    dst = get_dst_reg(c, inst, dst_chan);
     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
     brw_MAC(p, dst, src0[2], src1[2]);
@@ -1091,37 +1110,28 @@ static void emit_math1(struct brw_wm_compile *c,
                        const struct prog_instruction *inst, GLuint func)
 {
     struct brw_compile *p = &c->func;
-    struct brw_reg src0, dst, tmp;
-    const int mark = mark_tmps( c );
-    int i;
+    struct brw_reg src0, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
 
-    tmp = alloc_tmp(c);
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
     /* Get first component of source register */
+    dst = get_dst_reg(c, inst, dst_chan);
     src0 = get_src_reg(c, inst, 0, 0);
 
-    /* tmp = func(src0) */
     brw_MOV(p, brw_message_reg(2), src0);
     brw_math(p,
-             tmp,
+             dst,
              func,
              (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
              2,
              brw_null_reg(),
              BRW_MATH_DATA_VECTOR,
              BRW_MATH_PRECISION_FULL);
-
-    /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
-
-    /* replicate tmp value across enabled dest channels */
-    for (i = 0; i < 4; i++) {
-       if (inst->DstReg.WriteMask & (1 << i)) {
-          dst = get_dst_reg(c, inst, i);
-          brw_MOV(p, dst, tmp);
-       }
-    }
-
-    release_tmps(c, mark);
 }
 
 static void emit_rcp(struct brw_wm_compile *c,
@@ -1192,24 +1202,6 @@ static void emit_arl(struct brw_wm_compile *c,
     brw_set_saturate(p, 0);
 }
 
-static void emit_sub(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg(c, inst, 0, i);
-	    src1 = get_src_reg_imm(c, inst, 1, i);
-	    brw_ADD(p, dst, src0, negate(src1));
-	}
-    }
-    brw_set_saturate(p, 0);
-}
 
 static void emit_mul(struct brw_wm_compile *c,
                      const struct prog_instruction *inst)
@@ -1321,7 +1313,15 @@ static void emit_pow(struct brw_wm_compile *c,
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst, src0, src1;
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    dst = get_dst_reg(c, inst, dst_chan);
     src0 = get_src_reg_imm(c, inst, 0, 0);
     src1 = get_src_reg_imm(c, inst, 1, 0);
 
@@ -2828,18 +2828,12 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    case WM_FRONTFACING:
 		emit_frontfacing(c, inst);
 		break;
-	    case OPCODE_ABS:
-		emit_abs(c, inst);
-		break;
 	    case OPCODE_ADD:
 		emit_add(c, inst);
 		break;
 	    case OPCODE_ARL:
 		emit_arl(c, inst);
 		break;
-	    case OPCODE_SUB:
-		emit_sub(c, inst);
-		break;
 	    case OPCODE_FRC:
 		emit_frc(c, inst);
 		break;
@@ -3032,8 +3026,14 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
     }
     post_wm_emit(c);
-}
 
+    if (INTEL_DEBUG & DEBUG_WM) {
+      _mesa_printf("wm-native:\n");
+      for (i = 0; i < p->nr_insn; i++)
+	 brw_disasm(stderr, &p->store[i]);
+      _mesa_printf("\n");
+    }
+}
 
 /**
  * Do GPU code generation for shaders that use GLSL features such as
diff --git a/src/mesa/drivers/dri/i965/brw_wm_iz.c b/src/mesa/drivers/dri/i965/brw_wm_iz.c
index 8fd067abe7d..5e399ac62a8 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_iz.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_iz.c
@@ -122,6 +122,7 @@ const struct {
  */
 void brw_wm_lookup_iz( GLuint line_aa,
 		       GLuint lookup,
+		       GLboolean ps_uses_depth,
 		       struct brw_wm_prog_key *key )
 {
    GLuint reg = 2;
@@ -131,7 +132,7 @@ void brw_wm_lookup_iz( GLuint line_aa,
    if (lookup & IZ_PS_COMPUTES_DEPTH_BIT)
       key->computes_depth = 1;
 
-   if (wm_iz_table[lookup].sd_present) {
+   if (wm_iz_table[lookup].sd_present || ps_uses_depth) {
       key->source_depth_reg = reg;
       reg += 2;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
index 92142764f5d..62792583396 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
@@ -257,34 +257,6 @@ static void pass0_set_dst( struct brw_wm_compile *c,
 }
 
 
-static void pass0_set_dst_scalar( struct brw_wm_compile *c,
-				  struct brw_wm_instruction *out,
-                                  const struct prog_instruction *inst,
-				  GLuint writemask )
-{
-   if (writemask) {
-      const struct prog_dst_register *dst = &inst->DstReg;
-      GLuint i;
-
-      /* Compute only the first (X) value:
-       */
-      out->writemask = WRITEMASK_X;
-      out->dst[0] = get_value(c);
-
-      /* Update our tracking register file for all the components in
-       * writemask:
-       */
-      for (i = 0; i < 4; i++) {
-	 if (writemask & (1<<i)) {
-	    pass0_set_fpreg_value(c, dst->File, dst->Index, i, out->dst[0]);
-	 }
-      }
-   }
-   else
-      out->writemask = 0;
-}
-
-
 static const struct brw_wm_ref *get_fp_src_reg_ref( struct brw_wm_compile *c,
 						    struct prog_src_register src,
 						    GLuint i )
@@ -363,10 +335,7 @@ translate_insn(struct brw_wm_compile *c,
 
    /* Dst:
     */
-   if (brw_wm_is_scalar_result(out->opcode)) 
-      pass0_set_dst_scalar(c, out, inst, writemask);
-   else 
-      pass0_set_dst(c, out, inst, writemask);
+   pass0_set_dst(c, out, inst, writemask);
 }
 
 
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index 0f87fc46a44..6aa36d10b12 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -196,6 +196,16 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
    struct intel_context *intel = batch->intel;
    GLuint used = batch->ptr - batch->map;
 
+   if (intel->first_post_swapbuffers_batch == NULL) {
+      intel->first_post_swapbuffers_batch = intel->batch->buf;
+      drm_intel_bo_reference(intel->first_post_swapbuffers_batch);
+   }
+
+   if (intel->first_post_swapbuffers_batch == NULL) {
+      intel->first_post_swapbuffers_batch = intel->batch->buf;
+      drm_intel_bo_reference(intel->first_post_swapbuffers_batch);
+   }
+
    if (used == 0) {
       batch->cliprect_mode = IGNORE_CLIPRECTS;
       return;
diff --git a/src/mesa/drivers/dri/intel/intel_blit.c b/src/mesa/drivers/dri/intel/intel_blit.c
index 2e95bd1013f..979f2025842 100644
--- a/src/mesa/drivers/dri/intel/intel_blit.c
+++ b/src/mesa/drivers/dri/intel/intel_blit.c
@@ -477,6 +477,8 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                   BR13 |= BR13_565;
                }
 
+	       assert(irb->region->tiling != I915_TILING_Y);
+
 #ifndef I915
 	       if (irb->region->tiling != I915_TILING_NONE) {
 		  CMD |= XY_DST_TILED;
@@ -571,6 +573,7 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 
    assert( logic_op - GL_CLEAR >= 0 );
    assert( logic_op - GL_CLEAR < 0x10 );
+   assert(dst_pitch > 0);
 
    if (w < 0 || h < 0)
       return GL_TRUE;
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 4abb525f78e..46f1a7f7205 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -38,6 +38,7 @@
 #include "swrast_setup/swrast_setup.h"
 #include "tnl/tnl.h"
 #include "drivers/common/driverfuncs.h"
+#include "drivers/common/meta.h"
 
 #include "i830_dri.h"
 
@@ -513,7 +514,7 @@ intel_flush(GLcontext *ctx, GLboolean needs_mi_flush)
 	  * each of N places that do rendering.  This has worse performances,
 	  * but it is much easier to get correct.
 	  */
-	 if (intel->is_front_buffer_rendering) {
+	 if (!intel->is_front_buffer_rendering) {
 	    intel->front_buffer_dirty = GL_FALSE;
 	 }
       }
@@ -529,7 +530,27 @@ intelFlush(GLcontext * ctx)
 static void
 intel_glFlush(GLcontext *ctx)
 {
+   struct intel_context *intel = intel_context(ctx);
+
    intel_flush(ctx, GL_TRUE);
+
+   /* We're using glFlush as an indicator that a frame is done, which is
+    * what DRI2 does before calling SwapBuffers (and means we should catch
+    * people doing front-buffer rendering, as well)..
+    *
+    * Wait for the swapbuffers before the one we just emitted, so we don't
+    * get too many swaps outstanding for apps that are GPU-heavy but not
+    * CPU-heavy.
+    *
+    * Unfortunately, we don't have a handle to the batch containing the swap,
+    * and getting our hands on that doesn't seem worth it, so we just us the
+    * first batch we emitted after the last swap.
+    */
+   if (intel->first_post_swapbuffers_batch != NULL) {
+      drm_intel_bo_wait_rendering(intel->first_post_swapbuffers_batch);
+      drm_intel_bo_unreference(intel->first_post_swapbuffers_batch);
+      intel->first_post_swapbuffers_batch = NULL;
+   }
 }
 
 void
@@ -692,6 +713,8 @@ intelInitContext(struct intel_context *intel,
    _swrast_allow_pixel_fog(ctx, GL_FALSE);
    _swrast_allow_vertex_fog(ctx, GL_TRUE);
 
+   _mesa_meta_init(ctx);
+
    intel->hw_stencil = mesaVis->stencilBits && mesaVis->depthBits == 24;
    intel->hw_stipple = 1;
 
@@ -752,7 +775,7 @@ intelInitContext(struct intel_context *intel,
    if (intel->use_texture_tiling &&
        !intel->intelScreen->kernel_exec_fencing) {
       fprintf(stderr, "No kernel support for execution fencing, "
-	      "disabling texture tiling");
+	      "disabling texture tiling\n");
       intel->use_texture_tiling = GL_FALSE;
    }
    intel->use_early_z = driQueryOptionb(&intel->optionCache, "early_z");
@@ -795,6 +818,8 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
 
       INTEL_FIREVERTICES(intel);
 
+      _mesa_meta_free(&intel->ctx);
+
       meta_destroy_metaops(&intel->meta);
 
       intel->vtbl.destroy(intel);
@@ -814,6 +839,8 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
       intel->prim.vb = NULL;
       dri_bo_unreference(intel->prim.vb_bo);
       intel->prim.vb_bo = NULL;
+      dri_bo_unreference(intel->first_post_swapbuffers_batch);
+      intel->first_post_swapbuffers_batch = NULL;
 
       if (release_texture_heaps) {
          /* Nothing is currently done here to free texture heaps;
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index 08bea88c958..0d9db5eb1da 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -80,9 +80,13 @@ extern void intelFallback(struct intel_context *intel, GLuint bit,
 
 #define INTEL_MAX_FIXUP 64
 
+
+/**
+ * intel_context is derived from Mesa's context class: GLcontext.
+ */
 struct intel_context
 {
-   GLcontext ctx;               /* the parent class */
+   GLcontext ctx;  /**< base class, must be first field */
 
    struct
    {
@@ -178,6 +182,7 @@ struct intel_context
    GLboolean ttm;
 
    struct intel_batchbuffer *batch;
+   drm_intel_bo *first_post_swapbuffers_batch;
    GLboolean no_batch_wrap;
    unsigned batch_id;
 
@@ -307,7 +312,7 @@ struct intel_context
    __DRIdrawablePrivate *driReadDrawable;
    __DRIscreenPrivate *driScreen;
    intelScreenPrivate *intelScreen;
-   volatile struct drm_i915_sarea *sarea;
+   volatile drm_i915_sarea_t *sarea;
 
    GLuint lastStamp;
 
diff --git a/src/mesa/drivers/dri/intel/intel_extensions.c b/src/mesa/drivers/dri/intel/intel_extensions.c
index 6a68021c691..aa3d704f299 100644
--- a/src/mesa/drivers/dri/intel/intel_extensions.c
+++ b/src/mesa/drivers/dri/intel/intel_extensions.c
@@ -48,6 +48,7 @@
 #define need_GL_EXT_framebuffer_blit
 #define need_GL_EXT_gpu_program_parameters
 #define need_GL_EXT_point_parameters
+#define need_GL_EXT_provoking_vertex
 #define need_GL_EXT_secondary_color
 #define need_GL_EXT_stencil_two_side
 #define need_GL_APPLE_vertex_array_object
@@ -93,6 +94,7 @@ static const struct dri_extension card_extensions[] = {
    { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
    { "GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions },
    { "GL_EXT_packed_depth_stencil",       NULL },
+   { "GL_EXT_provoking_vertex",           GL_EXT_provoking_vertex_functions },
    { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
    { "GL_EXT_stencil_wrap",               NULL },
    { "GL_EXT_texture_edge_clamp",         NULL },
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index 666893596e1..804c0348401 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -35,6 +35,7 @@
 #include "main/context.h"
 #include "main/texformat.h"
 #include "main/texrender.h"
+#include "drivers/common/meta.h"
 
 #include "intel_context.h"
 #include "intel_buffers.h"
@@ -700,74 +701,6 @@ intel_validate_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb)
 
 
 /**
- * Called from glBlitFramebuffer().
- * For now, we're doing an approximation with glCopyPixels().
- * XXX we need to bypass all the per-fragment operations, except scissor.
- */
-static void
-intel_blit_framebuffer(GLcontext *ctx,
-                       GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
-                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
-                       GLbitfield mask, GLenum filter)
-{
-   const GLfloat xZoomSave = ctx->Pixel.ZoomX;
-   const GLfloat yZoomSave = ctx->Pixel.ZoomY;
-   GLsizei width, height;
-   GLfloat xFlip = 1.0F, yFlip = 1.0F;
-
-   if (srcX1 < srcX0) {
-      GLint tmp = srcX1;
-      srcX1 = srcX0;
-      srcX0 = tmp;
-      xFlip = -1.0F;
-   }
-
-   if (srcY1 < srcY0) {
-      GLint tmp = srcY1;
-      srcY1 = srcY0;
-      srcY0 = tmp;
-      yFlip = -1.0F;
-   }
-
-   width = srcX1 - srcX0;
-   height = srcY1 - srcY0;
-
-   ctx->Pixel.ZoomX = xFlip * (dstX1 - dstX0) / (srcX1 - srcY0);
-   ctx->Pixel.ZoomY = yFlip * (dstY1 - dstY0) / (srcY1 - srcY0);
-
-   if (ctx->Pixel.ZoomX < 0.0F) {
-      dstX0 = MAX2(dstX0, dstX1);
-   }
-   else {
-      dstX0 = MIN2(dstX0, dstX1);
-   }
-
-   if (ctx->Pixel.ZoomY < 0.0F) {
-      dstY0 = MAX2(dstY0, dstY1);
-   }
-   else {
-      dstY0 = MIN2(dstY0, dstY1);
-   }
-
-   if (mask & GL_COLOR_BUFFER_BIT) {
-      ctx->Driver.CopyPixels(ctx, srcX0, srcY0, width, height,
-                             dstX0, dstY0, GL_COLOR);
-   }
-   if (mask & GL_DEPTH_BUFFER_BIT) {
-      ctx->Driver.CopyPixels(ctx, srcX0, srcY0, width, height,
-                             dstX0, dstY0, GL_DEPTH);
-   }
-   if (mask & GL_STENCIL_BUFFER_BIT) {
-      ctx->Driver.CopyPixels(ctx, srcX0, srcY0, width, height,
-                             dstX0, dstY0, GL_STENCIL);
-   }
-      
-   ctx->Pixel.ZoomX = xZoomSave;
-   ctx->Pixel.ZoomY = yZoomSave;
-}
-
-
-/**
  * Do one-time context initializations related to GL_EXT_framebuffer_object.
  * Hook in device driver functions.
  */
@@ -782,5 +715,5 @@ intel_fbo_init(struct intel_context *intel)
    intel->ctx.Driver.FinishRenderTexture = intel_finish_render_texture;
    intel->ctx.Driver.ResizeBuffers = intel_resize_buffers;
    intel->ctx.Driver.ValidateFramebuffer = intel_validate_framebuffer;
-   intel->ctx.Driver.BlitFramebuffer = intel_blit_framebuffer;
+   intel->ctx.Driver.BlitFramebuffer = _mesa_meta_blit_framebuffer;
 }
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_copy.c b/src/mesa/drivers/dri/intel/intel_pixel_copy.c
index 5d52335dee3..ca796b36559 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_copy.c
@@ -31,7 +31,7 @@
 #include "main/state.h"
 #include "main/mtypes.h"
 #include "main/macros.h"
-#include "swrast/swrast.h"
+#include "drivers/common/meta.h"
 
 #include "intel_screen.h"
 #include "intel_context.h"
@@ -97,162 +97,6 @@ intel_check_copypixel_blit_fragment_ops(GLcontext * ctx)
 	    ctx->Color.BlendEnabled);
 }
 
-#ifdef I915
-/* Doesn't work for overlapping regions.  Could do a double copy or
- * just fallback.
- */
-static GLboolean
-do_texture_copypixels(GLcontext * ctx,
-                      GLint srcx, GLint srcy,
-                      GLsizei width, GLsizei height,
-                      GLint dstx, GLint dsty, GLenum type)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_region *dst = intel_drawbuf_region(intel);
-   struct intel_region *src = copypix_src_region(intel, type);
-   GLenum src_format;
-   GLenum src_type;
-
-   DBG("%s %d,%d %dx%d --> %d,%d\n", __FUNCTION__, 
-       srcx, srcy, width, height, dstx, dsty);
-
-   if (!src || !dst || type != GL_COLOR)
-      return GL_FALSE;
-
-   if (ctx->_ImageTransferState) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         fprintf(stderr, "%s: check_color failed\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-   /* Can't handle overlapping regions.  Don't have sufficient control
-    * over rasterization to pull it off in-place.  Punt on these for
-    * now.
-    * 
-    * XXX: do a copy to a temporary. 
-    */
-   if (src->buffer == dst->buffer) {
-      drm_clip_rect_t srcbox;
-      drm_clip_rect_t dstbox;
-      drm_clip_rect_t tmp;
-
-      srcbox.x1 = srcx;
-      srcbox.y1 = srcy;
-      srcbox.x2 = srcx + width;
-      srcbox.y2 = srcy + height;
-
-      if (ctx->Pixel.ZoomX > 0) {
-	 dstbox.x1 = dstx;
-	 dstbox.x2 = dstx + width * ctx->Pixel.ZoomX;
-      } else {
-	 dstbox.x1 = dstx + width * ctx->Pixel.ZoomX;
-	 dstbox.x2 = dstx;
-      }
-      if (ctx->Pixel.ZoomY > 0) {
-	 dstbox.y1 = dsty;
-	 dstbox.y2 = dsty + height * ctx->Pixel.ZoomY;
-      } else {
-	 dstbox.y1 = dsty + height * ctx->Pixel.ZoomY;
-	 dstbox.y2 = dsty;
-      }
-
-      DBG("src %d,%d %d,%d\n", srcbox.x1, srcbox.y1, srcbox.x2, srcbox.y2);
-      DBG("dst %d,%d %d,%d (%dx%d) (%f,%f)\n", dstbox.x1, dstbox.y1, dstbox.x2, dstbox.y2,
-	  width, height, ctx->Pixel.ZoomX, ctx->Pixel.ZoomY);
-
-      if (intel_intersect_cliprects(&tmp, &srcbox, &dstbox)) {
-         DBG("%s: regions overlap\n", __FUNCTION__);
-         return GL_FALSE;
-      }
-   }
-
-   intelFlush(&intel->ctx);
-
-   intel->vtbl.install_meta_state(intel);
-
-   /* Is this true?  Also will need to turn depth testing on according
-    * to state:
-    */
-   intel->vtbl.meta_no_stencil_write(intel);
-   intel->vtbl.meta_no_depth_write(intel);
-
-   /* Set the 3d engine to draw into the destination region:
-    */
-   intel->vtbl.meta_draw_region(intel, dst, intel->depth_region);
-
-   intel->vtbl.meta_import_pixel_state(intel);
-
-   if (src->cpp == 2) {
-      src_format = GL_RGB;
-      src_type = GL_UNSIGNED_SHORT_5_6_5;
-   }
-   else {
-      src_format = GL_BGRA;
-      src_type = GL_UNSIGNED_BYTE;
-   }
-
-   /* Set the frontbuffer up as a large rectangular texture.
-    */
-   if (!intel->vtbl.meta_tex_rect_source(intel, src->buffer, 0,
-                                         src->pitch,
-                                         src->height, src_format, src_type)) {
-      intel->vtbl.leave_meta_state(intel);
-      return GL_FALSE;
-   }
-
-
-   intel->vtbl.meta_texture_blend_replace(intel);
-
-   LOCK_HARDWARE(intel);
-
-   if (intel->driDrawable->numClipRects) {
-      __DRIdrawablePrivate *dPriv = intel->driDrawable;
-
-
-      srcy = dPriv->h - srcy - height;  /* convert from gl to hardware coords */
-
-      srcx += dPriv->x;
-      srcy += dPriv->y;
-
-      /* Clip against the source region.  This is the only source
-       * clipping we do.  XXX: Just set the texcord wrap mode to clamp
-       * or similar.
-       *
-       */
-      if (0) {
-         GLint orig_x = srcx;
-         GLint orig_y = srcy;
-
-         if (!_mesa_clip_to_region(0, 0, src->pitch, src->height,
-                                   &srcx, &srcy, &width, &height))
-            goto out;
-
-         dstx += srcx - orig_x;
-         dsty += (srcy - orig_y) * ctx->Pixel.ZoomY;
-      }
-
-      /* Just use the regular cliprect mechanism...  Does this need to
-       * even hold the lock???
-       */
-      intel->vtbl.meta_draw_quad(intel,
-				 dstx,
-				 dstx + width * ctx->Pixel.ZoomX,
-				 dPriv->h - (dsty + height * ctx->Pixel.ZoomY),
-				 dPriv->h - (dsty), 0, /* XXX: what z value? */
-				 0x00ff00ff,
-				 srcx, srcx + width, srcy, srcy + height);
-
-    out:
-      intel->vtbl.leave_meta_state(intel);
-      intel_batchbuffer_emit_mi_flush(intel->batch);
-   }
-   UNLOCK_HARDWARE(intel);
-
-   DBG("%s: success\n", __FUNCTION__);
-   return GL_TRUE;
-}
-#endif /* I915 */
-
 
 /**
  * CopyPixels with the blitter.  Don't support zooming, pixel transfer, etc.
@@ -400,12 +244,5 @@ intelCopyPixels(GLcontext * ctx,
    if (do_blit_copypixels(ctx, srcx, srcy, width, height, destx, desty, type))
       return;
 
-#ifdef I915
-   if (do_texture_copypixels(ctx, srcx, srcy, width, height, destx, desty, type))
-      return;
-#endif
-
-   DBG("fallback to _swrast_CopyPixels\n");
-
-   _swrast_CopyPixels(ctx, srcx, srcy, width, height, destx, desty, type);
+   _mesa_meta_copy_pixels(ctx, srcx, srcy, width, height, destx, desty, type);
 }
diff --git a/src/mesa/drivers/dri/intel/intel_regions.c b/src/mesa/drivers/dri/intel/intel_regions.c
index 7525cd9c4d7..497f7967649 100644
--- a/src/mesa/drivers/dri/intel/intel_regions.c
+++ b/src/mesa/drivers/dri/intel/intel_regions.c
@@ -181,6 +181,11 @@ intel_region_alloc(struct intel_context *intel,
    dri_bo *buffer;
    struct intel_region *region;
 
+   if (tiling == I915_TILING_X)
+      height = ALIGN(height, 8);
+   else if (tiling == I915_TILING_Y)
+      height = ALIGN(height, 32);
+
    if (expect_accelerated_upload) {
       buffer = drm_intel_bo_alloc_for_render(intel->bufmgr, "region",
 					     pitch * cpp * height, 64);
@@ -452,6 +457,7 @@ void
 intel_region_cow(struct intel_context *intel, struct intel_region *region)
 {
    struct intel_buffer_object *pbo = region->pbo;
+   GLboolean ok;
 
    intel_region_release_pbo(intel, region);
 
@@ -463,13 +469,14 @@ intel_region_cow(struct intel_context *intel, struct intel_region *region)
     */
 
    LOCK_HARDWARE(intel);
-   assert(intelEmitCopyBlit(intel,
-			    region->cpp,
-			    region->pitch, pbo->buffer, 0, region->tiling,
-			    region->pitch, region->buffer, 0, region->tiling,
-			    0, 0, 0, 0,
-			    region->pitch, region->height,
-			    GL_COPY));
+   ok = intelEmitCopyBlit(intel,
+                          region->cpp,
+                          region->pitch, pbo->buffer, 0, region->tiling,
+                          region->pitch, region->buffer, 0, region->tiling,
+                          0, 0, 0, 0,
+                          region->pitch, region->height,
+                          GL_COPY);
+   assert(ok);
    UNLOCK_HARDWARE(intel);
 }
 
diff --git a/src/mesa/drivers/dri/intel/intel_screen.c b/src/mesa/drivers/dri/intel/intel_screen.c
index 14cc815139c..1b8c56e68d6 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -69,7 +69,11 @@ PUBLIC const char __driConfigOptions[] =
 	 DRI_CONF_DESC_END
       DRI_CONF_OPT_END
 
-      DRI_CONF_TEXTURE_TILING(false)
+#ifdef I915
+     DRI_CONF_TEXTURE_TILING(false)
+#else
+     DRI_CONF_TEXTURE_TILING(true)
+#endif
 
       DRI_CONF_OPT_BEGIN(early_z, bool, false)
 	 DRI_CONF_DESC(en, "Enable early Z in classic mode (unstable, 945-only).")
diff --git a/src/mesa/drivers/dri/intel/intel_span.c b/src/mesa/drivers/dri/intel/intel_span.c
index 34b78ebc1ab..8df49908806 100644
--- a/src/mesa/drivers/dri/intel/intel_span.c
+++ b/src/mesa/drivers/dri/intel/intel_span.c
@@ -501,7 +501,7 @@ intel_map_unmap_buffers(struct intel_context *intel, GLboolean map)
 
 
 /**
- * Prepare for softare rendering.  Map current read/draw framebuffers'
+ * Prepare for software rendering.  Map current read/draw framebuffers'
  * renderbuffes and all currently bound texture objects.
  *
  * Old note: Moved locking out to get reasonable span performance.
@@ -526,7 +526,7 @@ intelSpanRenderStart(GLcontext * ctx)
 }
 
 /**
- * Called when done softare rendering.  Unmap the buffers we mapped in
+ * Called when done software rendering.  Unmap the buffers we mapped in
  * the above function.
  */
 void
diff --git a/src/mesa/drivers/dri/intel/intel_tex_subimage.c b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
index 1f27131dac0..89037073f84 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
@@ -44,10 +44,12 @@ intelTexSubimage(GLcontext * ctx,
                  GLenum target, GLint level,
                  GLint xoffset, GLint yoffset, GLint zoffset,
                  GLint width, GLint height, GLint depth,
+                 GLsizei imageSize,
                  GLenum format, GLenum type, const void *pixels,
                  const struct gl_pixelstore_attrib *packing,
                  struct gl_texture_object *texObj,
-                 struct gl_texture_image *texImage)
+                 struct gl_texture_image *texImage,
+                 GLboolean compressed)
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_texture_image *intelImage = intel_texture_image(texImage);
@@ -59,9 +61,14 @@ intelTexSubimage(GLcontext * ctx,
 
    intelFlush(ctx);
 
-   pixels =
-      _mesa_validate_pbo_teximage(ctx, dims, width, height, depth, format,
-                                  type, pixels, packing, "glTexSubImage2D");
+   if (compressed)
+      pixels = _mesa_validate_pbo_compressed_teximage(ctx, imageSize,
+                                                      pixels, packing,
+                                                      "glCompressedTexImage");
+   else
+      pixels = _mesa_validate_pbo_teximage(ctx, dims, width, height, depth,
+                                           format, type, pixels, packing,
+                                           "glTexSubImage");
    if (!pixels)
       return;
 
@@ -90,15 +97,28 @@ intelTexSubimage(GLcontext * ctx,
 
    assert(dstRowStride);
 
-   if (!texImage->TexFormat->StoreImage(ctx, dims, texImage->_BaseFormat,
-                                        texImage->TexFormat,
-                                        texImage->Data,
-                                        xoffset, yoffset, zoffset,
-                                        dstRowStride,
-                                        texImage->ImageOffsets,
-                                        width, height, depth,
-                                        format, type, pixels, packing)) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "intelTexSubImage");
+   if (compressed) {
+      if (intelImage->mt) {
+         struct intel_region *dst = intelImage->mt->region;
+         
+         _mesa_copy_rect(texImage->Data, dst->cpp, dst->pitch,
+                         xoffset, yoffset / 4,
+                         (width + 3)  & ~3, (height + 3) / 4,
+                         pixels, (width + 3) & ~3, 0, 0);
+      } else
+        memcpy(texImage->Data, pixels, imageSize);
+   }
+   else {
+      if (!texImage->TexFormat->StoreImage(ctx, dims, texImage->_BaseFormat,
+                                           texImage->TexFormat,
+                                           texImage->Data,
+                                           xoffset, yoffset, zoffset,
+                                           dstRowStride,
+                                           texImage->ImageOffsets,
+                                           width, height, depth,
+                                           format, type, pixels, packing)) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "intelTexSubImage");
+      }
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -132,8 +152,8 @@ intelTexSubImage3D(GLcontext * ctx,
    intelTexSubimage(ctx, 3,
                     target, level,
                     xoffset, yoffset, zoffset,
-                    width, height, depth,
-                    format, type, pixels, packing, texObj, texImage);
+                    width, height, depth, 0,
+                    format, type, pixels, packing, texObj, texImage, GL_FALSE);
 }
 
 
@@ -152,8 +172,8 @@ intelTexSubImage2D(GLcontext * ctx,
    intelTexSubimage(ctx, 2,
                     target, level,
                     xoffset, yoffset, 0,
-                    width, height, 1,
-                    format, type, pixels, packing, texObj, texImage);
+                    width, height, 1, 0,
+                    format, type, pixels, packing, texObj, texImage, GL_FALSE);
 }
 
 
@@ -172,8 +192,8 @@ intelTexSubImage1D(GLcontext * ctx,
    intelTexSubimage(ctx, 1,
                     target, level,
                     xoffset, 0, 0,
-                    width, 1, 1,
-                    format, type, pixels, packing, texObj, texImage);
+                    width, 1, 1, 0,
+                    format, type, pixels, packing, texObj, texImage, GL_FALSE);
 }
 
 static void
@@ -187,8 +207,11 @@ intelCompressedTexSubImage2D(GLcontext * ctx,
 			     struct gl_texture_object *texObj,
 			     struct gl_texture_image *texImage)
 {
-   fprintf(stderr, "stubbed CompressedTexSubImage2D: %dx%d@%dx%d\n",
-	   width, height, xoffset, yoffset);
+   intelTexSubimage(ctx, 2,
+                    target, level,
+                    xoffset, yoffset, 0,
+                    width, height, 1, imageSize,
+                    format, 0, pixels, &ctx->Unpack, texObj, texImage, GL_TRUE);
 }
 
 
diff --git a/src/mesa/drivers/dri/r200/r200_cmdbuf.c b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
index d49f4fabe72..5d0f367b387 100644
--- a/src/mesa/drivers/dri/r200/r200_cmdbuf.c
+++ b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
@@ -107,31 +107,37 @@ void r200SetUpAtomList( r200ContextPtr rmesa )
 
 void r200EmitScissor(r200ContextPtr rmesa)
 {
+    unsigned x1, y1, x2, y2;
+    struct radeon_renderbuffer *rrb;
     BATCH_LOCALS(&rmesa->radeon);
     if (!rmesa->radeon.radeonScreen->kernel_mm) {
        return;
     }
+    rrb = radeon_get_colorbuffer(&rmesa->radeon);
+    if (!rrb || !rrb->bo)
+	return;
+
     if (rmesa->radeon.state.scissor.enabled) {
-        BEGIN_BATCH(8);
-        OUT_BATCH(CP_PACKET0(R200_RE_CNTL, 0));
-        OUT_BATCH(R200_SCISSOR_ENABLE | rmesa->hw.set.cmd[SET_RE_CNTL]);
-        OUT_BATCH(CP_PACKET0(R200_RE_AUX_SCISSOR_CNTL, 0));
-        OUT_BATCH(R200_SCISSOR_ENABLE_0);
-        OUT_BATCH(CP_PACKET0(R200_RE_SCISSOR_TL_0, 0));
-        OUT_BATCH((rmesa->radeon.state.scissor.rect.y1 << 16) |
-                  rmesa->radeon.state.scissor.rect.x1);
-        OUT_BATCH(CP_PACKET0(R200_RE_SCISSOR_BR_0, 0));
-        OUT_BATCH(((rmesa->radeon.state.scissor.rect.y2 - 1) << 16) |
-                  (rmesa->radeon.state.scissor.rect.x2 - 1));
-        END_BATCH();
+	x1 = rmesa->radeon.state.scissor.rect.x1;
+	y1 = rmesa->radeon.state.scissor.rect.y1;
+	x2 = rmesa->radeon.state.scissor.rect.x2 - 1;
+	y2 = rmesa->radeon.state.scissor.rect.y2 - 1;
     } else {
-        BEGIN_BATCH(4);
-        OUT_BATCH(CP_PACKET0(R200_RE_CNTL, 0));
-        OUT_BATCH(rmesa->hw.set.cmd[SET_RE_CNTL] & ~R200_SCISSOR_ENABLE);
-        OUT_BATCH(CP_PACKET0(R200_RE_AUX_SCISSOR_CNTL, 0));
-        OUT_BATCH(0);
-        END_BATCH();
+        x1 = 0;
+        y1 = 0;
+        x2 = rrb->base.Width - 1;
+        y2 = rrb->base.Height - 1;
     }
+    BEGIN_BATCH(8);
+    OUT_BATCH(CP_PACKET0(R200_RE_CNTL, 0));
+    OUT_BATCH(R200_SCISSOR_ENABLE | rmesa->hw.set.cmd[SET_RE_CNTL]);
+    OUT_BATCH(CP_PACKET0(R200_RE_AUX_SCISSOR_CNTL, 0));
+    OUT_BATCH(0);
+    OUT_BATCH(CP_PACKET0(R200_RE_TOP_LEFT, 0));
+    OUT_BATCH((y1 << 16) | x1);
+    OUT_BATCH(CP_PACKET0(R200_RE_WIDTH_HEIGHT, 0));
+    OUT_BATCH((y2 << 16) | x2);
+    END_BATCH();
 }
 
 /* Fire a section of the retained (indexed_verts) buffer as a regular
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index 9a92a320797..8cb287de268 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -500,3 +500,15 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
 }
 
 
+void r200DestroyContext( __DRIcontextPrivate *driContextPriv )
+{
+	int i;
+	r200ContextPtr rmesa = (r200ContextPtr)driContextPriv->driverPrivate;
+	if (rmesa)
+	{
+		for ( i = 0 ; i < R200_MAX_TEXTURE_UNITS ; i++ ) {
+			_math_matrix_dtr( &rmesa->TexGenMatrix[i] );
+		}
+	}
+	radeonDestroyContext(driContextPriv);
+}
diff --git a/src/mesa/drivers/dri/r200/r200_state_init.c b/src/mesa/drivers/dri/r200/r200_state_init.c
index bc871d99048..78ad5baebb3 100644
--- a/src/mesa/drivers/dri/r200/r200_state_init.c
+++ b/src/mesa/drivers/dri/r200/r200_state_init.c
@@ -515,7 +515,7 @@ static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
    if (drb)
      dwords += 6;
    if (rrb)
-     dwords += 6;
+     dwords += 8;
    if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM)
      dwords += 4;
 
@@ -546,7 +546,7 @@ static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
      OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
 
      OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLORPITCH, 0));
-     OUT_BATCH(cbpitch);
+     OUT_BATCH_RELOC(cbpitch, rrb->bo, cbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
    }
 
    if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM) {
@@ -563,7 +563,6 @@ static void tex_emit(GLcontext *ctx, struct radeon_state_atom *atom)
    uint32_t dwords = atom->cmd_size;
    int i = atom->idx;
    radeonTexObj *t = r200->state.texture.unit[i].texobj;
-   radeon_mipmap_level *lvl;
 
    if (t && t->mt && !t->image_override)
      dwords += 2;
@@ -591,7 +590,6 @@ static void tex_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
    uint32_t dwords = atom->cmd_size;
    int i = atom->idx;
    radeonTexObj *t = r200->state.texture.unit[i].texobj;
-   radeon_mipmap_level *lvl;
    int hastexture = 1;
 
    if (!r200->state.texture.unit[i].unitneeded)
@@ -774,7 +772,7 @@ void r200InitState( r200ContextPtr rmesa )
       ALLOC_STATE( afs[1], never, AFS_STATE_SIZE, "AFS/afsinst-1", 1 );
    }
 
-   for (i = 0; i < 5; i++)
+   for (i = 0; i < 6; i++)
       if (rmesa->radeon.radeonScreen->kernel_mm)
           rmesa->hw.tex[i].emit = tex_emit_cs;
       else
@@ -786,7 +784,7 @@ void r200InitState( r200ContextPtr rmesa )
       ALLOC_STATE( cube[3], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-3", 3 );
       ALLOC_STATE( cube[4], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-4", 4 );
       ALLOC_STATE( cube[5], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-5", 5 );
-      for (i = 0; i < 5; i++)
+      for (i = 0; i < 6; i++)
           if (rmesa->radeon.radeonScreen->kernel_mm)
               rmesa->hw.cube[i].emit = cube_emit_cs;
           else
diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.c b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
index 7eb11aaf271..bd46f9acf2e 100644
--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.c
+++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
@@ -177,14 +177,17 @@ static void emit_tex_offsets(GLcontext *ctx, struct radeon_state_atom * atom)
 		} else if (!t) {
 			/* Texture unit hasn't a texture bound.
 			 * We assign the current color buffer as a fakery to make
-			 * KIL work. */
-			struct radeon_renderbuffer *rrb = radeon_get_colorbuffer(&r300->radeon);
-			if (rrb && rrb->bo) {
-				BEGIN_BATCH_NO_AUTOSTATE(4);
-				OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
-				OUT_BATCH_RELOC(0, rrb->bo, 0,
-						RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
-				END_BATCH();
+			 * KIL work on KMS (without it, the CS checker will complain).
+			 */
+			if (r300->radeon.radeonScreen->kernel_mm) {
+				struct radeon_renderbuffer *rrb = radeon_get_colorbuffer(&r300->radeon);
+				if (rrb && rrb->bo) {
+					BEGIN_BATCH_NO_AUTOSTATE(4);
+					OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+					OUT_BATCH_RELOC(0, rrb->bo, 0,
+							RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+					END_BATCH();
+				}
 			}
 		} else { /* override cases */
 			if (t->bo) {
diff --git a/src/mesa/drivers/dri/r300/r300_draw.c b/src/mesa/drivers/dri/r300/r300_draw.c
index cebb9a10d8b..ab2287a5e25 100644
--- a/src/mesa/drivers/dri/r300/r300_draw.c
+++ b/src/mesa/drivers/dri/r300/r300_draw.c
@@ -604,11 +604,19 @@ static void r300DrawPrims(GLcontext *ctx,
 			 const struct _mesa_prim *prim,
 			 GLuint nr_prims,
 			 const struct _mesa_index_buffer *ib,
+			 GLboolean index_bounds_valid,
 			 GLuint min_index,
 			 GLuint max_index)
 {
 	GLboolean retval;
 
+	/* This check should get folded into just the places that
+	 * min/max index are really needed.
+	 */
+	if (!index_bounds_valid) {
+		vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
+	}
+
 	if (min_index) {
 		vbo_rebase_prims( ctx, arrays, prim, nr_prims, ib, min_index, max_index, r300DrawPrims );
 		return;
diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.c b/src/mesa/drivers/dri/r600/r600_cmdbuf.c
index 8debecbab99..dc2fb0144a2 100644
--- a/src/mesa/drivers/dri/r600/r600_cmdbuf.c
+++ b/src/mesa/drivers/dri/r600/r600_cmdbuf.c
@@ -457,6 +457,9 @@ void r600InitCmdBuf(context_t *r600) /* from rcommonInitCmdBuf */
     GLuint size;
     rmesa->hw.max_state_size = 4000; /* rough estimate */
 
+    rmesa->hw.all_dirty = GL_TRUE;
+    rmesa->hw.is_dirty = GL_TRUE;
+
 	/* Initialize command buffer */
 	size = 256 * driQueryOptioni(&rmesa->optionCache,
 				     "command_buffer_size");
diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c
index eacf8112825..7009374b0ca 100644
--- a/src/mesa/drivers/dri/r600/r600_context.c
+++ b/src/mesa/drivers/dri/r600/r600_context.c
@@ -395,6 +395,8 @@ r600DestroyContext (__DRIcontextPrivate * driContextPriv)
 
     if (context)
 	    FREE(context->hw.pStateList);
+
+    radeonDestroyContext(driContextPriv);
 }
 
 
diff --git a/src/mesa/drivers/dri/r600/r600_context.h b/src/mesa/drivers/dri/r600/r600_context.h
index fbb8164af59..30ddce682c1 100644
--- a/src/mesa/drivers/dri/r600/r600_context.h
+++ b/src/mesa/drivers/dri/r600/r600_context.h
@@ -158,6 +158,17 @@ extern GLboolean r700InitChipObject(context_t *context);
 extern GLboolean r700SendContextStates(context_t *context);
 extern GLboolean r700SendViewportState(context_t *context, int id);
 extern GLboolean r700SendRenderTargetState(context_t *context, int id);
+extern GLboolean r700SendTextureState(context_t *context);
+extern GLboolean r700SendDepthTargetState(context_t *context);
+extern GLboolean r700SendUCPState(context_t *context);
+extern GLboolean r700SendFSState(context_t *context);
+extern void r700EmitState(GLcontext * ctx);
+
+extern GLboolean r700SyncSurf(context_t *context,
+			      struct radeon_bo *pbo,
+			      uint32_t read_domain,
+			      uint32_t write_domain,
+			      uint32_t sync_type);
 
 extern int       r700SetupStreams(GLcontext * ctx);
 extern void      r700SetupVTXConstants(GLcontext  * ctx, 
diff --git a/src/mesa/drivers/dri/r600/r600_texstate.c b/src/mesa/drivers/dri/r600/r600_texstate.c
index 4840586858d..ee9b64ee43a 100644
--- a/src/mesa/drivers/dri/r600/r600_texstate.c
+++ b/src/mesa/drivers/dri/r600/r600_texstate.c
@@ -568,9 +568,6 @@ static void setup_hardware_state(context_t *rmesa, struct gl_texture_object *tex
 		}
 	}
 
-	if (t->image_override && t->bo)
-		return;
-
 	switch (texObj->Target) {
         case GL_TEXTURE_1D:
 		SETfield(t->SQ_TEX_RESOURCE0, SQ_TEX_DIM_1D, DIM_shift, DIM_mask);
@@ -701,7 +698,7 @@ void r600SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 	struct gl_texture_object *tObj =
 	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
 	radeonTexObjPtr t = radeon_tex_obj(tObj);
-	uint32_t pitch_val;
+	uint32_t pitch_val, size;
 
 	if (!tObj)
 		return;
@@ -711,7 +708,12 @@ void r600SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 	if (!offset)
 		return;
 
-	t->bo = NULL;
+	size = pitch;//h * w * (depth / 8);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+	t->bo = radeon_legacy_bo_alloc_fake(rmesa->radeon.radeonScreen->bom, size, offset);
 	t->override_offset = offset;
 	pitch_val = pitch;
 	switch (depth) {
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c
index 0abf112b55f..eaacd061137 100644
--- a/src/mesa/drivers/dri/r600/r700_assembler.c
+++ b/src/mesa/drivers/dri/r600/r700_assembler.c
@@ -3839,6 +3839,9 @@ GLboolean Process_Export(r700_AssemblerBase* pAsm,
     if (export_count == 1) 
     {
         ucWriteMask = pAsm->pucOutMask[starting_register_number - pAsm->starting_export_register_number];
+	/* exports Z as a float into Red channel */
+	if (GL_TRUE == is_depth_export)
+	    ucWriteMask = 0x1;
 
         if( (ucWriteMask & 0x1) != 0)
         {
diff --git a/src/mesa/drivers/dri/r600/r700_chip.c b/src/mesa/drivers/dri/r600/r700_chip.c
index 78779e841d7..0fb355a0b6a 100644
--- a/src/mesa/drivers/dri/r600/r700_chip.c
+++ b/src/mesa/drivers/dri/r600/r700_chip.c
@@ -138,6 +138,19 @@ GLboolean r700InitChipObject(context_t *context)
     LINK_STATES(CB_CLRCMP_MSK);
     LINK_STATES(CB_BLEND_CONTROL);
 
+    //DB
+    LINK_STATES(DB_HTILE_DATA_BASE);
+    LINK_STATES(DB_STENCIL_CLEAR);
+    LINK_STATES(DB_DEPTH_CLEAR);
+    LINK_STATES(DB_STENCILREFMASK);
+    LINK_STATES(DB_STENCILREFMASK_BF);
+    LINK_STATES(DB_DEPTH_CONTROL);
+    LINK_STATES(DB_SHADER_CONTROL);
+    LINK_STATES(DB_RENDER_CONTROL);
+    LINK_STATES(DB_RENDER_OVERRIDE);
+    LINK_STATES(DB_HTILE_SURFACE);
+    LINK_STATES(DB_ALPHA_TO_MASK);
+
     // SX
     LINK_STATES(SX_MISC);
     LINK_STATES(SX_ALPHA_TEST_CONTROL);
@@ -322,7 +335,7 @@ void r700SetupVTXConstants(GLcontext  * ctx,
     unsigned int uSQ_VTX_CONSTANT_WORD6_0 = 0;
 
     if (!paos->bo)
-	    return GL_FALSE;
+	    return;
 
     if ((context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV610) ||
 	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV620) ||
@@ -474,6 +487,18 @@ GLboolean r700SendContextStates(context_t *context)
     for(ui = 0; ui < R700_MAX_SHADER_EXPORTS; ui++)
 	    R600_OUT_BATCH(r700->SPI_PS_INPUT_CNTL[ui].u32All);
     END_BATCH();
+
+    if (context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) {
+	    for (ui = 0; ui < R700_MAX_RENDER_TARGETS; ui++) {
+		    if (r700->render_target[ui].enabled) {
+			    BEGIN_BATCH_NO_AUTOSTATE(3);
+			    R600_OUT_BATCH_REGVAL(CB_BLEND0_CONTROL + (4 * ui),
+						  r700->render_target[ui].CB_BLEND0_CONTROL.u32All);
+			    END_BATCH();
+		    }
+	    }
+    }
+
     COMMIT_BATCH();
 
     return GL_TRUE;
@@ -491,38 +516,25 @@ GLboolean r700SendDepthTargetState(context_t *context)
 		return GL_FALSE;
 	}
 
-        BEGIN_BATCH_NO_AUTOSTATE(9);
+        BEGIN_BATCH_NO_AUTOSTATE(8);
 	R600_OUT_BATCH_REGSEQ(DB_DEPTH_SIZE, 2);
 	R600_OUT_BATCH(r700->DB_DEPTH_SIZE.u32All);
 	R600_OUT_BATCH(r700->DB_DEPTH_VIEW.u32All);
-	R600_OUT_BATCH_REGSEQ(DB_DEPTH_BASE, 3);
+	R600_OUT_BATCH_REGSEQ(DB_DEPTH_BASE, 2);
 	R600_OUT_BATCH_RELOC(r700->DB_DEPTH_BASE.u32All,
 			     rrb->bo,
 			     r700->DB_DEPTH_BASE.u32All,
 			     0, RADEON_GEM_DOMAIN_VRAM, 0);
 	R600_OUT_BATCH(r700->DB_DEPTH_INFO.u32All);
-	R600_OUT_BATCH(r700->DB_HTILE_DATA_BASE.u32All);
         END_BATCH();
 
-        BEGIN_BATCH_NO_AUTOSTATE(24);
-	R600_OUT_BATCH_REGSEQ(DB_STENCIL_CLEAR, 2);
-	R600_OUT_BATCH(r700->DB_STENCIL_CLEAR.u32All);
-	R600_OUT_BATCH(r700->DB_DEPTH_CLEAR.u32All);
-
-	R600_OUT_BATCH_REGSEQ(DB_STENCILREFMASK, 2);
-	R600_OUT_BATCH(r700->DB_STENCILREFMASK.u32All);
-	R600_OUT_BATCH(r700->DB_STENCILREFMASK_BF.u32All);
-
-	R600_OUT_BATCH_REGVAL(DB_DEPTH_CONTROL, r700->DB_DEPTH_CONTROL.u32All);
-	R600_OUT_BATCH_REGVAL(DB_SHADER_CONTROL, r700->DB_SHADER_CONTROL.u32All);
-
-	R600_OUT_BATCH_REGSEQ(DB_RENDER_CONTROL, 2);
-	R600_OUT_BATCH(r700->DB_RENDER_CONTROL.u32All);
-	R600_OUT_BATCH(r700->DB_RENDER_OVERRIDE.u32All);
-
-	R600_OUT_BATCH_REGVAL(DB_HTILE_SURFACE, r700->DB_HTILE_SURFACE.u32All);
-	R600_OUT_BATCH_REGVAL(DB_ALPHA_TO_MASK, r700->DB_ALPHA_TO_MASK.u32All);
-        END_BATCH();
+	if ((context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) &&
+	    (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)) {
+		BEGIN_BATCH_NO_AUTOSTATE(2);
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SURFACE_BASE_UPDATE, 0));
+		R600_OUT_BATCH(1 << 0);
+		END_BATCH();
+	}
 
 	COMMIT_BATCH();
 
@@ -575,12 +587,6 @@ GLboolean r700SendRenderTargetState(context_t *context, int id)
 	R600_OUT_BATCH_REGVAL(CB_COLOR0_MASK + (4 * id), r700->render_target[id].CB_COLOR0_MASK.u32All);
         END_BATCH();
 
-	if (context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) {
-		BEGIN_BATCH_NO_AUTOSTATE(3);
-		R600_OUT_BATCH_REGVAL(CB_BLEND0_CONTROL + (4 * id), r700->render_target[id].CB_BLEND0_CONTROL.u32All);
-		END_BATCH();
-	}
-
 	COMMIT_BATCH();
 
 	r700SyncSurf(context, rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM,
diff --git a/src/mesa/drivers/dri/r600/r700_clear.c b/src/mesa/drivers/dri/r600/r700_clear.c
index e84be386221..05d4af331e0 100644
--- a/src/mesa/drivers/dri/r600/r700_clear.c
+++ b/src/mesa/drivers/dri/r600/r700_clear.c
@@ -31,6 +31,7 @@
 #include "main/imports.h"
 #include "main/mtypes.h"
 #include "main/enums.h"
+#include "swrast/swrast.h"
 
 #include "radeon_lock.h"
 #include "r600_context.h"
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.c b/src/mesa/drivers/dri/r600/r700_fragprog.c
index 4ac37f1dfe8..6249bde6f18 100644
--- a/src/mesa/drivers/dri/r600/r700_fragprog.c
+++ b/src/mesa/drivers/dri/r600/r700_fragprog.c
@@ -55,6 +55,12 @@ void Map_Fragment_Program(r700_AssemblerBase         *pAsm,
 //Input mapping : mesa_fp->Base.InputsRead set the flag, set in 
 	//The flags parsed in parse_attrib_binding. FRAG_ATTRIB_COLx, FRAG_ATTRIB_TEXx, ...
 	//MUST match order in Map_Vertex_Output
+	unBit = 1 << FRAG_ATTRIB_WPOS;
+	if(mesa_fp->Base.InputsRead & unBit)
+	{
+		pAsm->uiFP_AttributeMap[FRAG_ATTRIB_WPOS] = pAsm->number_used_registers++;
+	}
+
 	unBit = 1 << FRAG_ATTRIB_COL0;
 	if(mesa_fp->Base.InputsRead & unBit)
 	{
@@ -112,6 +118,7 @@ void Map_Fragment_Program(r700_AssemblerBase         *pAsm,
 		pAsm->uiFP_OutputMap[FRAG_RESULT_DEPTH] = pAsm->number_used_registers++;
 		pAsm->number_of_exports++;
 		pAsm->number_of_colorandz_exports++;
+		pAsm->pR700Shader->depthIsExported = 1;
 	}
 
     pAsm->pucOutMask = (unsigned char*) MALLOC(pAsm->number_of_exports);    
@@ -263,11 +270,13 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
 {
     context_t *context = R700_CONTEXT(ctx);
     R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
-
+    BATCH_LOCALS(&context->radeon);
     struct r700_fragment_program *fp = (struct r700_fragment_program *)
 	                                   (ctx->FragmentProgram._Current);
     r700_AssemblerBase         *pAsm = &(fp->r700AsmCode);
     struct gl_fragment_program *mesa_fp = &(fp->mesa_program);
+    struct gl_program_parameter_list *paramList;
+    unsigned int unNumParamData;
     unsigned int ui, i;
     unsigned int unNumOfReg;
     unsigned int unBit;
@@ -302,6 +311,16 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
 
     ui = (r700->SPI_PS_IN_CONTROL_0.u32All & NUM_INTERP_mask) / (1 << NUM_INTERP_shift);
 
+    /* PS uses fragment.position */
+    if (mesa_fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS))
+    {
+        ui += 1;
+        SETfield(r700->SPI_PS_IN_CONTROL_0.u32All, ui, NUM_INTERP_shift, NUM_INTERP_mask);
+        SETfield(r700->SPI_PS_IN_CONTROL_0.u32All, CENTERS_ONLY, BARYC_SAMPLE_CNTL_shift, BARYC_SAMPLE_CNTL_mask);
+        SETbit(r700->SPI_PS_IN_CONTROL_0.u32All, POSITION_ENA_bit);
+        SETbit(r700->SPI_INPUT_Z.u32All, PROVIDE_Z_TO_SPI_bit);
+    }
+
     ui = (unNumOfReg < ui) ? ui : unNumOfReg;
 
     SETfield(r700->ps.SQ_PGM_RESOURCES_PS.u32All, ui, NUM_GPRS_shift, NUM_GPRS_mask); 
@@ -335,7 +354,49 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
         CLEARbit(r700->DB_SHADER_CONTROL.u32All, Z_EXPORT_ENABLE_bit);
     }
 
+    /* sent out shader constants. */
+    paramList = fp->mesa_program.Base.Parameters;
+
+    if(NULL != paramList)
+    {
+        _mesa_load_state_parameters(ctx, paramList);
+
+        unNumParamData = paramList->NumParameters * 4;
+
+        BEGIN_BATCH_NO_AUTOSTATE(2 + unNumParamData);
+
+        R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_ALU_CONST, unNumParamData));
+
+        /* assembler map const from very beginning. */
+        R600_OUT_BATCH(SQ_ALU_CONSTANT_PS_OFFSET * 4);
+
+        unNumParamData = paramList->NumParameters;
+
+        for(ui=0; ui<unNumParamData; ui++)
+        {
+            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][0])));
+            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][1])));
+            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][2])));
+            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][3])));
+        }
+        END_BATCH();
+        COMMIT_BATCH();
+    }
+
     // emit ps input map
+    unBit = 1 << FRAG_ATTRIB_WPOS;
+    if(mesa_fp->Base.InputsRead & unBit)
+    {
+            ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_WPOS];
+            SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
+            SETfield(r700->SPI_PS_INPUT_CNTL[ui].u32All, ui,
+                     SEMANTIC_shift, SEMANTIC_mask);
+            if (r700->SPI_INTERP_CONTROL_0.u32All & FLAT_SHADE_ENA_bit)
+                    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+            else
+                    CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+    }
+
     unBit = 1 << FRAG_ATTRIB_COL0;
     if(mesa_fp->Base.InputsRead & unBit)
     {
@@ -391,45 +452,3 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
     return GL_TRUE;
 }
 
-GLboolean r700SendPSConstants(GLcontext * ctx)
-{
-    context_t *context = R700_CONTEXT(ctx);
-    BATCH_LOCALS(&context->radeon);
-    struct r700_fragment_program *fp = (struct r700_fragment_program *)
-	                                   (ctx->FragmentProgram._Current);
-    struct gl_program_parameter_list *paramList;
-    unsigned int unNumParamData;
-    unsigned int ui;
-
-    /* sent out shader constants. */
-    paramList = fp->mesa_program.Base.Parameters;
-
-    if(NULL != paramList)
-    {
-        _mesa_load_state_parameters(ctx, paramList);
-
-        unNumParamData = paramList->NumParameters * 4;
-
-        BEGIN_BATCH_NO_AUTOSTATE(2 + unNumParamData);
-
-        R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_ALU_CONST, unNumParamData));
-
-        /* assembler map const from very beginning. */
-        R600_OUT_BATCH(SQ_ALU_CONSTANT_PS_OFFSET * 4);
-
-        unNumParamData = paramList->NumParameters;
-
-        for(ui=0; ui<unNumParamData; ui++)
-        {
-            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][0])));
-            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][1])));
-            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][2])));
-            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][3])));
-        }
-        END_BATCH();
-        COMMIT_BATCH();
-    }
-
-    return GL_TRUE;
-}
-
diff --git a/src/mesa/drivers/dri/r600/r700_oglprog.c b/src/mesa/drivers/dri/r600/r700_oglprog.c
index 36de143b1a7..c49b90c1cc9 100644
--- a/src/mesa/drivers/dri/r600/r700_oglprog.c
+++ b/src/mesa/drivers/dri/r600/r700_oglprog.c
@@ -33,6 +33,7 @@
 #include "tnl/tnl.h"
 
 #include "r600_context.h"
+#include "r600_emit.h"
 
 #include "r700_oglprog.h"
 #include "r700_fragprog.h"
@@ -87,7 +88,6 @@ static void r700DeleteProgram(GLcontext * ctx, struct gl_program *prog)
 {
     struct r700_vertex_program   * vp;
     struct r700_fragment_program * fp;
-    context_t *context = R700_CONTEXT(ctx);
 
     switch (prog->Target) 
     {
diff --git a/src/mesa/drivers/dri/r600/r700_render.c b/src/mesa/drivers/dri/r600/r700_render.c
index 2592d7df148..6705dbcf4b9 100644
--- a/src/mesa/drivers/dri/r600/r700_render.c
+++ b/src/mesa/drivers/dri/r600/r700_render.c
@@ -139,6 +139,13 @@ static GLboolean r700SetupShaders(GLcontext * ctx)
 
     r600UpdateTextureState(ctx);
 
+    r700SendFSState(context); // FIXME just a place holder for now
+    r700SendPSState(context);
+    r700SendVSState(context);
+
+    r700SendTextureState(context);
+    r700SetupStreams(ctx);
+
     return GL_TRUE;
 }
 
@@ -274,20 +281,15 @@ static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim
 void r700EmitState(GLcontext * ctx)
 {
 	context_t *context = R700_CONTEXT(ctx);
+	radeonContextPtr radeon = &context->radeon;
+
+	if (radeon->cmdbuf.cs->cdw && !radeon->hw.is_dirty && !radeon->hw.all_dirty)
+		return;
 
 	rcommonEnsureCmdBufSpace(&context->radeon,
 				 context->radeon.hw.max_state_size, __FUNCTION__);
 
-	r700Start3D(context);
 	r700SendSQConfig(context);
-	r700SendFSState(context); // FIXME just a place holder for now
-	r700SendPSState(context);
-	r700SendVSState(context);
-	r700SendVSConstants(ctx);
-	r700SendPSConstants(ctx);
-
-	r700SendTextureState(context);
-	r700SetupStreams(ctx);
 
 	r700SendUCPState(context);
 	r700SendContextStates(context);
@@ -305,13 +307,12 @@ static GLboolean r700RunRender(GLcontext * ctx,
     TNLcontext *tnl = TNL_CONTEXT(ctx);
     struct vertex_buffer *vb = &tnl->vb;
 
+    r700Start3D(context);
+
     r700UpdateShaders(ctx);
     r700SetScissor(context);
     r700SetupShaders(ctx);
 
-    r700SetRenderTarget(context, 0);
-    r700SetDepthTarget(context);
-
     r700EmitState(ctx);
 
     /* richard test code */
@@ -327,8 +328,6 @@ static GLboolean r700RunRender(GLcontext * ctx,
 
     radeonReleaseArrays(ctx, ~0);
 
-    rcommonFlushCmdBuf( &context->radeon, __FUNCTION__ );
-
     return GL_FALSE;
 }
 
diff --git a/src/mesa/drivers/dri/r600/r700_state.c b/src/mesa/drivers/dri/r600/r700_state.c
index e0a57425917..835b5e18c2c 100644
--- a/src/mesa/drivers/dri/r600/r700_state.c
+++ b/src/mesa/drivers/dri/r600/r700_state.c
@@ -60,6 +60,8 @@ static void r700SetClipPlaneState(GLcontext * ctx, GLenum cap, GLboolean state);
 static void r700UpdatePolygonMode(GLcontext * ctx);
 static void r700SetPolygonOffsetState(GLcontext * ctx, GLboolean state);
 static void r700SetStencilState(GLcontext * ctx, GLboolean state);
+static void r700SetRenderTarget(context_t *context, int id);
+static void r700SetDepthTarget(context_t *context);
 
 void r700SetDefaultStates(context_t *context) //--------------------
 {
@@ -158,29 +160,16 @@ void r700UpdateViewportOffset(GLcontext * ctx) //------------------
  */
 void r700UpdateDrawBuffer(GLcontext * ctx) /* TODO */ //---------------------
 {
-#if 0 /* to be enabled */
-    context_t *context = R700_CONTEXT(ctx);
+	context_t *context = R700_CONTEXT(ctx);
 
-    switch (ctx->DrawBuffer->_ColorDrawBufferIndexes[0]) 
-    {
-	case BUFFER_FRONT_LEFT:
-	    context->target.rt = context->screen->frontBuffer;
-	    break;
-	case BUFFER_BACK_LEFT:
-	    context->target.rt = context->screen->backBuffer;
-	    break;
-	default:
-	    memset (&context->target.rt, sizeof(context->target.rt), 0);
-	}
-#endif /* to be enabled */
+	r700SetRenderTarget(context, 0);
+	r700SetDepthTarget(context);
 }
 
 static void r700FetchStateParameter(GLcontext * ctx,
 			                        const gl_state_index state[STATE_LENGTH],
 			                        GLfloat * value)
 {
-	context_t *context = R700_CONTEXT(ctx);
-
     /* TODO */
 }
 
@@ -600,14 +589,46 @@ static void r700BlendFuncSeparate(GLcontext * ctx,
 
 /**
  * Translate LogicOp enums into hardware representation.
- * Both use a very logical bit-wise layout, but unfortunately the order
- * of bits is reversed.
  */
 static GLuint translate_logicop(GLenum logicop)
 {
-	GLuint bits = logicop - GL_CLEAR;
-	bits = ((bits & 1) << 3) | ((bits & 2) << 1) | ((bits & 4) >> 1) | ((bits & 8) >> 3);
-	return bits;
+	switch (logicop) {
+	case GL_CLEAR:
+		return 0x00;
+	case GL_SET:
+		return 0xff;
+	case GL_COPY:
+		return 0xcc;
+	case GL_COPY_INVERTED:
+		return 0x33;
+	case GL_NOOP:
+		return 0xaa;
+	case GL_INVERT:
+		return 0x55;
+	case GL_AND:
+		return 0x88;
+	case GL_NAND:
+		return 0x77;
+	case GL_OR:
+		return 0xee;
+	case GL_NOR:
+		return 0x11;
+	case GL_XOR:
+		return 0x66;
+	case GL_EQUIV:
+		return 0xaa;
+	case GL_AND_REVERSE:
+		return 0x44;
+	case GL_AND_INVERTED:
+		return 0x22;
+	case GL_OR_REVERSE:
+		return 0xdd;
+	case GL_OR_INVERTED:
+		return 0xbb;
+	default:
+		fprintf(stderr, "unknown blend logic operation %x\n", logicop);
+		return 0xcc;
+	}
 }
 
 /**
@@ -1327,22 +1348,22 @@ void r700SetScissor(context_t *context) //---------------
 	r700->viewport[id].enabled = GL_TRUE;
 }
 
-void r700SetRenderTarget(context_t *context, int id)
+static void r700SetRenderTarget(context_t *context, int id)
 {
     R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
 
     struct radeon_renderbuffer *rrb;
     unsigned int nPitchInPixel;
 
+    rrb = radeon_get_colorbuffer(&context->radeon);
+    if (!rrb || !rrb->bo) {
+	    fprintf(stderr, "no rrb\n");
+	    return;
+    }
+
     /* screen/window/view */
     SETfield(r700->CB_TARGET_MASK.u32All, 0xF, (4 * id), TARGET0_ENABLE_mask);
 
-    rrb = radeon_get_colorbuffer(&context->radeon);
-	if (!rrb || !rrb->bo) {
-		fprintf(stderr, "no rrb\n");
-		return;
-	}
-
     /* color buffer */
     r700->render_target[id].CB_COLOR0_BASE.u32All = context->radeon.state.color.draw_offset;
 
@@ -1375,39 +1396,22 @@ void r700SetRenderTarget(context_t *context, int id)
     r700->render_target[id].enabled = GL_TRUE;
 }
 
-void r700SetDepthTarget(context_t *context)
+static void r700SetDepthTarget(context_t *context)
 {
     R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
 
     struct radeon_renderbuffer *rrb;
     unsigned int nPitchInPixel;
 
+    rrb = radeon_get_depthbuffer(&context->radeon);
+    if (!rrb)
+	    return;
+
     /* depth buf */
     r700->DB_DEPTH_SIZE.u32All = 0;
     r700->DB_DEPTH_BASE.u32All = 0;
     r700->DB_DEPTH_INFO.u32All = 0;
-
-    r700->DB_DEPTH_CLEAR.u32All     = 0x3F800000;
-    r700->DB_DEPTH_VIEW.u32All      = 0;
-    r700->DB_RENDER_CONTROL.u32All  = 0;
-    SETbit(r700->DB_RENDER_CONTROL.u32All, STENCIL_COMPRESS_DISABLE_bit);
-    SETbit(r700->DB_RENDER_CONTROL.u32All, DEPTH_COMPRESS_DISABLE_bit);
-    r700->DB_RENDER_OVERRIDE.u32All = 0;
-    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
-	    SETbit(r700->DB_RENDER_OVERRIDE.u32All, FORCE_SHADER_Z_ORDER_bit);
-    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIZ_ENABLE_shift, FORCE_HIZ_ENABLE_mask);
-    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE0_shift, FORCE_HIS_ENABLE0_mask);
-    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE1_shift, FORCE_HIS_ENABLE1_mask);
-
-    r700->DB_ALPHA_TO_MASK.u32All = 0;
-    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET0_shift, ALPHA_TO_MASK_OFFSET0_mask);
-    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET1_shift, ALPHA_TO_MASK_OFFSET1_mask);
-    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET2_shift, ALPHA_TO_MASK_OFFSET2_mask);
-    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET3_shift, ALPHA_TO_MASK_OFFSET3_mask);
-
-    rrb = radeon_get_depthbuffer(&context->radeon);
-	if (!rrb)
-		return;
+    r700->DB_DEPTH_VIEW.u32All = 0;
 
     nPitchInPixel = rrb->pitch/rrb->cpp;
 
@@ -1757,6 +1761,24 @@ void r700InitState(GLcontext * ctx) //-------------------
     r700DepthFunc(ctx, ctx->Depth.Func);
     SETbit(r700->DB_SHADER_CONTROL.u32All, DUAL_EXPORT_ENABLE_bit);
 
+    r700->DB_DEPTH_CLEAR.u32All     = 0x3F800000;
+
+    r700->DB_RENDER_CONTROL.u32All  = 0;
+    SETbit(r700->DB_RENDER_CONTROL.u32All, STENCIL_COMPRESS_DISABLE_bit);
+    SETbit(r700->DB_RENDER_CONTROL.u32All, DEPTH_COMPRESS_DISABLE_bit);
+    r700->DB_RENDER_OVERRIDE.u32All = 0;
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
+	    SETbit(r700->DB_RENDER_OVERRIDE.u32All, FORCE_SHADER_Z_ORDER_bit);
+    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIZ_ENABLE_shift, FORCE_HIZ_ENABLE_mask);
+    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE0_shift, FORCE_HIS_ENABLE0_mask);
+    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE1_shift, FORCE_HIS_ENABLE1_mask);
+
+    r700->DB_ALPHA_TO_MASK.u32All = 0;
+    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET0_shift, ALPHA_TO_MASK_OFFSET0_mask);
+    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET1_shift, ALPHA_TO_MASK_OFFSET1_mask);
+    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET2_shift, ALPHA_TO_MASK_OFFSET2_mask);
+    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET3_shift, ALPHA_TO_MASK_OFFSET3_mask);
+
     /* stencil */
     r700Enable(ctx, GL_STENCIL_TEST, ctx->Stencil._Enabled);
     r700StencilMaskSeparate(ctx, 0, ctx->Stencil.WriteMask[0]);
@@ -1822,6 +1844,8 @@ void r700InitState(GLcontext * ctx) //-------------------
     /* Set up color compare mask */
     r700->CB_CLRCMP_MSK.u32All = 0xFFFFFFFF;
 
+    context->radeon.hw.all_dirty = GL_TRUE;
+
 }
 
 void r700InitStateFuncs(struct dd_function_table *functions) //-----------------
diff --git a/src/mesa/drivers/dri/r600/r700_state.h b/src/mesa/drivers/dri/r600/r700_state.h
index 23246367db8..30eb54e8b0a 100644
--- a/src/mesa/drivers/dri/r600/r700_state.h
+++ b/src/mesa/drivers/dri/r600/r700_state.h
@@ -42,10 +42,8 @@ extern void r700UpdateDrawBuffer (GLcontext * ctx);
 extern void r700InitState (GLcontext * ctx);
 extern void r700InitStateFuncs (struct dd_function_table *functions);
 
-extern void r700SetRenderTarget(context_t *context, int id);
 extern void r700SetDefaultStates(context_t * context);
 
-void r700SetScissor(context_t *context);
-void r700SetDepthTarget(context_t *context);
+extern void r700SetScissor(context_t *context);
 
 #endif	/* _R600_SCREEN_H */
diff --git a/src/mesa/drivers/dri/r600/r700_vertprog.c b/src/mesa/drivers/dri/r600/r700_vertprog.c
index 1c5c20f66e9..31e71cdfa30 100644
--- a/src/mesa/drivers/dri/r600/r700_vertprog.c
+++ b/src/mesa/drivers/dri/r600/r700_vertprog.c
@@ -336,10 +336,14 @@ GLboolean r700SetupVertexProgram(GLcontext * ctx)
 {
     context_t *context = R700_CONTEXT(ctx);
     R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
-
+    BATCH_LOCALS(&context->radeon);
     struct r700_vertex_program *vp
              = (struct r700_vertex_program *)ctx->VertexProgram._Current;
 
+    struct gl_program_parameter_list *paramList;
+    unsigned int unNumParamData;
+    unsigned int ui;
+
     if(GL_FALSE == vp->loaded)
     {
         if(vp->r700Shader.bNeedsAssembly == GL_TRUE)
@@ -385,21 +389,7 @@ GLboolean r700SetupVertexProgram(GLcontext * ctx)
     CLEARbit(r700->SPI_PS_IN_CONTROL_0.u32All, LINEAR_GRADIENT_ENA_bit);
     */
 
-    return GL_TRUE;
-}
-
-GLboolean r700SendVSConstants(GLcontext * ctx)
-{
-    context_t *context = R700_CONTEXT(ctx);
-    BATCH_LOCALS(&context->radeon);
-    struct r700_vertex_program *vp
-             = (struct r700_vertex_program *)ctx->VertexProgram._Current;
-    struct gl_program_parameter_list *paramList;
-    unsigned int unNumParamData;
-    unsigned int ui;
-
     /* sent out shader constants. */
-
     paramList = vp->mesa_program.Base.Parameters;
 
     if(NULL != paramList)
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_drm.h b/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
index 8eeaea1cb20..d52fb017d8a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
@@ -83,6 +83,10 @@ struct radeon_bo_funcs {
     int (*bo_unmap)(struct radeon_bo *bo);
     int (*bo_wait)(struct radeon_bo *bo);
     int (*bo_is_static)(struct radeon_bo *bo);
+    int (*bo_set_tiling)(struct radeon_bo *bo, uint32_t tiling_flags,
+			  uint32_t pitch);
+    int (*bo_get_tiling)(struct radeon_bo *bo, uint32_t *tiling_flags,
+			  uint32_t *pitch);
 };
 
 struct radeon_bo_manager {
@@ -187,6 +191,18 @@ static inline int _radeon_bo_wait(struct radeon_bo *bo,
     return bo->bom->funcs->bo_wait(bo);
 }
 
+static inline int radeon_bo_set_tiling(struct radeon_bo *bo,
+				       uint32_t tiling_flags, uint32_t pitch)
+{
+    return bo->bom->funcs->bo_set_tiling(bo, tiling_flags, pitch);
+}
+
+static inline int radeon_bo_get_tiling(struct radeon_bo *bo,
+				       uint32_t *tiling_flags, uint32_t *pitch)
+{
+    return bo->bom->funcs->bo_get_tiling(bo, tiling_flags, pitch);
+}
+
 static inline int radeon_bo_is_static(struct radeon_bo *bo)
 {
 	if (bo->bom->funcs->bo_is_static)
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
index 992eb4611b1..d6d22cb4c37 100644
--- a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
@@ -577,6 +577,8 @@ static struct radeon_bo_funcs bo_legacy_funcs = {
     bo_unmap,
     NULL,
     bo_is_static,
+    NULL,
+    NULL,
 };
 
 static int bo_vram_validate(struct radeon_bo *bo,
@@ -622,12 +624,34 @@ static int bo_vram_validate(struct radeon_bo *bo,
 
     if (bo_legacy->dirty || bo_legacy->tobj->base.dirty_images[0]) {
 	    if (IS_R600_CLASS(boml->screen)) {
-		    char *src = bo_legacy->ptr;
-		    char *dst = (char *) boml->screen->driScreen->pFB +
-			    (bo_legacy->offset - boml->fb_location);
+		    drm_radeon_texture_t tex;
+		    drm_radeon_tex_image_t tmp;
+		    int ret;
 
-		    /* FIXME: alignment, pitch, etc. */
-		    memcpy(dst, src, bo->size);
+		    tex.offset = bo_legacy->offset;
+		    tex.image = &tmp;
+		    assert(!(tex.offset & 1023));
+
+		    tmp.x = 0;
+		    tmp.y = 0;
+		    tmp.width = bo->size;
+		    tmp.height = 1;
+		    tmp.data = bo_legacy->ptr;
+		    tex.format = RADEON_TXFORMAT_ARGB8888;
+		    tex.width = tmp.width;
+		    tex.height = tmp.height;
+		    tex.pitch = bo->size;
+		    do {
+			    ret = drmCommandWriteRead(bo->bom->fd,
+						      DRM_RADEON_TEXTURE,
+						      &tex,
+						      sizeof(drm_radeon_texture_t));
+			    if (ret) {
+				    if (RADEON_DEBUG & DEBUG_IOCTL)
+					    fprintf(stderr, "DRM_RADEON_TEXTURE:  again!\n");
+				    usleep(1);
+			    }
+		    } while (ret == -EAGAIN);
 	    } else {
 		    /* Copy to VRAM using a blit.
 		     * All memory is 4K aligned. We're using 1024 pixels wide blits.
@@ -904,3 +928,32 @@ unsigned radeon_bo_legacy_relocs_size(struct radeon_bo *bo)
     return bo->size;
 }
 
+/*
+ * Fake up a bo for things like texture image_override.
+ * bo->offset already includes fb_location
+ */
+struct radeon_bo *radeon_legacy_bo_alloc_fake(struct radeon_bo_manager *bom,
+					      int size,
+	                                      uint32_t offset)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bom;
+    struct bo_legacy *bo;
+
+#ifdef RADEON_DEBUG_BO
+    bo = bo_allocate(boml, size, 0, RADEON_GEM_DOMAIN_VRAM, 0, "fake bo");
+#else
+    bo = bo_allocate(boml, size, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+#endif /* RADEON_DEBUG_BO */
+    if (bo == NULL)
+	return NULL;
+    bo->static_bo = 1;
+    bo->offset = offset;
+    bo->base.handle = bo->offset;
+    bo->ptr = boml->screen->driScreen->pFB + (offset - boml->fb_location);
+    if (bo->base.handle > boml->nhandle) {
+        boml->nhandle = bo->base.handle + 1;
+    }
+    radeon_bo_ref(&(bo->base));
+    return &(bo->base);
+}
+
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.h b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.h
index 0db817cab07..455adebc099 100644
--- a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.h
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.h
@@ -42,5 +42,8 @@ struct radeon_bo_manager *radeon_bo_manager_legacy_ctor(struct radeon_screen *sc
 void radeon_bo_manager_legacy_dtor(struct radeon_bo_manager *bom);
 void radeon_bo_legacy_texture_age(struct radeon_bo_manager *bom);
 unsigned radeon_bo_legacy_relocs_size(struct radeon_bo *bo);
+struct radeon_bo *radeon_legacy_bo_alloc_fake(struct radeon_bo_manager *bom,
+					      int size,
+	                                      uint32_t offset);
 
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h b/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
index e0c70dd9a11..a42870f4a93 100644
--- a/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
+++ b/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
@@ -15,6 +15,12 @@
 #define RADEON_GEM_DOMAIN_GTT 0x2   // GTT or cache flushed
 #define RADEON_GEM_DOMAIN_VRAM 0x4  // VRAM domain
 
+#define RADEON_TILING_MACRO 0x1
+#define RADEON_TILING_MICRO 0x2
+#define RADEON_TILING_SWAP 0x4
+#define RADEON_TILING_SURFACE 0x8 /* this object requires a surface
+				   * when mapped - i.e. front buffer */
+
 /* to be used to build locally in mesa with no libdrm bits */
 #include "../radeon/radeon_bo_drm.h"
 #include "../radeon/radeon_cs_drm.h"
diff --git a/src/mesa/drivers/dri/radeon/radeon_chipset.h b/src/mesa/drivers/dri/radeon/radeon_chipset.h
index 0a6a2df35b8..a275c8fb143 100644
--- a/src/mesa/drivers/dri/radeon/radeon_chipset.h
+++ b/src/mesa/drivers/dri/radeon/radeon_chipset.h
@@ -356,11 +356,14 @@
 #define PCI_CHIP_RV770_947A             0x947A
 #define PCI_CHIP_RV770_947B             0x947B
 
+#define PCI_CHIP_RV730_9480             0x9480
 #define PCI_CHIP_RV730_9487             0x9487
+#define PCI_CHIP_RV730_9488             0x9488
 #define PCI_CHIP_RV730_9489             0x9489
 #define PCI_CHIP_RV730_948F             0x948F
 #define PCI_CHIP_RV730_9490             0x9490
 #define PCI_CHIP_RV730_9491             0x9491
+#define PCI_CHIP_RV730_9495             0x9495
 #define PCI_CHIP_RV730_9498             0x9498
 #define PCI_CHIP_RV730_949C             0x949C
 #define PCI_CHIP_RV730_949E             0x949E
@@ -374,12 +377,16 @@
 #define PCI_CHIP_RV710_9552             0x9552
 #define PCI_CHIP_RV710_9553             0x9553
 #define PCI_CHIP_RV710_9555             0x9555
+#define PCI_CHIP_RV710_9557             0x9557
 
 #define PCI_CHIP_RV740_94A0             0x94A0
 #define PCI_CHIP_RV740_94A1             0x94A1
+#define PCI_CHIP_RV740_94A3             0x94A3
 #define PCI_CHIP_RV740_94B1             0x94B1
 #define PCI_CHIP_RV740_94B3             0x94B3
+#define PCI_CHIP_RV740_94B4             0x94B4
 #define PCI_CHIP_RV740_94B5             0x94B5
+#define PCI_CHIP_RV740_94B9             0x94B9
 
 enum {
    CHIP_FAMILY_R100,
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index 7f503a9ff71..330c2c8a86e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -481,32 +481,6 @@ void radeonCopyBuffer( __DRIdrawablePrivate *dPriv,
 		if (!n)
 			continue;
 
-		if (IS_R600_CLASS(rmesa->radeonScreen)) {
-			int cpp = rmesa->radeonScreen->cpp;
-			int src_pitch = rmesa->radeonScreen->backPitch * cpp;
-			int dst_pitch = rmesa->radeonScreen->frontPitch * cpp;
-			char *src = (char *)rmesa->radeonScreen->driScreen->pFB + rmesa->radeonScreen->backOffset;
-			char *dst = (char *)rmesa->radeonScreen->driScreen->pFB + rmesa->radeonScreen->frontOffset;
-			int j;
-			drm_clip_rect_t *pb = rmesa->sarea->boxes;
-
-			for (j = 0; j < n; j++) {
-				int x = pb[j].x1;
-				int y = pb[j].y1;
-				int w = pb[j].x2 - x;
-				int h = pb[j].y2 - y;
-
-				src += (y * src_pitch) + (x * cpp);
-				dst += (y * dst_pitch) + (x * cpp);
-
-				while (h--) {
-					memcpy(dst, src, w * cpp);
-					src += src_pitch;
-					dst += dst_pitch;
-				}
-			}
-		}
-
 		ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_SWAP );
 
 		if ( ret ) {
@@ -1093,7 +1067,7 @@ void radeonFlush(GLcontext *ctx)
 			 * each of N places that do rendering.  This has worse performances,
 			 * but it is much easier to get correct.
 			 */
-			if (radeon->is_front_buffer_rendering) {
+			if (!radeon->is_front_buffer_rendering) {
 				radeon->front_buffer_dirty = GL_FALSE;
 			}
 		}
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index 2a017b59cfc..f71dc1cb233 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -211,6 +211,7 @@ GLboolean radeonInitContext(radeonContextPtr radeon,
 	radeon->dri.screen = sPriv;
 	radeon->dri.hwContext = driContextPriv->hHWContext;
 	radeon->dri.hwLock = &sPriv->pSAREA->lock;
+	radeon->dri.hwLockCount = 0;
 	radeon->dri.fd = sPriv->fd;
 	radeon->dri.drmMinor = sPriv->drm_version.minor;
 
@@ -294,11 +295,10 @@ void radeonDestroyContext(__DRIcontextPrivate *driContextPriv )
 	GET_CURRENT_CONTEXT(ctx);
 	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
 	radeonContextPtr current = ctx ? RADEON_CONTEXT(ctx) : NULL;
-
-    /* +r6/r7 */
-    __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600) /* +r6/r7 */
+	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
 	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
-    /* --------- */
+#endif
 
 	if (radeon == current) {
 		radeon_firevertices(radeon);
@@ -306,16 +306,7 @@ void radeonDestroyContext(__DRIcontextPrivate *driContextPriv )
 	}
 
 	assert(radeon);
-	if (radeon) 
-    {
-
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600) /* +r6/r7 */
-	    if (IS_R600_CLASS(screen))
-        {
-		r600DestroyContext(driContextPriv);
-        }
-#endif
-
+	if (radeon) {
 		if (radeon->dma.current) {
 			rcommonFlushCmdBuf( radeon, __FUNCTION__ );
 		}
@@ -762,8 +753,10 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 			bo = depth_bo;
 			radeon_bo_ref(bo);
 		} else {
+			uint32_t tiling_flags = 0, pitch = 0;
+			int ret;
 #ifdef RADEON_DEBUG_BO
-            bo = radeon_bo_open(radeon->radeonScreen->bom,
+			bo = radeon_bo_open(radeon->radeonScreen->bom,
 						buffers[i].name,
 						0,
 						0,
@@ -784,6 +777,13 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 					regname, buffers[i].name);
 
 			}
+
+			ret = radeon_bo_get_tiling(bo, &tiling_flags, &pitch);
+			if (tiling_flags & RADEON_TILING_MACRO)
+				bo->flags |= RADEON_BO_FLAGS_MACRO_TILE;
+			if (tiling_flags & RADEON_TILING_MICRO)
+				bo->flags |= RADEON_BO_FLAGS_MICRO_TILE;
+			
 		}
 
 		if (buffers[i].attachment == __DRI_BUFFER_DEPTH) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.h b/src/mesa/drivers/dri/radeon/radeon_common_context.h
index d7e94a68949..f8e1a25c9fa 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.h
@@ -365,6 +365,7 @@ struct radeon_dri_mirror {
 
 	drm_context_t hwContext;
 	drm_hw_lock_t *hwLock;
+	int hwLockCount;
 	int fd;
 	int drmMinor;
 };
diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
index f28efa33e9a..f05b106aaf8 100644
--- a/src/mesa/drivers/dri/radeon/radeon_fbo.c
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -291,7 +291,7 @@ radeon_create_renderbuffer(GLenum format, __DRIdrawablePrivate *driDrawPriv)
 	    rrb->base.RedBits = 8;
 	    rrb->base.GreenBits = 8;
 	    rrb->base.BlueBits = 8;
-	    rrb->base.AlphaBits = 8;
+	    rrb->base.AlphaBits = 0;
 	    rrb->base.DataType = GL_UNSIGNED_BYTE;
 	    break;
 	case GL_RGBA8:
@@ -407,7 +407,7 @@ restart:
 		rrb->cpp = 2;
 		rrb->base._ActualFormat = GL_RGB5;
 		rrb->base._BaseFormat = GL_RGB;
-		rrb->base.DataType = GL_UNSIGNED_SHORT;
+		rrb->base.DataType = GL_UNSIGNED_BYTE;
 		DBG("Render to RGB5 texture OK\n");
 	}
 	else if (texImage->TexFormat == &_mesa_texformat_argb1555) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_lock.c b/src/mesa/drivers/dri/radeon/radeon_lock.c
index 2f0ed1cfced..6294b7e42be 100644
--- a/src/mesa/drivers/dri/radeon/radeon_lock.c
+++ b/src/mesa/drivers/dri/radeon/radeon_lock.c
@@ -86,8 +86,34 @@ void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
 
 	rmesa->vtbl.get_lock(rmesa);
 }
-
-void radeon_lock_hardware(radeonContextPtr radeon)
+#ifndef NDEBUG
+struct lock_debug {
+	const char* function;
+	const char* file;
+	int line;
+};
+
+static struct lock_debug ldebug = {0};
+#endif
+
+#if 0
+/** TODO: use atomic operations for reference counting **/
+/** gcc 4.2 has builtin functios for this **/
+#define ATOMIC_INC_AND_FETCH(atomic) __sync_add_and_fetch(&atomic, 1)
+#define ATOMIC_DEC_AND_FETCH(atomic) __sync_sub_and_fetch(&atomic, 1)
+#else
+#define ATOMIC_INC_AND_FETCH(atomic) (++atomic)
+#define ATOMIC_DEC_AND_FETCH(atomic) (--atomic)
+#endif
+
+
+void radeon_lock_hardware(radeonContextPtr radeon
+#ifndef NDEBUG
+		,const char* function
+		,const char* file
+		,const int line
+#endif
+		)
 {
 	char ret = 0;
 	struct radeon_framebuffer *rfb = NULL;
@@ -102,16 +128,39 @@ void radeon_lock_hardware(radeonContextPtr radeon)
 	}
 
 	if (!radeon->radeonScreen->driScreen->dri2.enabled) {
+		if (ATOMIC_INC_AND_FETCH(radeon->dri.hwLockCount) > 1)
+		{
+#ifndef NDEBUG
+			if ( RADEON_DEBUG & DEBUG_SANITY )
+				fprintf(stderr, "*** %d times of recursive call to %s ***\n"
+						"Original call was from %s (file: %s line: %d)\n"
+						"Now call is coming from %s (file: %s line: %d)\n"
+						, radeon->dri.hwLockCount, __FUNCTION__
+						, ldebug.function, ldebug.file, ldebug.line
+						, function, file, line
+					   );
+#endif
+			return;
+		}
 		DRM_CAS(radeon->dri.hwLock, radeon->dri.hwContext,
 			 (DRM_LOCK_HELD | radeon->dri.hwContext), ret );
 		if (ret)
 			radeonGetLock(radeon, 0);
+#ifndef NDEBUG
+		ldebug.function = function;
+		ldebug.file = file;
+		ldebug.line = line;
+#endif
 	}
 }
 
 void radeon_unlock_hardware(radeonContextPtr radeon)
 {
 	if (!radeon->radeonScreen->driScreen->dri2.enabled) {
+		if (ATOMIC_DEC_AND_FETCH(radeon->dri.hwLockCount) > 0)
+		{
+			return;
+		}
 		DRM_UNLOCK( radeon->dri.fd,
 			    radeon->dri.hwLock,
 			    radeon->dri.hwContext );
diff --git a/src/mesa/drivers/dri/radeon/radeon_lock.h b/src/mesa/drivers/dri/radeon/radeon_lock.h
index 2817709eed6..da5a5b43715 100644
--- a/src/mesa/drivers/dri/radeon/radeon_lock.h
+++ b/src/mesa/drivers/dri/radeon/radeon_lock.h
@@ -48,12 +48,22 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 extern void radeonGetLock(radeonContextPtr rmesa, GLuint flags);
 
-void radeon_lock_hardware(radeonContextPtr rmesa);
+void radeon_lock_hardware(radeonContextPtr rmesa
+#ifndef NDEBUG
+		,const char* function
+		,const char* file
+		,const int line
+#endif
+		);
 void radeon_unlock_hardware(radeonContextPtr rmesa);
 
 /* Lock the hardware and validate our state.
  */
+#ifdef NDEBUG
 #define LOCK_HARDWARE( rmesa )	radeon_lock_hardware(rmesa)
+#else
+#define LOCK_HARDWARE( rmesa )	radeon_lock_hardware(rmesa, __FUNCTION__, __FILE__, __LINE__)
+#endif
 #define UNLOCK_HARDWARE( rmesa )  radeon_unlock_hardware(rmesa)
 
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
index 071a18e7d86..d4082bf68f4 100644
--- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
@@ -326,7 +326,8 @@ GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
 	if (face >= mt->faces || level < mt->firstLevel || level > mt->lastLevel)
 		return GL_FALSE;
 
-	if (texImage->IsCompressed != mt->compressed)
+	if ((!texImage->IsCompressed && mt->compressed) ||
+	    (texImage->IsCompressed && !mt->compressed))
 		return GL_FALSE;
 
 	if (!texImage->IsCompressed &&
@@ -366,8 +367,8 @@ GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_textu
 	        mt->width0 == firstImage->Width &&
 	        mt->height0 == firstImage->Height &&
 	        mt->depth0 == firstImage->Depth &&
-	        mt->bpp == firstImage->TexFormat->TexelBytes &&
-	        mt->compressed == compressed);
+	        mt->compressed == compressed &&
+	        (!mt->compressed ? (mt->bpp == firstImage->TexFormat->TexelBytes) : 1));
 }
 
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index 5f1af5b0da5..7b759661ca7 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -878,11 +878,14 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
       screen->chip_flags = RADEON_CHIPSET_TCL;
       break;
 
+   case PCI_CHIP_RV730_9480:
    case PCI_CHIP_RV730_9487:
+   case PCI_CHIP_RV730_9488:
    case PCI_CHIP_RV730_9489:
    case PCI_CHIP_RV730_948F:
    case PCI_CHIP_RV730_9490:
    case PCI_CHIP_RV730_9491:
+   case PCI_CHIP_RV730_9495:
    case PCI_CHIP_RV730_9498:
    case PCI_CHIP_RV730_949C:
    case PCI_CHIP_RV730_949E:
@@ -899,15 +902,19 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
    case PCI_CHIP_RV710_9552:
    case PCI_CHIP_RV710_9553:
    case PCI_CHIP_RV710_9555:
+   case PCI_CHIP_RV710_9557:
       screen->chip_family = CHIP_FAMILY_RV710;
       screen->chip_flags = RADEON_CHIPSET_TCL;
       break;
 
    case PCI_CHIP_RV740_94A0:
    case PCI_CHIP_RV740_94A1:
+   case PCI_CHIP_RV740_94A3:
    case PCI_CHIP_RV740_94B1:
    case PCI_CHIP_RV740_94B3:
+   case PCI_CHIP_RV740_94B4:
    case PCI_CHIP_RV740_94B5:
+   case PCI_CHIP_RV740_94B9:
       screen->chip_family = CHIP_FAMILY_RV740;
       screen->chip_flags = RADEON_CHIPSET_TCL;
       break;
@@ -1579,21 +1586,11 @@ static GLboolean radeonCreateContext(const __GLcontextModes * glVisual,
 {
 	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
 	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
-	if (IS_R600_CLASS(screen))
-		return r600CreateContext(glVisual, driContextPriv, sharedContextPriv);
-#endif
-
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
 	if (IS_R300_CLASS(screen))
 		return r300CreateContext(glVisual, driContextPriv, sharedContextPriv);
 #endif
 
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
-	if (IS_R200_CLASS(screen))
-		return r200CreateContext(glVisual, driContextPriv, sharedContextPriv);
-#endif
-
 #if !RADEON_COMMON
 	(void)screen;
 	return r100CreateContext(glVisual, driContextPriv, sharedContextPriv);
@@ -1793,8 +1790,16 @@ getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo )
 const struct __DriverAPIRec driDriverAPI = {
    .InitScreen      = radeonInitScreen,
    .DestroyScreen   = radeonDestroyScreen,
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+   .CreateContext   = r200CreateContext,
+   .DestroyContext  = r200DestroyContext,
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+   .CreateContext   = r600CreateContext,
+   .DestroyContext  = r600DestroyContext,
+#else
    .CreateContext   = radeonCreateContext,
    .DestroyContext  = radeonDestroyContext,
+#endif
    .CreateBuffer    = radeonCreateBuffer,
    .DestroyBuffer   = radeonDestroyBuffer,
    .SwapBuffers     = radeonSwapBuffers,
diff --git a/src/mesa/drivers/dri/radeon/radeon_span.c b/src/mesa/drivers/dri/radeon/radeon_span.c
index b2a468b4fd6..5e4bf00d7ab 100644
--- a/src/mesa/drivers/dri/radeon/radeon_span.c
+++ b/src/mesa/drivers/dri/radeon/radeon_span.c
@@ -51,6 +51,59 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 static void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb);
 
+
+/* r200 depth buffer is always tiled - this is the formula
+   according to the docs unless I typo'ed in it
+*/
+static GLubyte *r200_depth_2byte(const struct radeon_renderbuffer * rrb,
+				 GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    GLint offset;
+    if (rrb->has_surface) {
+	offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+	GLuint b;
+	offset = 0;
+	b = (((y  >> 4) * (rrb->pitch >> 8) + (x >> 6)));
+	offset += (b >> 1) << 12;
+	offset += (((rrb->pitch >> 8) & 0x1) ? (b & 0x1) : ((b & 0x1) ^ ((y >> 4) & 0x1))) << 11;
+	offset += ((y >> 2) & 0x3) << 9;
+	offset += ((x >> 3) & 0x1) << 8;
+	offset += ((x >> 4) & 0x3) << 6;
+	offset += ((x >> 2) & 0x1) << 5;
+	offset += ((y >> 1) & 0x1) << 4;
+	offset += ((x >> 1) & 0x1) << 3;
+	offset += (y & 0x1) << 2;
+	offset += (x & 0x1) << 1;
+    }
+    return &ptr[offset];
+}
+
+static GLubyte *r200_depth_4byte(const struct radeon_renderbuffer * rrb,
+				 GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    GLint offset;
+    if (rrb->has_surface) {
+	offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+	GLuint b;
+	offset = 0;
+	b = (((y & 0x7ff) >> 4) * (rrb->pitch >> 7) + (x >> 5));
+	offset += (b >> 1) << 12;
+	offset += (((rrb->pitch >> 7) & 0x1) ? (b & 0x1) : ((b & 0x1) ^ ((y >> 4) & 0x1))) << 11;
+	offset += ((y >> 2) & 0x3) << 9;
+	offset += ((x >> 2) & 0x1) << 8;
+	offset += ((x >> 3) & 0x3) << 6;
+	offset += ((y >> 1) & 0x1) << 5;
+	offset += ((x >> 1) & 0x1) << 4;
+	offset += (y & 0x1) << 3;
+	offset += (x & 0x1) << 2;
+    }
+    return &ptr[offset];
+}
+
 /* radeon tiling on r300-r500 has 4 states,
    macro-linear/micro-linear
    macro-linear/micro-tiled
@@ -61,7 +114,6 @@ static void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb);
    4 byte surface
    8/16 byte (unused)
 */
-   
 static GLubyte *radeon_ptr_4byte(const struct radeon_renderbuffer * rrb,
 			     GLint x, GLint y)
 {
@@ -285,11 +337,21 @@ s8z24_to_z24s8(uint32_t val)
  */
 #define VALUE_TYPE GLushort
 
+#if defined(RADEON_COMMON_FOR_R200)
+#define WRITE_DEPTH( _x, _y, d )					\
+   *(GLushort *)r200_depth_2byte(rrb, _x + x_off, _y + y_off) = d
+#else
 #define WRITE_DEPTH( _x, _y, d )					\
    *(GLushort *)radeon_ptr_2byte_8x2(rrb, _x + x_off, _y + y_off) = d
+#endif
 
+#if defined(RADEON_COMMON_FOR_R200)
+#define READ_DEPTH( d, _x, _y )						\
+   d = *(GLushort *)r200_depth_2byte(rrb, _x + x_off, _y + y_off)
+#else
 #define READ_DEPTH( d, _x, _y )						\
    d = *(GLushort *)radeon_ptr_2byte_8x2(rrb, _x + x_off, _y + y_off)
+#endif
 
 #define TAG(x) radeon##x##_z16
 #include "depthtmp.h"
@@ -301,7 +363,7 @@ s8z24_to_z24s8(uint32_t val)
  */
 #define VALUE_TYPE GLuint
 
-#ifdef COMPILE_R300
+#if defined(COMPILE_R300)
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
@@ -310,6 +372,15 @@ do {									\
    tmp |= ((d << 8) & 0xffffff00);					\
    *_ptr = tmp;					\
 } while (0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
+   tmp &= 0xff000000;							\
+   tmp |= ((d) & 0x00ffffff);						\
+   *_ptr = tmp;					\
+} while (0)
 #else
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
@@ -321,19 +392,21 @@ do {									\
 } while (0)
 #endif
 
-#ifdef COMPILE_R300
+#if defined(COMPILE_R300)
 #define READ_DEPTH( d, _x, _y )						\
   do {									\
     d = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)) & 0xffffff00) >> 8; \
   }while(0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define READ_DEPTH( d, _x, _y )						\
+  do {									\
+    d = *(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off)) & 0x00ffffff; \
+  }while(0)
 #else
 #define READ_DEPTH( d, _x, _y )	\
   d = *(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off)) & 0x00ffffff;
 #endif
-/*
-    fprintf(stderr, "dval(%d, %d, %d, %d)=0x%08X\n", _x, xo, _y, yo, d);\
-   d = *(GLuint*)(radeon_ptr(rrb, _x,	_y )) & 0x00ffffff;
-*/
+
 #define TAG(x) radeon##x##_z24
 #include "depthtmp.h"
 
@@ -345,12 +418,19 @@ do {									\
  */
 #define VALUE_TYPE GLuint
 
-#ifdef COMPILE_R300
+#if defined(COMPILE_R300)
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
    *_ptr = d;								\
 } while (0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = z24s8_to_s8z24(d);					\
+   *_ptr = tmp;								\
+} while (0)
 #else
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
@@ -360,20 +440,22 @@ do {									\
 } while (0)
 #endif
 
-#ifdef COMPILE_R300
+#if defined(COMPILE_R300)
 #define READ_DEPTH( d, _x, _y )						\
   do { \
     d = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)));	\
   }while(0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define READ_DEPTH( d, _x, _y )						\
+  do { \
+    d = s8z24_to_z24s8(*(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off)));	\
+  }while(0)
 #else
 #define READ_DEPTH( d, _x, _y )	do {					\
     d = s8z24_to_z24s8(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off ))); \
   } while (0)
 #endif
-/*
-    fprintf(stderr, "dval(%d, %d, %d, %d)=0x%08X\n", _x, xo, _y, yo, d);\
-   d = *(GLuint*)(radeon_ptr(rrb, _x,	_y )) & 0x00ffffff;
-*/
+
 #define TAG(x) radeon##x##_z24_s8
 #include "depthtmp.h"
 
@@ -392,6 +474,15 @@ do {									\
    tmp |= (d) & 0xff;							\
    *_ptr = tmp;					\
 } while (0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define WRITE_STENCIL( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)r200_depth_4byte(rrb, _x + x_off, _y + y_off);		\
+   GLuint tmp = *_ptr;				\
+   tmp &= 0x00ffffff;							\
+   tmp |= (((d) & 0xff) << 24);						\
+   *_ptr = tmp;					\
+} while (0)
 #else
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
@@ -410,6 +501,13 @@ do {									\
    GLuint tmp = *_ptr;				\
    d = tmp & 0x000000ff;						\
 } while (0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define READ_STENCIL( d, _x, _y )					\
+do {									\
+   GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
+   d = (tmp & 0xff000000) >> 24;					\
+} while (0)
 #else
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index 6a065f04688..fa16f44c18e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -610,9 +610,17 @@ static void radeon_teximage(
 
 	if (pixels) {
 		radeon_teximage_map(image, GL_TRUE);
-
 		if (compressed) {
-			memcpy(texImage->Data, pixels, imageSize);
+			if (image->mt) {
+				uint32_t srcRowStride, bytesPerRow, rows;
+				srcRowStride = _mesa_compressed_row_stride(texImage->TexFormat->MesaFormat, width);
+				bytesPerRow = srcRowStride;
+				rows = (height + 3) / 4;
+				copy_rows(texImage->Data, image->mt->levels[level].rowstride,
+					  pixels, srcRowStride, rows, bytesPerRow);
+			} else {
+				memcpy(texImage->Data, pixels, imageSize);
+			}
 		} else {
 			GLuint dstRowStride;
 			GLuint *dstImageOffsets;
@@ -756,14 +764,23 @@ static void radeon_texsubimage(GLcontext* ctx, int dims, GLenum target, int leve
 		}
 
 		if (compressed) {
-			uint32_t srcRowStride, bytesPerRow, rows; 
-			dstRowStride = _mesa_compressed_row_stride(texImage->TexFormat->MesaFormat, texImage->Width);
+			uint32_t srcRowStride, bytesPerRow, rows;
+			GLubyte *img_start;
+			if (!image->mt) {
+				dstRowStride = _mesa_compressed_row_stride(texImage->TexFormat->MesaFormat, texImage->Width);
+				img_start = _mesa_compressed_image_address(xoffset, yoffset, 0,
+									   texImage->TexFormat->MesaFormat,
+									   texImage->Width, texImage->Data);
+			}
+			else {
+				uint32_t blocks_x = dstRowStride / (image->mt->bpp * 4);
+				img_start = texImage->Data + image->mt->bpp * 4 * (blocks_x * (yoffset / 4) + xoffset / 4);
+			}
 			srcRowStride = _mesa_compressed_row_stride(texImage->TexFormat->MesaFormat, width);
 			bytesPerRow = srcRowStride;
-			rows = height / 4;
+			rows = (height + 3) / 4;
 
-			copy_rows(texImage->Data, dstRowStride,  image->base.Data, srcRowStride, rows,
-				  bytesPerRow);
+			copy_rows(img_start, dstRowStride,  pixels, srcRowStride, rows,  bytesPerRow);
 			
 		} else {
 			if (!texImage->TexFormat->StoreImage(ctx, dims, texImage->_BaseFormat,
@@ -884,8 +901,8 @@ static void migrate_image_to_miptree(radeon_mipmap_tree *mt, radeon_texture_imag
 		uint32_t height;
 		/* need to confirm this value is correct */
 		if (mt->compressed) {
-			height = image->base.Height / 4;
-			srcrowstride = image->base.RowStride * mt->bpp;
+			height = (image->base.Height + 3) / 4;
+			srcrowstride = _mesa_compressed_row_stride(image->base.TexFormat->MesaFormat, image->base.Width);
 		} else {
 			height = image->base.Height * image->base.Depth;
 			srcrowstride = image->base.Width * image->base.TexFormat->TexelBytes;
@@ -1000,6 +1017,8 @@ radeon_get_tex_image(GLcontext * ctx, GLenum target, GLint level,
 	}
 
 	if (compressed) {
+		/* FIXME: this can't work for small textures (mips) which
+		         use different hw stride */
 		_mesa_get_compressed_teximage(ctx, target, level, pixels,
 					      texObj, texImage);
 	} else {