92 files changed, 3838 insertions, 4579 deletions
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 854dea94504..0dbc7c3e853 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -41,6 +41,7 @@
 #include "main/bufferobj.h"
 #include "main/fbobject.h"
 #include "main/texrender.h"
+#include "main/samplerobj.h"
 #include "main/syncobj.h"
 #include "main/texturebarrier.h"
 #include "main/transformfeedback.h"
@@ -200,6 +201,8 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
 
    _mesa_init_transform_feedback_functions(driver);
 
+   _mesa_init_sampler_object_functions(driver);
+
    /* T&L stuff */
    driver->NeedValidate = GL_FALSE;
    driver->ValidateTnlModule = NULL;
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 6c35fa10d8a..08b6024639f 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -40,6 +40,7 @@
 #include "main/bufferobj.h"
 #include "main/buffers.h"
 #include "main/colortab.h"
+#include "main/condrender.h"
 #include "main/depth.h"
 #include "main/enable.h"
 #include "main/fbobject.h"
@@ -94,6 +95,7 @@
 #define META_VIEWPORT       0x4000
 #define META_CLAMP_FRAGMENT_COLOR 0x8000
 #define META_CLAMP_VERTEX_COLOR 0x10000
+#define META_CONDITIONAL_RENDER 0x20000
 /*@}*/
 
 
@@ -188,6 +190,10 @@ struct save_state
    /** META_CLAMP_VERTEX_COLOR */
    GLenum ClampVertexColor;
 
+   /** META_CONDITIONAL_RENDER */
+   struct gl_query_object *CondRenderQuery;
+   GLenum CondRenderMode;
+
    /** Miscellaneous (always disabled) */
    GLboolean Lighting;
 };
@@ -597,6 +603,14 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
       _mesa_ClampColorARB(GL_CLAMP_VERTEX_COLOR, GL_FALSE);
    }
 
+   if (state & META_CONDITIONAL_RENDER) {
+      save->CondRenderQuery = ctx->Query.CondRenderQuery;
+      save->CondRenderMode = ctx->Query.CondRenderMode;
+
+      if (ctx->Query.CondRenderQuery)
+	 _mesa_EndConditionalRender();
+   }
+
    /* misc */
    {
       save->Lighting = ctx->Light.Enabled;
@@ -869,6 +883,12 @@ _mesa_meta_end(struct gl_context *ctx)
       _mesa_ClampColorARB(GL_CLAMP_VERTEX_COLOR, save->ClampVertexColor);
    }
 
+   if (state & META_CONDITIONAL_RENDER) {
+      if (save->CondRenderQuery)
+	 _mesa_BeginConditionalRender(save->CondRenderQuery->Id,
+				      save->CondRenderMode);
+   }
+
    /* misc */
    if (save->Lighting) {
       _mesa_set_enable(ctx, GL_LIGHTING, GL_TRUE);
@@ -1442,7 +1462,10 @@ _mesa_meta_Clear(struct gl_context *ctx, GLbitfield buffers)
    };
    struct vertex verts[4];
    /* save all state but scissor, pixel pack/unpack */
-   GLbitfield metaSave = META_ALL - META_SCISSOR - META_PIXEL_STORE;
+   GLbitfield metaSave = (META_ALL -
+			  META_SCISSOR -
+			  META_PIXEL_STORE -
+			  META_CONDITIONAL_RENDER);
    const GLuint stencilMax = (1 << ctx->DrawBuffer->Visual.stencilBits) - 1;
 
    if (buffers & BUFFER_BITS_COLOR) {
@@ -1848,7 +1871,8 @@ _mesa_meta_DrawPixels(struct gl_context *ctx,
        * just going for the matching set of channels, in floating
        * point.
        */
-      if (ctx->Color.ClampFragmentColor != GL_TRUE)
+      if (ctx->Color.ClampFragmentColor != GL_TRUE &&
+	  ctx->Extensions.ARB_texture_float)
 	 texIntFormat = GL_RGBA32F;
    }
    else if (_mesa_is_stencil_format(format)) {
diff --git a/src/mesa/drivers/dri/i915/i830_texstate.c b/src/mesa/drivers/dri/i915/i830_texstate.c
index 7554bd5e7b9..3298dbb69f5 100644
--- a/src/mesa/drivers/dri/i915/i830_texstate.c
+++ b/src/mesa/drivers/dri/i915/i830_texstate.c
@@ -29,6 +29,7 @@
 #include "main/enums.h"
 #include "main/colormac.h"
 #include "main/macros.h"
+#include "main/samplerobj.h"
 
 #include "intel_mipmap_tree.h"
 #include "intel_tex.h"
@@ -120,6 +121,7 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    struct gl_texture_object *tObj = tUnit->_Current;
    struct intel_texture_object *intelObj = intel_texture_object(tObj);
    struct gl_texture_image *firstImage;
+   struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
    GLuint *state = i830->state.Tex[unit], format, pitch;
    GLint lodbias;
    GLubyte border[4];
@@ -193,7 +195,7 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
       float maxlod;
       uint32_t minlod_fixed, maxlod_fixed;
 
-      switch (tObj->Sampler.MinFilter) {
+      switch (sampler->MinFilter) {
       case GL_NEAREST:
          minFilt = FILTER_NEAREST;
          mipFilt = MIPFILTER_NONE;
@@ -222,12 +224,12 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
          return GL_FALSE;
       }
 
-      if (tObj->Sampler.MaxAnisotropy > 1.0) {
+      if (sampler->MaxAnisotropy > 1.0) {
          minFilt = FILTER_ANISOTROPIC;
          magFilt = FILTER_ANISOTROPIC;
       }
       else {
-         switch (tObj->Sampler.MagFilter) {
+         switch (sampler->MagFilter) {
          case GL_NEAREST:
             magFilt = FILTER_NEAREST;
             break;
@@ -239,7 +241,7 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
          }
       }
 
-      lodbias = (int) ((tUnit->LodBias + tObj->Sampler.LodBias) * 16.0);
+      lodbias = (int) ((tUnit->LodBias + sampler->LodBias) * 16.0);
       if (lodbias < -64)
           lodbias = -64;
       if (lodbias > 63)
@@ -259,8 +261,8 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
        * addressable (smallest resolution) LOD.  Use it to cover both
        * MAX_LEVEL and MAX_LOD.
        */
-      minlod_fixed = U_FIXED(CLAMP(tObj->Sampler.MinLod, 0.0, 11), 4);
-      maxlod = MIN2(tObj->Sampler.MaxLod, tObj->_MaxLevel - tObj->BaseLevel);
+      minlod_fixed = U_FIXED(CLAMP(sampler->MinLod, 0.0, 11), 4);
+      maxlod = MIN2(sampler->MaxLod, tObj->_MaxLevel - tObj->BaseLevel);
       if (intel->intelScreen->deviceID == PCI_CHIP_I855_GM ||
 	  intel->intelScreen->deviceID == PCI_CHIP_I865_G) {
 	 maxlod_fixed = U_FIXED(CLAMP(maxlod, 0.0, 11.75), 2);
@@ -279,8 +281,8 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    }
 
    {
-      GLenum ws = tObj->Sampler.WrapS;
-      GLenum wt = tObj->Sampler.WrapT;
+      GLenum ws = sampler->WrapS;
+      GLenum wt = sampler->WrapT;
 
 
       /* 3D textures not available on i830
@@ -300,10 +302,10 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    }
 
    /* convert border color from float to ubyte */
-   CLAMPED_FLOAT_TO_UBYTE(border[0], tObj->Sampler.BorderColor.f[0]);
-   CLAMPED_FLOAT_TO_UBYTE(border[1], tObj->Sampler.BorderColor.f[1]);
-   CLAMPED_FLOAT_TO_UBYTE(border[2], tObj->Sampler.BorderColor.f[2]);
-   CLAMPED_FLOAT_TO_UBYTE(border[3], tObj->Sampler.BorderColor.f[3]);
+   CLAMPED_FLOAT_TO_UBYTE(border[0], sampler->BorderColor.f[0]);
+   CLAMPED_FLOAT_TO_UBYTE(border[1], sampler->BorderColor.f[1]);
+   CLAMPED_FLOAT_TO_UBYTE(border[2], sampler->BorderColor.f[2]);
+   CLAMPED_FLOAT_TO_UBYTE(border[3], sampler->BorderColor.f[3]);
 
    state[I830_TEXREG_TM0S4] = PACK_COLOR_8888(border[3],
 					      border[0],
diff --git a/src/mesa/drivers/dri/i915/i915_texstate.c b/src/mesa/drivers/dri/i915/i915_texstate.c
index 742bb994adb..5aa2ea18048 100644
--- a/src/mesa/drivers/dri/i915/i915_texstate.c
+++ b/src/mesa/drivers/dri/i915/i915_texstate.c
@@ -29,6 +29,7 @@
 #include "main/enums.h"
 #include "main/macros.h"
 #include "main/colormac.h"
+#include "main/samplerobj.h"
 
 #include "intel_mipmap_tree.h"
 #include "intel_tex.h"
@@ -136,6 +137,7 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    struct gl_texture_object *tObj = tUnit->_Current;
    struct intel_texture_object *intelObj = intel_texture_object(tObj);
    struct gl_texture_image *firstImage;
+   struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
    GLuint *state = i915->state.Tex[unit], format, pitch;
    GLint lodbias, aniso = 0;
    GLubyte border[4];
@@ -164,7 +166,7 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 
    format = translate_texture_format(firstImage->TexFormat,
 				     firstImage->InternalFormat,
-				     tObj->Sampler.DepthMode);
+				     sampler->DepthMode);
    pitch = intelObj->mt->region->pitch * intelObj->mt->cpp;
 
    state[I915_TEXREG_MS3] =
@@ -181,7 +183,7 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
     * (lowest resolution) LOD.  Use it to cover both MAX_LEVEL and
     * MAX_LOD.
     */
-   maxlod = MIN2(tObj->Sampler.MaxLod, tObj->_MaxLevel - tObj->BaseLevel);
+   maxlod = MIN2(sampler->MaxLod, tObj->_MaxLevel - tObj->BaseLevel);
    state[I915_TEXREG_MS4] =
       ((((pitch / 4) - 1) << MS4_PITCH_SHIFT) |
        MS4_CUBE_FACE_ENA_MASK |
@@ -192,7 +194,7 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    {
       GLuint minFilt, mipFilt, magFilt;
 
-      switch (tObj->Sampler.MinFilter) {
+      switch (sampler->MinFilter) {
       case GL_NEAREST:
          minFilt = FILTER_NEAREST;
          mipFilt = MIPFILTER_NONE;
@@ -221,16 +223,16 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
          return GL_FALSE;
       }
 
-      if (tObj->Sampler.MaxAnisotropy > 1.0) {
+      if (sampler->MaxAnisotropy > 1.0) {
          minFilt = FILTER_ANISOTROPIC;
          magFilt = FILTER_ANISOTROPIC;
-         if (tObj->Sampler.MaxAnisotropy > 2.0)
+         if (sampler->MaxAnisotropy > 2.0)
             aniso = SS2_MAX_ANISO_4;
          else
             aniso = SS2_MAX_ANISO_2;
       }
       else {
-         switch (tObj->Sampler.MagFilter) {
+         switch (sampler->MagFilter) {
          case GL_NEAREST:
             magFilt = FILTER_NEAREST;
             break;
@@ -242,7 +244,7 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
          }
       }
 
-      lodbias = (int) ((tUnit->LodBias + tObj->Sampler.LodBias) * 16.0);
+      lodbias = (int) ((tUnit->LodBias + sampler->LodBias) * 16.0);
       if (lodbias < -256)
           lodbias = -256;
       if (lodbias > 255)
@@ -258,14 +260,14 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 
       /* Shadow:
        */
-      if (tObj->Sampler.CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB &&
+      if (sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB &&
           tObj->Target != GL_TEXTURE_3D) {
          if (tObj->Target == GL_TEXTURE_1D) 
             return GL_FALSE;
 
          state[I915_TEXREG_SS2] |=
             (SS2_SHADOW_ENABLE |
-             intel_translate_shadow_compare_func(tObj->Sampler.CompareFunc));
+             intel_translate_shadow_compare_func(sampler->CompareFunc));
 
          minFilt = FILTER_4X4_FLAT;
          magFilt = FILTER_4X4_FLAT;
@@ -278,9 +280,9 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    }
 
    {
-      GLenum ws = tObj->Sampler.WrapS;
-      GLenum wt = tObj->Sampler.WrapT;
-      GLenum wr = tObj->Sampler.WrapR;
+      GLenum ws = sampler->WrapS;
+      GLenum wt = sampler->WrapT;
+      GLenum wr = sampler->WrapR;
       float minlod;
 
       /* We program 1D textures as 2D textures, so the 2D texcoord could
@@ -298,8 +300,8 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
        * clamp_to_border.
        */
       if (tObj->Target == GL_TEXTURE_3D &&
-          (tObj->Sampler.MinFilter != GL_NEAREST ||
-           tObj->Sampler.MagFilter != GL_NEAREST) &&
+          (sampler->MinFilter != GL_NEAREST ||
+           sampler->MagFilter != GL_NEAREST) &&
           (ws == GL_CLAMP ||
            wt == GL_CLAMP ||
            wr == GL_CLAMP ||
@@ -322,7 +324,7 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
           (translate_wrap_mode(wt) << SS3_TCY_ADDR_MODE_SHIFT) |
           (translate_wrap_mode(wr) << SS3_TCZ_ADDR_MODE_SHIFT));
 
-      minlod = MIN2(tObj->Sampler.MinLod, tObj->_MaxLevel - tObj->BaseLevel);
+      minlod = MIN2(sampler->MinLod, tObj->_MaxLevel - tObj->BaseLevel);
       state[I915_TEXREG_SS3] |= (unit << SS3_TEXTUREMAP_INDEX_SHIFT);
       state[I915_TEXREG_SS3] |= (U_FIXED(CLAMP(minlod, 0.0, 11.0), 4) <<
 				 SS3_MIN_LOD_SHIFT);
@@ -330,10 +332,10 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    }
 
    /* convert border color from float to ubyte */
-   CLAMPED_FLOAT_TO_UBYTE(border[0], tObj->Sampler.BorderColor.f[0]);
-   CLAMPED_FLOAT_TO_UBYTE(border[1], tObj->Sampler.BorderColor.f[1]);
-   CLAMPED_FLOAT_TO_UBYTE(border[2], tObj->Sampler.BorderColor.f[2]);
-   CLAMPED_FLOAT_TO_UBYTE(border[3], tObj->Sampler.BorderColor.f[3]);
+   CLAMPED_FLOAT_TO_UBYTE(border[0], sampler->BorderColor.f[0]);
+   CLAMPED_FLOAT_TO_UBYTE(border[1], sampler->BorderColor.f[1]);
+   CLAMPED_FLOAT_TO_UBYTE(border[2], sampler->BorderColor.f[2]);
+   CLAMPED_FLOAT_TO_UBYTE(border[3], sampler->BorderColor.f[3]);
 
    if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
       /* GL specs that border color for depth textures is taken from the
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index b05ba35d65f..849018b74ae 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -61,7 +61,6 @@ DRIVER_SOURCES = \
 	brw_sf.c \
 	brw_sf_emit.c \
 	brw_sf_state.c \
-	brw_state.c \
 	brw_state_batch.c \
 	brw_state_cache.c \
 	brw_state_dump.c \
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index 74a66af31a5..94b8c20b019 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -37,28 +37,36 @@
 #include "main/macros.h"
 #include "intel_batchbuffer.h"
 
-void
-brw_update_cc_vp(struct brw_context *brw)
+static void
+prepare_cc_vp(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_cc_viewport ccv;
+   struct brw_cc_viewport *ccv;
 
-   memset(&ccv, 0, sizeof(ccv));
+   ccv = brw_state_batch(brw, sizeof(*ccv), 32, &brw->cc.vp_offset);
 
    /* _NEW_TRANSOFORM */
    if (ctx->Transform.DepthClamp) {
       /* _NEW_VIEWPORT */
-      ccv.min_depth = MIN2(ctx->Viewport.Near, ctx->Viewport.Far);
-      ccv.max_depth = MAX2(ctx->Viewport.Near, ctx->Viewport.Far);
+      ccv->min_depth = MIN2(ctx->Viewport.Near, ctx->Viewport.Far);
+      ccv->max_depth = MAX2(ctx->Viewport.Near, ctx->Viewport.Far);
    } else {
-      ccv.min_depth = 0.0;
-      ccv.max_depth = 1.0;
+      ccv->min_depth = 0.0;
+      ccv->max_depth = 1.0;
    }
 
-   drm_intel_bo_unreference(brw->cc.vp_bo);
-   brw->cc.vp_bo = brw_cache_data(&brw->cache, BRW_CC_VP, &ccv, sizeof(ccv));
+   brw->state.dirty.cache |= CACHE_NEW_CC_VP;
 }
 
+const struct brw_tracked_state brw_cc_vp = {
+   .dirty = {
+      .mesa = _NEW_VIEWPORT | _NEW_TRANSFORM,
+      .brw = BRW_NEW_BATCH,
+      .cache = 0
+   },
+   .prepare = prepare_cc_vp
+};
+
 /**
  * Modify blend function to force destination alpha to 1.0
  *
@@ -81,11 +89,6 @@ fix_xRGB_alpha(GLenum function)
    return function;
 }
 
-static void prepare_cc_unit(struct brw_context *brw)
-{
-   brw_add_validated_bo(brw, brw->cc.vp_bo);
-}
-
 /**
  * Creates the state cache entry for the given CC unit key.
  */
@@ -209,7 +212,8 @@ static void upload_cc_unit(struct brw_context *brw)
       cc->cc5.statistics_enable = 1;
 
    /* CACHE_NEW_CC_VP */
-   cc->cc4.cc_viewport_state_offset = brw->cc.vp_bo->offset >> 5; /* reloc */
+   cc->cc4.cc_viewport_state_offset = (intel->batch.bo->offset +
+				       brw->cc.vp_offset) >> 5; /* reloc */
 
    brw->state.dirty.cache |= CACHE_NEW_CC_UNIT;
 
@@ -217,7 +221,7 @@ static void upload_cc_unit(struct brw_context *brw)
    drm_intel_bo_emit_reloc(brw->intel.batch.bo,
 			   (brw->cc.state_offset +
 			    offsetof(struct brw_cc_unit_state, cc4)),
-			   brw->cc.vp_bo, 0,
+			   intel->batch.bo, brw->cc.vp_offset,
 			   I915_GEM_DOMAIN_INSTRUCTION, 0);
 }
 
@@ -227,7 +231,6 @@ const struct brw_tracked_state brw_cc_unit = {
       .brw = BRW_NEW_BATCH,
       .cache = CACHE_NEW_CC_VP
    },
-   .prepare = prepare_cc_unit,
    .emit = upload_cc_unit,
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index 1be165cc9a1..3c175515408 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -144,14 +144,12 @@ static void compile_clip_prog( struct brw_context *brw,
    /* Upload
     */
    drm_intel_bo_unreference(brw->clip.prog_bo);
-   brw->clip.prog_bo = brw_upload_cache_with_auxdata(&brw->cache,
-						     BRW_CLIP_PROG,
-						     &c.key, sizeof(c.key),
-						     NULL, 0,
-						     program, program_size,
-						     &c.prog_data,
-						     sizeof(c.prog_data),
-						     &brw->clip.prog_data);
+   brw->clip.prog_bo = brw_upload_cache(&brw->cache,
+					BRW_CLIP_PROG,
+					&c.key, sizeof(c.key),
+					program, program_size,
+					&c.prog_data, sizeof(c.prog_data),
+					&brw->clip.prog_data);
 }
 
 /* Calculate interpolants for triangle and line rasterization.
@@ -270,7 +268,6 @@ static void upload_clip_prog(struct brw_context *brw)
    drm_intel_bo_unreference(brw->clip.prog_bo);
    brw->clip.prog_bo = brw_search_cache(&brw->cache, BRW_CLIP_PROG,
 					&key, sizeof(key),
-					NULL, 0,
 					&brw->clip.prog_data);
    if (brw->clip.prog_bo == NULL)
       compile_clip_prog( brw, &key );
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index 60fd5fa7d9e..6015c8cbe9f 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -33,148 +33,101 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
-struct brw_clip_unit_key {
-   unsigned int total_grf;
-   unsigned int urb_entry_read_length;
-   unsigned int curb_entry_read_length;
-   unsigned int clip_mode;
-
-   unsigned int curbe_offset;
-
-   unsigned int nr_urb_entries, urb_size;
-
-   GLboolean depth_clamp;
-};
-
 static void
-clip_unit_populate_key(struct brw_context *brw, struct brw_clip_unit_key *key)
-{
-   struct gl_context *ctx = &brw->intel.ctx;
-   memset(key, 0, sizeof(*key));
-
-   /* CACHE_NEW_CLIP_PROG */
-   key->total_grf = brw->clip.prog_data->total_grf;
-   key->urb_entry_read_length = brw->clip.prog_data->urb_read_length;
-   key->curb_entry_read_length = brw->clip.prog_data->curb_read_length;
-   key->clip_mode = brw->clip.prog_data->clip_mode;
-
-   /* BRW_NEW_CURBE_OFFSETS */
-   key->curbe_offset = brw->curbe.clip_start;
-
-   /* BRW_NEW_URB_FENCE */
-   key->nr_urb_entries = brw->urb.nr_clip_entries;
-   key->urb_size = brw->urb.vsize;
-
-   /* _NEW_TRANSOFORM */
-   key->depth_clamp = ctx->Transform.DepthClamp;
-}
-
-static drm_intel_bo *
-clip_unit_create_from_key(struct brw_context *brw,
-			  struct brw_clip_unit_key *key)
+brw_prepare_clip_unit(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
-   struct brw_clip_unit_state clip;
-   drm_intel_bo *bo;
+   struct gl_context *ctx = &intel->ctx;
+   struct brw_clip_unit_state *clip;
 
-   memset(&clip, 0, sizeof(clip));
+   clip = brw_state_batch(brw, sizeof(*clip), 32, &brw->clip.state_offset);
+   memset(clip, 0, sizeof(*clip));
 
-   clip.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
+   /* CACHE_NEW_CLIP_PROG */
+   clip->thread0.grf_reg_count = (ALIGN(brw->clip.prog_data->total_grf, 16) /
+				 16 - 1);
    /* reloc */
-   clip.thread0.kernel_start_pointer = brw->clip.prog_bo->offset >> 6;
+   clip->thread0.kernel_start_pointer = brw->clip.prog_bo->offset >> 6;
 
-   clip.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-   clip.thread1.single_program_flow = 1;
+   clip->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   clip->thread1.single_program_flow = 1;
 
-   clip.thread3.urb_entry_read_length = key->urb_entry_read_length;
-   clip.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
-   clip.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
-   clip.thread3.dispatch_grf_start_reg = 1;
-   clip.thread3.urb_entry_read_offset = 0;
+   clip->thread3.urb_entry_read_length = brw->clip.prog_data->urb_read_length;
+   clip->thread3.const_urb_entry_read_length =
+      brw->clip.prog_data->curb_read_length;
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   clip->thread3.const_urb_entry_read_offset = brw->curbe.clip_start * 2;
+   clip->thread3.dispatch_grf_start_reg = 1;
+   clip->thread3.urb_entry_read_offset = 0;
 
-   clip.thread4.nr_urb_entries = key->nr_urb_entries;
-   clip.thread4.urb_entry_allocation_size = key->urb_size - 1;
+   /* BRW_NEW_URB_FENCE */
+   clip->thread4.nr_urb_entries = brw->urb.nr_clip_entries;
+   clip->thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
    /* If we have enough clip URB entries to run two threads, do so.
     */
-   if (key->nr_urb_entries >= 10) {
+   if (brw->urb.nr_clip_entries >= 10) {
       /* Half of the URB entries go to each thread, and it has to be an
        * even number.
        */
-      assert(key->nr_urb_entries % 2 == 0);
+      assert(brw->urb.nr_clip_entries % 2 == 0);
       
       /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
        * only 2 threads can output VUEs at a time.
        */
       if (intel->gen == 5)
-         clip.thread4.max_threads = 16 - 1;        
+         clip->thread4.max_threads = 16 - 1;
       else
-         clip.thread4.max_threads = 2 - 1;
+         clip->thread4.max_threads = 2 - 1;
    } else {
-      assert(key->nr_urb_entries >= 5);
-      clip.thread4.max_threads = 1 - 1;
+      assert(brw->urb.nr_clip_entries >= 5);
+      clip->thread4.max_threads = 1 - 1;
    }
 
    if (unlikely(INTEL_DEBUG & DEBUG_SINGLE_THREAD))
-      clip.thread4.max_threads = 0;
+      clip->thread4.max_threads = 0;
 
    if (unlikely(INTEL_DEBUG & DEBUG_STATS))
-      clip.thread4.stats_enable = 1;
-
-   clip.clip5.userclip_enable_flags = 0x7f;
-   clip.clip5.userclip_must_clip = 1;
-   clip.clip5.guard_band_enable = 0;
-   if (!key->depth_clamp)
-      clip.clip5.viewport_z_clip_enable = 1;
-   clip.clip5.viewport_xy_clip_enable = 1;
-   clip.clip5.vertex_position_space = BRW_CLIP_NDCSPACE;
-   clip.clip5.api_mode = BRW_CLIP_API_OGL;
-   clip.clip5.clip_mode = key->clip_mode;
+      clip->thread4.stats_enable = 1;
 
-   if (intel->is_g4x)
-      clip.clip5.negative_w_clip_test = 1;
+   clip->clip5.userclip_enable_flags = 0x7f;
+   clip->clip5.userclip_must_clip = 1;
+   clip->clip5.guard_band_enable = 0;
+   /* _NEW_TRANSOFORM */
+   if (!ctx->Transform.DepthClamp)
+      clip->clip5.viewport_z_clip_enable = 1;
+   clip->clip5.viewport_xy_clip_enable = 1;
+   clip->clip5.vertex_position_space = BRW_CLIP_NDCSPACE;
+   clip->clip5.api_mode = BRW_CLIP_API_OGL;
+   clip->clip5.clip_mode = brw->clip.prog_data->clip_mode;
 
-   clip.clip6.clipper_viewport_state_ptr = 0;
-   clip.viewport_xmin = -1;
-   clip.viewport_xmax = 1;
-   clip.viewport_ymin = -1;
-   clip.viewport_ymax = 1;
+   if (intel->is_g4x)
+      clip->clip5.negative_w_clip_test = 1;
 
-   bo = brw_upload_cache(&brw->cache, BRW_CLIP_UNIT,
-			 key, sizeof(*key),
-			 &brw->clip.prog_bo, 1,
-			 &clip, sizeof(clip));
+   clip->clip6.clipper_viewport_state_ptr = 0;
+   clip->viewport_xmin = -1;
+   clip->viewport_xmax = 1;
+   clip->viewport_ymin = -1;
+   clip->viewport_ymax = 1;
 
    /* Emit clip program relocation */
    assert(brw->clip.prog_bo);
-   drm_intel_bo_emit_reloc(bo, offsetof(struct brw_clip_unit_state, thread0),
-			   brw->clip.prog_bo, clip.thread0.grf_reg_count << 1,
+   drm_intel_bo_emit_reloc(intel->batch.bo,
+			   (brw->clip.state_offset +
+			    offsetof(struct brw_clip_unit_state, thread0)),
+			   brw->clip.prog_bo, clip->thread0.grf_reg_count << 1,
 			   I915_GEM_DOMAIN_INSTRUCTION, 0);
 
-   return bo;
-}
-
-static void upload_clip_unit( struct brw_context *brw )
-{
-   struct brw_clip_unit_key key;
-
-   clip_unit_populate_key(brw, &key);
-
-   drm_intel_bo_unreference(brw->clip.state_bo);
-   brw->clip.state_bo = brw_search_cache(&brw->cache, BRW_CLIP_UNIT,
-					 &key, sizeof(key),
-					 &brw->clip.prog_bo, 1,
-					 NULL);
-   if (brw->clip.state_bo == NULL) {
-      brw->clip.state_bo = clip_unit_create_from_key(brw, &key);
-   }
+   brw->state.dirty.cache |= CACHE_NEW_CLIP_UNIT;
 }
 
 const struct brw_tracked_state brw_clip_unit = {
    .dirty = {
       .mesa  = _NEW_TRANSFORM,
-      .brw   = (BRW_NEW_CURBE_OFFSETS |
+      .brw   = (BRW_NEW_BATCH |
+		BRW_NEW_CURBE_OFFSETS |
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_CLIP_PROG
    },
-   .prepare = upload_clip_unit,
+   .prepare = brw_prepare_clip_unit,
 };
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 230d326fa12..db6466ff1ae 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -51,9 +51,6 @@ static void brwInitDriverFunctions( struct dd_function_table *functions )
 
    brwInitFragProgFuncs( functions );
    brw_init_queryobj_functions(functions);
-
-   functions->Enable = brw_enable;
-   functions->DepthRange = brw_depth_range;
 }
 
 GLboolean brwCreateContext( int api,
@@ -232,11 +229,6 @@ GLboolean brwCreateContext( int api,
 
    brw_draw_init( brw );
 
-   /* Now that most driver functions are hooked up, initialize some of the
-    * immediate state.
-    */
-   brw_update_cc_vp(brw);
-
    return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 1daa49abfb3..26cd8209c65 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -204,13 +204,16 @@ struct brw_wm_prog_data {
    GLuint urb_read_length;
 
    GLuint first_curbe_grf;
+   GLuint first_curbe_grf_16;
    GLuint total_grf;
+   GLuint total_grf_16;
    GLuint total_scratch;
 
    GLuint nr_params;       /**< number of float params/constants */
    GLuint nr_pull_params;
    GLboolean error;
    int dispatch_width;
+   uint32_t prog_offset_16;
 
    /* Pointer to tracked values (only valid once
     * _mesa_load_state_parameters has been called at runtime).
@@ -308,7 +311,6 @@ enum brw_cache_id {
    BRW_CC_VP,
    BRW_CC_UNIT,
    BRW_WM_PROG,
-   BRW_SAMPLER_DEFAULT_COLOR,
    BRW_SAMPLER,
    BRW_WM_UNIT,
    BRW_SF_PROG,
@@ -336,8 +338,6 @@ struct brw_cache_item {
    GLuint hash;
    GLuint key_size;		/* for variable-sized keys */
    const void *key;
-   drm_intel_bo **reloc_bufs;
-   GLuint nr_reloc_bufs;
 
    drm_intel_bo *bo;
 
@@ -381,7 +381,6 @@ struct brw_tracked_state {
 #define CACHE_NEW_CC_VP                  (1<<BRW_CC_VP)
 #define CACHE_NEW_CC_UNIT                (1<<BRW_CC_UNIT)
 #define CACHE_NEW_WM_PROG                (1<<BRW_WM_PROG)
-#define CACHE_NEW_SAMPLER_DEFAULT_COLOR  (1<<BRW_SAMPLER_DEFAULT_COLOR)
 #define CACHE_NEW_SAMPLER                (1<<BRW_SAMPLER)
 #define CACHE_NEW_WM_UNIT                (1<<BRW_WM_UNIT)
 #define CACHE_NEW_SF_PROG                (1<<BRW_SF_PROG)
@@ -630,29 +629,38 @@ struct brw_context
       int8_t *constant_map; /* variable array following prog_data */
 
       drm_intel_bo *prog_bo;
-      drm_intel_bo *state_bo;
       drm_intel_bo *const_bo;
+      uint32_t state_offset;
 
       /** Binding table of pointers to surf_bo entries */
       uint32_t bind_bo_offset;
       uint32_t surf_offset[BRW_VS_MAX_SURF];
       GLuint nr_surfaces;      
+
+      uint32_t push_const_offset; /* Offset in the batchbuffer */
+      int push_const_size; /* in 256-bit register increments */
    } vs;
 
    struct {
       struct brw_gs_prog_data *prog_data;
 
       GLboolean prog_active;
+      uint32_t state_offset;
       drm_intel_bo *prog_bo;
-      drm_intel_bo *state_bo;
    } gs;
 
    struct {
       struct brw_clip_prog_data *prog_data;
 
       drm_intel_bo *prog_bo;
-      drm_intel_bo *state_bo;
-      drm_intel_bo *vp_bo;
+
+      /* Offset in the batch to the CLIP state on pre-gen6. */
+      uint32_t state_offset;
+
+      /* As of gen6, this is the offset in the batch to the CLIP VP,
+       * instead of vp_bo.
+       */
+      uint32_t vp_offset;
    } clip;
 
 
@@ -660,9 +668,7 @@ struct brw_context
       struct brw_sf_prog_data *prog_data;
 
       drm_intel_bo *prog_bo;
-      drm_intel_bo *state_bo;
       uint32_t state_offset;
-      drm_intel_bo *vp_bo;
       uint32_t vp_offset;
    } sf;
 
@@ -675,8 +681,9 @@ struct brw_context
        */
       GLbitfield input_size_masks[4];
 
-      /** Array of surface default colors (texture border color) */
-      drm_intel_bo *sdc_bo[BRW_MAX_TEX_UNIT];
+      /** offsets in the batch to sampler default colors (texture border color)
+       */
+      uint32_t sdc_offset[BRW_MAX_TEX_UNIT];
 
       GLuint render_surf;
       GLuint nr_surfaces;      
@@ -685,35 +692,32 @@ struct brw_context
       drm_intel_bo *scratch_bo;
 
       GLuint sampler_count;
-      drm_intel_bo *sampler_bo;
+      uint32_t sampler_offset;
 
       /** Binding table of pointers to surf_bo entries */
       uint32_t bind_bo_offset;
       uint32_t surf_offset[BRW_WM_MAX_SURF];
+      uint32_t state_offset; /* offset in batchbuffer to pre-gen6 WM state */
 
       drm_intel_bo *prog_bo;
-      drm_intel_bo *state_bo;
       drm_intel_bo *const_bo; /* pull constant buffer. */
       /**
-       *  This is the push constant BO on gen6.
+       * This is offset in the batch to the push constants on gen6.
        *
        * Pre-gen6, push constants live in the CURBE.
        */
-      drm_intel_bo *push_const_bo;
+      uint32_t push_const_offset;
    } wm;
 
 
    struct {
       /* gen4 */
       drm_intel_bo *prog_bo;
-      drm_intel_bo *vp_bo;
-
-      /* gen6 */
-      drm_intel_bo *blend_state_bo;
-      drm_intel_bo *depth_stencil_state_bo;
-      drm_intel_bo *color_calc_state_bo;
 
       uint32_t state_offset;
+      uint32_t blend_state_offset;
+      uint32_t depth_stencil_state_offset;
+      uint32_t vp_offset;
    } cc;
 
    struct {
@@ -783,9 +787,6 @@ void brwInitFragProgFuncs( struct dd_function_table *functions );
  */
 void brw_upload_urb_fence(struct brw_context *brw);
 
-/* brw_cc.c */
-void brw_update_cc_vp(struct brw_context *brw);
-
 /* brw_curbe.c
  */
 void brw_upload_cs_urb_state(struct brw_context *brw);
@@ -793,10 +794,6 @@ void brw_upload_cs_urb_state(struct brw_context *brw);
 /* brw_disasm.c */
 int brw_disasm (FILE *file, struct brw_instruction *inst, int gen);
 
-/* brw_state.c */
-void brw_enable(struct gl_context * ctx, GLenum cap, GLboolean state);
-void brw_depth_range(struct gl_context *ctx, GLclampd nearval, GLclampd farval);
-
 /*======================================================================
  * Inline conversion functions.  These are better-typed than the
  * macros used previously:
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 2db70c543ea..9ab533179b8 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -28,6 +28,8 @@
 
 #include "main/glheader.h"
 #include "main/context.h"
+#include "main/condrender.h"
+#include "main/samplerobj.h"
 #include "main/state.h"
 #include "main/enums.h"
 #include "tnl/tnl.h"
@@ -278,22 +280,25 @@ static GLboolean check_fallbacks( struct brw_context *brw,
       int u;
       for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
          struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
+
          if (texUnit->Enabled) {
+	    struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, u);
+
             if (texUnit->Enabled & TEXTURE_1D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_1D_INDEX]->Sampler.WrapS == GL_CLAMP) {
+               if (sampler->WrapS == GL_CLAMP) {
                    return GL_TRUE;
                }
             }
             if (texUnit->Enabled & TEXTURE_2D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_2D_INDEX]->Sampler.WrapS == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_2D_INDEX]->Sampler.WrapT == GL_CLAMP) {
+               if (sampler->WrapS == GL_CLAMP ||
+                   sampler->WrapT == GL_CLAMP) {
                    return GL_TRUE;
                }
             }
             if (texUnit->Enabled & TEXTURE_3D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_3D_INDEX]->Sampler.WrapS == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->Sampler.WrapT == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->Sampler.WrapR == GL_CLAMP) {
+               if (sampler->WrapS == GL_CLAMP ||
+                   sampler->WrapT == GL_CLAMP ||
+                   sampler->WrapR == GL_CLAMP) {
                    return GL_TRUE;
                }
             }
@@ -359,15 +364,21 @@ static GLboolean brw_try_draw_prims( struct gl_context *ctx,
 
    for (i = 0; i < nr_prims; i++) {
       uint32_t hw_prim;
+      int estimated_max_prim_size;
+
+      estimated_max_prim_size = 512; /* batchbuffer commands */
+      estimated_max_prim_size += (BRW_MAX_TEX_UNIT *
+				  (sizeof(struct brw_sampler_state) +
+				   sizeof(struct gen5_sampler_default_color)));
+      estimated_max_prim_size += 1024; /* gen6 VS push constants */
+      estimated_max_prim_size += 1024; /* gen6 WM push constants */
+      estimated_max_prim_size += 512; /* misc. pad */
 
       /* Flush the batch if it's approaching full, so that we don't wrap while
        * we've got validated state that needs to be in the same batch as the
-       * primitives.  This fraction is just a guess (minimal full state plus
-       * a primitive is around 512 bytes), and would be better if we had
-       * an upper bound of how much we might emit in a single
-       * brw_try_draw_prims().
+       * primitives.
        */
-      intel_batchbuffer_require_space(intel, 1024, false);
+      intel_batchbuffer_require_space(intel, estimated_max_prim_size, false);
 
       hw_prim = brw_set_prim(brw, &prim[i]);
       if (brw->state.dirty.brw) {
@@ -438,6 +449,9 @@ void brw_draw_prims( struct gl_context *ctx,
 {
    GLboolean retval;
 
+   if (!_mesa_check_conditional_render(ctx))
+      return;
+
    if (!vbo_all_varyings_in_vbos(arrays)) {
       if (!index_bounds_valid)
 	 vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 718b3800423..4eb67d57a5a 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -293,6 +293,14 @@ static INLINE struct brw_reg retype( struct brw_reg reg,
    return reg;
 }
 
+static inline struct brw_reg
+sechalf(struct brw_reg reg)
+{
+   if (reg.vstride)
+      reg.nr++;
+   return reg;
+}
+
 static INLINE struct brw_reg suboffset( struct brw_reg reg,
 					  GLuint delta )
 {   
@@ -856,7 +864,6 @@ void brw_ff_sync(struct brw_compile *p,
 
 void brw_fb_WRITE(struct brw_compile *p,
 		  int dispatch_width,
-		   struct brw_reg dest,
 		   GLuint msg_reg_nr,
 		   struct brw_reg src0,
 		   GLuint binding_table_index,
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 71485cd1f71..859068ec4eb 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -1871,7 +1871,6 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
 
 void brw_fb_WRITE(struct brw_compile *p,
 		  int dispatch_width,
-                  struct brw_reg dest,
                   GLuint msg_reg_nr,
                   struct brw_reg src0,
                   GLuint binding_table_index,
@@ -1883,6 +1882,12 @@ void brw_fb_WRITE(struct brw_compile *p,
    struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint msg_control, msg_type;
+   struct brw_reg dest;
+
+   if (dispatch_width == 16)
+      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+   else
+      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
 
    if (intel->gen >= 6 && binding_table_index == 0) {
       insn = next_insn(p, BRW_OPCODE_SENDC);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5426925e372..21eb9e4e5e1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -194,6 +194,32 @@ fs_visitor::fail(const char *format, ...)
    }
 }
 
+void
+fs_visitor::push_force_uncompressed()
+{
+   force_uncompressed_stack++;
+}
+
+void
+fs_visitor::pop_force_uncompressed()
+{
+   force_uncompressed_stack--;
+   assert(force_uncompressed_stack >= 0);
+}
+
+void
+fs_visitor::push_force_sechalf()
+{
+   force_sechalf_stack++;
+}
+
+void
+fs_visitor::pop_force_sechalf()
+{
+   force_sechalf_stack--;
+   assert(force_sechalf_stack >= 0);
+}
+
 /**
  * Returns how many MRFs an FS opcode will write over.
  *
@@ -214,9 +240,9 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
    case FS_OPCODE_LOG2:
    case FS_OPCODE_SIN:
    case FS_OPCODE_COS:
-      return 1;
+      return 1 * c->dispatch_width / 8;
    case FS_OPCODE_POW:
-      return 2;
+      return 2 * c->dispatch_width / 8;
    case FS_OPCODE_TEX:
    case FS_OPCODE_TXB:
    case FS_OPCODE_TXD:
@@ -313,6 +339,31 @@ fs_visitor::variable_storage(ir_variable *var)
    return (fs_reg *)hash_table_find(this->variable_ht, var);
 }
 
+void
+import_uniforms_callback(const void *key,
+			 void *data,
+			 void *closure)
+{
+   struct hash_table *dst_ht = (struct hash_table *)closure;
+   const fs_reg *reg = (const fs_reg *)data;
+
+   if (reg->file != UNIFORM)
+      return;
+
+   hash_table_insert(dst_ht, data, key);
+}
+
+/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
+ * This brings in those uniform definitions
+ */
+void
+fs_visitor::import_uniforms(struct hash_table *src_variable_ht)
+{
+   hash_table_call_foreach(src_variable_ht,
+			   import_uniforms_callback,
+			   variable_ht);
+}
+
 /* Our support for uniforms is piggy-backed on the struct
  * gl_fragment_program, because that's where the values actually
  * get stored, rather than in some global gl_shader_program uniform
@@ -614,7 +665,7 @@ fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
 
    if (intel->gen < 6) {
       inst->base_mrf = 2;
-      inst->mlen = 1;
+      inst->mlen = c->dispatch_width / 8;
    }
 
    return inst;
@@ -652,7 +703,7 @@ fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
       inst = emit(opcode, dst, src0, reg_null_f);
 
       inst->base_mrf = base_mrf;
-      inst->mlen = 2;
+      inst->mlen = 2 * c->dispatch_width / 8;
    }
    return inst;
 }
@@ -689,6 +740,13 @@ fs_visitor::visit(ir_variable *ir)
    if (ir->mode == ir_var_uniform) {
       int param_index = c->prog_data.nr_params;
 
+      if (c->dispatch_width == 16) {
+	 if (!variable_storage(ir)) {
+	    fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
+	 }
+	 return;
+      }
+
       if (!strncmp(ir->name, "gl_", 3)) {
 	 setup_builtin_uniform_values(ir);
       } else {
@@ -1233,32 +1291,34 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
    return inst;
 }
 
+/* gen5's sampler has slots for u, v, r, array index, then optional
+ * parameters like shadow comparitor or LOD bias.  If optional
+ * parameters aren't present, those base slots are optional and don't
+ * need to be included in the message.
+ *
+ * We don't fill in the unnecessary slots regardless, which may look
+ * surprising in the disassembly.
+ */
 fs_inst *
 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
 {
-   /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
-    * optional parameters like shadow comparitor or LOD bias.  If
-    * optional parameters aren't present, those base slots are
-    * optional and don't need to be included in the message.
-    *
-    * We don't fill in the unnecessary slots regardless, which may
-    * look surprising in the disassembly.
-    */
    int mlen = 1; /* g0 header always present. */
    int base_mrf = 1;
+   int reg_width = c->dispatch_width / 8;
 
    for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * reg_width),
+	   coordinate);
       coordinate.reg_offset++;
    }
-   mlen += ir->coordinate->type->vector_elements;
+   mlen += ir->coordinate->type->vector_elements * reg_width;
 
    if (ir->shadow_comparitor) {
-      mlen = MAX2(mlen, 5);
+      mlen = MAX2(mlen, 1 + 4 * reg_width);
 
       ir->shadow_comparitor->accept(this);
       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-      mlen++;
+      mlen += reg_width;
    }
 
    fs_inst *inst = NULL;
@@ -1268,17 +1328,18 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
       break;
    case ir_txb:
       ir->lod_info.bias->accept(this);
-      mlen = MAX2(mlen, 5);
+      mlen = MAX2(mlen, 1 + 4 * reg_width);
       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-      mlen++;
+      mlen += reg_width;
 
       inst = emit(FS_OPCODE_TXB, dst);
+
       break;
    case ir_txl:
       ir->lod_info.lod->accept(this);
-      mlen = MAX2(mlen, 5);
+      mlen = MAX2(mlen, 1 + 4 * reg_width);
       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
-      mlen++;
+      mlen += reg_width;
 
       inst = emit(FS_OPCODE_TXL, dst);
       break;
@@ -1290,6 +1351,10 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
    inst->base_mrf = base_mrf;
    inst->mlen = mlen;
 
+   if (mlen > 11) {
+      fail("Message length >11 disallowed by hardware\n");
+   }
+
    return inst;
 }
 
@@ -1355,6 +1420,12 @@ fs_visitor::visit(ir_texture *ir)
 	 0
       };
 
+      if (c->dispatch_width == 16) {
+	 fail("rectangle scale uniform setup not supported on 16-wide\n");
+	 this->result = fs_reg(this, ir->type);
+	 return;
+      }
+
       c->prog_data.param_convert[c->prog_data.nr_params] =
 	 PARAM_NO_CONVERT;
       c->prog_data.param_convert[c->prog_data.nr_params + 1] =
@@ -1731,6 +1802,10 @@ fs_visitor::visit(ir_if *ir)
 {
    fs_inst *inst;
 
+   if (c->dispatch_width == 16) {
+      fail("Can't support (non-uniform) control flow on 16-wide\n");
+   }
+
    /* Don't point the annotation at the if statement, because then it plus
     * the then and else blocks get printed.
     */
@@ -1771,6 +1846,10 @@ fs_visitor::visit(ir_loop *ir)
 {
    fs_reg counter = reg_undef;
 
+   if (c->dispatch_width == 16) {
+      fail("Can't support (non-uniform) control flow on 16-wide\n");
+   }
+
    if (ir->counter) {
       this->base_ir = ir->counter;
       ir->counter->accept(this);
@@ -1874,6 +1953,11 @@ fs_visitor::emit(fs_inst inst)
    fs_inst *list_inst = new(mem_ctx) fs_inst;
    *list_inst = inst;
 
+   if (force_uncompressed_stack > 0)
+      list_inst->force_uncompressed = true;
+   else if (force_sechalf_stack > 0)
+      list_inst->force_sechalf = true;
+
    list_inst->annotation = this->current_annotation;
    list_inst->ir = this->base_ir;
 
@@ -1916,21 +2000,14 @@ fs_visitor::interp_reg(int location, int channel)
 void
 fs_visitor::emit_interpolation_setup_gen4()
 {
-   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
-
    this->current_annotation = "compute pixel centers";
    this->pixel_x = fs_reg(this, glsl_type::uint_type);
    this->pixel_y = fs_reg(this, glsl_type::uint_type);
    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
-   emit(BRW_OPCODE_ADD,
-	this->pixel_x,
-	fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
-	fs_reg(brw_imm_v(0x10101010)));
-   emit(BRW_OPCODE_ADD,
-	this->pixel_y,
-	fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
-	fs_reg(brw_imm_v(0x11001100)));
+
+   emit(FS_OPCODE_PIXEL_X, this->pixel_x);
+   emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
 
    this->current_annotation = "compute pixel deltas from v0";
    if (brw->has_pln) {
@@ -2001,11 +2078,69 @@ fs_visitor::emit_interpolation_setup_gen6()
 }
 
 void
+fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color)
+{
+   int reg_width = c->dispatch_width / 8;
+
+   if (c->dispatch_width == 8 || intel->gen == 6) {
+      /* SIMD8 write looks like:
+       * m + 0: r0
+       * m + 1: r1
+       * m + 2: g0
+       * m + 3: g1
+       *
+       * gen6 SIMD16 DP write looks like:
+       * m + 0: r0
+       * m + 1: r1
+       * m + 2: g0
+       * m + 3: g1
+       * m + 4: b0
+       * m + 5: b1
+       * m + 6: a0
+       * m + 7: a1
+       */
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width),
+	   color);
+   } else {
+      /* pre-gen6 SIMD16 single source DP write looks like:
+       * m + 0: r0
+       * m + 1: g0
+       * m + 2: b0
+       * m + 3: a0
+       * m + 4: r1
+       * m + 5: g1
+       * m + 6: b1
+       * m + 7: a1
+       */
+      if (brw->has_compr4) {
+	 /* By setting the high bit of the MRF register number, we
+	  * indicate that we want COMPR4 mode - instead of doing the
+	  * usual destination + 1 for the second half we get
+	  * destination + 4.
+	  */
+	 emit(BRW_OPCODE_MOV,
+	      fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index), color);
+      } else {
+	 push_force_uncompressed();
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color);
+	 pop_force_uncompressed();
+
+	 push_force_sechalf();
+	 color.sechalf = true;
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color);
+	 pop_force_sechalf();
+	 color.sechalf = false;
+      }
+   }
+}
+
+void
 fs_visitor::emit_fb_writes()
 {
    this->current_annotation = "FB write header";
    GLboolean header_present = GL_TRUE;
    int nr = 0;
+   int reg_width = c->dispatch_width / 8;
 
    if (intel->gen >= 6 &&
        !this->kill_emitted &&
@@ -2019,31 +2154,44 @@ fs_visitor::emit_fb_writes()
    }
 
    if (c->aa_dest_stencil_reg) {
+      push_force_uncompressed();
       emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
 	   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
+      pop_force_uncompressed();
    }
 
    /* Reserve space for color. It'll be filled in per MRT below. */
    int color_mrf = nr;
-   nr += 4;
+   nr += 4 * reg_width;
 
    if (c->source_depth_to_render_target) {
+      if (intel->gen == 6 && c->dispatch_width == 16) {
+	 /* For outputting oDepth on gen6, SIMD8 writes have to be
+	  * used.  This would require 8-wide moves of each half to
+	  * message regs, kind of like pre-gen5 SIMD16 FB writes.
+	  * Just bail on doing so for now.
+	  */
+	 fail("Missing support for simd16 depth writes on gen6\n");
+      }
+
       if (c->computes_depth) {
 	 /* Hand over gl_FragDepth. */
 	 assert(this->frag_depth);
 	 fs_reg depth = *(variable_storage(this->frag_depth));
 
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth);
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
       } else {
 	 /* Pass through the payload depth. */
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
 	      fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
       }
+      nr += reg_width;
    }
 
    if (c->dest_depth_reg) {
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
 	   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
+      nr += reg_width;
    }
 
    fs_reg color = reg_undef;
@@ -2060,7 +2208,7 @@ fs_visitor::emit_fb_writes()
 						 target);
       if (this->frag_color || this->frag_data) {
 	 for (int i = 0; i < 4; i++) {
-	    emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color);
+	    emit_color_write(i, color_mrf, color);
 	    color.reg_offset++;
 	 }
       }
@@ -2084,7 +2232,7 @@ fs_visitor::emit_fb_writes()
 	  * renderbuffer.
 	  */
 	 color.reg_offset += 3;
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + 3), color);
+	 emit_color_write(3, color_mrf, color);
       }
 
       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
@@ -2144,8 +2292,7 @@ fs_visitor::generate_fb_write(fs_inst *inst)
    brw_pop_insn_state(p);
 
    brw_fb_WRITE(p,
-		8, /* dispatch_width */
-		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+		c->dispatch_width,
 		inst->base_mrf,
 		implied_header,
 		inst->target,
@@ -2155,6 +2302,40 @@ fs_visitor::generate_fb_write(fs_inst *inst)
 		inst->header_present);
 }
 
+/* Computes the integer pixel x,y values from the origin.
+ *
+ * This is the basis of gl_FragCoord computation, but is also used
+ * pre-gen6 for computing the deltas from v0 for computing
+ * interpolation.
+ */
+void
+fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x)
+{
+   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
+   struct brw_reg src;
+   struct brw_reg deltas;
+
+   if (is_x) {
+      src = stride(suboffset(g1_uw, 4), 2, 4, 0);
+      deltas = brw_imm_v(0x10101010);
+   } else {
+      src = stride(suboffset(g1_uw, 5), 2, 4, 0);
+      deltas = brw_imm_v(0x11001100);
+   }
+
+   if (c->dispatch_width == 16) {
+      dst = vec16(dst);
+   }
+
+   /* We do this 8 or 16-wide, but since the destination is UW we
+    * don't do compression in the 16-wide case.
+    */
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_ADD(p, dst, src, deltas);
+   brw_pop_insn_state(p);
+}
+
 void
 fs_visitor::generate_linterp(fs_inst *inst,
 			     struct brw_reg dst, struct brw_reg *src)
@@ -2214,8 +2395,16 @@ fs_visitor::generate_math(fs_inst *inst,
       assert(inst->mlen == 0);
 
       if (inst->opcode == FS_OPCODE_POW) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 	 brw_math2(p, dst, op, src[0], src[1]);
+
+	 if (c->dispatch_width == 16) {
+	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+	    brw_math2(p, sechalf(dst), op, sechalf(src[0]), sechalf(src[1]));
+	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+	 }
       } else {
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 	 brw_math(p, dst,
 		  op,
 		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
@@ -2223,10 +2412,23 @@ fs_visitor::generate_math(fs_inst *inst,
 		  0, src[0],
 		  BRW_MATH_DATA_VECTOR,
 		  BRW_MATH_PRECISION_FULL);
+
+	 if (c->dispatch_width == 16) {
+	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+	    brw_math(p, sechalf(dst),
+		     op,
+		     inst->saturate ? BRW_MATH_SATURATE_SATURATE :
+		     BRW_MATH_SATURATE_NONE,
+		     0, sechalf(src[0]),
+		     BRW_MATH_DATA_VECTOR,
+		     BRW_MATH_PRECISION_FULL);
+	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+	 }
       }
-   } else {
+   } else /* gen <= 5 */{
       assert(inst->mlen >= 1);
 
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       brw_math(p, dst,
 	       op,
 	       inst->saturate ? BRW_MATH_SATURATE_SATURATE :
@@ -2234,6 +2436,19 @@ fs_visitor::generate_math(fs_inst *inst,
 	       inst->base_mrf, src[0],
 	       BRW_MATH_DATA_VECTOR,
 	       BRW_MATH_PRECISION_FULL);
+
+      if (c->dispatch_width == 16) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+	 brw_math(p, sechalf(dst),
+		  op,
+		  inst->saturate ? BRW_MATH_SATURATE_SATURATE :
+		  BRW_MATH_SATURATE_NONE,
+		  inst->base_mrf + 1, sechalf(src[0]),
+		  BRW_MATH_DATA_VECTOR,
+		  BRW_MATH_PRECISION_FULL);
+
+	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+      }
    }
 }
 
@@ -2244,6 +2459,12 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
    int rlen = 4;
    uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
 
+   if (c->dispatch_width == 16) {
+      rlen = 8;
+      dst = vec16(dst);
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+   }
+
    if (intel->gen >= 5) {
       switch (inst->opcode) {
       case FS_OPCODE_TEX:
@@ -2311,11 +2532,6 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
    }
    assert(msg_type != -1);
 
-   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
-      rlen = 8;
-      dst = vec16(dst);
-   }
-
    brw_SAMPLE(p,
 	      retype(dst, BRW_REGISTER_TYPE_UW),
 	      inst->base_mrf,
@@ -2408,6 +2624,7 @@ fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
    } else {
       brw_push_insn_state(p);
       brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
       brw_pop_insn_state(p);
    }
@@ -2432,6 +2649,7 @@ fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
 
       brw_push_insn_state(p);
       brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       brw_AND(p, g1, f0, g1);
       brw_pop_insn_state(p);
    } else {
@@ -2441,6 +2659,7 @@ fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
 
       brw_push_insn_state(p);
       brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       brw_AND(p, g0, mask, g0);
       brw_pop_insn_state(p);
    }
@@ -2527,6 +2746,9 @@ fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
 void
 fs_visitor::setup_paramvalues_refs()
 {
+   if (c->dispatch_width != 8)
+      return;
+
    /* Set up the pointers to ParamValues now that that array is finalized. */
    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
       c->prog_data.param[i] =
@@ -2538,8 +2760,12 @@ fs_visitor::setup_paramvalues_refs()
 void
 fs_visitor::assign_curb_setup()
 {
-   c->prog_data.first_curbe_grf = c->nr_payload_regs;
    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
+   if (c->dispatch_width == 8) {
+      c->prog_data.first_curbe_grf = c->nr_payload_regs;
+   } else {
+      c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
+   }
 
    /* Map the offsets in the UNIFORM file to fixed HW regs. */
    foreach_iter(exec_list_iterator, iter, this->instructions) {
@@ -2548,7 +2774,7 @@ fs_visitor::assign_curb_setup()
       for (unsigned int i = 0; i < 3; i++) {
 	 if (inst->src[i].file == UNIFORM) {
 	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
-	    struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
+	    struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
 						  constant_nr / 8,
 						  constant_nr % 8);
 
@@ -2600,7 +2826,7 @@ fs_visitor::calculate_urb_setup()
 void
 fs_visitor::assign_urb_setup()
 {
-   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
+   int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
 
    /* Offset all the urb_setup[] index by the actual position of the
     * setup regs, now that the location of the constants has been chosen.
@@ -2725,6 +2951,11 @@ fs_visitor::setup_pull_constants()
    if (c->prog_data.nr_params <= max_uniform_components)
       return;
 
+   if (c->dispatch_width == 16) {
+      fail("Pull constants not supported in 16-wide\n");
+      return;
+   }
+
    /* Just demote the end of the list.  We could probably do better
     * here, demoting things that are rarely used in the program first.
     */
@@ -2884,7 +3115,9 @@ fs_visitor::propagate_constants()
       if (inst->opcode != BRW_OPCODE_MOV ||
 	  inst->predicated ||
 	  inst->dst.file != GRF || inst->src[0].file != IMM ||
-	  inst->dst.type != inst->src[0].type)
+	  inst->dst.type != inst->src[0].type ||
+	  (c->dispatch_width == 16 &&
+	   (inst->force_uncompressed || inst->force_sechalf)))
 	 continue;
 
       /* Don't bother with cases where we should have had the
@@ -3152,6 +3385,20 @@ fs_visitor::compute_to_mrf()
 	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
 	 continue;
 
+      /* Work out which hardware MRF registers are written by this
+       * instruction.
+       */
+      int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
+      int mrf_high;
+      if (inst->dst.hw_reg & BRW_MRF_COMPR4) {
+	 mrf_high = mrf_low + 4;
+      } else if (c->dispatch_width == 16 &&
+		 (!inst->force_uncompressed && !inst->force_sechalf)) {
+	 mrf_high = mrf_low + 1;
+      } else {
+	 mrf_high = mrf_low;
+      }
+
       /* Can't compute-to-MRF this GRF if someone else was going to
        * read it later.
        */
@@ -3179,11 +3426,21 @@ fs_visitor::compute_to_mrf()
 	    }
 
 	    /* If it's predicated, it (probably) didn't populate all
-	     * the channels.
+	     * the channels.  We might be able to rewrite everything
+	     * that writes that reg, but it would require smarter
+	     * tracking to delay the rewriting until complete success.
 	     */
 	    if (scan_inst->predicated)
 	       break;
 
+	    /* If it's half of register setup and not the same half as
+	     * our MOV we're trying to remove, bail for now.
+	     */
+	    if (scan_inst->force_uncompressed != inst->force_uncompressed ||
+		scan_inst->force_sechalf != inst->force_sechalf) {
+	       break;
+	    }
+
 	    /* SEND instructions can't have MRF as a destination. */
 	    if (scan_inst->mlen)
 	       break;
@@ -3233,12 +3490,29 @@ fs_visitor::compute_to_mrf()
 	 if (interfered)
 	    break;
 
-	 if (scan_inst->dst.file == MRF &&
-	     scan_inst->dst.hw_reg == inst->dst.hw_reg) {
-	    /* Somebody else wrote our MRF here, so we can't can't
+	 if (scan_inst->dst.file == MRF) {
+	    /* If somebody else writes our MRF here, we can't
 	     * compute-to-MRF before that.
 	     */
-	    break;
+	    int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4;
+	    int scan_mrf_high;
+
+	    if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) {
+	       scan_mrf_high = scan_mrf_low + 4;
+	    } else if (c->dispatch_width == 16 &&
+		       (!scan_inst->force_uncompressed &&
+			!scan_inst->force_sechalf)) {
+	       scan_mrf_high = scan_mrf_low + 1;
+	    } else {
+	       scan_mrf_high = scan_mrf_low;
+	    }
+
+	    if (mrf_low == scan_mrf_low ||
+		mrf_low == scan_mrf_high ||
+		mrf_high == scan_mrf_low ||
+		mrf_high == scan_mrf_high) {
+	       break;
+	    }
 	 }
 
 	 if (scan_inst->mlen > 0) {
@@ -3247,8 +3521,12 @@ fs_visitor::compute_to_mrf()
 	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
 	     * above it.
 	     */
-	    if (inst->dst.hw_reg >= scan_inst->base_mrf &&
-		inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) {
+	    if (mrf_low >= scan_inst->base_mrf &&
+		mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
+	       break;
+	    }
+	    if (mrf_high >= scan_inst->base_mrf &&
+		mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
 	       break;
 	    }
 	 }
@@ -3268,6 +3546,10 @@ fs_visitor::remove_duplicate_mrf_writes()
    fs_inst *last_mrf_move[16];
    bool progress = false;
 
+   /* Need to update the MRF tracking for compressed instructions. */
+   if (c->dispatch_width == 16)
+      return false;
+
    memset(last_mrf_move, 0, sizeof(last_mrf_move));
 
    foreach_iter(exec_list_iterator, iter, this->instructions) {
@@ -3347,6 +3629,29 @@ fs_visitor::virtual_grf_interferes(int a, int b)
 	  (this->virtual_grf_use[b] != -1 ||
 	   this->virtual_grf_def[b] == MAX_INSTRUCTION));
 
+   /* If the register is used to store 16 values of less than float
+    * size (only the case for pixel_[xy]), then we can't allocate
+    * another dword-sized thing to that register that would be used in
+    * the same instruction.  This is because when the GPU decodes (for
+    * example):
+    *
+    * (declare (in ) vec4 gl_FragCoord@0x97766a0)
+    * add(16)         g6<1>F          g6<8,8,1>UW     0.5F { align1 compr };
+    *
+    * it's actually processed as:
+    * add(8)         g6<1>F          g6<8,8,1>UW     0.5F { align1 };
+    * add(8)         g7<1>F          g6.8<8,8,1>UW   0.5F { align1 sechalf };
+    *
+    * so our second half values in g6 got overwritten in the first
+    * half.
+    */
+   if (c->dispatch_width == 16 && (this->pixel_x.reg == a ||
+				   this->pixel_x.reg == b ||
+				   this->pixel_y.reg == a ||
+				   this->pixel_y.reg == b)) {
+      return start <= end;
+   }
+
    return start < end;
 }
 
@@ -3366,6 +3671,8 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
 				reg->hw_reg, reg->smear);
       }
       brw_reg = retype(brw_reg, reg->type);
+      if (reg->sechalf)
+	 brw_reg = sechalf(brw_reg);
       break;
    case IMM:
       switch (reg->type) {
@@ -3411,7 +3718,7 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
 void
 fs_visitor::generate_code()
 {
-   int last_native_inst = 0;
+   int last_native_inst = p->nr_insn;
    const char *last_annotation_string = NULL;
    ir_instruction *last_annotation_ir = NULL;
 
@@ -3427,8 +3734,8 @@ fs_visitor::generate_code()
 
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      printf("Native code for fragment shader %d:\n",
-	     ctx->Shader.CurrentFragmentProgram->Name);
+      printf("Native code for fragment shader %d (%d-wide dispatch):\n",
+	     ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width);
    }
 
    foreach_iter(exec_list_iterator, iter, this->instructions) {
@@ -3461,6 +3768,14 @@ fs_visitor::generate_code()
       brw_set_predicate_inverse(p, inst->predicate_inverse);
       brw_set_saturate(p, inst->saturate);
 
+      if (inst->force_uncompressed || c->dispatch_width == 8) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      } else if (inst->force_sechalf) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      } else {
+	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+      }
+
       switch (inst->opcode) {
       case BRW_OPCODE_MOV:
 	 brw_MOV(p, dst, src[0]);
@@ -3602,6 +3917,12 @@ fs_visitor::generate_code()
       case FS_OPCODE_COS:
 	 generate_math(inst, dst, src);
 	 break;
+      case FS_OPCODE_PIXEL_X:
+	 generate_pixel_xy(dst, true);
+	 break;
+      case FS_OPCODE_PIXEL_Y:
+	 generate_pixel_xy(dst, false);
+	 break;
       case FS_OPCODE_CINTERP:
 	 brw_MOV(p, dst, src[0]);
 	 break;
@@ -3668,6 +3989,10 @@ fs_visitor::generate_code()
       last_native_inst = p->nr_insn;
    }
 
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+      printf("\n");
+   }
+
    ralloc_free(if_stack);
    ralloc_free(loop_stack);
    ralloc_free(if_depth_in_loop);
@@ -3693,108 +4018,146 @@ fs_visitor::generate_code()
    }
 }
 
-GLboolean
-brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
+bool
+fs_visitor::run()
 {
-   struct intel_context *intel = &brw->intel;
-   struct gl_context *ctx = &intel->ctx;
-   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
+   uint32_t prog_offset_16 = 0;
+   uint32_t orig_nr_params = c->prog_data.nr_params;
 
-   if (!prog)
-      return GL_FALSE;
+   brw_wm_payload_setup(brw, c);
 
-   struct brw_shader *shader =
-     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
-   if (!shader)
-      return GL_FALSE;
+   if (c->dispatch_width == 16) {
+      /* align to 64 byte boundary. */
+      while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
+	 brw_NOP(p);
+      }
 
-   /* We always use 8-wide mode, at least for now.  For one, flow
-    * control only works in 8-wide.  Also, when we're fragment shader
-    * bound, we're almost always under register pressure as well, so
-    * 8-wide would save us from the performance cliff of spilling
-    * regs.
-    */
-   c->dispatch_width = 8;
+      /* Save off the start of this 16-wide program in case we succeed. */
+      prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
 
-   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
-      _mesa_print_ir(shader->ir, NULL);
-      printf("\n");
+      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
    }
 
-   /* Now the main event: Visit the shader IR and generate our FS IR for it.
-    */
-   fs_visitor v(c, shader);
-
    if (0) {
-      v.emit_dummy_fs();
+      emit_dummy_fs();
    } else {
-      v.calculate_urb_setup();
+      calculate_urb_setup();
       if (intel->gen < 6)
-	 v.emit_interpolation_setup_gen4();
+	 emit_interpolation_setup_gen4();
       else
-	 v.emit_interpolation_setup_gen6();
+	 emit_interpolation_setup_gen6();
 
       /* Generate FS IR for main().  (the visitor only descends into
        * functions called "main").
        */
       foreach_iter(exec_list_iterator, iter, *shader->ir) {
 	 ir_instruction *ir = (ir_instruction *)iter.get();
-	 v.base_ir = ir;
-	 ir->accept(&v);
+	 base_ir = ir;
+	 ir->accept(this);
       }
 
-      v.emit_fb_writes();
+      emit_fb_writes();
 
-      v.split_virtual_grfs();
+      split_virtual_grfs();
 
-      v.setup_paramvalues_refs();
-      v.setup_pull_constants();
+      setup_paramvalues_refs();
+      setup_pull_constants();
 
       bool progress;
       do {
 	 progress = false;
 
-	 progress = v.remove_duplicate_mrf_writes() || progress;
+	 progress = remove_duplicate_mrf_writes() || progress;
 
-	 progress = v.propagate_constants() || progress;
-	 progress = v.register_coalesce() || progress;
-	 progress = v.compute_to_mrf() || progress;
-	 progress = v.dead_code_eliminate() || progress;
+	 progress = propagate_constants() || progress;
+	 progress = register_coalesce() || progress;
+	 progress = compute_to_mrf() || progress;
+	 progress = dead_code_eliminate() || progress;
       } while (progress);
 
-      v.schedule_instructions();
+      schedule_instructions();
 
-      v.assign_curb_setup();
-      v.assign_urb_setup();
+      assign_curb_setup();
+      assign_urb_setup();
 
       if (0) {
 	 /* Debug of register spilling: Go spill everything. */
-	 int virtual_grf_count = v.virtual_grf_next;
+	 int virtual_grf_count = virtual_grf_next;
 	 for (int i = 1; i < virtual_grf_count; i++) {
-	    v.spill_reg(i);
+	    spill_reg(i);
 	 }
       }
 
       if (0)
-	 v.assign_regs_trivial();
+	 assign_regs_trivial();
       else {
-	 while (!v.assign_regs()) {
-	    if (v.failed)
+	 while (!assign_regs()) {
+	    if (failed)
 	       break;
 	 }
       }
    }
+   assert(force_uncompressed_stack == 0);
+   assert(force_sechalf_stack == 0);
 
-   if (!v.failed)
-      v.generate_code();
+   if (failed)
+      return false;
 
-   assert(!v.failed); /* FINISHME: Cleanly fail, tested at link time, etc. */
+   generate_code();
 
-   if (v.failed)
-      return GL_FALSE;
+   if (c->dispatch_width == 8) {
+      c->prog_data.total_grf = grf_used;
+   } else {
+      c->prog_data.total_grf_16 = grf_used;
+      c->prog_data.prog_offset_16 = prog_offset_16;
 
-   c->prog_data.total_grf = v.grf_used;
+      /* Make sure we didn't try to sneak in an extra uniform */
+      assert(orig_nr_params == c->prog_data.nr_params);
+   }
 
-   return GL_TRUE;
+   return !failed;
+}
+
+bool
+brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+   struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
+   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
+
+   if (!prog)
+      return false;
+
+   struct brw_shader *shader =
+     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
+   if (!shader)
+      return false;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
+      _mesa_print_ir(shader->ir, NULL);
+      printf("\n\n");
+   }
+
+   /* Now the main event: Visit the shader IR and generate our FS IR for it.
+    */
+   c->dispatch_width = 8;
+
+   fs_visitor v(c, shader);
+   if (!v.run()) {
+      /* FINISHME: Cleanly fail, test at link time, etc. */
+      assert(!"not reached");
+      return false;
+   }
+
+   if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
+      c->dispatch_width = 16;
+      fs_visitor v2(c, shader);
+      v2.import_uniforms(v.variable_ht);
+      v2.run();
+   }
+
+   c->prog_data.dispatch_width = 8;
+
+   return true;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index f792906cfe7..518d09180c4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -67,6 +67,8 @@ enum fs_opcodes {
    FS_OPCODE_COS,
    FS_OPCODE_DDX,
    FS_OPCODE_DDY,
+   FS_OPCODE_PIXEL_X,
+   FS_OPCODE_PIXEL_Y,
    FS_OPCODE_CINTERP,
    FS_OPCODE_LINTERP,
    FS_OPCODE_TEX,
@@ -176,6 +178,7 @@ public:
    int type;
    bool negate;
    bool abs;
+   bool sechalf;
    struct brw_reg fixed_hw_reg;
    int smear; /* -1, or a channel of the reg to smear to all channels. */
 
@@ -341,6 +344,8 @@ public:
    bool eot;
    bool header_present;
    bool shadow_compare;
+   bool force_uncompressed;
+   bool force_sechalf;
    uint32_t offset; /* spill/unspill offset */
 
    /** @{
@@ -403,6 +408,8 @@ public:
       this->live_intervals_valid = false;
 
       this->kill_emitted = false;
+      this->force_uncompressed_stack = 0;
+      this->force_sechalf_stack = 0;
    }
 
    ~fs_visitor()
@@ -413,6 +420,7 @@ public:
 
    fs_reg *variable_storage(ir_variable *var);
    int virtual_grf_alloc(int size);
+   void import_uniforms(struct hash_table *src_variable_ht);
 
    void visit(ir_variable *ir);
    void visit(ir_assignment *ir);
@@ -459,6 +467,7 @@ public:
       return emit(fs_inst(opcode, dst, src0, src1, src2));
    }
 
+   bool run();
    void setup_paramvalues_refs();
    void assign_curb_setup();
    void calculate_urb_setup();
@@ -479,8 +488,14 @@ public:
    void schedule_instructions();
    void fail(const char *msg, ...);
 
+   void push_force_uncompressed();
+   void pop_force_uncompressed();
+   void push_force_sechalf();
+   void pop_force_sechalf();
+
    void generate_code();
    void generate_fb_write(fs_inst *inst);
+   void generate_pixel_xy(struct brw_reg dst, bool is_x);
    void generate_linterp(fs_inst *inst, struct brw_reg dst,
 			 struct brw_reg *src);
    void generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src);
@@ -508,6 +523,7 @@ public:
    void emit_if_gen6(ir_if *ir);
    void emit_unspill(fs_inst *inst, fs_reg reg, uint32_t spill_offset);
 
+   void emit_color_write(int index, int first_color_mrf, fs_reg color);
    void emit_fb_writes();
    void emit_assignment_writes(fs_reg &l, fs_reg &r,
 			       const glsl_type *type, bool predicated);
@@ -565,6 +581,9 @@ public:
    fs_reg reg_null_cmp;
 
    int grf_used;
+
+   int force_uncompressed_stack;
+   int force_sechalf_stack;
 };
 
 GLboolean brw_do_channel_expressions(struct exec_list *instructions);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 67f29ce1816..1e2cf917116 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -48,11 +48,11 @@ extern "C" {
 #include "../glsl/ir_print_visitor.h"
 
 static void
-assign_reg(int *reg_hw_locations, fs_reg *reg)
+assign_reg(int *reg_hw_locations, fs_reg *reg, int reg_width)
 {
    if (reg->file == GRF && reg->reg != 0) {
       assert(reg->reg_offset >= 0);
-      reg->hw_reg = reg_hw_locations[reg->reg] + reg->reg_offset;
+      reg->hw_reg = reg_hw_locations[reg->reg] + reg->reg_offset * reg_width;
       reg->reg = 0;
    }
 }
@@ -63,32 +63,48 @@ fs_visitor::assign_regs_trivial()
    int last_grf = 0;
    int hw_reg_mapping[this->virtual_grf_next];
    int i;
+   int reg_width = c->dispatch_width / 8;
 
    hw_reg_mapping[0] = 0;
-   hw_reg_mapping[1] = this->first_non_payload_grf;
+   /* Note that compressed instructions require alignment to 2 registers. */
+   hw_reg_mapping[1] = ALIGN(this->first_non_payload_grf, reg_width);
    for (i = 2; i < this->virtual_grf_next; i++) {
       hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
-			   this->virtual_grf_sizes[i - 1]);
+			   this->virtual_grf_sizes[i - 1] * reg_width);
    }
-   last_grf = hw_reg_mapping[i - 1] + this->virtual_grf_sizes[i - 1];
+   last_grf = hw_reg_mapping[i - 1] + (this->virtual_grf_sizes[i - 1] *
+				       reg_width);
 
    foreach_iter(exec_list_iterator, iter, this->instructions) {
       fs_inst *inst = (fs_inst *)iter.get();
 
-      assign_reg(hw_reg_mapping, &inst->dst);
-      assign_reg(hw_reg_mapping, &inst->src[0]);
-      assign_reg(hw_reg_mapping, &inst->src[1]);
+      assign_reg(hw_reg_mapping, &inst->dst, reg_width);
+      assign_reg(hw_reg_mapping, &inst->src[0], reg_width);
+      assign_reg(hw_reg_mapping, &inst->src[1], reg_width);
    }
 
-   this->grf_used = last_grf + 1;
+   if (last_grf >= BRW_MAX_GRF) {
+      fail("Ran out of regs on trivial allocator (%d/%d)\n",
+	   last_grf, BRW_MAX_GRF);
+   }
+
+   this->grf_used = last_grf + reg_width;
 }
 
 bool
 fs_visitor::assign_regs()
 {
+   /* Most of this allocation was written for a reg_width of 1
+    * (dispatch_width == 8).  In extending to 16-wide, the code was
+    * left in place and it was converted to have the hardware
+    * registers it's allocating be contiguous physical pairs of regs
+    * for reg_width == 2.
+    */
+   int reg_width = c->dispatch_width / 8;
    int last_grf = 0;
    int hw_reg_mapping[this->virtual_grf_next + 1];
-   int base_reg_count = BRW_MAX_GRF - this->first_non_payload_grf;
+   int first_assigned_grf = ALIGN(this->first_non_payload_grf, reg_width);
+   int base_reg_count = (BRW_MAX_GRF - first_assigned_grf) / reg_width;
    int class_sizes[base_reg_count];
    int class_count = 0;
    int aligned_pair_class = -1;
@@ -157,8 +173,8 @@ fs_visitor::assign_regs()
 
 	       if (0) {
 		  printf("%d/%d conflicts %d/%d\n",
-			 class_sizes[i], this->first_non_payload_grf + i_r,
-			 class_sizes[c], this->first_non_payload_grf + c_r);
+			 class_sizes[i], first_assigned_grf + i_r,
+			 class_sizes[c], first_assigned_grf + c_r);
 	       }
 
 	       ra_add_reg_conflict(regs,
@@ -172,7 +188,7 @@ fs_visitor::assign_regs()
    /* Add a special class for aligned pairs, which we'll put delta_x/y
     * in on gen5 so that we can do PLN.
     */
-   if (brw->has_pln && intel->gen < 6) {
+   if (brw->has_pln && reg_width == 1 && intel->gen < 6) {
       int reg_count = (base_reg_count - 1) / 2;
       int unaligned_pair_class = 1;
       assert(class_sizes[unaligned_pair_class] == 2);
@@ -182,7 +198,7 @@ fs_visitor::assign_regs()
       class_sizes[aligned_pair_class] = 2;
       class_base_reg[aligned_pair_class] = 0;
       class_reg_count[aligned_pair_class] = 0;
-      int start = (this->first_non_payload_grf & 1) ? 1 : 0;
+      int start = (first_assigned_grf & 1) ? 1 : 0;
 
       for (int i = 0; i < reg_count; i++) {
 	 ra_class_add_reg(regs, classes[aligned_pair_class],
@@ -228,6 +244,8 @@ fs_visitor::assign_regs()
 
       if (reg == -1) {
 	 fail("no register to spill\n");
+      } else if (c->dispatch_width == 16) {
+	 fail("no spilling support on 16-wide yet\n");
       } else {
 	 spill_reg(reg);
       }
@@ -257,7 +275,7 @@ fs_visitor::assign_regs()
       }
 
       assert(hw_reg >= 0);
-      hw_reg_mapping[i] = this->first_non_payload_grf + hw_reg;
+      hw_reg_mapping[i] = first_assigned_grf + hw_reg * reg_width;
       last_grf = MAX2(last_grf,
 		      hw_reg_mapping[i] + this->virtual_grf_sizes[i] - 1);
    }
@@ -265,12 +283,12 @@ fs_visitor::assign_regs()
    foreach_iter(exec_list_iterator, iter, this->instructions) {
       fs_inst *inst = (fs_inst *)iter.get();
 
-      assign_reg(hw_reg_mapping, &inst->dst);
-      assign_reg(hw_reg_mapping, &inst->src[0]);
-      assign_reg(hw_reg_mapping, &inst->src[1]);
+      assign_reg(hw_reg_mapping, &inst->dst, reg_width);
+      assign_reg(hw_reg_mapping, &inst->src[0], reg_width);
+      assign_reg(hw_reg_mapping, &inst->src[1], reg_width);
    }
 
-   this->grf_used = last_grf + 1;
+   this->grf_used = last_grf + reg_width;
 
    ralloc_free(g);
    ralloc_free(regs);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
index bff8f82f3f7..fb1192c810a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
@@ -145,6 +145,8 @@ public:
    void calculate_deps();
    void schedule_instructions(fs_inst *next_block_header);
 
+   bool is_compressed(fs_inst *inst);
+
    void *mem_ctx;
 
    int instructions_to_schedule;
@@ -234,6 +236,17 @@ instruction_scheduler::add_barrier_deps(schedule_node *n)
    }
 }
 
+/* instruction scheduling needs to be aware of when an MRF write
+ * actually writes 2 MRFs.
+ */
+bool
+instruction_scheduler::is_compressed(fs_inst *inst)
+{
+   return (v->c->dispatch_width == 16 &&
+	   !inst->force_uncompressed &&
+	   !inst->force_sechalf);
+}
+
 void
 instruction_scheduler::calculate_deps()
 {
@@ -297,11 +310,24 @@ instruction_scheduler::calculate_deps()
 	 }
 	 last_grf_write[inst->dst.reg] = n;
       } else if (inst->dst.file == MRF) {
-	 if (last_mrf_write[inst->dst.hw_reg]) {
-	    add_dep(last_mrf_write[inst->dst.hw_reg], n,
-		    last_mrf_write[inst->dst.hw_reg]->latency);
+	 int reg = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
+
+	 if (last_mrf_write[reg]) {
+	    add_dep(last_mrf_write[reg], n,
+		    last_mrf_write[reg]->latency);
+	 }
+	 last_mrf_write[reg] = n;
+	 if (is_compressed(inst)) {
+	    if (inst->dst.hw_reg & BRW_MRF_COMPR4)
+	       reg += 4;
+	    else
+	       reg++;
+	    if (last_mrf_write[reg]) {
+	       add_dep(last_mrf_write[reg], n,
+		       last_mrf_write[reg]->latency);
+	    }
+	    last_mrf_write[reg] = n;
 	 }
-	 last_mrf_write[inst->dst.hw_reg] = n;
       } else if (inst->dst.file != BAD_FILE) {
 	 add_barrier_deps(n);
       }
@@ -369,7 +395,18 @@ instruction_scheduler::calculate_deps()
       if (inst->dst.file == GRF) {
 	 last_grf_write[inst->dst.reg] = n;
       } else if (inst->dst.file == MRF) {
-	 last_mrf_write[inst->dst.hw_reg] = n;
+	 int reg = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
+
+	 last_mrf_write[reg] = n;
+
+	 if (is_compressed(inst)) {
+	    if (inst->dst.hw_reg & BRW_MRF_COMPR4)
+	       reg += 4;
+	    else
+	       reg++;
+
+	    last_mrf_write[reg] = n;
+	 }
       } else if (inst->dst.file != BAD_FILE) {
 	 add_barrier_deps(n);
       }
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 14ee6767cd5..f213ae20acd 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -121,13 +121,11 @@ static void compile_gs_prog( struct brw_context *brw,
    /* Upload
     */
    drm_intel_bo_unreference(brw->gs.prog_bo);
-   brw->gs.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_GS_PROG,
-						   &c.key, sizeof(c.key),
-						   NULL, 0,
-						   program, program_size,
-						   &c.prog_data,
-						   sizeof(c.prog_data),
-						   &brw->gs.prog_data);
+   brw->gs.prog_bo = brw_upload_cache(&brw->cache, BRW_GS_PROG,
+				      &c.key, sizeof(c.key),
+				      program, program_size,
+				      &c.prog_data, sizeof(c.prog_data),
+				      &brw->gs.prog_data);
 }
 
 static const GLenum gs_prim[GL_POLYGON+1] = {  
@@ -193,7 +191,6 @@ static void prepare_gs_prog(struct brw_context *brw)
    if (brw->gs.prog_active) {
       brw->gs.prog_bo = brw_search_cache(&brw->cache, BRW_GS_PROG,
 					 &key, sizeof(key),
-					 NULL, 0,
 					 &brw->gs.prog_data);
       if (brw->gs.prog_bo == NULL)
 	 compile_gs_prog( brw, &key );
diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
index 69a5f7a6667..542874b7706 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_state.c
@@ -35,112 +35,65 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
-struct brw_gs_unit_key {
-   unsigned int total_grf;
-   unsigned int urb_entry_read_length;
-
-   unsigned int curbe_offset;
-
-   unsigned int nr_urb_entries, urb_size;
-   GLboolean prog_active;
-};
-
 static void
-gs_unit_populate_key(struct brw_context *brw, struct brw_gs_unit_key *key)
-{
-   memset(key, 0, sizeof(*key));
-
-   /* CACHE_NEW_GS_PROG */
-   key->prog_active = brw->gs.prog_active;
-   if (key->prog_active) {
-      key->total_grf = brw->gs.prog_data->total_grf;
-      key->urb_entry_read_length = brw->gs.prog_data->urb_read_length;
-   } else {
-      key->total_grf = 1;
-      key->urb_entry_read_length = 1;
-   }
-
-   /* BRW_NEW_CURBE_OFFSETS */
-   key->curbe_offset = brw->curbe.clip_start;
-
-   /* BRW_NEW_URB_FENCE */
-   key->nr_urb_entries = brw->urb.nr_gs_entries;
-   key->urb_size = brw->urb.vsize;
-}
-
-static drm_intel_bo *
-gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
+brw_prepare_gs_unit(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
-   struct brw_gs_unit_state gs;
-   drm_intel_bo *bo;
-
-   memset(&gs, 0, sizeof(gs));
-
-   gs.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
-   if (key->prog_active) /* reloc */
-      gs.thread0.kernel_start_pointer = brw->gs.prog_bo->offset >> 6;
+   struct brw_gs_unit_state *gs;
 
-   gs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-   gs.thread1.single_program_flow = 1;
+   gs = brw_state_batch(brw, sizeof(*gs), 32, &brw->gs.state_offset);
 
-   gs.thread3.dispatch_grf_start_reg = 1;
-   gs.thread3.const_urb_entry_read_offset = 0;
-   gs.thread3.const_urb_entry_read_length = 0;
-   gs.thread3.urb_entry_read_offset = 0;
-   gs.thread3.urb_entry_read_length = key->urb_entry_read_length;
+   memset(gs, 0, sizeof(*gs));
 
-   gs.thread4.nr_urb_entries = key->nr_urb_entries;
-   gs.thread4.urb_entry_allocation_size = key->urb_size - 1;
-
-   if (key->nr_urb_entries >= 8)
-      gs.thread4.max_threads = 1;
-   else
-      gs.thread4.max_threads = 0;
-
-   if (intel->gen == 5)
-      gs.thread4.rendering_enable = 1;
-
-   if (unlikely(INTEL_DEBUG & DEBUG_STATS))
-      gs.thread4.stats_enable = 1;
-
-   bo = brw_upload_cache(&brw->cache, BRW_GS_UNIT,
-			 key, sizeof(*key),
-			 &brw->gs.prog_bo, 1,
-			 &gs, sizeof(gs));
+   /* CACHE_NEW_GS_PROG */
+   if (brw->gs.prog_active) {
+      gs->thread0.grf_reg_count = (ALIGN(brw->gs.prog_data->total_grf, 16) /
+				   16 - 1);
+      /* reloc */
+      gs->thread0.kernel_start_pointer = brw->gs.prog_bo->offset >> 6;
+
+      gs->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+      gs->thread1.single_program_flow = 1;
+
+      gs->thread3.dispatch_grf_start_reg = 1;
+      gs->thread3.const_urb_entry_read_offset = 0;
+      gs->thread3.const_urb_entry_read_length = 0;
+      gs->thread3.urb_entry_read_offset = 0;
+      gs->thread3.urb_entry_read_length = brw->gs.prog_data->urb_read_length;
+
+      /* BRW_NEW_URB_FENCE */
+      gs->thread4.nr_urb_entries = brw->urb.nr_gs_entries;
+      gs->thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
+
+      if (brw->urb.nr_gs_entries >= 8)
+	 gs->thread4.max_threads = 1;
+      else
+	 gs->thread4.max_threads = 0;
 
-   if (key->prog_active) {
       /* Emit GS program relocation */
-      drm_intel_bo_emit_reloc(bo, offsetof(struct brw_gs_unit_state, thread0),
-			      brw->gs.prog_bo, gs.thread0.grf_reg_count << 1,
+      drm_intel_bo_emit_reloc(intel->batch.bo,
+			      (brw->gs.state_offset +
+			       offsetof(struct brw_gs_unit_state, thread0)),
+			      brw->gs.prog_bo, gs->thread0.grf_reg_count << 1,
 			      I915_GEM_DOMAIN_INSTRUCTION, 0);
    }
 
-   return bo;
-}
-
-static void prepare_gs_unit(struct brw_context *brw)
-{
-   struct brw_gs_unit_key key;
+   if (intel->gen == 5)
+      gs->thread4.rendering_enable = 1;
 
-   gs_unit_populate_key(brw, &key);
+   if (unlikely(INTEL_DEBUG & DEBUG_STATS))
+      gs->thread4.stats_enable = 1;
 
-   drm_intel_bo_unreference(brw->gs.state_bo);
-   brw->gs.state_bo = brw_search_cache(&brw->cache, BRW_GS_UNIT,
-				       &key, sizeof(key),
-				       &brw->gs.prog_bo, 1,
-				       NULL);
-   if (brw->gs.state_bo == NULL) {
-      brw->gs.state_bo = gs_unit_create_from_key(brw, &key);
-   }
+   brw->state.dirty.cache |= CACHE_NEW_GS_UNIT;
 }
 
 const struct brw_tracked_state brw_gs_unit = {
    .dirty = {
       .mesa  = 0,
-      .brw   = (BRW_NEW_CURBE_OFFSETS |
+      .brw   = (BRW_NEW_BATCH |
+		BRW_NEW_CURBE_OFFSETS |
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_GS_PROG
    },
-   .prepare = prepare_gs_unit,
+   .prepare = brw_prepare_gs_unit,
 };
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 19eea07ebc6..7119786de42 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -143,15 +143,19 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
 
    BEGIN_BATCH(7);
    OUT_BATCH(_3DSTATE_PIPELINED_POINTERS << 16 | (7 - 2));
-   OUT_RELOC(brw->vs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+	     brw->vs.state_offset);
    if (brw->gs.prog_active)
-      OUT_RELOC(brw->gs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
+      OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+		brw->gs.state_offset | 1);
    else
       OUT_BATCH(0);
-   OUT_RELOC(brw->clip.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
+   OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+	     brw->clip.state_offset | 1);
    OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
 	     brw->sf.state_offset);
-   OUT_RELOC(brw->wm.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+	     brw->wm.state_offset);
    OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
 	     brw->cc.state_offset);
    ADVANCE_BATCH();
@@ -159,16 +163,6 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
    brw->state.dirty.brw |= BRW_NEW_PSP;
 }
 
-
-static void prepare_psp_urb_cbs(struct brw_context *brw)
-{
-   brw_add_validated_bo(brw, brw->vs.state_bo);
-   brw_add_validated_bo(brw, brw->gs.state_bo);
-   brw_add_validated_bo(brw, brw->clip.state_bo);
-   brw_add_validated_bo(brw, brw->sf.state_bo);
-   brw_add_validated_bo(brw, brw->wm.state_bo);
-}
-
 static void upload_psp_urb_cbs(struct brw_context *brw )
 {
    upload_pipelined_state_pointers(brw);
@@ -188,7 +182,6 @@ const struct brw_tracked_state brw_psp_urb_cbs = {
 		CACHE_NEW_WM_UNIT | 
 		CACHE_NEW_CC_UNIT)
    },
-   .prepare = prepare_psp_urb_cbs,
    .emit = upload_psp_urb_cbs,
 };
 
@@ -551,12 +544,28 @@ static void upload_state_base_address( struct brw_context *brw )
    if (intel->gen >= 6) {
        BEGIN_BATCH(10);
        OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2));
-       OUT_BATCH(1); /* General state base address */
-       OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
-		 1); /* Surface state base address */
-       OUT_BATCH(1); /* Dynamic state base address */
-       OUT_BATCH(1); /* Indirect object base address */
-       OUT_BATCH(1); /* Instruction base address */
+       /* General state base address: stateless DP read/write requests */
+       OUT_BATCH(1);
+       /* Surface state base address:
+	* BINDING_TABLE_STATE
+	* SURFACE_STATE
+	*/
+       OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1);
+        /* Dynamic state base address:
+	 * SAMPLER_STATE
+	 * SAMPLER_BORDER_COLOR_STATE
+	 * CLIP, SF, WM/CC viewport state
+	 * COLOR_CALC_STATE
+	 * DEPTH_STENCIL_STATE
+	 * BLEND_STATE
+	 * Push constants (when INSTPM: CONSTANT_BUFFER Address Offset
+	 * Disable is clear, which we rely on)
+	 */
+       OUT_RELOC(intel->batch.bo, (I915_GEM_DOMAIN_RENDER |
+				   I915_GEM_DOMAIN_INSTRUCTION), 0, 1);
+
+       OUT_BATCH(1); /* Indirect object base address: MEDIA_OBJECT data */
+       OUT_BATCH(1); /* Instruction base address: shader kernels (incl. SIP) */
        OUT_BATCH(1); /* General state upper bound */
        OUT_BATCH(1); /* Dynamic state upper bound */
        OUT_BATCH(1); /* Indirect object upper bound */
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index 6da155b1a9b..5a03851b8e6 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -119,13 +119,11 @@ static void compile_sf_prog( struct brw_context *brw,
    /* Upload
     */
    drm_intel_bo_unreference(brw->sf.prog_bo);
-   brw->sf.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_SF_PROG,
-						   &c.key, sizeof(c.key),
-						   NULL, 0,
-						   program, program_size,
-						   &c.prog_data,
-						   sizeof(c.prog_data),
-						   &brw->sf.prog_data);
+   brw->sf.prog_bo = brw_upload_cache(&brw->cache, BRW_SF_PROG,
+				      &c.key, sizeof(c.key),
+				      program, program_size,
+				      &c.prog_data, sizeof(c.prog_data),
+				      &brw->sf.prog_data);
 }
 
 /* Calculate interpolants for triangle and line rasterization.
@@ -194,7 +192,6 @@ static void upload_sf_prog(struct brw_context *brw)
    drm_intel_bo_unreference(brw->sf.prog_bo);
    brw->sf.prog_bo = brw_search_cache(&brw->cache, BRW_SF_PROG,
 				      &key, sizeof(key),
-				      NULL, 0,
 				      &brw->sf.prog_data);
    if (brw->sf.prog_bo == NULL)
       compile_sf_prog( brw, &key );
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 66d91a0bde7..78b22c4df3d 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -39,7 +39,7 @@
 static void upload_sf_vp(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
-   struct gl_context *ctx = &brw->intel.ctx;
+   struct gl_context *ctx = &intel->ctx;
    const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
    struct brw_sf_viewport *sfv;
    GLfloat y_scale, y_bias;
@@ -106,11 +106,6 @@ static void upload_sf_vp(struct brw_context *brw)
       sfv->scissor.ymax = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymin - 1;
    }
 
-   /* Keep a pointer to it for brw_state_dump.c */
-   drm_intel_bo_unreference(brw->sf.vp_bo);
-   drm_intel_bo_reference(intel->batch.bo);
-   brw->sf.vp_bo = intel->batch.bo;
-
    brw->state.dirty.cache |= CACHE_NEW_SF_VP;
 }
 
@@ -177,7 +172,7 @@ static void upload_sf_unit( struct brw_context *brw )
       sf->thread4.stats_enable = 1;
 
    /* CACHE_NEW_SF_VP */
-   sf->sf5.sf_viewport_state_offset = (brw->sf.vp_bo->offset +
+   sf->sf5.sf_viewport_state_offset = (intel->batch.bo->offset +
 				       brw->sf.vp_offset) >> 5; /* reloc */
 
    sf->sf5.viewport_transform = 1;
diff --git a/src/mesa/drivers/dri/i965/brw_state.c b/src/mesa/drivers/dri/i965/brw_state.c
deleted file mode 100644
index 13b231d5cf5..00000000000
--- a/src/mesa/drivers/dri/i965/brw_state.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright © 2010 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <[email protected]>
- *
- */
-
-#include "brw_context.h"
-
-void
-brw_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
-{
-   struct brw_context *brw = brw_context(ctx);
-
-   switch (cap) {
-   case GL_DEPTH_CLAMP:
-      brw_update_cc_vp(brw);
-      break;
-   }
-}
-
-void
-brw_depth_range(struct gl_context *ctx, GLclampd nearval, GLclampd farval)
-{
-   struct brw_context *brw = brw_context(ctx);
-
-   if (ctx->Transform.DepthClamp)
-      brw_update_cc_vp(brw);
-}
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 86b0caa4a4e..8b9e3a4ec5d 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -47,6 +47,7 @@ brw_add_validated_bo(struct brw_context *brw, drm_intel_bo *bo)
 };
 
 extern const struct brw_tracked_state brw_blend_constant_color;
+extern const struct brw_tracked_state brw_cc_vp;
 extern const struct brw_tracked_state brw_cc_unit;
 extern const struct brw_tracked_state brw_check_fallback;
 extern const struct brw_tracked_state brw_clip_prog;
@@ -102,11 +103,11 @@ extern const struct brw_tracked_state gen6_depth_stencil_state;
 extern const struct brw_tracked_state gen6_gs_state;
 extern const struct brw_tracked_state gen6_sampler_state;
 extern const struct brw_tracked_state gen6_scissor_state;
-extern const struct brw_tracked_state gen6_scissor_state_pointers;
 extern const struct brw_tracked_state gen6_sf_state;
 extern const struct brw_tracked_state gen6_sf_vp;
 extern const struct brw_tracked_state gen6_urb;
 extern const struct brw_tracked_state gen6_viewport_state;
+extern const struct brw_tracked_state gen6_vs_constants;
 extern const struct brw_tracked_state gen6_vs_state;
 extern const struct brw_tracked_state gen6_wm_constants;
 extern const struct brw_tracked_state gen6_wm_state;
@@ -123,38 +124,21 @@ void brw_clear_validated_bos(struct brw_context *brw);
 /***********************************************************************
  * brw_state_cache.c
  */
-drm_intel_bo *brw_cache_data(struct brw_cache *cache,
-		       enum brw_cache_id cache_id,
-		       const void *data,
-		       GLuint size);
 
 drm_intel_bo *brw_upload_cache(struct brw_cache *cache,
 			       enum brw_cache_id cache_id,
 			       const void *key,
 			       GLuint key_sz,
-			       drm_intel_bo **reloc_bufs,
-			       GLuint nr_reloc_bufs,
 			       const void *data,
-			       GLuint data_sz);
-
-drm_intel_bo *brw_upload_cache_with_auxdata(struct brw_cache *cache,
-					    enum brw_cache_id cache_id,
-					    const void *key,
-					    GLuint key_sz,
-					    drm_intel_bo **reloc_bufs,
-					    GLuint nr_reloc_bufs,
-					    const void *data,
-					    GLuint data_sz,
-					    const void *aux,
-					    GLuint aux_sz,
-					    void *aux_return);
+			       GLuint data_sz,
+			       const void *aux,
+			       GLuint aux_sz,
+			       void *aux_return);
 
 drm_intel_bo *brw_search_cache( struct brw_cache *cache,
 			  enum brw_cache_id cache_id,
 			  const void *key,
 			  GLuint key_size,
-			  drm_intel_bo **reloc_bufs,
-			  GLuint nr_reloc_bufs,
 			  void *aux_return);
 void brw_state_cache_check_size( struct brw_context *brw );
 
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index 01eeb19a684..f13a41fa7cc 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -31,29 +31,17 @@
 
 /** @file brw_state_cache.c
  *
- * This file implements a simple static state cache for 965.  The consumers
- * can query the hash table of state using a cache_id, opaque key data,
- * and list of buffers that will be used in relocations, and receive the
- * corresponding state buffer object of state (plus associated auxiliary
- * data) in return.
+ * This file implements a simple static state cache for 965.  The
+ * consumers can query the hash table of state using a cache_id,
+ * opaque key data, and receive the corresponding state buffer object
+ * of state (plus associated auxiliary data) in return.  Objects in
+ * the cache may not have relocations (pointers to other BOs) in them.
  *
- * The inner workings are a simple hash table based on a CRC of the key data.
- * The cache_id and relocation target buffers associated with the state
- * buffer are included as auxiliary key data, but are not part of the hash
- * value (this should be fixed, but will likely be fixed instead by making
- * consumers use structured keys).
+ * The inner workings are a simple hash table based on a CRC of the
+ * key data.
  *
- * Replacement is not implemented.  Instead, when the cache gets too big, at
- * a safe point (unlock) we throw out all of the cache data and let it
- * regenerate for the next rendering operation.
- *
- * The reloc_buf pointers need to be included as key data, otherwise the
- * non-unique values stuffed in the offset in key data through
- * brw_cache_data() may result in successful probe for state buffers
- * even when the buffer being referenced doesn't match.  The result would be
- * that the same state cache entry is used twice for different buffers,
- * only one of the two buffers referenced gets put into the offset, and the
- * incorrect program is run for the other instance.
+ * Replacement is not implemented.  Instead, when the cache gets too
+ * big we throw out all of the cache data and let it get regenerated.
  */
 
 #include "main/imports.h"
@@ -76,13 +64,6 @@ hash_key(struct brw_cache_item *item)
       hash = (hash << 5) | (hash >> 27);
    }
 
-   /* Include the BO pointers as key data as well */
-   ikey = (GLuint *)item->reloc_bufs;
-   for (i = 0; i < item->nr_reloc_bufs * sizeof(drm_intel_bo *) / 4; i++) {
-      hash ^= ikey[i];
-      hash = (hash << 5) | (hash >> 27);
-   }
-
    return hash;
 }
 
@@ -110,10 +91,7 @@ brw_cache_item_equals(const struct brw_cache_item *a,
    return a->cache_id == b->cache_id &&
       a->hash == b->hash &&
       a->key_size == b->key_size &&
-      (memcmp(a->key, b->key, a->key_size) == 0) &&
-      a->nr_reloc_bufs == b->nr_reloc_bufs &&
-      (memcmp(a->reloc_bufs, b->reloc_bufs,
-	      a->nr_reloc_bufs * sizeof(drm_intel_bo *)) == 0);
+      (memcmp(a->key, b->key, a->key_size) == 0);
 }
 
 static struct brw_cache_item *
@@ -170,9 +148,7 @@ rehash(struct brw_cache *cache)
 drm_intel_bo *
 brw_search_cache(struct brw_cache *cache,
                  enum brw_cache_id cache_id,
-                 const void *key,
-                 GLuint key_size,
-                 drm_intel_bo **reloc_bufs, GLuint nr_reloc_bufs,
+                 const void *key, GLuint key_size,
                  void *aux_return)
 {
    struct brw_cache_item *item;
@@ -182,8 +158,6 @@ brw_search_cache(struct brw_cache *cache,
    lookup.cache_id = cache_id;
    lookup.key = key;
    lookup.key_size = key_size;
-   lookup.reloc_bufs = reloc_bufs;
-   lookup.nr_reloc_bufs = nr_reloc_bufs;
    hash = hash_key(&lookup);
    lookup.hash = hash;
 
@@ -203,30 +177,24 @@ brw_search_cache(struct brw_cache *cache,
 
 
 drm_intel_bo *
-brw_upload_cache_with_auxdata(struct brw_cache *cache,
-			      enum brw_cache_id cache_id,
-			      const void *key,
-			      GLuint key_size,
-			      drm_intel_bo **reloc_bufs,
-			      GLuint nr_reloc_bufs,
-			      const void *data,
-			      GLuint data_size,
-			      const void *aux,
-			      GLuint aux_size,
-			      void *aux_return)
+brw_upload_cache(struct brw_cache *cache,
+		 enum brw_cache_id cache_id,
+		 const void *key,
+		 GLuint key_size,
+		 const void *data,
+		 GLuint data_size,
+		 const void *aux,
+		 GLuint aux_size,
+		 void *aux_return)
 {
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
    GLuint hash;
-   GLuint relocs_size = nr_reloc_bufs * sizeof(drm_intel_bo *);
    void *tmp;
    drm_intel_bo *bo;
-   int i;
 
    item->cache_id = cache_id;
    item->key = key;
    item->key_size = key_size;
-   item->reloc_bufs = reloc_bufs;
-   item->nr_reloc_bufs = nr_reloc_bufs;
    hash = hash_key(item);
    item->hash = hash;
 
@@ -235,19 +203,13 @@ brw_upload_cache_with_auxdata(struct brw_cache *cache,
 			   cache->name[cache_id], data_size, 1 << 6);
 
 
-   /* Set up the memory containing the key, aux_data, and reloc_bufs */
-   tmp = malloc(key_size + aux_size + relocs_size);
+   /* Set up the memory containing the key and aux_data */
+   tmp = malloc(key_size + aux_size);
 
    memcpy(tmp, key, key_size);
    memcpy(tmp + key_size, aux, aux_size);
-   memcpy(tmp + key_size + aux_size, reloc_bufs, relocs_size);
-   for (i = 0; i < nr_reloc_bufs; i++) {
-      if (reloc_bufs[i] != NULL)
-	 drm_intel_bo_reference(reloc_bufs[i]);
-   }
 
    item->key = tmp;
-   item->reloc_bufs = tmp + key_size + aux_size;
 
    item->bo = bo;
    drm_intel_bo_reference(bo);
@@ -276,73 +238,6 @@ brw_upload_cache_with_auxdata(struct brw_cache *cache,
    return bo;
 }
 
-drm_intel_bo *
-brw_upload_cache(struct brw_cache *cache,
-		 enum brw_cache_id cache_id,
-		 const void *key,
-		 GLuint key_size,
-		 drm_intel_bo **reloc_bufs,
-		 GLuint nr_reloc_bufs,
-		 const void *data,
-		 GLuint data_size)
-{
-   return brw_upload_cache_with_auxdata(cache, cache_id,
-					key, key_size,
-					reloc_bufs, nr_reloc_bufs,
-					data, data_size,
-					NULL, 0,
-					NULL);
-}
-
-/**
- * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
- *
- * If nr_reloc_bufs is nonzero, brw_search_cache()/brw_upload_cache() would be
- * better to use, as the potentially changing offsets in the data-used-as-key
- * will result in excessive cache misses.
- *
- * If aux data is involved, use search/upload instead.
-
- */
-drm_intel_bo *
-brw_cache_data(struct brw_cache *cache,
-	       enum brw_cache_id cache_id,
-	       const void *data,
-	       GLuint data_size)
-{
-   drm_intel_bo *bo;
-   struct brw_cache_item *item, lookup;
-   GLuint hash;
-
-   lookup.cache_id = cache_id;
-   lookup.key = data;
-   lookup.key_size = data_size;
-   lookup.reloc_bufs = NULL;
-   lookup.nr_reloc_bufs = 0;
-   hash = hash_key(&lookup);
-   lookup.hash = hash;
-
-   item = search_cache(cache, hash, &lookup);
-   if (item) {
-      update_cache_last(cache, cache_id, item->bo);
-      drm_intel_bo_reference(item->bo);
-      return item->bo;
-   }
-
-   bo = brw_upload_cache(cache, cache_id,
-			 data, data_size,
-			 NULL, 0,
-			 data, data_size);
-
-   return bo;
-}
-
-enum pool_type {
-   DW_SURFACE_STATE,
-   DW_GENERAL_STATE
-};
-
-
 static void
 brw_init_cache_id(struct brw_cache *cache,
                   const char *name,
@@ -352,8 +247,8 @@ brw_init_cache_id(struct brw_cache *cache,
 }
 
 
-static void
-brw_init_non_surface_cache(struct brw_context *brw)
+void
+brw_init_caches(struct brw_context *brw)
 {
    struct brw_cache *cache = &brw->cache;
 
@@ -367,7 +262,6 @@ brw_init_non_surface_cache(struct brw_context *brw)
    brw_init_cache_id(cache, "CC_VP", BRW_CC_VP);
    brw_init_cache_id(cache, "CC_UNIT", BRW_CC_UNIT);
    brw_init_cache_id(cache, "WM_PROG", BRW_WM_PROG);
-   brw_init_cache_id(cache, "SAMPLER_DEFAULT_COLOR", BRW_SAMPLER_DEFAULT_COLOR);
    brw_init_cache_id(cache, "SAMPLER", BRW_SAMPLER);
    brw_init_cache_id(cache, "WM_UNIT", BRW_WM_UNIT);
    brw_init_cache_id(cache, "SF_PROG", BRW_SF_PROG);
@@ -392,13 +286,6 @@ brw_init_non_surface_cache(struct brw_context *brw)
    brw_init_cache_id(cache, "DEPTH_STENCIL_STATE", BRW_DEPTH_STENCIL_STATE);
 }
 
-void
-brw_init_caches(struct brw_context *brw)
-{
-   brw_init_non_surface_cache(brw);
-}
-
-
 static void
 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 {
@@ -409,11 +296,7 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 
    for (i = 0; i < cache->size; i++) {
       for (c = cache->items[i]; c; c = next) {
-	 int j;
-
 	 next = c->next;
-	 for (j = 0; j < c->nr_reloc_bufs; j++)
-	    drm_intel_bo_unreference(c->reloc_bufs[j]);
 	 drm_intel_bo_unreference(c->bo);
 	 free((void *)c->key);
 	 free(c);
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index b393259c915..3a3aa8c0346 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -140,19 +140,15 @@ static void dump_wm_surface_state(struct brw_context *brw)
 
 static void dump_wm_sampler_state(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &brw->intel.ctx;
    int i;
 
-   if (!brw->wm.sampler_bo) {
-      fprintf(stderr, "WM_SAMPLER: NULL\n");
-      return;
-   }
-
-   drm_intel_bo_map(brw->wm.sampler_bo, GL_FALSE);
+   drm_intel_bo_map(intel->batch.bo, GL_FALSE);
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
       unsigned int offset;
+      uint32_t sdc_offset;
       struct brw_sampler_state *samp;
-      struct brw_sampler_default_color *sdc;
       char name[20];
 
       if (!ctx->Texture.Unit[i]._ReallyEnabled) {
@@ -160,9 +156,11 @@ static void dump_wm_sampler_state(struct brw_context *brw)
 	 continue;
       }
 
-      offset = brw->wm.sampler_bo->offset +
-	 i * sizeof(struct brw_sampler_state);
-      samp = (struct brw_sampler_state *)(brw->wm.sampler_bo->virtual +
+      offset = (intel->batch.bo->offset +
+		brw->wm.sampler_offset +
+		i * sizeof(struct brw_sampler_state));
+      samp = (struct brw_sampler_state *)(intel->batch.bo->virtual +
+					  brw->wm.sampler_offset +
 					  i * sizeof(struct brw_sampler_state));
 
       sprintf(name, "WM SAMP%d", i);
@@ -173,30 +171,45 @@ static void dump_wm_sampler_state(struct brw_context *brw)
 
       sprintf(name, " WM SDC%d", i);
 
-      drm_intel_bo_map(brw->wm.sdc_bo[i], GL_FALSE);
-      sdc = (struct brw_sampler_default_color *)(brw->wm.sdc_bo[i]->virtual);
-      state_out(name, sdc, brw->wm.sdc_bo[i]->offset, 0, "r\n");
-      state_out(name, sdc, brw->wm.sdc_bo[i]->offset, 1, "g\n");
-      state_out(name, sdc, brw->wm.sdc_bo[i]->offset, 2, "b\n");
-      state_out(name, sdc, brw->wm.sdc_bo[i]->offset, 3, "a\n");
-      drm_intel_bo_unmap(brw->wm.sdc_bo[i]);
+      sdc_offset = intel->batch.bo->offset + brw->wm.sdc_offset[i];
+      if (intel->gen >= 5) {
+	 struct gen5_sampler_default_color *sdc = (intel->batch.bo->virtual +
+						   brw->wm.sdc_offset[i]);
+	 state_out(name, sdc, sdc_offset, 0, "unorm rgba\n");
+	 state_out(name, sdc, sdc_offset, 1, "r %f\n", sdc->f[0]);
+	 state_out(name, sdc, sdc_offset, 2, "b %f\n", sdc->f[1]);
+	 state_out(name, sdc, sdc_offset, 3, "g %f\n", sdc->f[2]);
+	 state_out(name, sdc, sdc_offset, 4, "a %f\n", sdc->f[3]);
+	 state_out(name, sdc, sdc_offset, 5, "half float rg\n");
+	 state_out(name, sdc, sdc_offset, 6, "half float ba\n");
+	 state_out(name, sdc, sdc_offset, 7, "u16 rg\n");
+	 state_out(name, sdc, sdc_offset, 8, "u16 ba\n");
+	 state_out(name, sdc, sdc_offset, 9, "s16 rg\n");
+	 state_out(name, sdc, sdc_offset, 10, "s16 ba\n");
+	 state_out(name, sdc, sdc_offset, 11, "s8 rgba\n");
+      } else {
+	 struct brw_sampler_default_color *sdc = (intel->batch.bo->virtual +
+						  brw->wm.sdc_offset[i]);
+	 state_out(name, sdc, sdc_offset, 0, "r %f\n", sdc->color[0]);
+	 state_out(name, sdc, sdc_offset, 1, "g %f\n", sdc->color[1]);
+	 state_out(name, sdc, sdc_offset, 2, "b %f\n", sdc->color[2]);
+	 state_out(name, sdc, sdc_offset, 3, "a %f\n", sdc->color[3]);
+      }
    }
-   drm_intel_bo_unmap(brw->wm.sampler_bo);
+   drm_intel_bo_unmap(intel->batch.bo);
 }
 
 static void dump_sf_viewport_state(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    const char *name = "SF VP";
    struct brw_sf_viewport *vp;
    uint32_t vp_off;
 
-   if (brw->sf.vp_bo == NULL)
-      return;
-
-   drm_intel_bo_map(brw->sf.vp_bo, GL_FALSE);
+   drm_intel_bo_map(intel->batch.bo, GL_FALSE);
 
-   vp = brw->sf.vp_bo->virtual + brw->sf.vp_offset;
-   vp_off = brw->sf.vp_bo->offset + brw->sf.vp_offset;
+   vp = intel->batch.bo->virtual + brw->sf.vp_offset;
+   vp_off = intel->batch.bo->offset + brw->sf.vp_offset;
 
    state_out(name, vp, vp_off, 0, "m00 = %f\n", vp->viewport.m00);
    state_out(name, vp, vp_off, 1, "m11 = %f\n", vp->viewport.m11);
@@ -210,62 +223,56 @@ static void dump_sf_viewport_state(struct brw_context *brw)
    state_out(name, vp, vp_off, 7, "bottom right = %d,%d\n",
 	     vp->scissor.xmax, vp->scissor.ymax);
 
-   drm_intel_bo_unmap(brw->sf.vp_bo);
+   drm_intel_bo_unmap(intel->batch.bo);
 }
 
 static void dump_clip_viewport_state(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    const char *name = "CLIP VP";
    struct brw_clipper_viewport *vp;
    uint32_t vp_off;
 
-   if (brw->clip.vp_bo == NULL)
-      return;
-
-   drm_intel_bo_map(brw->clip.vp_bo, GL_FALSE);
+   drm_intel_bo_map(intel->batch.bo, GL_FALSE);
 
-   vp = brw->clip.vp_bo->virtual;
-   vp_off = brw->clip.vp_bo->offset;
+   vp = intel->batch.bo->virtual + brw->clip.vp_offset;
+   vp_off = intel->batch.bo->offset + brw->clip.vp_offset;
 
    state_out(name, vp, vp_off, 0, "xmin = %f\n", vp->xmin);
    state_out(name, vp, vp_off, 1, "xmax = %f\n", vp->xmax);
    state_out(name, vp, vp_off, 2, "ymin = %f\n", vp->ymin);
    state_out(name, vp, vp_off, 3, "ymax = %f\n", vp->ymax);
-   drm_intel_bo_unmap(brw->clip.vp_bo);
+   drm_intel_bo_unmap(intel->batch.bo);
 }
 
 static void dump_cc_viewport_state(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    const char *name = "CC VP";
    struct brw_cc_viewport *vp;
    uint32_t vp_off;
 
-   if (brw->cc.vp_bo == NULL)
-      return;
-
-   drm_intel_bo_map(brw->cc.vp_bo, GL_FALSE);
+   drm_intel_bo_map(intel->batch.bo, GL_FALSE);
 
-   vp = brw->cc.vp_bo->virtual;
-   vp_off = brw->cc.vp_bo->offset;
+   vp = intel->batch.bo->virtual + brw->cc.vp_offset;
+   vp_off = intel->batch.bo->offset + brw->cc.vp_offset;
 
    state_out(name, vp, vp_off, 0, "min_depth = %f\n", vp->min_depth);
    state_out(name, vp, vp_off, 1, "max_depth = %f\n", vp->max_depth);
-   drm_intel_bo_unmap(brw->cc.vp_bo);
+   drm_intel_bo_unmap(intel->batch.bo);
 }
 
 static void dump_depth_stencil_state(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    const char *name = "DEPTH STENCIL";
    struct gen6_depth_stencil_state *ds;
    uint32_t ds_off;
 
-   if (brw->cc.depth_stencil_state_bo == NULL)
-	return;
+   drm_intel_bo_map(intel->batch.bo, GL_FALSE);
 
-   drm_intel_bo_map(brw->cc.depth_stencil_state_bo, GL_FALSE);
-
-   ds = brw->cc.depth_stencil_state_bo->virtual;
-   ds_off = brw->cc.depth_stencil_state_bo->offset;
+   ds = intel->batch.bo->virtual + brw->cc.depth_stencil_state_offset;
+   ds_off = intel->batch.bo->offset + brw->cc.depth_stencil_state_offset;
 
    state_out(name, ds, ds_off, 0, "stencil %sable, func %d, write %sable\n",
 		ds->ds0.stencil_enable ? "en" : "dis",
@@ -277,7 +284,7 @@ static void dump_depth_stencil_state(struct brw_context *brw)
 		ds->ds2.depth_test_enable ? "en" : "dis",
 		ds->ds2.depth_test_func,
 		ds->ds2.depth_write_enable ? "en" : "dis");
-   drm_intel_bo_unmap(brw->cc.depth_stencil_state_bo); 
+   drm_intel_bo_unmap(intel->batch.bo);
 }
 
 static void dump_cc_state(struct brw_context *brw)
@@ -291,8 +298,8 @@ static void dump_cc_state(struct brw_context *brw)
 	return;
 
    drm_intel_bo_map(bo, GL_FALSE);
-   cc = bo->virtual;
-   cc_off = bo->offset;
+   cc = bo->virtual + brw->cc.state_offset;
+   cc_off = bo->offset + brw->cc.state_offset;
 
    state_out(name, cc, cc_off, 0, "alpha test format %s, round disable %d, stencil ref %d,"
 		"bf stencil ref %d\n",
@@ -312,22 +319,20 @@ static void dump_cc_state(struct brw_context *brw)
 
 static void dump_blend_state(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    const char *name = "BLEND";
    struct gen6_blend_state *blend;
    uint32_t blend_off;
 
-   if (brw->cc.blend_state_bo == NULL)
-	return;
-
-   drm_intel_bo_map(brw->cc.blend_state_bo, GL_FALSE);
+   drm_intel_bo_map(intel->batch.bo, GL_FALSE);
 
-   blend = brw->cc.blend_state_bo->virtual;
-   blend_off = brw->cc.blend_state_bo->offset;
+   blend = intel->batch.bo->virtual + brw->cc.blend_state_offset;
+   blend_off = intel->batch.bo->offset + brw->cc.blend_state_offset;
 
    state_out(name, blend, blend_off, 0, "\n");
    state_out(name, blend, blend_off, 1, "\n");
 
-   drm_intel_bo_unmap(brw->cc.blend_state_bo);
+   drm_intel_bo_unmap(intel->batch.bo);
 
 }
 
@@ -383,21 +388,25 @@ void brw_debug_batch(struct intel_context *intel)
    dump_wm_sampler_state(brw);
 
    if (intel->gen < 6)
-       state_struct_out("VS", brw->vs.state_bo, 0, sizeof(struct brw_vs_unit_state));
+       state_struct_out("VS", intel->batch.bo, brw->vs.state_offset,
+			sizeof(struct brw_vs_unit_state));
    brw_debug_prog("VS prog", brw->vs.prog_bo);
 
    if (intel->gen < 6)
-       state_struct_out("GS", brw->gs.state_bo, 0, sizeof(struct brw_gs_unit_state));
+       state_struct_out("GS", intel->batch.bo, brw->gs.state_offset,
+			sizeof(struct brw_gs_unit_state));
    brw_debug_prog("GS prog", brw->gs.prog_bo);
 
    if (intel->gen < 6) {
-       state_struct_out("SF", brw->sf.state_bo, 0, sizeof(struct brw_sf_unit_state));
-       brw_debug_prog("SF prog", brw->sf.prog_bo);
+      state_struct_out("SF", intel->batch.bo, brw->sf.state_offset,
+		       sizeof(struct brw_sf_unit_state));
+      brw_debug_prog("SF prog", brw->sf.prog_bo);
    }
    dump_sf_viewport_state(brw);
 
    if (intel->gen < 6)
-       state_struct_out("WM", brw->wm.state_bo, 0, sizeof(struct brw_wm_unit_state));
+       state_struct_out("WM", intel->batch.bo, brw->wm.state_offset,
+			sizeof(struct brw_wm_unit_state));
    brw_debug_prog("WM prog", brw->wm.prog_bo);
 
    if (intel->gen >= 6) {
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 6f521be6599..008aceb222b 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -60,6 +60,7 @@ static const struct brw_tracked_state *gen4_atoms[] =
    &brw_curbe_offsets,
    &brw_recalculate_urb_fence,
 
+   &brw_cc_vp,
    &brw_cc_unit,
 
    &brw_vs_constants, /* Before vs_surfaces and constant_buffer */
@@ -119,6 +120,10 @@ static const struct brw_tracked_state *gen6_atoms[] =
    /* Command packets: */
    &brw_invarient_state,
 
+   /* must do before binding table pointers, cc state ptrs */
+   &brw_state_base_address,
+
+   &brw_cc_vp,
    &gen6_viewport_state,	/* must do after *_vp stages */
 
    &gen6_urb,
@@ -129,6 +134,7 @@ static const struct brw_tracked_state *gen6_atoms[] =
 
    &brw_vs_constants, /* Before vs_surfaces and constant_buffer */
    &brw_wm_constants, /* Before wm_surfaces and constant_buffer */
+   &gen6_vs_constants, /* Before vs_state */
    &gen6_wm_constants, /* Before wm_state */
 
    &brw_vs_surfaces,		/* must do before unit */
@@ -146,9 +152,6 @@ static const struct brw_tracked_state *gen6_atoms[] =
    &gen6_wm_state,
 
    &gen6_scissor_state,
-   &gen6_scissor_state_pointers,
-
-   &brw_state_base_address,
 
    &gen6_binding_table_pointers,
 
@@ -314,7 +317,6 @@ static struct dirty_bit_map cache_bits[] = {
    DEFINE_BIT(CACHE_NEW_CC_VP),
    DEFINE_BIT(CACHE_NEW_CC_UNIT),
    DEFINE_BIT(CACHE_NEW_WM_PROG),
-   DEFINE_BIT(CACHE_NEW_SAMPLER_DEFAULT_COLOR),
    DEFINE_BIT(CACHE_NEW_SAMPLER),
    DEFINE_BIT(CACHE_NEW_WM_UNIT),
    DEFINE_BIT(CACHE_NEW_SF_PROG),
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 63ae13191f9..31a2b518c40 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -103,13 +103,11 @@ static void do_vs_prog( struct brw_context *brw,
    aux_size += c.vp->program.Base.Parameters->NumParameters;
 
    drm_intel_bo_unreference(brw->vs.prog_bo);
-   brw->vs.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_VS_PROG,
-						   &c.key, sizeof(c.key),
-						   NULL, 0,
-						   program, program_size,
-						   &c.prog_data,
-						   aux_size,
-						   &brw->vs.prog_data);
+   brw->vs.prog_bo = brw_upload_cache(&brw->cache, BRW_VS_PROG,
+				      &c.key, sizeof(c.key),
+				      program, program_size,
+				      &c.prog_data, aux_size,
+				      &brw->vs.prog_data);
 }
 
 
@@ -148,7 +146,6 @@ static void brw_upload_vs_prog(struct brw_context *brw)
    drm_intel_bo_unreference(brw->vs.prog_bo);
    brw->vs.prog_bo = brw_search_cache(&brw->cache, BRW_VS_PROG,
 				      &key, sizeof(key),
-				      NULL, 0,
 				      &brw->vs.prog_data);
    if (brw->vs.prog_bo == NULL)
       do_vs_prog(brw, vp, &key);
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index dd4e1e6c6ad..a28cdc0bfe9 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -1553,6 +1553,26 @@ static void emit_swz( struct brw_vs_compile *c,
    }
 }
 
+static int
+align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
+{
+   struct intel_context *intel = &brw->intel;
+
+   if (intel->gen >= 6) {
+      /* URB data written (does not include the message header reg) must
+       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
+       * section 5.4.3.2.2: URB_INTERLEAVED.
+       *
+       * URB entries are allocated on a multiple of 1024 bits, so an
+       * extra 128 bits written here to make the end align to 256 is
+       * no problem.
+       */
+      if ((mlen % 2) != 1)
+	 mlen++;
+   }
+
+   return mlen;
+}
 
 /**
  * Post-vertex-program processing.  Send the results to the URB.
@@ -1734,12 +1754,11 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 
    eot = (c->first_overflow_output == 0);
 
-   msg_len = c->nr_outputs + 2 + len_vertex_header; 
-   if (intel->gen >= 6) {
-	   /* interleaved urb write message length for gen6 should be multiple of 2 */
-	   if ((msg_len % 2) != 0)
-		msg_len++;
-   }
+   /* Message header, plus VUE header, plus the (first set of) outputs. */
+   msg_len = 1 + len_vertex_header + c->nr_outputs;
+   msg_len = align_interleaved_urb_mlen(brw, msg_len);
+   /* Any outputs beyond BRW_MAX_MRF should be past first_overflow_output */
+   msg_len = MIN2(msg_len, (BRW_MAX_MRF - 1)),
 
    brw_urb_WRITE(p, 
 		 brw_null_reg(), /* dest */
@@ -1747,7 +1766,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 c->r0,		/* src */
 		 0,		/* allocate */
 		 1,		/* used */
-		 MIN2(msg_len - 1, (BRW_MAX_MRF - 1)), /* msg len */
+		 msg_len,
 		 0,		/* response len */
 		 eot, 		/* eot */
 		 eot, 		/* writes complete */
@@ -1774,7 +1793,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
                     c->r0,          /* src */
                     0,              /* allocate */
                     1,              /* used */
-                    mrf,            /* msg len */
+                    align_interleaved_urb_mlen(brw, mrf),
                     0,              /* response len */
                     1,              /* eot */
                     1,              /* writes complete */
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index c3a7cc247c5..1eee5b7e5de 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -49,48 +49,19 @@ struct brw_vs_unit_key {
 };
 
 static void
-vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
-{
-   struct gl_context *ctx = &brw->intel.ctx;
-
-   memset(key, 0, sizeof(*key));
-
-   /* CACHE_NEW_VS_PROG */
-   key->total_grf = brw->vs.prog_data->total_grf;
-   key->urb_entry_read_length = brw->vs.prog_data->urb_read_length;
-   key->curb_entry_read_length = brw->vs.prog_data->curb_read_length;
-
-   /* BRW_NEW_URB_FENCE */
-   key->nr_urb_entries = brw->urb.nr_vs_entries;
-   key->urb_size = brw->urb.vsize;
-
-   /* BRW_NEW_NR_VS_SURFACES */
-   key->nr_surfaces = brw->vs.nr_surfaces;
-
-   /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */
-   if (ctx->Transform.ClipPlanesEnabled) {
-      /* Note that we read in the userclip planes as well, hence
-       * clip_start:
-       */
-      key->curbe_offset = brw->curbe.clip_start;
-   }
-   else {
-      key->curbe_offset = brw->curbe.vs_start;
-   }
-}
-
-static drm_intel_bo *
-vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
+brw_prepare_vs_unit(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
-   struct brw_vs_unit_state vs;
-   drm_intel_bo *bo;
+   struct gl_context *ctx = &intel->ctx;
+   struct brw_vs_unit_state *vs;
 
-   memset(&vs, 0, sizeof(vs));
+   vs = brw_state_batch(brw, sizeof(*vs), 32, &brw->vs.state_offset);
+   memset(vs, 0, sizeof(*vs));
 
-   vs.thread0.kernel_start_pointer = brw->vs.prog_bo->offset >> 6; /* reloc */
-   vs.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
-   vs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   /* CACHE_NEW_VS_PROG */
+   vs->thread0.kernel_start_pointer = brw->vs.prog_bo->offset >> 6; /* reloc */
+   vs->thread0.grf_reg_count = ALIGN(brw->vs.prog_data->total_grf, 16) / 16 - 1;
+   vs->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
    /* Choosing multiple program flow means that we may get 2-vertex threads,
     * which will have the channel mask for dwords 4-7 enabled in the thread,
     * and those dwords will be written to the second URB handle when we
@@ -103,21 +74,34 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
     * The most notable and reliably failing application is the Humus
     * demo "CelShading"
    */
-   vs.thread1.single_program_flow = (intel->gen == 5);
+   vs->thread1.single_program_flow = (intel->gen == 5);
 
+   /* BRW_NEW_NR_VS_SURFACES */
    if (intel->gen == 5)
-      vs.thread1.binding_table_entry_count = 0; /* hardware requirement */
+      vs->thread1.binding_table_entry_count = 0; /* hardware requirement */
    else
-      vs.thread1.binding_table_entry_count = key->nr_surfaces;
+      vs->thread1.binding_table_entry_count = brw->vs.nr_surfaces;
+
+   vs->thread3.urb_entry_read_length = brw->vs.prog_data->urb_read_length;
+   vs->thread3.const_urb_entry_read_length = brw->vs.prog_data->curb_read_length;
+   vs->thread3.dispatch_grf_start_reg = 1;
+   vs->thread3.urb_entry_read_offset = 0;
 
-   vs.thread3.urb_entry_read_length = key->urb_entry_read_length;
-   vs.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
-   vs.thread3.dispatch_grf_start_reg = 1;
-   vs.thread3.urb_entry_read_offset = 0;
-   vs.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
+   /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */
+   if (ctx->Transform.ClipPlanesEnabled) {
+      /* Note that we read in the userclip planes as well, hence
+       * clip_start:
+       */
+      vs->thread3.const_urb_entry_read_offset = brw->curbe.clip_start * 2;
+   }
+   else {
+      vs->thread3.const_urb_entry_read_offset = brw->curbe.vs_start * 2;
+   }
 
+
+   /* BRW_NEW_URB_FENCE */
    if (intel->gen == 5) {
-      switch (key->nr_urb_entries) {
+      switch (brw->urb.nr_vs_entries) {
       case 8:
       case 12:
       case 16:
@@ -129,13 +113,13 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
       case 192:
       case 224:
       case 256:
-	 vs.thread4.nr_urb_entries = key->nr_urb_entries >> 2;
+	 vs->thread4.nr_urb_entries = brw->urb.nr_vs_entries >> 2;
 	 break;
       default:
 	 assert(0);
       }
    } else {
-      switch (key->nr_urb_entries) {
+      switch (brw->urb.nr_vs_entries) {
       case 8:
       case 12:
       case 16:
@@ -147,63 +131,45 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
       default:
 	 assert(0);
       }
-      vs.thread4.nr_urb_entries = key->nr_urb_entries;
+      vs->thread4.nr_urb_entries = brw->urb.nr_vs_entries;
    }
 
-   vs.thread4.urb_entry_allocation_size = key->urb_size - 1;
+   vs->thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
 
-   vs.thread4.max_threads = CLAMP(key->nr_urb_entries / 2,
-				  1, brw->vs_max_threads) - 1;
+   vs->thread4.max_threads = CLAMP(brw->urb.nr_vs_entries / 2,
+				   1, brw->vs_max_threads) - 1;
 
    /* No samplers for ARB_vp programs:
     */
    /* It has to be set to 0 for Ironlake
     */
-   vs.vs5.sampler_count = 0;
+   vs->vs5.sampler_count = 0;
 
    if (unlikely(INTEL_DEBUG & DEBUG_STATS))
-      vs.thread4.stats_enable = 1;
+      vs->thread4.stats_enable = 1;
 
    /* Vertex program always enabled:
     */
-   vs.vs6.vs_enable = 1;
-
-   bo = brw_upload_cache(&brw->cache, BRW_VS_UNIT,
-			 key, sizeof(*key),
-			 &brw->vs.prog_bo, 1,
-			 &vs, sizeof(vs));
+   vs->vs6.vs_enable = 1;
 
    /* Emit VS program relocation */
-   drm_intel_bo_emit_reloc(bo, offsetof(struct brw_vs_unit_state, thread0),
-			   brw->vs.prog_bo, vs.thread0.grf_reg_count << 1,
+   drm_intel_bo_emit_reloc(intel->batch.bo, (brw->vs.state_offset +
+					     offsetof(struct brw_vs_unit_state,
+						      thread0)),
+			   brw->vs.prog_bo, vs->thread0.grf_reg_count << 1,
 			   I915_GEM_DOMAIN_INSTRUCTION, 0);
 
-   return bo;
-}
-
-static void prepare_vs_unit(struct brw_context *brw)
-{
-   struct brw_vs_unit_key key;
-
-   vs_unit_populate_key(brw, &key);
-
-   drm_intel_bo_unreference(brw->vs.state_bo);
-   brw->vs.state_bo = brw_search_cache(&brw->cache, BRW_VS_UNIT,
-				       &key, sizeof(key),
-				       &brw->vs.prog_bo, 1,
-				       NULL);
-   if (brw->vs.state_bo == NULL) {
-      brw->vs.state_bo = vs_unit_create_from_key(brw, &key);
-   }
+   brw->state.dirty.cache |= CACHE_NEW_VS_UNIT;
 }
 
 const struct brw_tracked_state brw_vs_unit = {
    .dirty = {
       .mesa  = _NEW_TRANSFORM,
-      .brw   = (BRW_NEW_CURBE_OFFSETS |
+      .brw   = (BRW_NEW_BATCH |
+		BRW_NEW_CURBE_OFFSETS |
                 BRW_NEW_NR_VS_SURFACES |
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_VS_PROG
    },
-   .prepare = prepare_vs_unit,
+   .prepare = brw_prepare_vs_unit,
 };
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index ce8712a260f..f2c417d8a81 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -60,7 +60,6 @@ dri_bo_release(drm_intel_bo **bo)
 static void brw_destroy_context( struct intel_context *intel )
 {
    struct brw_context *brw = brw_context(&intel->ctx);
-   int i;
 
    brw_destroy_state(brw);
    brw_draw_destroy( brw );
@@ -77,28 +76,13 @@ static void brw_destroy_context( struct intel_context *intel )
 
    dri_bo_release(&brw->curbe.curbe_bo);
    dri_bo_release(&brw->vs.prog_bo);
-   dri_bo_release(&brw->vs.state_bo);
    dri_bo_release(&brw->vs.const_bo);
    dri_bo_release(&brw->gs.prog_bo);
-   dri_bo_release(&brw->gs.state_bo);
    dri_bo_release(&brw->clip.prog_bo);
-   dri_bo_release(&brw->clip.state_bo);
-   dri_bo_release(&brw->clip.vp_bo);
    dri_bo_release(&brw->sf.prog_bo);
-   dri_bo_release(&brw->sf.state_bo);
-   dri_bo_release(&brw->sf.vp_bo);
-   for (i = 0; i < BRW_MAX_TEX_UNIT; i++)
-      dri_bo_release(&brw->wm.sdc_bo[i]);
-   dri_bo_release(&brw->wm.sampler_bo);
    dri_bo_release(&brw->wm.prog_bo);
-   dri_bo_release(&brw->wm.state_bo);
    dri_bo_release(&brw->wm.const_bo);
-   dri_bo_release(&brw->wm.push_const_bo);
    dri_bo_release(&brw->cc.prog_bo);
-   dri_bo_release(&brw->cc.vp_bo);
-   dri_bo_release(&brw->cc.blend_state_bo);
-   dri_bo_release(&brw->cc.depth_stencil_state_bo);
-   dri_bo_release(&brw->cc.color_calc_state_bo);
 
    free(brw->curbe.last_buf);
    free(brw->curbe.next_buf);
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 65af227d831..06512de940f 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -33,6 +33,7 @@
 #include "brw_wm.h"
 #include "brw_state.h"
 #include "main/formats.h"
+#include "main/samplerobj.h"
 
 /** Return number of src args for given instruction */
 GLuint brw_wm_nr_args( GLuint opcode )
@@ -119,7 +120,7 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
    brw_wm_emit(c);
 }
 
-static void
+void
 brw_wm_payload_setup(struct brw_context *brw,
 		     struct brw_wm_compile *c)
 {
@@ -224,18 +225,13 @@ static void do_wm_prog( struct brw_context *brw,
 
    brw_init_compile(brw, &c->func);
 
-   brw_wm_payload_setup(brw, c);
-
    if (!brw_wm_fs_emit(brw, c)) {
-      /*
-       * Shader which use GLSL features such as flow control are handled
-       * differently from "simple" shaders.
-       */
+      /* Fallback for fixed function and ARB_fp shaders. */
       c->dispatch_width = 16;
       brw_wm_payload_setup(brw, c);
       brw_wm_non_glsl_emit(brw, c);
+      c->prog_data.dispatch_width = 16;
    }
-   c->prog_data.dispatch_width = c->dispatch_width;
 
    /* Scratch space is used for register spilling */
    if (c->last_scratch) {
@@ -272,13 +268,11 @@ static void do_wm_prog( struct brw_context *brw,
    program = brw_get_program(&c->func, &program_size);
 
    drm_intel_bo_unreference(brw->wm.prog_bo);
-   brw->wm.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_WM_PROG,
-						   &c->key, sizeof(c->key),
-						   NULL, 0,
-						   program, program_size,
-						   &c->prog_data,
-						   sizeof(c->prog_data),
-						   &brw->wm.prog_data);
+   brw->wm.prog_bo = brw_upload_cache(&brw->cache, BRW_WM_PROG,
+				      &c->key, sizeof(c->key),
+				      program, program_size,
+				      &c->prog_data, sizeof(c->prog_data),
+				      &brw->wm.prog_data);
 }
 
 
@@ -373,6 +367,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
       if (unit->_ReallyEnabled) {
          const struct gl_texture_object *t = unit->_Current;
          const struct gl_texture_image *img = t->Image[0][t->BaseLevel];
+	 struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, i);
 	 int swizzles[SWIZZLE_NIL + 1] = {
 	    SWIZZLE_X,
 	    SWIZZLE_Y,
@@ -388,14 +383,14 @@ static void brw_wm_populate_key( struct brw_context *brw,
 	  * well and our shadow compares always return the result in
 	  * all 4 channels.
 	  */
-	 if (t->Sampler.CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB) {
-	    if (t->Sampler.DepthMode == GL_ALPHA) {
+	 if (sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB) {
+	    if (sampler->DepthMode == GL_ALPHA) {
 	       swizzles[0] = SWIZZLE_ZERO;
 	       swizzles[1] = SWIZZLE_ZERO;
 	       swizzles[2] = SWIZZLE_ZERO;
-	    } else if (t->Sampler.DepthMode == GL_LUMINANCE) {
+	    } else if (sampler->DepthMode == GL_LUMINANCE) {
 	       swizzles[3] = SWIZZLE_ONE;
-	    } else if (t->Sampler.DepthMode == GL_RED) {
+	    } else if (sampler->DepthMode == GL_RED) {
 	       /* See table 3.23 of the GL 3.0 spec. */
 	       swizzles[1] = SWIZZLE_ZERO;
 	       swizzles[2] = SWIZZLE_ZERO;
@@ -465,7 +460,7 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
    struct brw_wm_prog_key key;
    struct brw_fragment_program *fp = (struct brw_fragment_program *)
       brw->fragment_program;
-     
+
    brw_wm_populate_key(brw, &key);
 
    /* Make an early check for the key.
@@ -473,7 +468,6 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
    drm_intel_bo_unreference(brw->wm.prog_bo);
    brw->wm.prog_bo = brw_search_cache(&brw->cache, BRW_WM_PROG,
 				      &key, sizeof(key),
-				      NULL, 0,
 				      &brw->wm.prog_data);
    if (brw->wm.prog_bo == NULL)
       do_wm_prog(brw, fp, &key);
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 40659f26025..a5f99a0a657 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -201,11 +201,11 @@ struct brw_wm_compile {
       PASS2_DONE
    } state;
 
-   GLuint source_depth_reg:3;
-   GLuint source_w_reg:3;
-   GLuint aa_dest_stencil_reg:3;
-   GLuint dest_depth_reg:3;
-   GLuint nr_payload_regs:4;
+   uint8_t source_depth_reg;
+   uint8_t source_w_reg;
+   uint8_t aa_dest_stencil_reg;
+   uint8_t dest_depth_reg;
+   uint8_t nr_payload_regs;
    GLuint computes_depth:1;	/* could be derived from program string */
    GLuint source_depth_to_render_target:1;
    GLuint runtime_check_aads_emit:1;
@@ -218,7 +218,6 @@ struct brw_wm_compile {
    GLuint nr_fp_insns;
    GLuint fp_temp;
    GLuint fp_interp_emitted;
-   GLuint fp_fragcolor_emitted;
 
    struct prog_src_register pixel_xy;
    struct prog_src_register delta_xy;
@@ -315,7 +314,7 @@ void brw_wm_print_program( struct brw_wm_compile *c,
 void brw_wm_lookup_iz(struct intel_context *intel,
 		      struct brw_wm_compile *c);
 
-GLboolean brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
+bool brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
 
 /* brw_wm_emit.c */
 void emit_alu1(struct brw_compile *p,
@@ -475,5 +474,7 @@ struct gl_shader_program *brw_new_shader_program(struct gl_context *ctx, GLuint
 
 bool brw_color_buffer_write_enabled(struct brw_context *brw);
 bool brw_render_target_supported(gl_format format);
+void brw_wm_payload_setup(struct brw_context *brw,
+			  struct brw_wm_compile *c);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index cdc1f367e5c..fd4cd892f41 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -51,16 +51,6 @@ static GLboolean can_do_pln(struct intel_context *intel,
    return GL_TRUE;
 }
 
-/* Not quite sure how correct this is - need to understand horiz
- * vs. vertical strides a little better.
- */
-static INLINE struct brw_reg sechalf( struct brw_reg reg )
-{
-   if (reg.vstride)
-      reg.nr++;
-   return reg;
-}
-
 /* Return the SrcReg index of the channels that can be immediate float operands
  * instead of usage of PROGRAM_CONSTANT values through push/pull.
  */
@@ -1325,12 +1315,6 @@ static void fire_fb_write( struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
    struct intel_context *intel = &p->brw->intel;
-   struct brw_reg dst;
-
-   if (c->dispatch_width == 16)
-      dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
-   else
-      dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
 
    /* Pass through control information:
     * 
@@ -1352,7 +1336,6 @@ static void fire_fb_write( struct brw_wm_compile *c,
 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
    brw_fb_WRITE(p,
 		c->dispatch_width,
-		dst,
 		base_reg,
 		retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
 		target,		
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index 4759b289a0c..9ddbee2edf4 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -961,35 +961,31 @@ static void emit_render_target_writes( struct brw_wm_compile *c )
    struct prog_src_register outcolor;
    GLuint i;
 
-   struct prog_instruction *inst, *last_inst = NULL;
+   struct prog_instruction *inst = NULL;
 
    /* The inst->Aux field is used for FB write target and the EOT marker */
 
-   if (c->key.nr_color_regions > 1) {
-      for (i = 0 ; i < c->key.nr_color_regions; i++) {
-         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0 + i);
-         last_inst = inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(), 0),
-                                    0, outcolor, payload_r0_depth, outdepth);
-         inst->Aux = INST_AUX_TARGET(i);
-         if (c->fp_fragcolor_emitted) {
-            outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR);
-            last_inst = inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(), 0),
-                                       0, outcolor, payload_r0_depth, outdepth);
-            inst->Aux = INST_AUX_TARGET(i);
-         }
+   for (i = 0; i < c->key.nr_color_regions; i++) {
+      if (c->fp->program.Base.OutputsWritten & (1 << FRAG_RESULT_COLOR)) {
+	 outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR);
+      } else {
+	 outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0 + i);
       }
-      last_inst->Aux |= INST_AUX_EOT;
+      inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(), 0),
+		     0, outcolor, payload_r0_depth, outdepth);
+      inst->Aux = INST_AUX_TARGET(i);
    }
-   else {
-      /* if gl_FragData[0] is written, use it, else use gl_FragColor */
-      if (c->fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DATA0))
-         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0);
-      else 
-         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR);
 
-      inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
-                     0, outcolor, payload_r0_depth, outdepth);
-      inst->Aux = INST_AUX_EOT | INST_AUX_TARGET(0);
+   /* Mark the last FB write as final, or emit a dummy write if we had
+    * no render targets bound.
+    */
+   if (c->key.nr_color_regions != 0) {
+      inst->Aux |= INST_AUX_EOT;
+   } else {
+      inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(), 0),
+		     0, src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR),
+		     payload_r0_depth, outdepth);
+      inst->Aux = INST_AUX_TARGET(0) | INST_AUX_EOT;
    }
 }
 
@@ -1015,16 +1011,6 @@ static void validate_src_regs( struct brw_wm_compile *c,
       }
    }
 }
-	 
-static void validate_dst_regs( struct brw_wm_compile *c,
-			       const struct prog_instruction *inst )
-{
-   if (inst->DstReg.File == PROGRAM_OUTPUT) {
-      GLuint idx = inst->DstReg.Index;
-      if (idx == FRAG_RESULT_COLOR)
-         c->fp_fragcolor_emitted = 1;
-   }
-}
 
 static void print_insns( const struct prog_instruction *insn,
 			 GLuint nr )
@@ -1083,7 +1069,6 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
    for (insn = 0; insn < fp->program.Base.NumInstructions; insn++) {
       const struct prog_instruction *inst = &fp->program.Base.Instructions[insn];
       validate_src_regs(c, inst);
-      validate_dst_regs(c, inst);
    }
 
    /* Loop over all instructions doing assorted simplifications and
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index cfc30d8613f..7b93bf90241 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -35,7 +35,7 @@
 #include "brw_defines.h"
 
 #include "main/macros.h"
-
+#include "main/samplerobj.h"
 
 
 /* Samplers aren't strictly wm state from the hardware's perspective,
@@ -66,81 +66,93 @@ static GLuint translate_wrap_mode( GLenum wrap )
    }
 }
 
-static drm_intel_bo *upload_default_color( struct brw_context *brw,
-				     const GLfloat *color )
+static void
+upload_default_color(struct brw_context *brw, struct gl_sampler_object *sampler,
+		     int unit)
 {
    struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   struct gl_texture_object *texObj = texUnit->_Current;
+   struct gl_texture_image *firstImage = texObj->Image[0][texObj->BaseLevel];
+   float color[4];
+
+   if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
+      /* GL specs that border color for depth textures is taken from the
+       * R channel, while the hardware uses A.  Spam R into all the
+       * channels for safety.
+       */
+      color[0] = sampler->BorderColor.f[0];
+      color[1] = sampler->BorderColor.f[0];
+      color[2] = sampler->BorderColor.f[0];
+      color[3] = sampler->BorderColor.f[0];
+   } else {
+      color[0] = sampler->BorderColor.f[0];
+      color[1] = sampler->BorderColor.f[1];
+      color[2] = sampler->BorderColor.f[2];
+      color[3] = sampler->BorderColor.f[3];
+   }
 
    if (intel->gen >= 5) {
-      struct gen5_sampler_default_color sdc;
+      struct gen5_sampler_default_color *sdc;
+
+      sdc = brw_state_batch(brw, sizeof(*sdc), 32, &brw->wm.sdc_offset[unit]);
 
-      memset(&sdc, 0, sizeof(sdc));
+      memset(sdc, 0, sizeof(*sdc));
 
-      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[0], color[0]);
-      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[1], color[1]);
-      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[2], color[2]);
-      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[3], color[3]);
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc->ub[0], color[0]);
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc->ub[1], color[1]);
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc->ub[2], color[2]);
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc->ub[3], color[3]);
 
-      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[0], color[0]);
-      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[1], color[1]);
-      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[2], color[2]);
-      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[3], color[3]);
+      UNCLAMPED_FLOAT_TO_USHORT(sdc->us[0], color[0]);
+      UNCLAMPED_FLOAT_TO_USHORT(sdc->us[1], color[1]);
+      UNCLAMPED_FLOAT_TO_USHORT(sdc->us[2], color[2]);
+      UNCLAMPED_FLOAT_TO_USHORT(sdc->us[3], color[3]);
 
-      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[0], color[0]);
-      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[1], color[1]);
-      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[2], color[2]);
-      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[3], color[3]);
+      UNCLAMPED_FLOAT_TO_SHORT(sdc->s[0], color[0]);
+      UNCLAMPED_FLOAT_TO_SHORT(sdc->s[1], color[1]);
+      UNCLAMPED_FLOAT_TO_SHORT(sdc->s[2], color[2]);
+      UNCLAMPED_FLOAT_TO_SHORT(sdc->s[3], color[3]);
 
-      /* XXX: Fill in half floats */
-      /* XXX: Fill in signed bytes */
+      sdc->hf[0] = _mesa_float_to_half(color[0]);
+      sdc->hf[1] = _mesa_float_to_half(color[1]);
+      sdc->hf[2] = _mesa_float_to_half(color[2]);
+      sdc->hf[3] = _mesa_float_to_half(color[3]);
 
-      COPY_4V(sdc.f, color);
+      sdc->b[0] = sdc->s[0] >> 8;
+      sdc->b[1] = sdc->s[1] >> 8;
+      sdc->b[2] = sdc->s[2] >> 8;
+      sdc->b[3] = sdc->s[3] >> 8;
 
-      return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
-			    &sdc, sizeof(sdc));
+      sdc->f[0] = color[0];
+      sdc->f[1] = color[1];
+      sdc->f[2] = color[2];
+      sdc->f[3] = color[3];
    } else {
-      struct brw_sampler_default_color sdc;
+      struct brw_sampler_default_color *sdc;
 
-      COPY_4V(sdc.color, color);
+      sdc = brw_state_batch(brw, sizeof(*sdc), 32, &brw->wm.sdc_offset[unit]);
 
-      return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
-			    &sdc, sizeof(sdc));
+      COPY_4V(sdc->color, color);
    }
 }
 
-
-struct wm_sampler_key {
-   int sampler_count;
-
-   struct wm_sampler_entry {
-      GLenum tex_target;
-      GLenum wrap_r, wrap_s, wrap_t;
-      float maxlod, minlod;
-      float lod_bias;
-      float max_aniso;
-      GLenum minfilter, magfilter;
-      GLenum comparemode, comparefunc;
-
-      /** If target is cubemap, take context setting.
-       */
-      GLboolean seamless_cube_map;
-   } sampler[BRW_MAX_TEX_UNIT];
-};
-
 /**
  * Sets the sampler state for a single unit based off of the sampler key
  * entry.
  */
 static void brw_update_sampler_state(struct brw_context *brw,
-				     struct wm_sampler_entry *key,
-				     drm_intel_bo *sdc_bo,
+				     int unit,
 				     struct brw_sampler_state *sampler)
 {
    struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   struct gl_texture_object *texObj = texUnit->_Current;
+   struct gl_sampler_object *gl_sampler = _mesa_get_samplerobj(ctx, unit);
 
-   memset(sampler, 0, sizeof(*sampler));
-
-   switch (key->minfilter) {
+   switch (gl_sampler->MinFilter) {
    case GL_NEAREST:
       sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
       sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
@@ -171,17 +183,17 @@ static void brw_update_sampler_state(struct brw_context *brw,
 
    /* Set Anisotropy: 
     */
-   if (key->max_aniso > 1.0) {
+   if (gl_sampler->MaxAnisotropy > 1.0) {
       sampler->ss0.min_filter = BRW_MAPFILTER_ANISOTROPIC; 
       sampler->ss0.mag_filter = BRW_MAPFILTER_ANISOTROPIC;
 
-      if (key->max_aniso > 2.0) {
-	 sampler->ss3.max_aniso = MIN2((key->max_aniso - 2) / 2,
+      if (gl_sampler->MaxAnisotropy > 2.0) {
+	 sampler->ss3.max_aniso = MIN2((gl_sampler->MaxAnisotropy - 2) / 2,
 				       BRW_ANISORATIO_16);
       }
    }
    else {
-      switch (key->magfilter) {
+      switch (gl_sampler->MagFilter) {
       case GL_NEAREST:
 	 sampler->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
 	 break;
@@ -193,9 +205,9 @@ static void brw_update_sampler_state(struct brw_context *brw,
       }  
    }
 
-   sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
-   sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
-   sampler->ss1.t_wrap_mode = translate_wrap_mode(key->wrap_t);
+   sampler->ss1.r_wrap_mode = translate_wrap_mode(gl_sampler->WrapR);
+   sampler->ss1.s_wrap_mode = translate_wrap_mode(gl_sampler->WrapS);
+   sampler->ss1.t_wrap_mode = translate_wrap_mode(gl_sampler->WrapT);
 
    if (intel->gen >= 6 &&
        sampler->ss0.min_filter != sampler->ss0.mag_filter)
@@ -204,9 +216,10 @@ static void brw_update_sampler_state(struct brw_context *brw,
    /* Cube-maps on 965 and later must use the same wrap mode for all 3
     * coordinate dimensions.  Futher, only CUBE and CLAMP are valid.
     */
-   if (key->tex_target == GL_TEXTURE_CUBE_MAP) {
-      if (key->seamless_cube_map &&
-	  (key->minfilter != GL_NEAREST || key->magfilter != GL_NEAREST)) {
+   if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
+      if (ctx->Texture.CubeMapSeamless &&
+	  (gl_sampler->MinFilter != GL_NEAREST ||
+	   gl_sampler->MagFilter != GL_NEAREST)) {
 	 sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE;
 	 sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
 	 sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
@@ -215,7 +228,7 @@ static void brw_update_sampler_state(struct brw_context *brw,
 	 sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
 	 sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
       }
-   } else if (key->tex_target == GL_TEXTURE_1D) {
+   } else if (texObj->Target == GL_TEXTURE_1D) {
       /* There's a bug in 1D texture sampling - it actually pays
        * attention to the wrap_t value, though it should not.
        * Override the wrap_t value here to GL_REPEAT to keep
@@ -227,18 +240,19 @@ static void brw_update_sampler_state(struct brw_context *brw,
 
    /* Set shadow function: 
     */
-   if (key->comparemode == GL_COMPARE_R_TO_TEXTURE_ARB) {
+   if (gl_sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB) {
       /* Shadowing is "enabled" by emitting a particular sampler
        * message (sample_c).  So need to recompile WM program when
        * shadow comparison is enabled on each/any texture unit.
        */
       sampler->ss0.shadow_function =
-	 intel_translate_shadow_compare_func(key->comparefunc);
+	 intel_translate_shadow_compare_func(gl_sampler->CompareFunc);
    }
 
    /* Set LOD bias: 
     */
-   sampler->ss0.lod_bias = S_FIXED(CLAMP(key->lod_bias, -16, 15), 6);
+   sampler->ss0.lod_bias = S_FIXED(CLAMP(texUnit->LodBias +
+					 gl_sampler->LodBias, -16, 15), 6);
 
    sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
    sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
@@ -252,150 +266,67 @@ static void brw_update_sampler_state(struct brw_context *brw,
     */
    sampler->ss0.base_level = U_FIXED(0, 1);
 
-   sampler->ss1.max_lod = U_FIXED(CLAMP(key->maxlod, 0, 13), 6);
-   sampler->ss1.min_lod = U_FIXED(CLAMP(key->minlod, 0, 13), 6);
-   
-   sampler->ss2.default_color_pointer = sdc_bo->offset >> 5; /* reloc */
-}
+   sampler->ss1.max_lod = U_FIXED(CLAMP(gl_sampler->MaxLod, 0, 13), 6);
+   sampler->ss1.min_lod = U_FIXED(CLAMP(gl_sampler->MinLod, 0, 13), 6);
 
+   upload_default_color(brw, gl_sampler, unit);
 
-/** Sets up the cache key for sampler state for all texture units */
-static void
-brw_wm_sampler_populate_key(struct brw_context *brw,
-			    struct wm_sampler_key *key)
-{
-   struct gl_context *ctx = &brw->intel.ctx;
-   int unit;
-   char *last_entry_end = ((char*)&key->sampler_count) + 
-      sizeof(key->sampler_count);
-
-   key->sampler_count = 0;
-
-   for (unit = 0; unit < BRW_MAX_TEX_UNIT; unit++) {
-      if (ctx->Texture.Unit[unit]._ReallyEnabled) {
-	 struct wm_sampler_entry *entry = &key->sampler[unit];
-	 struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	 struct gl_texture_object *texObj = texUnit->_Current;
-	 struct gl_texture_image *firstImage =
-	    texObj->Image[0][texObj->BaseLevel];
-
-	 memset(last_entry_end, 0, 
-		(char*)entry - last_entry_end + sizeof(*entry));
-	 last_entry_end = ((char*)entry) + sizeof(*entry);
-
-         entry->tex_target = texObj->Target;
-
-	 entry->seamless_cube_map = (texObj->Target == GL_TEXTURE_CUBE_MAP)
-	    ? ctx->Texture.CubeMapSeamless : GL_FALSE;
-
-	 entry->wrap_r = texObj->Sampler.WrapR;
-	 entry->wrap_s = texObj->Sampler.WrapS;
-	 entry->wrap_t = texObj->Sampler.WrapT;
-
-	 entry->maxlod = texObj->Sampler.MaxLod;
-	 entry->minlod = texObj->Sampler.MinLod;
-	 entry->lod_bias = texUnit->LodBias + texObj->Sampler.LodBias;
-	 entry->max_aniso = texObj->Sampler.MaxAnisotropy;
-	 entry->minfilter = texObj->Sampler.MinFilter;
-	 entry->magfilter = texObj->Sampler.MagFilter;
-	 entry->comparemode = texObj->Sampler.CompareMode;
-         entry->comparefunc = texObj->Sampler.CompareFunc;
-
-	 drm_intel_bo_unreference(brw->wm.sdc_bo[unit]);
-	 if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
-	    float bordercolor[4] = {
-	       texObj->Sampler.BorderColor.f[0],
-	       texObj->Sampler.BorderColor.f[0],
-	       texObj->Sampler.BorderColor.f[0],
-	       texObj->Sampler.BorderColor.f[0]
-	    };
-	    /* GL specs that border color for depth textures is taken from the
-	     * R channel, while the hardware uses A.  Spam R into all the
-	     * channels for safety.
-	     */
-	    brw->wm.sdc_bo[unit] = upload_default_color(brw, bordercolor);
-	 } else {
-	    brw->wm.sdc_bo[unit] = upload_default_color(brw,
-							texObj->Sampler.BorderColor.f);
-	 }
-	 key->sampler_count = unit + 1;
-      }
+   if (intel->gen >= 6) {
+      sampler->ss2.default_color_pointer = brw->wm.sdc_offset[unit] >> 5;
+   } else {
+      /* reloc */
+      sampler->ss2.default_color_pointer = (intel->batch.bo->offset +
+					    brw->wm.sdc_offset[unit]) >> 5;
+
+      drm_intel_bo_emit_reloc(intel->batch.bo,
+			      brw->wm.sampler_offset +
+			      unit * sizeof(struct brw_sampler_state) +
+			      offsetof(struct brw_sampler_state, ss2),
+			      intel->batch.bo, brw->wm.sdc_offset[unit],
+			      I915_GEM_DOMAIN_SAMPLER, 0);
    }
-   struct wm_sampler_entry *entry = &key->sampler[key->sampler_count];
-   memset(last_entry_end, 0, (char*)entry - last_entry_end);
 }
 
+
 /* All samplers must be uploaded in a single contiguous array, which
  * complicates various things.  However, this is still too confusing -
  * FIXME: simplify all the different new texture state flags.
  */
-static void upload_wm_samplers( struct brw_context *brw )
+static void
+prepare_wm_samplers(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->intel.ctx;
-   struct wm_sampler_key key;
-   int i, sampler_key_size;
-
-   brw_wm_sampler_populate_key(brw, &key);
+   struct brw_sampler_state *samplers;
+   int i;
 
-   if (brw->wm.sampler_count != key.sampler_count) {
-      brw->wm.sampler_count = key.sampler_count;
-      brw->state.dirty.cache |= CACHE_NEW_SAMPLER;
+   brw->wm.sampler_count = 0;
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
+      if (ctx->Texture.Unit[i]._ReallyEnabled)
+	 brw->wm.sampler_count = i + 1;
    }
 
-   drm_intel_bo_unreference(brw->wm.sampler_bo);
-   brw->wm.sampler_bo = NULL;
    if (brw->wm.sampler_count == 0)
       return;
 
-   /* Only include the populated portion of the key in the search. */
-   sampler_key_size = offsetof(struct wm_sampler_key,
-			       sampler[key.sampler_count]);
-   brw->wm.sampler_bo = brw_search_cache(&brw->cache, BRW_SAMPLER,
-					 &key, sampler_key_size,
-					 brw->wm.sdc_bo, key.sampler_count,
-					 NULL);
+   samplers = brw_state_batch(brw, brw->wm.sampler_count * sizeof(*samplers),
+			      32, &brw->wm.sampler_offset);
+   memset(samplers, 0, brw->wm.sampler_count * sizeof(*samplers));
 
-   /* If we didnt find it in the cache, compute the state and put it in the
-    * cache.
-    */
-   if (brw->wm.sampler_bo == NULL) {
-      struct brw_sampler_state sampler[BRW_MAX_TEX_UNIT];
-
-      memset(sampler, 0, sizeof(sampler));
-      for (i = 0; i < key.sampler_count; i++) {
-	 if (brw->wm.sdc_bo[i] == NULL)
-	    continue;
-
-	 brw_update_sampler_state(brw, &key.sampler[i], brw->wm.sdc_bo[i],
-				  &sampler[i]);
-      }
-
-      brw->wm.sampler_bo = brw_upload_cache(&brw->cache, BRW_SAMPLER,
-					    &key, sampler_key_size,
-					    brw->wm.sdc_bo, key.sampler_count,
-					    &sampler, sizeof(sampler));
-
-      /* Emit SDC relocations */
-      for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-	 if (!ctx->Texture.Unit[i]._ReallyEnabled)
-	    continue;
-
-	 drm_intel_bo_emit_reloc(brw->wm.sampler_bo,
-				 i * sizeof(struct brw_sampler_state) +
-				 offsetof(struct brw_sampler_state, ss2),
-				 brw->wm.sdc_bo[i], 0,
-				 I915_GEM_DOMAIN_SAMPLER, 0);
-      }
+   for (i = 0; i < brw->wm.sampler_count; i++) {
+      if (ctx->Texture.Unit[i]._ReallyEnabled)
+	 brw_update_sampler_state(brw, i, &samplers[i]);
    }
+
+   brw->state.dirty.cache |= CACHE_NEW_SAMPLER;
 }
 
 const struct brw_tracked_state brw_wm_samplers = {
    .dirty = {
       .mesa = _NEW_TEXTURE,
-      .brw = 0,
+      .brw = BRW_NEW_BATCH,
       .cache = 0
    },
-   .prepare = upload_wm_samplers,
+   .prepare = prepare_wm_samplers,
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index be4b260a5ff..a91ae511b7f 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -40,21 +40,6 @@
  * WM unit - fragment programs and rasterization
  */
 
-struct brw_wm_unit_key {
-   unsigned int total_grf, total_scratch;
-   unsigned int urb_entry_read_length;
-   unsigned int curb_entry_read_length;
-   unsigned int dispatch_grf_start_reg;
-
-   unsigned int curbe_offset;
-
-   unsigned int nr_surfaces, sampler_count;
-   GLboolean uses_depth, computes_depth, uses_kill, is_glsl;
-   GLboolean polygon_stipple, stats_wm, line_stipple, offset_enable;
-   GLboolean color_write_enable;
-   GLfloat offset_units, offset_factor;
-};
-
 bool
 brw_color_buffer_write_enabled(struct brw_context *brw)
 {
@@ -81,219 +66,192 @@ brw_color_buffer_write_enabled(struct brw_context *brw)
    return false;
 }
 
+/**
+ * Setup wm hardware state.  See page 225 of Volume 2
+ */
 static void
-wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
+brw_prepare_wm_unit(struct brw_context *brw)
 {
-   struct gl_context *ctx = &brw->intel.ctx;
-   const struct gl_fragment_program *fp = brw->fragment_program;
    struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
+   const struct gl_fragment_program *fp = brw->fragment_program;
+   struct brw_wm_unit_state *wm;
 
-   memset(key, 0, sizeof(*key));
+   wm = brw_state_batch(brw, sizeof(*wm), 32, &brw->wm.state_offset);
+   memset(wm, 0, sizeof(*wm));
+
+   if (brw->wm.prog_data->prog_offset_16) {
+      /* These two fields should be the same pre-gen6, which is why we
+       * only have one hardware field to program for both dispatch
+       * widths.
+       */
+      assert(brw->wm.prog_data->first_curbe_grf ==
+	     brw->wm.prog_data->first_curbe_grf_16);
+   }
 
    /* CACHE_NEW_WM_PROG */
-   key->total_grf = brw->wm.prog_data->total_grf;
-   key->urb_entry_read_length = brw->wm.prog_data->urb_read_length;
-   key->curb_entry_read_length = brw->wm.prog_data->curb_read_length;
-   key->dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
-   key->total_scratch = brw->wm.prog_data->total_scratch;
+   wm->thread0.grf_reg_count = ALIGN(brw->wm.prog_data->total_grf, 16) / 16 - 1;
+   wm->wm9.grf_reg_count_2 = ALIGN(brw->wm.prog_data->total_grf_16, 16) / 16 - 1;
+   wm->thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */
+   /* reloc */
+   wm->wm9.kernel_start_pointer_2 = (brw->wm.prog_bo->offset +
+				     brw->wm.prog_data->prog_offset_16) >> 6;
+   wm->thread1.depth_coef_urb_read_offset = 1;
+   wm->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
 
-   /* BRW_NEW_CURBE_OFFSETS */
-   key->curbe_offset = brw->curbe.wm_start;
+   if (intel->gen == 5)
+      wm->thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else {
+      /* BRW_NEW_NR_SURFACES */
+      wm->thread1.binding_table_entry_count = brw->wm.nr_surfaces;
+   }
 
-   /* BRW_NEW_NR_SURFACEs */
-   key->nr_surfaces = brw->wm.nr_surfaces;
+   if (brw->wm.prog_data->total_scratch != 0) {
+      wm->thread2.scratch_space_base_pointer =
+	 brw->wm.scratch_bo->offset >> 10; /* reloc */
+      wm->thread2.per_thread_scratch_space =
+	 ffs(brw->wm.prog_data->total_scratch) - 11;
+   } else {
+      wm->thread2.scratch_space_base_pointer = 0;
+      wm->thread2.per_thread_scratch_space = 0;
+   }
 
-   /* CACHE_NEW_SAMPLER */
-   key->sampler_count = brw->wm.sampler_count;
+   wm->thread3.dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
+   wm->thread3.urb_entry_read_length = brw->wm.prog_data->urb_read_length;
+   wm->thread3.urb_entry_read_offset = 0;
+   wm->thread3.const_urb_entry_read_length =
+      brw->wm.prog_data->curb_read_length;
+   /* BRW_NEW_CURBE_OFFSETS */
+   wm->thread3.const_urb_entry_read_offset = brw->curbe.wm_start * 2;
 
-   /* _NEW_POLYGONSTIPPLE */
-   key->polygon_stipple = ctx->Polygon.StippleFlag;
+   if (intel->gen == 5)
+      wm->wm4.sampler_count = 0; /* hardware requirement */
+   else {
+      /* CACHE_NEW_SAMPLER */
+      wm->wm4.sampler_count = (brw->wm.sampler_count + 1) / 4;
+   }
 
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   key->uses_depth = (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
+   if (brw->wm.sampler_count) {
+      /* reloc */
+      wm->wm4.sampler_state_pointer = (intel->batch.bo->offset +
+				       brw->wm.sampler_offset) >> 5;
+   } else {
+      wm->wm4.sampler_state_pointer = 0;
+   }
 
-   /* as far as we can tell */
-   key->computes_depth =
-      (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) != 0;
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   wm->wm5.program_uses_depth = (fp->Base.InputsRead &
+				 (1 << FRAG_ATTRIB_WPOS)) != 0;
+   wm->wm5.program_computes_depth = (fp->Base.OutputsWritten &
+				     BITFIELD64_BIT(FRAG_RESULT_DEPTH)) != 0;
    /* BRW_NEW_DEPTH_BUFFER
     * Override for NULL depthbuffer case, required by the Pixel Shader Computed
     * Depth field.
     */
    if (brw->state.depth_region == NULL)
-      key->computes_depth = 0;
-
-   /* _NEW_BUFFERS | _NEW_COLOR */
-   key->color_write_enable = brw_color_buffer_write_enabled(brw);
+      wm->wm5.program_computes_depth = 0;
 
    /* _NEW_COLOR */
-   key->uses_kill = fp->UsesKill || ctx->Color.AlphaEnabled;
+   wm->wm5.program_uses_killpixel = fp->UsesKill || ctx->Color.AlphaEnabled;
 
-   /* If using the fragment shader backend, the program is always
-    * 8-wide.
+
+   /* BRW_NEW_FRAGMENT_PROGRAM
+    *
+    * If using the fragment shader backend, the program is always
+    * 8-wide.  If not, it's always 16.
     */
    if (ctx->Shader.CurrentFragmentProgram) {
       struct brw_shader *shader = (struct brw_shader *)
 	 ctx->Shader.CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT];
 
       if (shader != NULL && shader->ir != NULL) {
-	 key->is_glsl = GL_TRUE;
+	 wm->wm5.enable_8_pix = 1;
+	 if (brw->wm.prog_data->prog_offset_16)
+	    wm->wm5.enable_16_pix = 1;
       }
    }
+   if (!wm->wm5.enable_8_pix)
+      wm->wm5.enable_16_pix = 1;
 
-   /* _NEW_DEPTH */
-   key->stats_wm = intel->stats_wm;
+   wm->wm5.max_threads = brw->wm_max_threads - 1;
 
-   /* _NEW_LINE */
-   key->line_stipple = ctx->Line.StippleFlag;
-
-   /* _NEW_POLYGON */
-   key->offset_enable = ctx->Polygon.OffsetFill;
-   key->offset_units = ctx->Polygon.OffsetUnits;
-   key->offset_factor = ctx->Polygon.OffsetFactor;
-}
-
-/**
- * Setup wm hardware state.  See page 225 of Volume 2
- */
-static drm_intel_bo *
-wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
-			drm_intel_bo **reloc_bufs)
-{
-   struct intel_context *intel = &brw->intel;
-   struct brw_wm_unit_state wm;
-   drm_intel_bo *bo;
-
-   memset(&wm, 0, sizeof(wm));
-
-   wm.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
-   wm.thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */
-   wm.thread1.depth_coef_urb_read_offset = 1;
-   wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-
-   if (intel->gen == 5)
-      wm.thread1.binding_table_entry_count = 0; /* hardware requirement */
-   else
-      wm.thread1.binding_table_entry_count = key->nr_surfaces;
-
-   if (key->total_scratch != 0) {
-      wm.thread2.scratch_space_base_pointer =
-	 brw->wm.scratch_bo->offset >> 10; /* reloc */
-      wm.thread2.per_thread_scratch_space = ffs(key->total_scratch) - 11;
-   } else {
-      wm.thread2.scratch_space_base_pointer = 0;
-      wm.thread2.per_thread_scratch_space = 0;
-   }
-
-   wm.thread3.dispatch_grf_start_reg = key->dispatch_grf_start_reg;
-   wm.thread3.urb_entry_read_length = key->urb_entry_read_length;
-   wm.thread3.urb_entry_read_offset = 0;
-   wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
-   wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
-
-   if (intel->gen == 5)
-      wm.wm4.sampler_count = 0; /* hardware requirement */
-   else
-      wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
-
-   if (brw->wm.sampler_bo != NULL) {
-      /* reloc */
-      wm.wm4.sampler_state_pointer = brw->wm.sampler_bo->offset >> 5;
-   } else {
-      wm.wm4.sampler_state_pointer = 0;
-   }
-
-   wm.wm5.program_uses_depth = key->uses_depth;
-   wm.wm5.program_computes_depth = key->computes_depth;
-   wm.wm5.program_uses_killpixel = key->uses_kill;
-
-   if (key->is_glsl)
-      wm.wm5.enable_8_pix = 1;
-   else
-      wm.wm5.enable_16_pix = 1;
-
-   wm.wm5.max_threads = brw->wm_max_threads - 1;
-
-   if (key->color_write_enable ||
-       key->uses_kill ||
-       key->computes_depth) {
-      wm.wm5.thread_dispatch_enable = 1;
+   /* _NEW_BUFFERS | _NEW_COLOR */
+   if (brw_color_buffer_write_enabled(brw) ||
+       wm->wm5.program_uses_killpixel ||
+       wm->wm5.program_computes_depth) {
+      wm->wm5.thread_dispatch_enable = 1;
    }
 
-   wm.wm5.legacy_line_rast = 0;
-   wm.wm5.legacy_global_depth_bias = 0;
-   wm.wm5.early_depth_test = 1;	        /* never need to disable */
-   wm.wm5.line_aa_region_width = 0;
-   wm.wm5.line_endcap_aa_region_width = 1;
+   wm->wm5.legacy_line_rast = 0;
+   wm->wm5.legacy_global_depth_bias = 0;
+   wm->wm5.early_depth_test = 1;	        /* never need to disable */
+   wm->wm5.line_aa_region_width = 0;
+   wm->wm5.line_endcap_aa_region_width = 1;
 
-   wm.wm5.polygon_stipple = key->polygon_stipple;
+   /* _NEW_POLYGONSTIPPLE */
+   wm->wm5.polygon_stipple = ctx->Polygon.StippleFlag;
 
-   if (key->offset_enable) {
-      wm.wm5.depth_offset = 1;
+   /* _NEW_POLYGON */
+   if (ctx->Polygon.OffsetFill) {
+      wm->wm5.depth_offset = 1;
       /* Something wierd going on with legacy_global_depth_bias,
        * offset_constant, scaling and MRD.  This value passes glean
        * but gives some odd results elsewere (eg. the
        * quad-offset-units test).
        */
-      wm.global_depth_offset_constant = key->offset_units * 2;
+      wm->global_depth_offset_constant = ctx->Polygon.OffsetUnits * 2;
 
       /* This is the only value that passes glean:
        */
-      wm.global_depth_offset_scale = key->offset_factor;
+      wm->global_depth_offset_scale = ctx->Polygon.OffsetFactor;
    }
 
-   wm.wm5.line_stipple = key->line_stipple;
-
-   if (unlikely(INTEL_DEBUG & DEBUG_STATS) || key->stats_wm)
-      wm.wm4.stats_enable = 1;
+   /* _NEW_LINE */
+   wm->wm5.line_stipple = ctx->Line.StippleFlag;
 
-   bo = brw_upload_cache(&brw->cache, BRW_WM_UNIT,
-			 key, sizeof(*key),
-			 reloc_bufs, 3,
-			 &wm, sizeof(wm));
+   /* _NEW_DEPTH */
+   if (unlikely(INTEL_DEBUG & DEBUG_STATS) || intel->stats_wm)
+      wm->wm4.stats_enable = 1;
 
    /* Emit WM program relocation */
-   drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, thread0),
-			   brw->wm.prog_bo, wm.thread0.grf_reg_count << 1,
+   drm_intel_bo_emit_reloc(intel->batch.bo,
+			   brw->wm.state_offset +
+			   offsetof(struct brw_wm_unit_state, thread0),
+			   brw->wm.prog_bo, wm->thread0.grf_reg_count << 1,
 			   I915_GEM_DOMAIN_INSTRUCTION, 0);
 
+   if (brw->wm.prog_data->prog_offset_16) {
+      drm_intel_bo_emit_reloc(intel->batch.bo,
+			      brw->wm.state_offset +
+			      offsetof(struct brw_wm_unit_state, wm9),
+			      brw->wm.prog_bo,
+			      ((wm->wm9.grf_reg_count_2 << 1) +
+			       brw->wm.prog_data->prog_offset_16),
+			      I915_GEM_DOMAIN_INSTRUCTION, 0);
+   }
+
    /* Emit scratch space relocation */
-   if (key->total_scratch != 0) {
-      drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, thread2),
+   if (brw->wm.prog_data->total_scratch != 0) {
+      drm_intel_bo_emit_reloc(intel->batch.bo,
+			      brw->wm.state_offset +
+			      offsetof(struct brw_wm_unit_state, thread2),
 			      brw->wm.scratch_bo,
-			      wm.thread2.per_thread_scratch_space,
+			      wm->thread2.per_thread_scratch_space,
 			      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
    }
 
    /* Emit sampler state relocation */
-   if (key->sampler_count != 0) {
-      drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, wm4),
-			      brw->wm.sampler_bo, (wm.wm4.stats_enable |
-						   (wm.wm4.sampler_count << 2)),
+   if (brw->wm.sampler_count != 0) {
+      drm_intel_bo_emit_reloc(intel->batch.bo,
+			      brw->wm.state_offset +
+			      offsetof(struct brw_wm_unit_state, wm4),
+			      intel->batch.bo, (brw->wm.sampler_offset |
+						wm->wm4.stats_enable |
+						(wm->wm4.sampler_count << 2)),
 			      I915_GEM_DOMAIN_INSTRUCTION, 0);
    }
 
-   return bo;
-}
-
-
-static void upload_wm_unit( struct brw_context *brw )
-{
-   struct brw_wm_unit_key key;
-   drm_intel_bo *reloc_bufs[3];
-   wm_unit_populate_key(brw, &key);
-
-   reloc_bufs[0] = brw->wm.prog_bo;
-   reloc_bufs[1] = brw->wm.scratch_bo;
-   reloc_bufs[2] = brw->wm.sampler_bo;
-
-   drm_intel_bo_unreference(brw->wm.state_bo);
-   brw->wm.state_bo = brw_search_cache(&brw->cache, BRW_WM_UNIT,
-				       &key, sizeof(key),
-				       reloc_bufs, 3,
-				       NULL);
-   if (brw->wm.state_bo == NULL) {
-      brw->wm.state_bo = wm_unit_create_from_key(brw, &key, reloc_bufs);
-   }
+   brw->state.dirty.cache |= CACHE_NEW_WM_UNIT;
 }
 
 const struct brw_tracked_state brw_wm_unit = {
@@ -305,7 +263,8 @@ const struct brw_tracked_state brw_wm_unit = {
 	       _NEW_DEPTH |
 	       _NEW_BUFFERS),
 
-      .brw = (BRW_NEW_FRAGMENT_PROGRAM | 
+      .brw = (BRW_NEW_BATCH |
+	      BRW_NEW_FRAGMENT_PROGRAM |
 	      BRW_NEW_CURBE_OFFSETS |
 	      BRW_NEW_DEPTH_BUFFER |
 	      BRW_NEW_NR_WM_SURFACES),
@@ -313,6 +272,6 @@ const struct brw_tracked_state brw_wm_unit = {
       .cache = (CACHE_NEW_WM_PROG |
 		CACHE_NEW_SAMPLER)
    },
-   .prepare = upload_wm_unit,
+   .prepare = brw_prepare_wm_unit,
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index e3396a3cbd4..47b8b511f05 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -31,6 +31,7 @@
                    
 
 #include "main/mtypes.h"
+#include "main/samplerobj.h"
 #include "main/texstore.h"
 #include "program/prog_parameter.h"
 
@@ -112,6 +113,10 @@ static uint32_t brw_format_for_mesa_format[MESA_FORMAT_COUNT] =
    [MESA_FORMAT_LUMINANCE_FLOAT32] = BRW_SURFACEFORMAT_L32_FLOAT,
    [MESA_FORMAT_ALPHA_FLOAT32] = BRW_SURFACEFORMAT_A32_FLOAT,
    [MESA_FORMAT_LUMINANCE_ALPHA_FLOAT32] = BRW_SURFACEFORMAT_L32A32_FLOAT,
+   [MESA_FORMAT_RED_RGTC1] = BRW_SURFACEFORMAT_BC4_UNORM,
+   [MESA_FORMAT_SIGNED_RED_RGTC1] = BRW_SURFACEFORMAT_BC4_SNORM,
+   [MESA_FORMAT_RG_RGTC2] = BRW_SURFACEFORMAT_BC5_UNORM,
+   [MESA_FORMAT_SIGNED_RG_RGTC2] = BRW_SURFACEFORMAT_BC5_SNORM,
 };
 
 bool
@@ -213,6 +218,7 @@ brw_update_texture_surface( struct gl_context *ctx, GLuint unit )
    struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current;
    struct intel_texture_object *intelObj = intel_texture_object(tObj);
    struct gl_texture_image *firstImage = tObj->Image[0][tObj->BaseLevel];
+   struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
    const GLuint surf_index = SURF_INDEX_TEXTURE(unit);
    struct brw_surface_state *surf;
 
@@ -224,8 +230,8 @@ brw_update_texture_surface( struct gl_context *ctx, GLuint unit )
    surf->ss0.surface_type = translate_tex_target(tObj->Target);
    surf->ss0.surface_format = translate_tex_format(firstImage->TexFormat,
                                                    firstImage->InternalFormat,
-                                                   tObj->Sampler.DepthMode,
-                                                   tObj->Sampler.sRGBDecode);
+                                                   sampler->DepthMode,
+                                                   sampler->sRGBDecode);
 
    /* This is ok for all textures with channel width 8bit or less:
     */
@@ -309,7 +315,7 @@ brw_create_constant_surface(struct brw_context *brw,
  * state atom.
  */
 static void
-prepare_wm_constants(struct brw_context *brw)
+prepare_wm_pull_constants(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->intel.ctx;
    struct intel_context *intel = &brw->intel;
@@ -353,7 +359,7 @@ const struct brw_tracked_state brw_wm_constants = {
       .brw = (BRW_NEW_FRAGMENT_PROGRAM),
       .cache = 0
    },
-   .prepare = prepare_wm_constants,
+   .prepare = prepare_wm_pull_constants,
 };
 
 /**
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index 1b935fb5e70..66357f00fa6 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -32,82 +32,39 @@
 #include "intel_batchbuffer.h"
 #include "main/macros.h"
 
-struct gen6_blend_state_key {
-   GLboolean color_blend, alpha_enabled;
-   GLboolean dither;
-   GLboolean color_mask[BRW_MAX_DRAW_BUFFERS][4];
-
-   GLenum logic_op;
-
-   GLenum blend_eq_rgb, blend_eq_a;
-   GLenum blend_src_rgb, blend_src_a;
-   GLenum blend_dst_rgb, blend_dst_a;
-
-   GLenum alpha_func;
-};
-
 static void
-blend_state_populate_key(struct brw_context *brw,
-			 struct gen6_blend_state_key *key)
+prepare_blend_state(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->intel.ctx;
-
-   memset(key, 0, sizeof(*key));
-
-   /* _NEW_COLOR */
-   memcpy(key->color_mask, ctx->Color.ColorMask, sizeof(key->color_mask));
-
-   /* _NEW_COLOR */
-   if (ctx->Color._LogicOpEnabled)
-      key->logic_op = ctx->Color.LogicOp;
-   else
-      key->logic_op = GL_COPY;
-
-   /* _NEW_COLOR */
-   key->color_blend = ctx->Color.BlendEnabled;
-   if (key->color_blend) {
-      key->blend_eq_rgb = ctx->Color.Blend[0].EquationRGB;
-      key->blend_eq_a = ctx->Color.Blend[0].EquationA;
-      key->blend_src_rgb = ctx->Color.Blend[0].SrcRGB;
-      key->blend_dst_rgb = ctx->Color.Blend[0].DstRGB;
-      key->blend_src_a = ctx->Color.Blend[0].SrcA;
-      key->blend_dst_a = ctx->Color.Blend[0].DstA;
-   }
-
-   /* _NEW_COLOR */
-   key->alpha_enabled = ctx->Color.AlphaEnabled;
-   if (key->alpha_enabled) {
-      key->alpha_func = ctx->Color.AlphaFunc;
-   }
-
-   /* _NEW_COLOR */
-   key->dither = ctx->Color.DitherFlag;
-}
-
-/**
- * Creates the state cache entry for the given CC unit key.
- */
-static drm_intel_bo *
-blend_state_create_from_key(struct brw_context *brw,
-			    struct gen6_blend_state_key *key)
-{
-   struct gen6_blend_state blend[BRW_MAX_DRAW_BUFFERS];
-   drm_intel_bo *bo;
+   struct gen6_blend_state *blend;
    int b;
-
-   memset(&blend, 0, sizeof(blend));
-
-   for (b = 0; b < BRW_MAX_DRAW_BUFFERS; b++) {
-      if (key->logic_op != GL_COPY) {
-	 blend[b].blend1.logic_op_enable = 1;
-	 blend[b].blend1.logic_op_func = intel_translate_logic_op(key->logic_op);
-      } else if (key->color_blend & (1 << b)) {
-	 GLenum eqRGB = key->blend_eq_rgb;
-	 GLenum eqA = key->blend_eq_a;
-	 GLenum srcRGB = key->blend_src_rgb;
-	 GLenum dstRGB = key->blend_dst_rgb;
-	 GLenum srcA = key->blend_src_a;
-	 GLenum dstA = key->blend_dst_a;
+   int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
+   int size = sizeof(*blend) * nr_draw_buffers;
+
+   blend = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset);
+
+   memset(blend, 0, size);
+
+   for (b = 0; b < nr_draw_buffers; b++) {
+      /* _NEW_COLOR */
+      if (ctx->Color._LogicOpEnabled) {
+	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[b];
+	 /* _NEW_BUFFERS */
+	 /* Floating point RTs should have no effect from LogicOp,
+	  * except for disabling of blending
+	  */
+	 if (_mesa_get_format_datatype(rb->Format) != GL_FLOAT) {
+	    blend[b].blend1.logic_op_enable = 1;
+	    blend[b].blend1.logic_op_func =
+	       intel_translate_logic_op(ctx->Color.LogicOp);
+	 }
+      } else if (ctx->Color.BlendEnabled & (1 << b)) {
+	 GLenum eqRGB = ctx->Color.Blend[0].EquationRGB;
+	 GLenum eqA = ctx->Color.Blend[0].EquationA;
+	 GLenum srcRGB = ctx->Color.Blend[0].SrcRGB;
+	 GLenum dstRGB = ctx->Color.Blend[0].DstRGB;
+	 GLenum srcA = ctx->Color.Blend[0].SrcA;
+	 GLenum dstA = ctx->Color.Blend[0].DstA;
 
 	 if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
 	    srcRGB = dstRGB = GL_ONE;
@@ -131,146 +88,74 @@ blend_state_create_from_key(struct brw_context *brw,
 					 eqA != eqRGB);
       }
 
-      if (key->alpha_enabled) {
+
+      /* _NEW_COLOR */
+      if (ctx->Color.AlphaEnabled) {
 	 blend[b].blend1.alpha_test_enable = 1;
-	 blend[b].blend1.alpha_test_func = intel_translate_compare_func(key->alpha_func);
+	 blend[b].blend1.alpha_test_func =
+	    intel_translate_compare_func(ctx->Color.AlphaFunc);
 
       }
 
-      if (key->dither) {
+      /* _NEW_COLOR */
+      if (ctx->Color.DitherFlag) {
 	 blend[b].blend1.dither_enable = 1;
 	 blend[b].blend1.y_dither_offset = 0;
 	 blend[b].blend1.x_dither_offset = 0;
       }
 
-      blend[b].blend1.write_disable_r = !key->color_mask[b][0];
-      blend[b].blend1.write_disable_g = !key->color_mask[b][1];
-      blend[b].blend1.write_disable_b = !key->color_mask[b][2];
-      blend[b].blend1.write_disable_a = !key->color_mask[b][3];
+      blend[b].blend1.write_disable_r = !ctx->Color.ColorMask[b][0];
+      blend[b].blend1.write_disable_g = !ctx->Color.ColorMask[b][1];
+      blend[b].blend1.write_disable_b = !ctx->Color.ColorMask[b][2];
+      blend[b].blend1.write_disable_a = !ctx->Color.ColorMask[b][3];
    }
 
-   bo = brw_upload_cache(&brw->cache, BRW_BLEND_STATE,
-			 key, sizeof(*key),
-			 NULL, 0,
-			 &blend, sizeof(blend));
-
-   return bo;
-}
-
-static void
-prepare_blend_state(struct brw_context *brw)
-{
-   struct gen6_blend_state_key key;
-
-   blend_state_populate_key(brw, &key);
-
-   drm_intel_bo_unreference(brw->cc.blend_state_bo);
-   brw->cc.blend_state_bo = brw_search_cache(&brw->cache, BRW_BLEND_STATE,
-					     &key, sizeof(key),
-					     NULL, 0,
-					     NULL);
-
-   if (brw->cc.blend_state_bo == NULL)
-      brw->cc.blend_state_bo = blend_state_create_from_key(brw, &key);
+   brw->state.dirty.cache |= CACHE_NEW_BLEND_STATE;
 }
 
 const struct brw_tracked_state gen6_blend_state = {
    .dirty = {
-      .mesa = _NEW_COLOR,
-      .brw = 0,
+      .mesa = (_NEW_COLOR |
+	       _NEW_BUFFERS),
+      .brw = BRW_NEW_BATCH,
       .cache = 0,
    },
    .prepare = prepare_blend_state,
 };
 
-struct gen6_color_calc_state_key {
-   float blend_constant_color[4];
-   GLclampf alpha_ref;
-   GLubyte stencil_ref[2];
-};
-
 static void
-color_calc_state_populate_key(struct brw_context *brw,
-			      struct gen6_color_calc_state_key *key)
+gen6_prepare_color_calc_state(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->intel.ctx;
+   struct gen6_color_calc_state *cc;
 
-   memset(key, 0, sizeof(*key));
-
-   /* _NEW_STENCIL */
-   if (ctx->Stencil._Enabled) {
-      const unsigned back = ctx->Stencil._BackFace;
-
-      key->stencil_ref[0] = ctx->Stencil.Ref[0];
-      if (ctx->Stencil._TestTwoSide)
-	 key->stencil_ref[1] = ctx->Stencil.Ref[back];
-   }
+   cc = brw_state_batch(brw, sizeof(*cc), 64, &brw->cc.state_offset);
+   memset(cc, 0, sizeof(*cc));
 
    /* _NEW_COLOR */
-   if (ctx->Color.AlphaEnabled)
-      key->alpha_ref = ctx->Color.AlphaRef;
-
-   key->blend_constant_color[0] = ctx->Color.BlendColorUnclamped[0];
-   key->blend_constant_color[1] = ctx->Color.BlendColorUnclamped[1];
-   key->blend_constant_color[2] = ctx->Color.BlendColorUnclamped[2];
-   key->blend_constant_color[3] = ctx->Color.BlendColorUnclamped[3];
-}
-
-/**
- * Creates the state cache entry for the given CC state key.
- */
-static drm_intel_bo *
-color_calc_state_create_from_key(struct brw_context *brw,
-				 struct gen6_color_calc_state_key *key)
-{
-   struct gen6_color_calc_state cc;
-   drm_intel_bo *bo;
-
-   memset(&cc, 0, sizeof(cc));
+   cc->cc0.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
+   UNCLAMPED_FLOAT_TO_UBYTE(cc->cc1.alpha_ref_fi.ui, ctx->Color.AlphaRef);
 
-   cc.cc0.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
-   UNCLAMPED_FLOAT_TO_UBYTE(cc.cc1.alpha_ref_fi.ui, key->alpha_ref);
-
-   cc.cc0.stencil_ref = key->stencil_ref[0];
-   cc.cc0.bf_stencil_ref = key->stencil_ref[1];
-
-   cc.constant_r = key->blend_constant_color[0];
-   cc.constant_g = key->blend_constant_color[1];
-   cc.constant_b = key->blend_constant_color[2];
-   cc.constant_a = key->blend_constant_color[3];
-
-   bo = brw_upload_cache(&brw->cache, BRW_COLOR_CALC_STATE,
-			 key, sizeof(*key),
-			 NULL, 0,
-			 &cc, sizeof(cc));
-
-   return bo;
-}
-
-static void
-prepare_color_calc_state(struct brw_context *brw)
-{
-   struct gen6_color_calc_state_key key;
-
-   color_calc_state_populate_key(brw, &key);
+   /* _NEW_STENCIL */
+   cc->cc0.stencil_ref = ctx->Stencil.Ref[0];
+   cc->cc0.bf_stencil_ref = ctx->Stencil.Ref[ctx->Stencil._BackFace];
 
-   drm_intel_bo_unreference(brw->cc.color_calc_state_bo);
-   brw->cc.color_calc_state_bo = brw_search_cache(&brw->cache, BRW_COLOR_CALC_STATE,
-				       &key, sizeof(key),
-				       NULL, 0,
-				       NULL);
+   /* _NEW_COLOR */
+   cc->constant_r = ctx->Color.BlendColorUnclamped[0];
+   cc->constant_g = ctx->Color.BlendColorUnclamped[1];
+   cc->constant_b = ctx->Color.BlendColorUnclamped[2];
+   cc->constant_a = ctx->Color.BlendColorUnclamped[3];
 
-   if (brw->cc.color_calc_state_bo == NULL)
-      brw->cc.color_calc_state_bo = color_calc_state_create_from_key(brw, &key);
+   brw->state.dirty.cache |= CACHE_NEW_COLOR_CALC_STATE;
 }
 
 const struct brw_tracked_state gen6_color_calc_state = {
    .dirty = {
       .mesa = _NEW_COLOR | _NEW_STENCIL,
-      .brw = 0,
+      .brw = BRW_NEW_BATCH,
       .cache = 0,
    },
-   .prepare = prepare_color_calc_state,
+   .prepare = gen6_prepare_color_calc_state,
 };
 
 static void upload_cc_state_pointers(struct brw_context *brw)
@@ -279,20 +164,12 @@ static void upload_cc_state_pointers(struct brw_context *brw)
 
    BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (4 - 2));
-   OUT_RELOC(brw->cc.blend_state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
-   OUT_RELOC(brw->cc.depth_stencil_state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
-   OUT_RELOC(brw->cc.color_calc_state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
+   OUT_BATCH(brw->cc.blend_state_offset | 1);
+   OUT_BATCH(brw->cc.depth_stencil_state_offset | 1);
+   OUT_BATCH(brw->cc.state_offset | 1);
    ADVANCE_BATCH();
 }
 
-
-static void prepare_cc_state_pointers(struct brw_context *brw)
-{
-   brw_add_validated_bo(brw, brw->cc.color_calc_state_bo);
-   brw_add_validated_bo(brw, brw->cc.blend_state_bo);
-   brw_add_validated_bo(brw, brw->cc.depth_stencil_state_bo);
-}
-
 const struct brw_tracked_state gen6_cc_state_pointers = {
    .dirty = {
       .mesa = 0,
@@ -301,6 +178,5 @@ const struct brw_tracked_state gen6_cc_state_pointers = {
 		CACHE_NEW_COLOR_CALC_STATE |
 		CACHE_NEW_DEPTH_STENCIL_STATE)
    },
-   .prepare = prepare_cc_state_pointers,
    .emit = upload_cc_state_pointers,
 };
diff --git a/src/mesa/drivers/dri/i965/gen6_depthstencil.c b/src/mesa/drivers/dri/i965/gen6_depthstencil.c
index 96e6eade6b7..775e1ce2c9c 100644
--- a/src/mesa/drivers/dri/i965/gen6_depthstencil.c
+++ b/src/mesa/drivers/dri/i965/gen6_depthstencil.c
@@ -28,138 +28,68 @@
 #include "brw_context.h"
 #include "brw_state.h"
 
-struct brw_depth_stencil_state_key {
-   GLenum depth_func;
-   GLboolean depth_test, depth_write;
-   GLboolean stencil, stencil_two_side;
-   GLenum stencil_func[2], stencil_fail_op[2];
-   GLenum stencil_pass_depth_fail_op[2], stencil_pass_depth_pass_op[2];
-   GLubyte stencil_write_mask[2], stencil_test_mask[2];
-};
-
 static void
-depth_stencil_state_populate_key(struct brw_context *brw,
-				 struct brw_depth_stencil_state_key *key)
+gen6_prepare_depth_stencil_state(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->intel.ctx;
-   const unsigned back = ctx->Stencil._BackFace;
+   struct gen6_depth_stencil_state *ds;
 
-   memset(key, 0, sizeof(*key));
+   ds = brw_state_batch(brw, sizeof(*ds), 64,
+			&brw->cc.depth_stencil_state_offset);
+   memset(ds, 0, sizeof(*ds));
 
    /* _NEW_STENCIL */
-   key->stencil = ctx->Stencil._Enabled;
-   key->stencil_two_side = ctx->Stencil._TestTwoSide;
-
-   if (key->stencil) {
-      key->stencil_func[0] = ctx->Stencil.Function[0];
-      key->stencil_fail_op[0] = ctx->Stencil.FailFunc[0];
-      key->stencil_pass_depth_fail_op[0] = ctx->Stencil.ZFailFunc[0];
-      key->stencil_pass_depth_pass_op[0] = ctx->Stencil.ZPassFunc[0];
-      key->stencil_write_mask[0] = ctx->Stencil.WriteMask[0];
-      key->stencil_test_mask[0] = ctx->Stencil.ValueMask[0];
-   }
-   if (key->stencil_two_side) {
-      key->stencil_func[1] = ctx->Stencil.Function[back];
-      key->stencil_fail_op[1] = ctx->Stencil.FailFunc[back];
-      key->stencil_pass_depth_fail_op[1] = ctx->Stencil.ZFailFunc[back];
-      key->stencil_pass_depth_pass_op[1] = ctx->Stencil.ZPassFunc[back];
-      key->stencil_write_mask[1] = ctx->Stencil.WriteMask[back];
-      key->stencil_test_mask[1] = ctx->Stencil.ValueMask[back];
-   }
-
-   key->depth_test = ctx->Depth.Test;
-   if (key->depth_test) {
-      key->depth_func = ctx->Depth.Func;
-      key->depth_write = ctx->Depth.Mask;
-   }
-}
-
-/**
- * Creates the state cache entry for the given DEPTH_STENCIL_STATE state key.
- */
-static drm_intel_bo *
-depth_stencil_state_create_from_key(struct brw_context *brw,
-				    struct brw_depth_stencil_state_key *key)
-{
-   struct gen6_depth_stencil_state ds;
-   drm_intel_bo *bo;
-
-   memset(&ds, 0, sizeof(ds));
-
-   /* _NEW_STENCIL */
-   if (key->stencil) {
-      ds.ds0.stencil_enable = 1;
-      ds.ds0.stencil_func =
-	 intel_translate_compare_func(key->stencil_func[0]);
-      ds.ds0.stencil_fail_op =
-	 intel_translate_stencil_op(key->stencil_fail_op[0]);
-      ds.ds0.stencil_pass_depth_fail_op =
-	 intel_translate_stencil_op(key->stencil_pass_depth_fail_op[0]);
-      ds.ds0.stencil_pass_depth_pass_op =
-	 intel_translate_stencil_op(key->stencil_pass_depth_pass_op[0]);
-      ds.ds1.stencil_write_mask = key->stencil_write_mask[0];
-      ds.ds1.stencil_test_mask = key->stencil_test_mask[0];
-
-      if (key->stencil_two_side) {
-	 ds.ds0.bf_stencil_enable = 1;
-	 ds.ds0.bf_stencil_func =
-	    intel_translate_compare_func(key->stencil_func[1]);
-	 ds.ds0.bf_stencil_fail_op =
-	    intel_translate_stencil_op(key->stencil_fail_op[1]);
-	 ds.ds0.bf_stencil_pass_depth_fail_op =
-	    intel_translate_stencil_op(key->stencil_pass_depth_fail_op[1]);
-	 ds.ds0.bf_stencil_pass_depth_pass_op =
-	    intel_translate_stencil_op(key->stencil_pass_depth_pass_op[1]);
-	 ds.ds1.bf_stencil_write_mask = key->stencil_write_mask[1];
-	 ds.ds1.bf_stencil_test_mask = key->stencil_test_mask[1];
+   if (ctx->Stencil._Enabled) {
+      int back = ctx->Stencil._BackFace;
+
+      ds->ds0.stencil_enable = 1;
+      ds->ds0.stencil_func =
+	 intel_translate_compare_func(ctx->Stencil.Function[0]);
+      ds->ds0.stencil_fail_op =
+	 intel_translate_stencil_op(ctx->Stencil.FailFunc[0]);
+      ds->ds0.stencil_pass_depth_fail_op =
+	 intel_translate_stencil_op(ctx->Stencil.ZFailFunc[0]);
+      ds->ds0.stencil_pass_depth_pass_op =
+	 intel_translate_stencil_op(ctx->Stencil.ZPassFunc[0]);
+      ds->ds1.stencil_write_mask = ctx->Stencil.WriteMask[0];
+      ds->ds1.stencil_test_mask = ctx->Stencil.ValueMask[0];
+
+      if (ctx->Stencil._TestTwoSide) {
+	 ds->ds0.bf_stencil_enable = 1;
+	 ds->ds0.bf_stencil_func =
+	    intel_translate_compare_func(ctx->Stencil.Function[back]);
+	 ds->ds0.bf_stencil_fail_op =
+	    intel_translate_stencil_op(ctx->Stencil.FailFunc[back]);
+	 ds->ds0.bf_stencil_pass_depth_fail_op =
+	    intel_translate_stencil_op(ctx->Stencil.ZFailFunc[back]);
+	 ds->ds0.bf_stencil_pass_depth_pass_op =
+	    intel_translate_stencil_op(ctx->Stencil.ZPassFunc[back]);
+	 ds->ds1.bf_stencil_write_mask = ctx->Stencil.WriteMask[back];
+	 ds->ds1.bf_stencil_test_mask = ctx->Stencil.ValueMask[back];
       }
 
       /* Not really sure about this:
        */
-      if (key->stencil_write_mask[0] ||
-	  (key->stencil_two_side && key->stencil_write_mask[1]))
-	 ds.ds0.stencil_write_enable = 1;
+      if (ctx->Stencil.WriteMask[0] ||
+	  (ctx->Stencil._TestTwoSide && ctx->Stencil.WriteMask[back]))
+	 ds->ds0.stencil_write_enable = 1;
    }
 
    /* _NEW_DEPTH */
-   if (key->depth_test) {
-      ds.ds2.depth_test_enable = 1;
-      ds.ds2.depth_test_func = intel_translate_compare_func(key->depth_func);
-      ds.ds2.depth_write_enable = key->depth_write;
+   if (ctx->Depth.Test) {
+      ds->ds2.depth_test_enable = 1;
+      ds->ds2.depth_test_func = intel_translate_compare_func(ctx->Depth.Func);
+      ds->ds2.depth_write_enable = ctx->Depth.Mask;
    }
 
-   bo = brw_upload_cache(&brw->cache, BRW_DEPTH_STENCIL_STATE,
-			 key, sizeof(*key),
-			 NULL, 0,
-			 &ds, sizeof(ds));
-
-   return bo;
-}
-
-static void
-prepare_depth_stencil_state(struct brw_context *brw)
-{
-   struct brw_depth_stencil_state_key key;
-
-   depth_stencil_state_populate_key(brw, &key);
-
-   drm_intel_bo_unreference(brw->cc.depth_stencil_state_bo);
-   brw->cc.depth_stencil_state_bo = brw_search_cache(&brw->cache,
-						     BRW_DEPTH_STENCIL_STATE,
-						     &key, sizeof(key),
-						     NULL, 0,
-						     NULL);
-
-   if (brw->cc.depth_stencil_state_bo == NULL)
-      brw->cc.depth_stencil_state_bo =
-	 depth_stencil_state_create_from_key(brw, &key);
+   brw->state.dirty.cache |= CACHE_NEW_DEPTH_STENCIL_STATE;
 }
 
 const struct brw_tracked_state gen6_depth_stencil_state = {
    .dirty = {
       .mesa = _NEW_DEPTH | _NEW_STENCIL,
-      .brw = 0,
+      .brw = BRW_NEW_BATCH,
       .cache = 0,
    },
-   .prepare = prepare_depth_stencil_state,
+   .prepare = gen6_prepare_depth_stencil_state,
 };
diff --git a/src/mesa/drivers/dri/i965/gen6_sampler_state.c b/src/mesa/drivers/dri/i965/gen6_sampler_state.c
index f65c651bdff..4cdec699df6 100644
--- a/src/mesa/drivers/dri/i965/gen6_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sampler_state.c
@@ -43,27 +43,15 @@ upload_sampler_state_pointers(struct brw_context *brw)
 	     (4 - 2));
    OUT_BATCH(0); /* VS */
    OUT_BATCH(0); /* GS */
-   if (brw->wm.sampler_bo)
-      OUT_RELOC(brw->wm.sampler_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
-   else
-      OUT_BATCH(0);
-
+   OUT_BATCH(brw->wm.sampler_offset);
    ADVANCE_BATCH();
 }
 
-
-static void
-prepare_sampler_state_pointers(struct brw_context *brw)
-{
-   brw_add_validated_bo(brw, brw->wm.sampler_bo);
-}
-
 const struct brw_tracked_state gen6_sampler_state = {
    .dirty = {
       .mesa = 0,
       .brw = BRW_NEW_BATCH,
       .cache = CACHE_NEW_SAMPLER
    },
-   .prepare = prepare_sampler_state_pointers,
    .emit = upload_sampler_state_pointers,
 };
diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
index 12b65826ae9..fad3ca0dd04 100644
--- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
@@ -31,11 +31,15 @@
 #include "intel_batchbuffer.h"
 
 static void
-prepare_scissor_state(struct brw_context *brw)
+gen6_prepare_scissor_state(struct brw_context *brw)
 {
-   struct gl_context *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
    const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
-   struct gen6_scissor_rect scissor;
+   struct gen6_scissor_rect *scissor;
+   uint32_t scissor_state_offset;
+
+   scissor = brw_state_batch(brw, sizeof(*scissor), 32, &scissor_state_offset);
 
    /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
 
@@ -54,62 +58,36 @@ prepare_scissor_state(struct brw_context *brw)
        * anything.  Instead, just provide a min > max scissor inside
        * the bounds, which produces the expected no rendering.
        */
-      scissor.xmin = 1;
-      scissor.xmax = 0;
-      scissor.ymin = 1;
-      scissor.ymax = 0;
+      scissor->xmin = 1;
+      scissor->xmax = 0;
+      scissor->ymin = 1;
+      scissor->ymax = 0;
    } else if (render_to_fbo) {
       /* texmemory: Y=0=bottom */
-      scissor.xmin = ctx->DrawBuffer->_Xmin;
-      scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
-      scissor.ymin = ctx->DrawBuffer->_Ymin;
-      scissor.ymax = ctx->DrawBuffer->_Ymax - 1;
+      scissor->xmin = ctx->DrawBuffer->_Xmin;
+      scissor->xmax = ctx->DrawBuffer->_Xmax - 1;
+      scissor->ymin = ctx->DrawBuffer->_Ymin;
+      scissor->ymax = ctx->DrawBuffer->_Ymax - 1;
    }
    else {
       /* memory: Y=0=top */
-      scissor.xmin = ctx->DrawBuffer->_Xmin;
-      scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
-      scissor.ymin = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymax;
-      scissor.ymax = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymin - 1;
+      scissor->xmin = ctx->DrawBuffer->_Xmin;
+      scissor->xmax = ctx->DrawBuffer->_Xmax - 1;
+      scissor->ymin = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymax;
+      scissor->ymax = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymin - 1;
    }
 
-   drm_intel_bo_unreference(brw->sf.state_bo);
-   brw->sf.state_bo = brw_cache_data(&brw->cache, BRW_SF_UNIT,
-				     &scissor, sizeof(scissor));
-}
-
-const struct brw_tracked_state gen6_scissor_state = {
-   .dirty = {
-      .mesa = _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT,
-      .brw = 0,
-      .cache = 0,
-   },
-   .prepare = prepare_scissor_state,
-};
-
-static void upload_scissor_state_pointers(struct brw_context *brw)
-{
-   struct intel_context *intel = &brw->intel;
-
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_SCISSOR_STATE_POINTERS << 16 | (2 - 2));
-   OUT_RELOC(brw->sf.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_BATCH(scissor_state_offset);
    ADVANCE_BATCH();
-
 }
 
-
-static void prepare_scissor_state_pointers(struct brw_context *brw)
-{
-   brw_add_validated_bo(brw, brw->sf.state_bo);
-}
-
-const struct brw_tracked_state gen6_scissor_state_pointers = {
+const struct brw_tracked_state gen6_scissor_state = {
    .dirty = {
-      .mesa = 0,
+      .mesa = _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT,
       .brw = BRW_NEW_BATCH,
-      .cache = CACHE_NEW_SF_UNIT
+      .cache = 0,
    },
-   .prepare = prepare_scissor_state_pointers,
-   .emit = upload_scissor_state_pointers,
+   .prepare = gen6_prepare_scissor_state,
 };
diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
index cd7d209e3ea..4116bdb96de 100644
--- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
@@ -41,22 +41,22 @@
 static void
 prepare_clip_vp(struct brw_context *brw)
 {
-   struct brw_clipper_viewport vp;
+   struct brw_clipper_viewport *vp;
 
-   vp.xmin = -1.0;
-   vp.xmax = 1.0;
-   vp.ymin = -1.0;
-   vp.ymax = 1.0;
+   vp = brw_state_batch(brw, sizeof(*vp), 32, &brw->clip.vp_offset);
 
-   drm_intel_bo_unreference(brw->clip.vp_bo);
-   brw->clip.vp_bo = brw_cache_data(&brw->cache, BRW_CLIP_VP,
-				    &vp, sizeof(vp));
+   vp->xmin = -1.0;
+   vp->xmax = 1.0;
+   vp->ymin = -1.0;
+   vp->ymax = 1.0;
+
+   brw->state.dirty.cache |= CACHE_NEW_CLIP_VP;
 }
 
 const struct brw_tracked_state gen6_clip_vp = {
    .dirty = {
-      .mesa = _NEW_VIEWPORT, /* XXX: not really, but we need nonzero */
-      .brw = 0,
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH,
       .cache = 0,
    },
    .prepare = prepare_clip_vp,
@@ -67,12 +67,13 @@ prepare_sf_vp(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->intel.ctx;
    const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
-   struct brw_sf_viewport sfv;
+   struct brw_sf_viewport *sfv;
    GLfloat y_scale, y_bias;
    const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
    const GLfloat *v = ctx->Viewport._WindowMap.m;
 
-   memset(&sfv, 0, sizeof(sfv));
+   sfv = brw_state_batch(brw, sizeof(*sfv), 32, &brw->sf.vp_offset);
+   memset(sfv, 0, sizeof(*sfv));
 
    /* _NEW_BUFFERS */
    if (render_to_fbo) {
@@ -84,34 +85,25 @@ prepare_sf_vp(struct brw_context *brw)
    }
 
    /* _NEW_VIEWPORT */
-   sfv.viewport.m00 = v[MAT_SX];
-   sfv.viewport.m11 = v[MAT_SY] * y_scale;
-   sfv.viewport.m22 = v[MAT_SZ] * depth_scale;
-   sfv.viewport.m30 = v[MAT_TX];
-   sfv.viewport.m31 = v[MAT_TY] * y_scale + y_bias;
-   sfv.viewport.m32 = v[MAT_TZ] * depth_scale;
+   sfv->viewport.m00 = v[MAT_SX];
+   sfv->viewport.m11 = v[MAT_SY] * y_scale;
+   sfv->viewport.m22 = v[MAT_SZ] * depth_scale;
+   sfv->viewport.m30 = v[MAT_TX];
+   sfv->viewport.m31 = v[MAT_TY] * y_scale + y_bias;
+   sfv->viewport.m32 = v[MAT_TZ] * depth_scale;
 
-   drm_intel_bo_unreference(brw->sf.vp_bo);
-   brw->sf.vp_bo = brw_cache_data(&brw->cache, BRW_SF_VP,
-				  &sfv, sizeof(sfv));
+   brw->state.dirty.cache |= CACHE_NEW_SF_VP;
 }
 
 const struct brw_tracked_state gen6_sf_vp = {
    .dirty = {
       .mesa = _NEW_VIEWPORT | _NEW_BUFFERS,
-      .brw = 0,
+      .brw = BRW_NEW_BATCH,
       .cache = 0,
    },
    .prepare = prepare_sf_vp,
 };
 
-static void prepare_viewport_state_pointers(struct brw_context *brw)
-{
-   brw_add_validated_bo(brw, brw->clip.vp_bo);
-   brw_add_validated_bo(brw, brw->sf.vp_bo);
-   brw_add_validated_bo(brw, brw->cc.vp_bo);
-}
-
 static void upload_viewport_state_pointers(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
@@ -121,9 +113,9 @@ static void upload_viewport_state_pointers(struct brw_context *brw)
 	     GEN6_CC_VIEWPORT_MODIFY |
 	     GEN6_SF_VIEWPORT_MODIFY |
 	     GEN6_CLIP_VIEWPORT_MODIFY);
-   OUT_RELOC(brw->clip.vp_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
-   OUT_RELOC(brw->sf.vp_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
-   OUT_RELOC(brw->cc.vp_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_BATCH(brw->clip.vp_offset);
+   OUT_BATCH(brw->sf.vp_offset);
+   OUT_BATCH(brw->cc.vp_offset);
    ADVANCE_BATCH();
 }
 
@@ -135,6 +127,5 @@ const struct brw_tracked_state gen6_viewport_state = {
 		CACHE_NEW_SF_VP |
 		CACHE_NEW_CC_VP)
    },
-   .prepare = prepare_viewport_state_pointers,
    .emit = upload_viewport_state_pointers,
 };
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index a10cec318d6..b46368e36e2 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -34,43 +34,36 @@
 #include "intel_batchbuffer.h"
 
 static void
-upload_vs_state(struct brw_context *brw)
+gen6_prepare_vs_push_constants(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &intel->ctx;
+   /* _BRW_NEW_VERTEX_PROGRAM */
    const struct brw_vertex_program *vp =
       brw_vertex_program_const(brw->vertex_program);
    unsigned int nr_params = brw->vs.prog_data->nr_params / 4;
-   drm_intel_bo *constant_bo;
-   int i;
 
+   if (brw->vertex_program->IsNVProgram)
+      _mesa_load_tracked_matrices(ctx);
+
+   /* Updates the ParamaterValues[i] pointers for all parameters of the
+    * basic type of PROGRAM_STATE_VAR.
+    */
+   /* XXX: Should this happen somewhere before to get our state flag set? */
+   _mesa_load_state_parameters(ctx, vp->program.Base.Parameters);
+
+   /* CACHE_NEW_VS_PROG | _NEW_TRANSFORM */
    if (brw->vs.prog_data->nr_params == 0 && !ctx->Transform.ClipPlanesEnabled) {
-      /* Disable the push constant buffers. */
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (5 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
+      brw->vs.push_const_size = 0;
    } else {
-      int params_uploaded = 0, param_regs;
+      int params_uploaded = 0;
       float *param;
+      int i;
 
-      if (brw->vertex_program->IsNVProgram)
-	 _mesa_load_tracked_matrices(ctx);
-
-      /* Updates the ParamaterValues[i] pointers for all parameters of the
-       * basic type of PROGRAM_STATE_VAR.
-       */
-      _mesa_load_state_parameters(ctx, vp->program.Base.Parameters);
-
-      constant_bo = drm_intel_bo_alloc(intel->bufmgr, "VS constant_bo",
-				       (MAX_CLIP_PLANES + nr_params) *
-				       4 * sizeof(float),
-				       4096);
-      drm_intel_gem_bo_map_gtt(constant_bo);
-      param = constant_bo->virtual;
+      param = brw_state_batch(brw,
+			      (MAX_CLIP_PLANES + nr_params) *
+			      4 * sizeof(float),
+			      32, &brw->vs.push_const_offset);
 
       /* This should be loaded like any other param, but it's ad-hoc
        * until we redo the VS backend.
@@ -100,30 +93,56 @@ upload_vs_state(struct brw_context *brw)
       if (0) {
 	 printf("VS constant buffer:\n");
 	 for (i = 0; i < params_uploaded; i++) {
-	    float *buf = (float *)constant_bo->virtual + i * 4;
+	    float *buf = param + i * 4;
 	    printf("%d: %f %f %f %f\n",
 		   i, buf[0], buf[1], buf[2], buf[3]);
 	 }
       }
 
-      drm_intel_gem_bo_unmap_gtt(constant_bo);
+      brw->vs.push_const_size = (params_uploaded + 1) / 2;
+      /* We can only push 32 registers of constants at a time. */
+      assert(brw->vs.push_const_size <= 32);
+   }
+}
+
+const struct brw_tracked_state gen6_vs_constants = {
+   .dirty = {
+      .mesa  = _NEW_TRANSFORM | _NEW_PROGRAM_CONSTANTS,
+      .brw   = (BRW_NEW_BATCH |
+		BRW_NEW_VERTEX_PROGRAM),
+      .cache = 0,
+   },
+   .prepare = gen6_prepare_vs_push_constants,
+};
 
-      param_regs = (params_uploaded + 1) / 2;
-      assert(param_regs <= 32);
+static void
+upload_vs_state(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
 
+   if (brw->vs.push_const_size == 0) {
+      /* Disable the push constant buffers. */
+      BEGIN_BATCH(5);
+      OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (5 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else {
       BEGIN_BATCH(5);
       OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 |
 		GEN6_CONSTANT_BUFFER_0_ENABLE |
 		(5 - 2));
-      OUT_RELOC(constant_bo,
-		I915_GEM_DOMAIN_RENDER, 0, /* XXX: bad domain */
-		param_regs - 1);
+      /* Pointer to the VS constant buffer.  Covered by the set of
+       * state flags from gen6_prepare_wm_constants
+       */
+      OUT_BATCH(brw->vs.push_const_offset +
+		brw->vs.push_const_size - 1);
       OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
       ADVANCE_BATCH();
-
-      drm_intel_bo_unreference(constant_bo);
    }
 
    BEGIN_BATCH(6);
@@ -149,7 +168,9 @@ const struct brw_tracked_state gen6_vs_state = {
       .brw   = (BRW_NEW_CURBE_OFFSETS |
                 BRW_NEW_NR_VS_SURFACES |
 		BRW_NEW_URB_FENCE |
-		BRW_NEW_CONTEXT),
+		BRW_NEW_CONTEXT |
+		BRW_NEW_VERTEX_PROGRAM |
+		BRW_NEW_BATCH),
       .cache = CACHE_NEW_VS_PROG
    },
    .emit = upload_vs_state,
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 8215cb15a9c..33b233414c6 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -35,16 +35,13 @@
 #include "intel_batchbuffer.h"
 
 static void
-prepare_wm_constants(struct brw_context *brw)
+gen6_prepare_wm_push_constants(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &intel->ctx;
    const struct brw_fragment_program *fp =
       brw_fragment_program_const(brw->fragment_program);
 
-   drm_intel_bo_unreference(brw->wm.push_const_bo);
-   brw->wm.push_const_bo = NULL;
-
    /* Updates the ParamaterValues[i] pointers for all parameters of the
     * basic type of PROGRAM_STATE_VAR.
     */
@@ -55,13 +52,11 @@ prepare_wm_constants(struct brw_context *brw)
       float *constants;
       unsigned int i;
 
-      brw->wm.push_const_bo = drm_intel_bo_alloc(intel->bufmgr,
-						 "WM constant_bo",
-						 brw->wm.prog_data->nr_params *
-						 sizeof(float),
-						 4096);
-      drm_intel_gem_bo_map_gtt(brw->wm.push_const_bo);
-      constants = brw->wm.push_const_bo->virtual;
+      constants = brw_state_batch(brw,
+				  brw->wm.prog_data->nr_params *
+				  sizeof(float),
+				  32, &brw->wm.push_const_offset);
+
       for (i = 0; i < brw->wm.prog_data->nr_params; i++) {
 	 constants[i] = convert_param(brw->wm.prog_data->param_convert[i],
 				      *brw->wm.prog_data->param[i]);
@@ -80,18 +75,17 @@ prepare_wm_constants(struct brw_context *brw)
 	    printf("\n");
 	 printf("\n");
       }
-
-      drm_intel_gem_bo_unmap_gtt(brw->wm.push_const_bo);
    }
 }
 
 const struct brw_tracked_state gen6_wm_constants = {
    .dirty = {
       .mesa  = _NEW_PROGRAM_CONSTANTS,
-      .brw   = BRW_NEW_FRAGMENT_PROGRAM,
+      .brw   = (BRW_NEW_BATCH |
+		BRW_NEW_FRAGMENT_PROGRAM),
       .cache = 0,
    },
-   .prepare = prepare_wm_constants,
+   .prepare = gen6_prepare_wm_push_constants,
 };
 
 static void
@@ -118,8 +112,10 @@ upload_wm_state(struct brw_context *brw)
       OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 |
 		GEN6_CONSTANT_BUFFER_0_ENABLE |
 		(5 - 2));
-      OUT_RELOC(brw->wm.push_const_bo,
-		I915_GEM_DOMAIN_RENDER, 0, /* XXX: bad domain */
+      /* Pointer to the WM constant buffer.  Covered by the set of
+       * state flags from gen6_prepare_wm_constants
+       */
+      OUT_BATCH(brw->wm.push_const_offset +
 		ALIGN(brw->wm.prog_data->nr_params,
 		      brw->wm.prog_data->dispatch_width) / 8 - 1);
       OUT_BATCH(0);
@@ -143,14 +139,19 @@ upload_wm_state(struct brw_context *brw)
    dw2 |= (ALIGN(brw->wm.sampler_count, 4) / 4) << GEN6_WM_SAMPLER_COUNT_SHIFT;
    dw4 |= (brw->wm.prog_data->first_curbe_grf <<
 	   GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
+   dw4 |= (brw->wm.prog_data->first_curbe_grf_16 <<
+	   GEN6_WM_DISPATCH_START_GRF_SHIFT_2);
 
    dw5 |= (brw->wm_max_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
 
    /* CACHE_NEW_WM_PROG */
-   if (brw->wm.prog_data->dispatch_width == 8)
+   if (brw->wm.prog_data->dispatch_width == 8) {
       dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
-   else
+      if (brw->wm.prog_data->prog_offset_16)
+	 dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
+   } else {
       dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
+   }
 
    /* _NEW_LINE */
    if (ctx->Line.StippleFlag)
@@ -194,7 +195,12 @@ upload_wm_state(struct brw_context *brw)
    OUT_BATCH(dw5);
    OUT_BATCH(dw6);
    OUT_BATCH(0); /* kernel 1 pointer */
-   OUT_BATCH(0); /* kernel 2 pointer */
+   if (brw->wm.prog_data->prog_offset_16) {
+      OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+		brw->wm.prog_data->prog_offset_16);
+   } else {
+      OUT_BATCH(0); /* kernel 2 pointer */
+   }
    ADVANCE_BATCH();
 }
 
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index 53d6e7c6acc..377989bcc14 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -128,6 +128,11 @@ _intel_batchbuffer_flush(struct intel_context *intel,
    if (intel->batch.used == 0)
       return;
 
+   if (intel->first_post_swapbuffers_batch == NULL) {
+      intel->first_post_swapbuffers_batch = intel->batch.bo;
+      drm_intel_bo_reference(intel->first_post_swapbuffers_batch);
+   }
+
    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
       fprintf(stderr, "%s:%d: Batchbuffer flush with %db used\n", file, line,
 	      4*intel->batch.used);
diff --git a/src/mesa/drivers/dri/intel/intel_clear.c b/src/mesa/drivers/dri/intel/intel_clear.c
index 82d29e76712..5a96232107e 100644
--- a/src/mesa/drivers/dri/intel/intel_clear.c
+++ b/src/mesa/drivers/dri/intel/intel_clear.c
@@ -28,6 +28,7 @@
 
 #include "main/glheader.h"
 #include "main/mtypes.h"
+#include "main/condrender.h"
 #include "swrast/swrast.h"
 #include "drivers/common/meta.h"
 
@@ -88,6 +89,9 @@ intelClear(struct gl_context *ctx, GLbitfield mask)
    struct intel_renderbuffer *irb;
    int i;
 
+   if (!_mesa_check_conditional_render(ctx))
+      return;
+
    if (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_FRONT_RIGHT)) {
       intel->front_buffer_dirty = GL_TRUE;
    }
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 02e7f7717fc..acdf35fc71b 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -466,9 +466,11 @@ intel_prepare_render(struct intel_context *intel)
     * the swap, and getting our hands on that doesn't seem worth it,
     * so we just us the first batch we emitted after the last swap.
     */
-   if (intel->need_throttle) {
-       drmCommandNone(intel->driFd, DRM_I915_GEM_THROTTLE);
-       intel->need_throttle = GL_FALSE;
+   if (intel->need_throttle && intel->first_post_swapbuffers_batch) {
+      drm_intel_bo_wait_rendering(intel->first_post_swapbuffers_batch);
+      drm_intel_bo_unreference(intel->first_post_swapbuffers_batch);
+      intel->first_post_swapbuffers_batch = NULL;
+      intel->need_throttle = GL_FALSE;
    }
 }
 
@@ -650,27 +652,23 @@ intelInitContext(struct intel_context *intel,
    intel->driFd = sPriv->fd;
 
    intel->has_xrgb_textures = GL_TRUE;
+   intel->gen = intelScreen->gen;
    if (IS_GEN6(intel->intelScreen->deviceID)) {
-      intel->gen = 6;
       intel->needs_ff_sync = GL_TRUE;
       intel->has_luminance_srgb = GL_TRUE;
    } else if (IS_GEN5(intel->intelScreen->deviceID)) {
-      intel->gen = 5;
       intel->needs_ff_sync = GL_TRUE;
       intel->has_luminance_srgb = GL_TRUE;
    } else if (IS_965(intel->intelScreen->deviceID)) {
-      intel->gen = 4;
       if (IS_G4X(intel->intelScreen->deviceID)) {
 	  intel->has_luminance_srgb = GL_TRUE;
 	  intel->is_g4x = GL_TRUE;
       }
    } else if (IS_9XX(intel->intelScreen->deviceID)) {
-      intel->gen = 3;
       if (IS_945(intel->intelScreen->deviceID)) {
 	 intel->is_945 = GL_TRUE;
       }
    } else {
-      intel->gen = 2;
       if (intel->intelScreen->deviceID == PCI_CHIP_I830_M ||
 	  intel->intelScreen->deviceID == PCI_CHIP_845_G) {
 	 intel->has_xrgb_textures = GL_FALSE;
@@ -718,6 +716,12 @@ intelInitContext(struct intel_context *intel,
    ctx->TextureFormatSupported[MESA_FORMAT_RGBA_DXT5] = GL_TRUE;
 
 #ifndef I915
+   /* GL_ARB_texture_compression_rgtc */
+   ctx->TextureFormatSupported[MESA_FORMAT_RED_RGTC1] = GL_TRUE;
+   ctx->TextureFormatSupported[MESA_FORMAT_SIGNED_RED_RGTC1] = GL_TRUE;
+   ctx->TextureFormatSupported[MESA_FORMAT_RG_RGTC2] = GL_TRUE;
+   ctx->TextureFormatSupported[MESA_FORMAT_SIGNED_RG_RGTC2] = GL_TRUE;
+
    /* GL_ARB_texture_rg */
    ctx->TextureFormatSupported[MESA_FORMAT_R8] = GL_TRUE;
    ctx->TextureFormatSupported[MESA_FORMAT_R16] = GL_TRUE;
@@ -936,6 +940,8 @@ intelDestroyContext(__DRIcontext * driContextPriv)
       intel->prim.vb = NULL;
       drm_intel_bo_unreference(intel->prim.vb_bo);
       intel->prim.vb_bo = NULL;
+      drm_intel_bo_unreference(intel->first_post_swapbuffers_batch);
+      intel->first_post_swapbuffers_batch = NULL;
 
       driDestroyOptionCache(&intel->optionCache);
 
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index c59119373da..d3a8a659caa 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -182,6 +182,7 @@ struct intel_context
       bool is_blit;
    } batch;
 
+   drm_intel_bo *first_post_swapbuffers_batch;
    GLboolean need_throttle;
    GLboolean no_batch_wrap;
 
diff --git a/src/mesa/drivers/dri/intel/intel_extensions.c b/src/mesa/drivers/dri/intel/intel_extensions.c
index e107534a4da..3fd987abd8c 100644
--- a/src/mesa/drivers/dri/intel/intel_extensions.c
+++ b/src/mesa/drivers/dri/intel/intel_extensions.c
@@ -91,6 +91,7 @@ static const struct dri_extension card_extensions[] = {
    { "GL_ARB_pixel_buffer_object",      NULL },
    { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
    { "GL_ARB_point_sprite",               NULL },
+   { "GL_ARB_sampler_objects",            NULL },
    { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions },
    { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions },
    { "GL_ARB_sync",                       GL_ARB_sync_functions },
@@ -176,6 +177,7 @@ static const struct dri_extension brw_extensions[] = {
    { "GL_ARB_texture_float",              NULL },
 #endif
    { "GL_MESA_texture_signed_rgba",       NULL },
+   { "GL_ARB_texture_compression_rgtc",   NULL },
    { "GL_ARB_texture_non_power_of_two",   NULL },
    { "GL_ARB_texture_rg",                 NULL },
    { "GL_EXT_draw_buffers2",              GL_EXT_draw_buffers2_functions },
@@ -189,6 +191,7 @@ static const struct dri_extension brw_extensions[] = {
    { "GL_ATI_envmap_bumpmap",             GL_ATI_envmap_bumpmap_functions },
    { "GL_ATI_separate_stencil",           GL_ATI_separate_stencil_functions },
    { "GL_ATI_texture_env_combine3",       NULL },
+   { "GL_NV_conditional_render",          NULL },
    { "GL_NV_texture_env_combine4",        NULL },
    { NULL,                                NULL }
 };
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
index 43cdd0d2bac..64c7acce1e9 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
@@ -29,6 +29,7 @@
 #include "main/enums.h"
 #include "main/image.h"
 #include "main/colormac.h"
+#include "main/condrender.h"
 #include "main/mtypes.h"
 #include "main/macros.h"
 #include "main/pbo.h"
@@ -68,7 +69,7 @@ static const GLubyte *map_pbo( struct gl_context *ctx,
 
    if (!_mesa_validate_pbo_access(2, unpack, width, height, 1,
 				  GL_COLOR_INDEX, GL_BITMAP,
-				  (GLvoid *) bitmap)) {
+				  INT_MAX, (const GLvoid *) bitmap)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,"glBitmap(invalid PBO access)");
       return NULL;
    }
@@ -329,6 +330,9 @@ intelBitmap(struct gl_context * ctx,
 {
    struct intel_context *intel = intel_context(ctx);
 
+   if (!_mesa_check_conditional_render(ctx))
+      return;
+
    if (do_blit_bitmap(ctx, x, y, width, height,
                           unpack, pixels))
       return;
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_copy.c b/src/mesa/drivers/dri/intel/intel_pixel_copy.c
index a7ca780e944..e83f1bfab94 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_copy.c
@@ -29,6 +29,7 @@
 #include "main/image.h"
 #include "main/state.h"
 #include "main/mtypes.h"
+#include "main/condrender.h"
 #include "drivers/common/meta.h"
 
 #include "intel_context.h"
@@ -204,6 +205,9 @@ intelCopyPixels(struct gl_context * ctx,
 {
    DBG("%s\n", __FUNCTION__);
 
+   if (!_mesa_check_conditional_render(ctx))
+      return;
+
    if (do_blit_copypixels(ctx, srcx, srcy, width, height, destx, desty, type))
       return;
 
diff --git a/src/mesa/drivers/dri/intel/intel_screen.c b/src/mesa/drivers/dri/intel/intel_screen.c
index 64a21a147f0..5dacbb06633 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -216,8 +216,16 @@ intel_create_image(__DRIscreen *screen,
 {
    __DRIimage *image;
    struct intel_screen *intelScreen = screen->private;
+   uint32_t tiling;
    int cpp;
 
+   tiling = I915_TILING_X;
+   if (use & __DRI_IMAGE_USE_CURSOR) {
+      if (width != 64 || height != 64)
+	 return NULL;
+      tiling = I915_TILING_NONE;
+   }
+
    image = CALLOC(sizeof *image);
    if (image == NULL)
       return NULL;
@@ -247,7 +255,7 @@ intel_create_image(__DRIscreen *screen,
    cpp = _mesa_get_format_bytes(image->format);
 
    image->region =
-      intel_region_alloc(intelScreen, I915_TILING_NONE,
+      intel_region_alloc(intelScreen, tiling,
 			 cpp, width, height, GL_TRUE);
    if (image->region == NULL) {
       FREE(image);
@@ -548,6 +556,18 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
       intelScreen->deviceID = strtod(devid_override, NULL);
    }
 
+   if (IS_GEN6(intelScreen->deviceID)) {
+      intelScreen->gen = 6;
+   } else if (IS_GEN5(intelScreen->deviceID)) {
+      intelScreen->gen = 5;
+   } else if (IS_965(intelScreen->deviceID)) {
+      intelScreen->gen = 4;
+   } else if (IS_9XX(intelScreen->deviceID)) {
+      intelScreen->gen = 3;
+   } else {
+      intelScreen->gen = 2;
+   }
+
    api_mask = (1 << __DRI_API_OPENGL);
 #if FEATURE_ES1
    api_mask |= (1 << __DRI_API_GLES);
@@ -660,12 +680,21 @@ intelAllocateBuffer(__DRIscreen *screen,
 {
    struct intel_buffer *intelBuffer;
    struct intel_screen *intelScreen = screen->private;
+   uint32_t tiling;
 
    intelBuffer = CALLOC(sizeof *intelBuffer);
    if (intelBuffer == NULL)
       return NULL;
 
-   intelBuffer->region = intel_region_alloc(intelScreen, I915_TILING_NONE,
+   if ((attachment == __DRI_BUFFER_DEPTH ||
+	attachment == __DRI_BUFFER_STENCIL ||
+	attachment == __DRI_BUFFER_DEPTH_STENCIL) &&
+       intelScreen->gen >= 4)
+      tiling = I915_TILING_Y;
+   else
+      tiling = I915_TILING_X;
+
+   intelBuffer->region = intel_region_alloc(intelScreen, tiling,
 					    format / 8, width, height, GL_TRUE);
    
    if (intelBuffer->region == NULL) {
diff --git a/src/mesa/drivers/dri/intel/intel_screen.h b/src/mesa/drivers/dri/intel/intel_screen.h
index 0f0b5be56dc..4613c9858c4 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.h
+++ b/src/mesa/drivers/dri/intel/intel_screen.h
@@ -37,6 +37,7 @@
 struct intel_screen
 {
    int deviceID;
+   int gen;
 
    int logTextureGranularity;
 
diff --git a/src/mesa/drivers/dri/intel/intel_tex_validate.c b/src/mesa/drivers/dri/intel/intel_tex_validate.c
index 5e705c93619..27f2646ebf5 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_validate.c
@@ -1,5 +1,6 @@
 #include "main/mtypes.h"
 #include "main/macros.h"
+#include "main/samplerobj.h"
 
 #include "intel_context.h"
 #include "intel_mipmap_tree.h"
@@ -14,11 +15,13 @@
  */
 static void
 intel_update_max_level(struct intel_context *intel,
-		       struct intel_texture_object *intelObj)
+		       struct intel_texture_object *intelObj,
+		       struct gl_sampler_object *sampler)
 {
    struct gl_texture_object *tObj = &intelObj->base;
 
-   if (tObj->Sampler.MinFilter == GL_NEAREST || tObj->Sampler.MinFilter == GL_LINEAR) {
+   if (sampler->MinFilter == GL_NEAREST ||
+       sampler->MinFilter == GL_LINEAR) {
       intelObj->_MaxLevel = tObj->BaseLevel;
    } else {
       intelObj->_MaxLevel = tObj->_MaxLevel;
@@ -70,8 +73,10 @@ copy_image_data_to_tree(struct intel_context *intel,
 GLuint
 intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit)
 {
+   struct gl_context *ctx = &intel->ctx;
    struct gl_texture_object *tObj = intel->ctx.Texture.Unit[unit]._Current;
    struct intel_texture_object *intelObj = intel_texture_object(tObj);
+   struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
    int comp_byte = 0;
    int cpp;
    GLuint face, i;
@@ -84,7 +89,7 @@ intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit)
 
    /* What levels must the tree include at a minimum?
     */
-   intel_update_max_level(intel, intelObj);
+   intel_update_max_level(intel, intelObj, sampler);
    firstImage = intel_texture_image(tObj->Image[0][tObj->BaseLevel]);
 
    /* Fallback case:
diff --git a/src/mesa/drivers/dri/r300/compiler/Makefile b/src/mesa/drivers/dri/r300/compiler/Makefile
index 51b896ae91f..5c9f57b4eac 100644
--- a/src/mesa/drivers/dri/r300/compiler/Makefile
+++ b/src/mesa/drivers/dri/r300/compiler/Makefile
@@ -20,12 +20,15 @@ C_SOURCES = \
 		radeon_pair_translate.c \
 		radeon_pair_schedule.c \
 		radeon_pair_regalloc.c \
+		radeon_pair_dead_sources.c \
 		radeon_dataflow.c \
 		radeon_dataflow_deadcode.c \
 		radeon_dataflow_swizzles.c \
+		radeon_list.c \
 		radeon_optimize.c \
 		radeon_remove_constants.c \
 		radeon_rename_regs.c \
+		radeon_variable.c \
 		r3xx_fragprog.c \
 		r300_fragprog.c \
 		r300_fragprog_swizzle.c \
@@ -48,6 +51,7 @@ INCLUDES = \
 	-I. \
 	-I$(TOP)/include \
 	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/glsl \
 
 
 ##### TARGETS #####
diff --git a/src/mesa/drivers/dri/r300/compiler/SConscript b/src/mesa/drivers/dri/r300/compiler/SConscript
index 2b4bce1c08c..9931537492e 100755
--- a/src/mesa/drivers/dri/r300/compiler/SConscript
+++ b/src/mesa/drivers/dri/r300/compiler/SConscript
@@ -3,6 +3,7 @@ Import('*')
 env = env.Clone()
 env.Append(CPPPATH = '#/include')
 env.Append(CPPPATH = '#/src/mesa')
+env.Append(CPPPATH = '#/src/glsl')
 
 # temporary fix
 env['CFLAGS'] = str(env['CFLAGS']).replace('-Werror=declaration-after-statement', '')
@@ -22,6 +23,7 @@ r300compiler = env.ConvenienceLibrary(
         'radeon_pair_translate.c',
         'radeon_pair_schedule.c',
         'radeon_pair_regalloc.c',
+	'radeon_pair_dead_sources.c',
         'radeon_optimize.c',
         'radeon_remove_constants.c',
         'radeon_rename_regs.c',
@@ -30,6 +32,8 @@ r300compiler = env.ConvenienceLibrary(
         'radeon_dataflow.c',
         'radeon_dataflow_deadcode.c',
         'radeon_dataflow_swizzles.c',
+        'radeon_variable.c',
+	'radeon_list.c',
         'r3xx_fragprog.c',
         'r300_fragprog.c',
         'r300_fragprog_swizzle.c',
diff --git a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
index 8b73409136f..e6fd1fde62d 100644
--- a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
@@ -93,7 +93,7 @@ static unsigned int use_source(struct r300_fragment_program_code* code, struct r
 
 	if (src.File == RC_FILE_CONSTANT) {
 		return src.Index | (1 << 5);
-	} else if (src.File == RC_FILE_TEMPORARY) {
+	} else if (src.File == RC_FILE_TEMPORARY || src.File == RC_FILE_INPUT) {
 		use_temporary(code, src.Index);
 		return src.Index & 0x1f;
 	}
diff --git a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c
index 5223aaa71a4..b7bca8c0cfa 100644
--- a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c
@@ -87,6 +87,18 @@ static const struct swizzle_data* lookup_native_swizzle(unsigned int swizzle)
 	return 0;
 }
 
+/**
+ * Determines if the given swizzle is valid for r300/r400.  In most situations
+ * it is better to use r300_swizzle_is_native() which can be accesed via
+ * struct radeon_compiler *c; c->SwizzleCaps->IsNative().
+ */
+int r300_swizzle_is_native_basic(unsigned int swizzle)
+{
+	if(lookup_native_swizzle(swizzle))
+		return 1;
+	else
+		return 0;
+}
 
 /**
  * Check whether the given instruction supports the swizzle and negate
@@ -140,7 +152,6 @@ static void r300_swizzle_split(
 	split->NumPhases = 0;
 
 	while(mask) {
-		const struct swizzle_data *best_swizzle = 0;
 		unsigned int best_matchcount = 0;
 		unsigned int best_matchmask = 0;
 		int i, comp;
@@ -167,7 +178,6 @@ static void r300_swizzle_split(
 				}
 			}
 			if (matchcount > best_matchcount) {
-				best_swizzle = sd;
 				best_matchcount = matchcount;
 				best_matchmask = matchmask;
 				if (matchmask == (mask & RC_MASK_XYZ))
diff --git a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.h b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.h
index 118476af132..f2635be140d 100644
--- a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.h
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.h
@@ -34,5 +34,6 @@ extern struct rc_swizzle_caps r300_swizzle_caps;
 
 unsigned int r300FPTranslateRGBSwizzle(unsigned int src, unsigned int swizzle);
 unsigned int r300FPTranslateAlphaSwizzle(unsigned int src, unsigned int swizzle);
+int r300_swizzle_is_native_basic(unsigned int swizzle);
 
 #endif /* __R300_FRAGPROG_SWIZZLE_H_ */
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index 9286733635f..e2441e97d87 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -148,8 +148,8 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 		{"register rename",		1, !is_r500,	rc_rename_regs,			NULL},
 		{"pair translate",		1, 1,		rc_pair_translate,		NULL},
 		{"pair scheduling",		1, 1,		rc_pair_schedule,		NULL},
-		{"register allocation",		1, opt,		rc_pair_regalloc,		NULL},
-		{"dumb register allocation",	1, !opt,	rc_pair_regalloc_inputs_only,	NULL},
+		{"dead sources",		1, 1,		rc_pair_remove_dead_sources, NULL},
+		{"register allocation",		1, 1,		rc_pair_regalloc,		&opt},
 		{"final code validation",	0, 1,		rc_validate_final_shader,	NULL},
 		{"machine code generation",	0, is_r500,	r500BuildFragmentProgramHwCode,	NULL},
 		{"machine code generation",	0, !is_r500,	r300BuildFragmentProgramHwCode,	NULL},
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
index 140eeed3de3..5e0be6b8881 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
@@ -70,6 +70,8 @@ static int r500_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 	if (opcode == RC_OPCODE_TEX ||
 	    opcode == RC_OPCODE_TXB ||
 	    opcode == RC_OPCODE_TXP ||
+	    opcode == RC_OPCODE_TXD ||
+	    opcode == RC_OPCODE_TXL ||
 	    opcode == RC_OPCODE_KIL) {
 		if (reg.Abs)
 			return 0;
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
index c7f79bc53c7..c30cd753d15 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
@@ -207,7 +207,7 @@ static unsigned int use_source(struct r500_fragment_program_code* code, struct r
 
 	if (src.File == RC_FILE_CONSTANT) {
 		return src.Index | R500_RGB_ADDR0_CONST;
-	} else if (src.File == RC_FILE_TEMPORARY) {
+	} else if (src.File == RC_FILE_TEMPORARY || src.File == RC_FILE_INPUT) {
 		use_temporary(code, src.Index);
 		return src.Index;
 	}
@@ -396,6 +396,12 @@ static int emit_tex(struct r300_fragment_program_compiler *c, struct rc_sub_inst
 	case RC_OPCODE_TXP:
 		code->inst[ip].inst1 |= R500_TEX_INST_PROJ;
 		break;
+	case RC_OPCODE_TXD:
+		code->inst[ip].inst1 |= R500_TEX_INST_DXDY;
+		break;
+	case RC_OPCODE_TXL:
+		code->inst[ip].inst1 |= R500_TEX_INST_LOD;
+		break;
 	default:
 		error("emit_tex can't handle opcode %s\n", rc_get_opcode_info(inst->Opcode)->Name);
 	}
@@ -407,8 +413,23 @@ static int emit_tex(struct r300_fragment_program_compiler *c, struct rc_sub_inst
 	code->inst[ip].inst2 = R500_TEX_SRC_ADDR(inst->SrcReg[0].Index)
 		| (translate_strq_swizzle(inst->SrcReg[0].Swizzle) << 8)
 		| R500_TEX_DST_ADDR(inst->DstReg.Index)
-		| R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G
-		| R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A;
+		| (GET_SWZ(inst->TexSwizzle, 0) << 24)
+		| (GET_SWZ(inst->TexSwizzle, 1) << 26)
+		| (GET_SWZ(inst->TexSwizzle, 2) << 28)
+		| (GET_SWZ(inst->TexSwizzle, 3) << 30)
+		;
+
+	if (inst->Opcode == RC_OPCODE_TXD) {
+		use_temporary(code, inst->SrcReg[1].Index);
+		use_temporary(code, inst->SrcReg[2].Index);
+
+		/* DX and DY parameters are specified in a separate register. */
+		code->inst[ip].inst3 =
+			R500_DX_ADDR(inst->SrcReg[1].Index) |
+			(translate_strq_swizzle(inst->SrcReg[1].Swizzle) << 8) |
+			R500_DY_ADDR(inst->SrcReg[2].Index) |
+			(translate_strq_swizzle(inst->SrcReg[2].Swizzle) << 24);
+	}
 
 	return 1;
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.c b/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.c
index 15ec4418cb8..b077e7b7d65 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.c
@@ -124,6 +124,165 @@ unsigned swizzle_mask(unsigned swizzle, unsigned mask)
 	return ret;
 }
 
+static unsigned int srcs_need_rewrite(const struct rc_opcode_info * info)
+{
+	if (info->HasTexture) {
+		return 0;
+	}
+	switch (info->Opcode) {
+		case RC_OPCODE_DP2:
+		case RC_OPCODE_DP3:
+		case RC_OPCODE_DP4:
+		case RC_OPCODE_DDX:
+		case RC_OPCODE_DDY:
+			return 0;
+		default:
+			return 1;
+	}
+}
+
+/**
+ * @return A swizzle the results from converting old_swizzle using
+ * conversion_swizzle
+ */
+unsigned int rc_adjust_channels(
+	unsigned int old_swizzle,
+	unsigned int conversion_swizzle)
+{
+	unsigned int i;
+	unsigned int new_swizzle = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0);
+	for (i = 0; i < 4; i++) {
+		unsigned int new_chan = get_swz(conversion_swizzle, i);
+		if (new_chan == RC_SWIZZLE_UNUSED) {
+			continue;
+		}
+		SET_SWZ(new_swizzle, new_chan, GET_SWZ(old_swizzle, i));
+	}
+	return new_swizzle;
+}
+
+static unsigned int rewrite_writemask(
+	unsigned int old_mask,
+	unsigned int conversion_swizzle)
+{
+	unsigned int new_mask = 0;
+	unsigned int i;
+
+	for (i = 0; i < 4; i++) {
+		if (!GET_BIT(old_mask, i)
+		   || GET_SWZ(conversion_swizzle, i) == RC_SWIZZLE_UNUSED) {
+			continue;
+		}
+		new_mask |= (1 << GET_SWZ(conversion_swizzle, i));
+	}
+
+	return new_mask;
+}
+
+/**
+ * This function rewrites the writemask of sub and adjusts the swizzles
+ * of all its source registers based on the conversion_swizzle.
+ * conversion_swizzle represents a mapping of the old writemask to the
+ * new writemask.  For a detailed description of how conversion swizzles
+ * work see rc_rewrite_swizzle().
+ */
+void rc_pair_rewrite_writemask(
+	struct rc_pair_sub_instruction * sub,
+	unsigned int conversion_swizzle)
+{
+	const struct rc_opcode_info * info = rc_get_opcode_info(sub->Opcode);
+	unsigned int i;
+
+	sub->WriteMask = rewrite_writemask(sub->WriteMask, conversion_swizzle);
+
+	if (!srcs_need_rewrite(info)) {
+		return ;
+	}
+
+	for (i = 0; i < info->NumSrcRegs; i++) {
+		sub->Arg[i].Swizzle =
+			rc_adjust_channels(sub->Arg[i].Swizzle,
+						conversion_swizzle);
+	}
+}
+
+static void normal_rewrite_writemask_cb(
+	void * userdata,
+	struct rc_instruction * inst,
+	struct rc_src_register * src)
+{
+	unsigned int * new_mask = (unsigned int *)userdata;
+	src->Swizzle = rc_adjust_channels(src->Swizzle, *new_mask);
+}
+
+/**
+ * This function is the same as rc_pair_rewrite_writemask() except it
+ * operates on normal instructions.
+ */
+void rc_normal_rewrite_writemask(
+	struct rc_instruction * inst,
+	unsigned int conversion_swizzle)
+{
+	unsigned int new_mask;
+	struct rc_sub_instruction * sub = &inst->U.I;
+	const struct rc_opcode_info * info = rc_get_opcode_info(sub->Opcode);
+	sub->DstReg.WriteMask =
+		rewrite_writemask(sub->DstReg.WriteMask, conversion_swizzle);
+
+	if (info->HasTexture) {
+		unsigned int i;
+		assert(sub->TexSwizzle == RC_SWIZZLE_XYZW);
+		for (i = 0; i < 4; i++) {
+			unsigned int swz = GET_SWZ(conversion_swizzle, i);
+			if (swz > 3)
+				continue;
+			SET_SWZ(sub->TexSwizzle, swz, i);
+		}
+	}
+
+	if (!srcs_need_rewrite(info)) {
+		return;
+	}
+
+	new_mask = sub->DstReg.WriteMask;
+	rc_for_all_reads_src(inst, normal_rewrite_writemask_cb, &new_mask);
+}
+
+/**
+ * This function replaces each value 'swz' in swizzle with the value of
+ * GET_SWZ(conversion_swizzle, swz).  So, if you want to change all the X's
+ * in swizzle to Y, then conversion_swizzle should be Y___ (0xff9).  If you want
+ * to change all the Y's in swizzle to X, then conversion_swizzle should be
+ * _X__ (0xfc7).  If you want to change the Y's to X and the X's to Y, then
+ * conversion swizzle should be YX__ (0xfc1).
+ * @param swizzle The swizzle to change
+ * @param conversion_swizzle Describes the conversion to perform on the swizzle
+ * @return A converted swizzle
+ */
+unsigned int rc_rewrite_swizzle(
+	unsigned int swizzle,
+	unsigned int conversion_swizzle)
+{
+	unsigned int chan;
+	unsigned int out_swizzle = swizzle;
+
+	for (chan = 0; chan < 4; chan++) {
+		unsigned int swz = GET_SWZ(swizzle, chan);
+		unsigned int new_swz;
+		if (swz > 3) {
+			SET_SWZ(out_swizzle, chan, swz);
+		} else {
+			new_swz = GET_SWZ(conversion_swizzle, swz);
+			if (new_swz != RC_SWIZZLE_UNUSED) {
+				SET_SWZ(out_swizzle, chan, new_swz);
+			} else {
+				SET_SWZ(out_swizzle, chan, swz);
+			}
+		}
+	}
+	return out_swizzle;
+}
+
 /**
  * Left multiplication of a register with a swizzle
  */
@@ -281,3 +440,197 @@ unsigned int rc_inst_can_use_presub(
 	return 1;
 }
 
+struct max_data {
+	unsigned int Max;
+	unsigned int HasFileType;
+	rc_register_file File;
+};
+
+static void max_callback(
+	void * userdata,
+	struct rc_instruction * inst,
+	rc_register_file file,
+	unsigned int index,
+	unsigned int mask)
+{
+	struct max_data * d = (struct max_data*)userdata;
+	if (file == d->File && (!d->HasFileType || index > d->Max)) {
+		d->Max = index;
+		d->HasFileType = 1;
+	}
+}
+
+/**
+ * @return The maximum index of the specified register file used by the
+ * program.
+ */
+int rc_get_max_index(
+	struct radeon_compiler * c,
+	rc_register_file file)
+{
+	struct max_data data;
+	data.Max = 0;
+	data.HasFileType = 0;
+	data.File = file;
+	struct rc_instruction * inst;
+	for (inst = c->Program.Instructions.Next;
+					inst != &c->Program.Instructions;
+					inst = inst->Next) {
+		rc_for_all_reads_mask(inst, max_callback, &data);
+		rc_for_all_writes_mask(inst, max_callback, &data);
+	}
+	if (!data.HasFileType) {
+		return -1;
+	} else {
+		return data.Max;
+	}
+}
+
+static unsigned int get_source_readmask(
+	struct rc_pair_sub_instruction * sub,
+	unsigned int source,
+	unsigned int src_type)
+{
+	unsigned int i;
+	unsigned int readmask = 0;
+	const struct rc_opcode_info * info = rc_get_opcode_info(sub->Opcode);
+
+	for (i = 0; i < info->NumSrcRegs; i++) {
+		if (sub->Arg[i].Source != source
+		    || src_type != rc_source_type_swz(sub->Arg[i].Swizzle)) {
+			continue;
+		}
+		readmask |= rc_swizzle_to_writemask(sub->Arg[i].Swizzle);
+	}
+	return readmask;
+}
+
+/**
+ * This function attempts to remove a source from a pair instructions.
+ * @param inst
+ * @param src_type RC_SOURCE_RGB, RC_SOURCE_ALPHA, or both bitwise or'd
+ * @param source The index of the source to remove
+ * @param new_readmask A mask representing the components that are read by
+ * the source that is intended to replace the one you are removing.  If you
+ * want to remove a source only and not replace it, this parameter should be
+ * zero.
+ * @return 1 if the source was successfully removed, 0 if it was not
+ */
+unsigned int rc_pair_remove_src(
+	struct rc_instruction * inst,
+	unsigned int src_type,
+	unsigned int source,
+	unsigned int new_readmask)
+{
+	unsigned int readmask = 0;
+
+	readmask |= get_source_readmask(&inst->U.P.RGB, source, src_type);
+	readmask |= get_source_readmask(&inst->U.P.Alpha, source, src_type);
+
+	if ((new_readmask & readmask) != readmask)
+		return 0;
+
+	if (src_type & RC_SOURCE_RGB) {
+		memset(&inst->U.P.RGB.Src[source], 0,
+			sizeof(struct rc_pair_instruction_source));
+	}
+
+	if (src_type & RC_SOURCE_ALPHA) {
+		memset(&inst->U.P.Alpha.Src[source], 0,
+			sizeof(struct rc_pair_instruction_source));
+	}
+
+	return 1;
+}
+
+/**
+ * @return RC_OPCODE_NOOP if inst is not a flow control instruction.
+ * @return The opcode of inst if it is a flow control instruction.
+ */
+rc_opcode rc_get_flow_control_inst(struct rc_instruction * inst)
+{
+	const struct rc_opcode_info * info;
+	if (inst->Type == RC_INSTRUCTION_NORMAL) {
+		info = rc_get_opcode_info(inst->U.I.Opcode);
+	} else {
+		info = rc_get_opcode_info(inst->U.P.RGB.Opcode);
+		/*A flow control instruction shouldn't have an alpha
+		 * instruction.*/
+		assert(!info->IsFlowControl ||
+				inst->U.P.Alpha.Opcode == RC_OPCODE_NOP);
+	}
+
+	if (info->IsFlowControl)
+		return info->Opcode;
+	else
+		return RC_OPCODE_NOP;
+
+}
+
+/**
+ * @return The BGNLOOP instruction that starts the loop ended by endloop.
+ */
+struct rc_instruction * rc_match_endloop(struct rc_instruction * endloop)
+{
+	unsigned int endloop_count = 0;
+	struct rc_instruction * inst;
+	for (inst = endloop->Prev; inst != endloop; inst = inst->Prev) {
+		rc_opcode op = rc_get_flow_control_inst(inst);
+		if (op == RC_OPCODE_ENDLOOP) {
+			endloop_count++;
+		} else if (op == RC_OPCODE_BGNLOOP) {
+			if (endloop_count == 0) {
+				return inst;
+			} else {
+				endloop_count--;
+			}
+		}
+	}
+	return NULL;
+}
+
+/**
+ * @return The ENDLOOP instruction that ends the loop started by bgnloop.
+ */
+struct rc_instruction * rc_match_bgnloop(struct rc_instruction * bgnloop)
+{
+	unsigned int bgnloop_count = 0;
+	struct rc_instruction * inst;
+	for (inst = bgnloop->Next; inst!=bgnloop; inst = inst->Next) {
+		rc_opcode op = rc_get_flow_control_inst(inst);
+		if (op == RC_OPCODE_BGNLOOP) {
+			bgnloop_count++;
+		} else if (op == RC_OPCODE_ENDLOOP) {
+			if (bgnloop_count == 0) {
+				return inst;
+			} else {
+				bgnloop_count--;
+			}
+		}
+	}
+	return NULL;
+}
+
+/**
+ * @return A conversion swizzle for converting from old_mask->new_mask
+ */
+unsigned int rc_make_conversion_swizzle(
+	unsigned int old_mask,
+	unsigned int new_mask)
+{
+	unsigned int conversion_swizzle = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0);
+	unsigned int old_idx;
+	unsigned int new_idx = 0;
+	for (old_idx = 0; old_idx < 4; old_idx++) {
+		if (!GET_BIT(old_mask, old_idx))
+			continue;
+		for ( ; new_idx < 4; new_idx++) {
+			if (GET_BIT(new_mask, new_idx)) {
+				SET_SWZ(conversion_swizzle, old_idx, new_idx);
+				new_idx++;
+				break;
+			}
+		}
+	}
+	return conversion_swizzle;
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.h b/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.h
index dd0f6c66156..2af289dfabd 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.h
@@ -3,7 +3,12 @@
 #ifndef RADEON_PROGRAM_UTIL_H
 #define RADEON_PROGRAM_UTIL_H
 
+#include "radeon_opcodes.h"
+
+struct radeon_compiler;
 struct rc_instruction;
+struct rc_pair_instruction;
+struct rc_pair_sub_instruction;
 struct rc_src_register;
 
 unsigned int rc_swizzle_to_writemask(unsigned int swz);
@@ -22,6 +27,22 @@ rc_swizzle rc_mask_to_swizzle(unsigned int mask);
 
 unsigned swizzle_mask(unsigned swizzle, unsigned mask);
 
+unsigned int rc_adjust_channels(
+	unsigned int old_swizzle,
+	unsigned int conversion_swizzle);
+
+void rc_pair_rewrite_writemask(
+	struct rc_pair_sub_instruction * sub,
+	unsigned int conversion_swizzle);
+
+void rc_normal_rewrite_writemask(
+	struct rc_instruction * inst,
+	unsigned int conversion_swizzle);
+
+unsigned int rc_rewrite_swizzle(
+	unsigned int swizzle,
+	unsigned int new_mask);
+
 struct rc_src_register lmul_swizzle(unsigned int swizzle, struct rc_src_register srcreg);
 
 void reset_srcreg(struct rc_src_register* reg);
@@ -46,4 +67,23 @@ unsigned int rc_inst_can_use_presub(
 	struct rc_src_register presub_src0,
 	struct rc_src_register presub_src1);
 
+int rc_get_max_index(
+	struct radeon_compiler * c,
+	rc_register_file file);
+
+unsigned int rc_pair_remove_src(
+	struct rc_instruction * inst,
+	unsigned int src_type,
+	unsigned int source,
+	unsigned int new_readmask);
+
+rc_opcode rc_get_flow_control_inst(struct rc_instruction * inst);
+
+struct rc_instruction * rc_match_endloop(struct rc_instruction * endloop);
+struct rc_instruction * rc_match_bgnloop(struct rc_instruction * bgnloop);
+
+unsigned int rc_make_conversion_swizzle(
+	unsigned int old_mask,
+	unsigned int new_mask);
+
 #endif /* RADEON_PROGRAM_UTIL_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c
index d1a7eab50f7..b0deb751be0 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c
@@ -151,6 +151,7 @@ static void pair_sub_for_all_args(
 			unsigned int presub_src_count;
 			struct rc_pair_instruction_source * src_array;
 			unsigned int j;
+
 			if (src_type & RC_SOURCE_RGB) {
 				presub_type = fullinst->
 					U.P.RGB.Src[RC_PAIR_PRESUB_SRC].Index;
@@ -446,30 +447,6 @@ void rc_remap_registers(struct rc_instruction * inst, rc_remap_register_fn cb, v
 		remap_pair_instruction(inst, cb, userdata);
 }
 
-/**
- * @return RC_OPCODE_NOOP if inst is not a flow control instruction.
- * @return The opcode of inst if it is a flow control instruction.
- */
-static rc_opcode get_flow_control_inst(struct rc_instruction * inst)
-{
-	const struct rc_opcode_info * info;
-	if (inst->Type == RC_INSTRUCTION_NORMAL) {
-		info = rc_get_opcode_info(inst->U.I.Opcode);
-	} else {
-		info = rc_get_opcode_info(inst->U.P.RGB.Opcode);
-		/*A flow control instruction shouldn't have an alpha
-		 * instruction.*/
-		assert(!info->IsFlowControl ||
-				inst->U.P.Alpha.Opcode == RC_OPCODE_NOP);
-	}
-
-	if (info->IsFlowControl)
-		return info->Opcode;
-	else
-		return RC_OPCODE_NOP;
-
-}
-
 struct branch_write_mask {
 	unsigned int IfWriteMask:4;
 	unsigned int ElseWriteMask:4;
@@ -495,12 +472,11 @@ struct get_readers_callback_data {
 	struct branch_write_mask BranchMasks[R500_PFS_MAX_BRANCH_DEPTH_FULL + 1];
 };
 
-static void add_reader(
+static struct rc_reader * add_reader(
 	struct memory_pool * pool,
 	struct rc_reader_data * data,
 	struct rc_instruction * inst,
-	unsigned int mask,
-	void * arg_or_src)
+	unsigned int mask)
 {
 	struct rc_reader * new;
 	memory_pool_array_reserve(pool, struct rc_reader, data->Readers,
@@ -508,11 +484,32 @@ static void add_reader(
 	new = &data->Readers[data->ReaderCount++];
 	new->Inst = inst;
 	new->WriteMask = mask;
-	if (inst->Type == RC_INSTRUCTION_NORMAL) {
-		new->U.Src = arg_or_src;
-	} else {
-		new->U.Arg = arg_or_src;
-	}
+	return new;
+}
+
+static void add_reader_normal(
+	struct memory_pool * pool,
+	struct rc_reader_data * data,
+	struct rc_instruction * inst,
+	unsigned int mask,
+	struct rc_src_register * src)
+{
+	struct rc_reader * new = add_reader(pool, data, inst, mask);
+	new->U.I.Src = src;
+}
+
+
+static void add_reader_pair(
+	struct memory_pool * pool,
+	struct rc_reader_data * data,
+	struct rc_instruction * inst,
+	unsigned int mask,
+	struct rc_pair_instruction_arg * arg,
+	struct rc_pair_instruction_source * src)
+{
+	struct rc_reader * new = add_reader(pool, data, inst, mask);
+	new->U.P.Src = src;
+	new->U.P.Arg = arg;
 }
 
 static unsigned int get_readers_read_callback(
@@ -544,6 +541,11 @@ static unsigned int get_readers_read_callback(
 		return shared_mask;
 	}
 
+	if (cb_data->ReaderData->LoopDepth > 0) {
+		cb_data->ReaderData->AbortOnWrite |=
+				(read_mask & cb_data->AliveWriteMask);
+	}
+
 	/* XXX The behavior in this case should be configurable. */
 	if ((read_mask & cb_data->AliveWriteMask) != read_mask) {
 		cb_data->ReaderData->Abort = 1;
@@ -572,10 +574,10 @@ static void get_readers_pair_read_callback(
 	if (d->ReadPairCB)
 		d->ReadPairCB(d->ReaderData, inst, arg, src);
 
-	if (d->ReaderData->Abort)
+	if (d->ReaderData->ExitOnAbort && d->ReaderData->Abort)
 		return;
 
-	add_reader(&d->C->Pool, d->ReaderData, inst, shared_mask, arg);
+	add_reader_pair(&d->C->Pool, d->ReaderData, inst, shared_mask, arg, src);
 }
 
 /**
@@ -600,10 +602,10 @@ static void get_readers_normal_read_callback(
 	if (d->ReadNormalCB)
 		d->ReadNormalCB(d->ReaderData, inst, src);
 
-	if (d->ReaderData->Abort)
+	if (d->ReaderData->ExitOnAbort && d->ReaderData->Abort)
 		return;
 
-	add_reader(&d->C->Pool, d->ReaderData, inst, shared_mask, src);
+	add_reader_normal(&d->C->Pool, d->ReaderData, inst, shared_mask, src);
 }
 
 /**
@@ -624,12 +626,57 @@ static void get_readers_write_callback(
 		unsigned int shared_mask = mask & d->DstMask;
 		d->ReaderData->AbortOnRead &= ~shared_mask;
 		d->AliveWriteMask &= ~shared_mask;
+		if (d->ReaderData->AbortOnWrite & shared_mask) {
+			d->ReaderData->Abort = 1;
+		}
 	}
 
 	if(d->WriteCB)
 		d->WriteCB(d->ReaderData, inst, file, index, mask);
 }
 
+static void push_branch_mask(
+	struct get_readers_callback_data * d,
+	unsigned int * branch_depth)
+{
+	(*branch_depth)++;
+	if (*branch_depth > R500_PFS_MAX_BRANCH_DEPTH_FULL) {
+		d->ReaderData->Abort = 1;
+		return;
+	}
+	d->BranchMasks[*branch_depth].IfWriteMask =
+					d->AliveWriteMask;
+}
+
+static void pop_branch_mask(
+	struct get_readers_callback_data * d,
+	unsigned int * branch_depth)
+{
+	struct branch_write_mask * masks = &d->BranchMasks[*branch_depth];
+
+	if (masks->HasElse) {
+		/* Abort on read for components that were written in the IF
+		 * block. */
+		d->ReaderData->AbortOnRead |=
+				masks->IfWriteMask & ~masks->ElseWriteMask;
+		/* Abort on read for components that were written in the ELSE
+		 * block. */
+		d->ReaderData->AbortOnRead |=
+				masks->ElseWriteMask & ~d->AliveWriteMask;
+
+		d->AliveWriteMask = masks->IfWriteMask
+			^ ((masks->IfWriteMask ^ masks->ElseWriteMask)
+			& (masks->IfWriteMask ^ d->AliveWriteMask));
+	} else {
+		d->ReaderData->AbortOnRead |=
+				masks->IfWriteMask & ~d->AliveWriteMask;
+		d->AliveWriteMask = masks->IfWriteMask;
+
+	}
+	memset(masks, 0, sizeof(struct branch_write_mask));
+	(*branch_depth)--;
+}
+
 static void get_readers_for_single_write(
 	void * userdata,
 	struct rc_instruction * writer,
@@ -639,10 +686,14 @@ static void get_readers_for_single_write(
 {
 	struct rc_instruction * tmp;
 	unsigned int branch_depth = 0;
+	struct rc_instruction * endloop = NULL;
+	unsigned int abort_on_read_at_endloop;
 	struct get_readers_callback_data * d = userdata;
 
 	d->ReaderData->Writer = writer;
 	d->ReaderData->AbortOnRead = 0;
+	d->ReaderData->AbortOnWrite = 0;
+	d->ReaderData->LoopDepth = 0;
 	d->ReaderData->InElse = 0;
 	d->DstFile = dst_file;
 	d->DstIndex = dst_index;
@@ -655,32 +706,43 @@ static void get_readers_for_single_write(
 
 	for(tmp = writer->Next; tmp != &d->C->Program.Instructions;
 							tmp = tmp->Next){
-		rc_opcode opcode = get_flow_control_inst(tmp);
+		rc_opcode opcode = rc_get_flow_control_inst(tmp);
 		switch(opcode) {
 		case RC_OPCODE_BGNLOOP:
-			/* XXX We can do better when we see a BGNLOOP if we
-			 * add a flag called AbortOnWrite to struct
-			 * rc_reader_data and leave it set until the next
-			 * ENDLOOP. */
+			d->ReaderData->LoopDepth++;
+			push_branch_mask(d, &branch_depth);
+			break;
 		case RC_OPCODE_ENDLOOP:
-			/* XXX We can do better when we see an ENDLOOP by
-			 * searching backwards from writer and looking for
-			 * readers of writer's destination index.  If we find a
-			 * reader before we get to the BGNLOOP, we must abort
-			 * unless there is another writer between that reader
-			 * and the BGNLOOP. */
-		case RC_OPCODE_BRK:
-		case RC_OPCODE_CONT:
-			d->ReaderData->Abort = 1;
-			return;
-		case RC_OPCODE_IF:
-			branch_depth++;
-			if (branch_depth > R500_PFS_MAX_BRANCH_DEPTH_FULL) {
-				d->ReaderData->Abort = 1;
-				return;
+			if (d->ReaderData->LoopDepth > 0) {
+				d->ReaderData->LoopDepth--;
+				if (d->ReaderData->LoopDepth == 0) {
+					d->ReaderData->AbortOnWrite = 0;
+				}
+				pop_branch_mask(d, &branch_depth);
+			} else {
+				/* Here we have reached an ENDLOOP without
+				 * seeing its BGNLOOP.  These means that
+				 * the writer was written inside of a loop,
+				 * so it could have readers that are above it
+				 * (i.e. they have a lower IP).  To find these
+				 * readers we jump to the BGNLOOP instruction
+				 * and check each instruction until we get
+				 * back to the writer.
+				 */
+				endloop = tmp;
+				tmp = rc_match_endloop(tmp);
+				if (!tmp) {
+					rc_error(d->C, "Failed to match endloop.\n");
+					d->ReaderData->Abort = 1;
+					return;
+				}
+				abort_on_read_at_endloop = d->ReaderData->AbortOnRead;
+				d->ReaderData->AbortOnRead |= d->AliveWriteMask;
+				continue;
 			}
-			d->BranchMasks[branch_depth].IfWriteMask =
-							d->AliveWriteMask;
+			break;
+		case RC_OPCODE_IF:
+			push_branch_mask(d, &branch_depth);
 			break;
 		case RC_OPCODE_ELSE:
 			if (branch_depth == 0) {
@@ -700,35 +762,7 @@ static void get_readers_for_single_write(
 				d->ReaderData->InElse = 0;
 			}
 			else {
-				struct branch_write_mask * masks =
-					&d->BranchMasks[branch_depth];
-
-				if (masks->HasElse) {
-					/* Abort on read for components that
-					 * were written in the IF block. */
-					d->ReaderData->AbortOnRead |=
-						masks->IfWriteMask
-							& ~masks->ElseWriteMask;
-					/* Abort on read for components that
-					 * were written in the ELSE block. */
-					d->ReaderData->AbortOnRead |=
-						masks->ElseWriteMask
-							& ~d->AliveWriteMask;
-					d->AliveWriteMask = masks->IfWriteMask
-						^ ((masks->IfWriteMask ^
-							masks->ElseWriteMask)
-						& (masks->IfWriteMask
-							^ d->AliveWriteMask));
-				} else {
-					d->ReaderData->AbortOnRead |=
-						masks->IfWriteMask
-							& ~d->AliveWriteMask;
-					d->AliveWriteMask = masks->IfWriteMask;
-
-				}
-				memset(masks, 0,
-					sizeof(struct branch_write_mask));
-				branch_depth--;
+				pop_branch_mask(d, &branch_depth);
 			}
 			break;
 		default:
@@ -745,9 +779,17 @@ static void get_readers_for_single_write(
 			rc_pair_for_all_reads_arg(tmp,
 				get_readers_pair_read_callback, d);
 		}
+
+		/* This can happen when we jump from an ENDLOOP to BGNLOOP */
+		if (tmp == writer) {
+			tmp = endloop;
+			endloop = NULL;
+			d->ReaderData->AbortOnRead = abort_on_read_at_endloop;
+			continue;
+		}
 		rc_for_all_writes_mask(tmp, get_readers_write_callback, d);
 
-		if (d->ReaderData->Abort)
+		if (d->ReaderData->ExitOnAbort && d->ReaderData->Abort)
 			return;
 
 		if (branch_depth == 0 && !d->AliveWriteMask)
@@ -755,6 +797,26 @@ static void get_readers_for_single_write(
 	}
 }
 
+static void init_get_readers_callback_data(
+	struct get_readers_callback_data * d,
+	struct rc_reader_data * reader_data,
+	struct radeon_compiler * c,
+	rc_read_src_fn read_normal_cb,
+	rc_pair_read_arg_fn read_pair_cb,
+	rc_read_write_mask_fn write_cb)
+{
+	reader_data->Abort = 0;
+	reader_data->ReaderCount = 0;
+	reader_data->ReadersReserved = 0;
+	reader_data->Readers = NULL;
+
+	d->C = c;
+	d->ReaderData = reader_data;
+	d->ReadNormalCB = read_normal_cb;
+	d->ReadPairCB = read_pair_cb;
+	d->WriteCB = write_cb;
+}
+
 /**
  * This function will create a list of readers via the rc_reader_data struct.
  * This function will abort (set the flag data->Abort) and return if it
@@ -803,16 +865,28 @@ void rc_get_readers(
 {
 	struct get_readers_callback_data d;
 
-	data->Abort = 0;
-	data->ReaderCount = 0;
-	data->ReadersReserved = 0;
-	data->Readers = NULL;
-
-	d.C = c;
-	d.ReaderData = data;
-	d.ReadNormalCB = read_normal_cb;
-	d.ReadPairCB = read_pair_cb;
-	d.WriteCB = write_cb;
+	init_get_readers_callback_data(&d, data, c, read_normal_cb,
+						read_pair_cb, write_cb);
 
 	rc_for_all_writes_mask(writer, get_readers_for_single_write, &d);
 }
+
+void rc_get_readers_sub(
+	struct radeon_compiler * c,
+	struct rc_instruction * writer,
+	struct rc_pair_sub_instruction * sub_writer,
+	struct rc_reader_data * data,
+	rc_read_src_fn read_normal_cb,
+	rc_pair_read_arg_fn read_pair_cb,
+	rc_read_write_mask_fn write_cb)
+{
+	struct get_readers_callback_data d;
+
+	init_get_readers_callback_data(&d, data, c, read_normal_cb,
+						read_pair_cb, write_cb);
+
+	if (sub_writer->WriteMask) {
+		get_readers_for_single_write(&d, writer, RC_FILE_TEMPORARY,
+			sub_writer->DestIndex, sub_writer->WriteMask);
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.h b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.h
index ef971c5b234..d8a627258ea 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.h
@@ -37,6 +37,7 @@ struct rc_swizzle_caps;
 struct rc_src_register;
 struct rc_pair_instruction_arg;
 struct rc_pair_instruction_source;
+struct rc_pair_sub_instruction;
 struct rc_compiler;
 
 
@@ -74,14 +75,21 @@ struct rc_reader {
 	struct rc_instruction * Inst;
 	unsigned int WriteMask;
 	union {
-		struct rc_src_register * Src;
-		struct rc_pair_instruction_arg * Arg;
+		struct {
+			struct rc_src_register * Src;
+		} I;
+		struct {
+			struct rc_pair_instruction_arg * Arg;
+			struct rc_pair_instruction_source * Src;
+		} P;
 	} U;
 };
 
 struct rc_reader_data {
 	unsigned int Abort;
 	unsigned int AbortOnRead;
+	unsigned int AbortOnWrite;
+	unsigned int LoopDepth;
 	unsigned int InElse;
 	struct rc_instruction * Writer;
 
@@ -89,6 +97,9 @@ struct rc_reader_data {
 	unsigned int ReadersReserved;
 	struct rc_reader * Readers;
 
+	/* If this flag is enabled, rc_get_readers will exit as soon possbile
+	 * after the Abort flag is set.*/
+	unsigned int ExitOnAbort;
 	void * CbData;
 };
 
@@ -99,6 +110,15 @@ void rc_get_readers(
 	rc_read_src_fn read_normal_cb,
 	rc_pair_read_arg_fn read_pair_cb,
 	rc_read_write_mask_fn write_cb);
+
+void rc_get_readers_sub(
+	struct radeon_compiler * c,
+	struct rc_instruction * writer,
+	struct rc_pair_sub_instruction * sub_writer,
+	struct rc_reader_data * data,
+	rc_read_src_fn read_normal_cb,
+	rc_pair_read_arg_fn read_pair_cb,
+	rc_read_write_mask_fn write_cb);
 /**
  * Compiler passes based on dataflow analysis.
  */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_list.c b/src/mesa/drivers/dri/r300/compiler/radeon_list.c
new file mode 100644
index 00000000000..811c908a81a
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_list.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2011 Tom Stellard <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_list.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "memory_pool.h"
+
+struct rc_list * rc_list(struct memory_pool * pool, void * item)
+{
+	struct rc_list * new = memory_pool_malloc(pool, sizeof(struct rc_list));
+	new->Item = item;
+	new->Next = NULL;
+	new->Prev = NULL;
+
+	return new;
+}
+
+void rc_list_add(struct rc_list ** list, struct rc_list * new_value)
+{
+	struct rc_list * temp;
+
+	if (*list == NULL) {
+		*list = new_value;
+		return;
+	}
+
+	for (temp = *list; temp->Next; temp = temp->Next);
+
+	temp->Next = new_value;
+	new_value->Prev = temp;
+}
+
+void rc_list_remove(struct rc_list ** list, struct rc_list * rm_value)
+{
+	if (*list == rm_value) {
+		*list = rm_value->Next;
+		return;
+	}
+
+	rm_value->Prev->Next = rm_value->Next;
+	if (rm_value->Next) {
+		rm_value->Next->Prev = rm_value->Prev;
+	}
+}
+
+unsigned int rc_list_count(struct rc_list * list)
+{
+	unsigned int count = 0;
+	while (list) {
+		count++;
+		list = list->Next;
+	}
+	return count;
+}
+
+void rc_list_print(struct rc_list * list)
+{
+	while(list) {
+		fprintf(stderr, "%p->", list->Item);
+		list = list->Next;
+	}
+	fprintf(stderr, "\n");
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_list.h b/src/mesa/drivers/dri/r300/compiler/radeon_list.h
new file mode 100644
index 00000000000..b3c8f89cc68
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_list.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2011 Tom Stellard <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef RADEON_LIST_H
+#define RADEON_LIST_H
+
+struct memory_pool;
+
+struct rc_list {
+	void * Item;
+	struct rc_list * Prev;
+	struct rc_list * Next;
+};
+
+struct rc_list * rc_list(struct memory_pool * pool, void * item);
+void rc_list_add(struct rc_list ** list, struct rc_list * new_value);
+void rc_list_remove(struct rc_list ** list, struct rc_list * rm_value);
+unsigned int rc_list_count(struct rc_list * list);
+void rc_list_print(struct rc_list * list);
+
+#endif /* RADEON_LIST_H */
+
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
index e3e498e8fb4..afd78ad79dd 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
@@ -481,6 +481,7 @@ void rc_compute_sources_for_writemask(
 			break;
 		case RC_OPCODE_TXB:
 		case RC_OPCODE_TXP:
+		case RC_OPCODE_TXL:
 			srcmasks[0] |= RC_MASK_W;
 			/* Fall through */
 		case RC_OPCODE_TEX:
@@ -500,6 +501,33 @@ void rc_compute_sources_for_writemask(
 					break;
 			}
 			break;
+		case RC_OPCODE_TXD:
+			switch (inst->U.I.TexSrcTarget) {
+				case RC_TEXTURE_1D_ARRAY:
+					srcmasks[0] |= RC_MASK_Y;
+					/* Fall through. */
+				case RC_TEXTURE_1D:
+					srcmasks[0] |= RC_MASK_X;
+					srcmasks[1] |= RC_MASK_X;
+					srcmasks[2] |= RC_MASK_X;
+					break;
+				case RC_TEXTURE_2D_ARRAY:
+					srcmasks[0] |= RC_MASK_Z;
+					/* Fall through. */
+				case RC_TEXTURE_2D:
+				case RC_TEXTURE_RECT:
+					srcmasks[0] |= RC_MASK_XY;
+					srcmasks[1] |= RC_MASK_XY;
+					srcmasks[2] |= RC_MASK_XY;
+					break;
+				case RC_TEXTURE_3D:
+				case RC_TEXTURE_CUBE:
+					srcmasks[0] |= RC_MASK_XYZ;
+					srcmasks[1] |= RC_MASK_XYZ;
+					srcmasks[2] |= RC_MASK_XYZ;
+					break;
+			}
+			break;
 		case RC_OPCODE_DST:
 			srcmasks[0] |= RC_MASK_Y | RC_MASK_Z;
 			srcmasks[1] |= RC_MASK_Y | RC_MASK_W;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
index 79898e1047e..5b4fba80873 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
@@ -91,6 +91,8 @@ static void copy_propagate_scan_read(void * data, struct rc_instruction * inst,
 				(inst->U.I.Opcode == RC_OPCODE_TEX ||
 				inst->U.I.Opcode == RC_OPCODE_TXB ||
 				inst->U.I.Opcode == RC_OPCODE_TXP ||
+				inst->U.I.Opcode == RC_OPCODE_TXD ||
+				inst->U.I.Opcode == RC_OPCODE_TXL ||
 				inst->U.I.Opcode == RC_OPCODE_KIL)){
 		reader_data->Abort = 1;
 		return;
@@ -144,6 +146,7 @@ static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * i
 		return;
 
 	/* Get a list of all the readers of this MOV instruction. */
+	reader_data.ExitOnAbort = 1;
 	rc_get_readers(c, inst_mov, &reader_data,
 		       copy_propagate_scan_read, NULL,
 		       is_src_clobbered_scan_write);
@@ -154,7 +157,7 @@ static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * i
 	/* Propagate the MOV instruction. */
 	for (i = 0; i < reader_data.ReaderCount; i++) {
 		struct rc_instruction * inst = reader_data.Readers[i].Inst;
-		*reader_data.Readers[i].U.Src = chain_srcregs(*reader_data.Readers[i].U.Src, inst_mov->U.I.SrcReg[0]);
+		*reader_data.Readers[i].U.I.Src = chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);
 
 		if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
 			inst->U.I.PreSub = inst_mov->U.I.PreSub;
@@ -453,6 +456,7 @@ static int presub_helper(
 	rc_presubtract_op cb_op = presub_opcode;
 
 	reader_data.CbData = &cb_op;
+	reader_data.ExitOnAbort = 1;
 	rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL,
 						is_src_clobbered_scan_write);
 
@@ -466,7 +470,7 @@ static int presub_helper(
 				rc_get_opcode_info(reader.Inst->U.I.Opcode);
 
 		for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
-			if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.Src)
+			if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
 				presub_replace(inst_add, reader.Inst, src_index);
 		}
 	}
@@ -619,13 +623,11 @@ static int peephole_add_presub_inv(
 	struct radeon_compiler * c,
 	struct rc_instruction * inst_add)
 {
-	unsigned int i, swz, mask;
+	unsigned int i, swz;
 
 	if (!is_presub_candidate(c, inst_add))
 		return 0;
 
-	mask = inst_add->U.I.DstReg.WriteMask;
-
 	/* Check if src0 is 1. */
 	/* XXX It would be nice to use is_src_uniform_constant here, but that
 	 * function only works if the register's file is RC_FILE_NONE */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_dead_sources.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_dead_sources.c
new file mode 100644
index 00000000000..1e9a2c09d44
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_dead_sources.c
@@ -0,0 +1,62 @@
+
+#include "radeon_compiler.h"
+#include "radeon_compiler_util.h"
+#include "radeon_opcodes.h"
+#include "radeon_program_pair.h"
+
+static void mark_used_presub(struct rc_pair_sub_instruction * sub)
+{
+	if (sub->Src[RC_PAIR_PRESUB_SRC].Used) {
+		unsigned int presub_reg_count = rc_presubtract_src_reg_count(
+					sub->Src[RC_PAIR_PRESUB_SRC].Index);
+		unsigned int i;
+		for (i = 0; i < presub_reg_count; i++) {
+			sub->Src[i].Used = 1;
+		}
+	}
+}
+
+static void mark_used(
+	struct rc_instruction * inst,
+	struct rc_pair_sub_instruction * sub)
+{
+	unsigned int i;
+	const struct rc_opcode_info * info = rc_get_opcode_info(sub->Opcode);
+	for (i = 0; i < info->NumSrcRegs; i++) {
+		unsigned int src_type = rc_source_type_swz(sub->Arg[i].Swizzle);
+		if (src_type & RC_SOURCE_RGB) {
+			inst->U.P.RGB.Src[sub->Arg[i].Source].Used = 1;
+		}
+
+		if (src_type & RC_SOURCE_ALPHA) {
+			inst->U.P.Alpha.Src[sub->Arg[i].Source].Used = 1;
+		}
+	}
+}
+
+/**
+ * This pass finds sources that are not used by their instruction and marks
+ * them as unused. 
+ */
+void rc_pair_remove_dead_sources(struct radeon_compiler * c, void *user)
+{
+	struct rc_instruction * inst;
+	for (inst = c->Program.Instructions.Next;
+					inst != &c->Program.Instructions;
+					inst = inst->Next) {
+		unsigned int i;
+		if (inst->Type == RC_INSTRUCTION_NORMAL)
+			continue;
+
+		/* Mark all sources as unused */
+		for (i = 0; i < 4; i++) {
+			inst->U.P.RGB.Src[i].Used = 0;
+			inst->U.P.Alpha.Src[i].Used = 0;
+		}
+		mark_used(inst, &inst->U.P.RGB);
+		mark_used(inst, &inst->U.P.Alpha);
+
+		mark_used_presub(&inst->U.P.RGB);
+		mark_used_presub(&inst->U.P.Alpha);
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
index d53181e1f75..49983d6ce75 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2009 Nicolai Haehnle.
+ * Copyright 2011 Tom Stellard <[email protected]>
  *
  * All Rights Reserved.
  *
@@ -29,125 +30,126 @@
 
 #include <stdio.h>
 
+#include "main/glheader.h"
+#include "program/register_allocate.h"
+#include "ralloc.h"
+
+#include "r300_fragprog_swizzle.h"
 #include "radeon_compiler.h"
+#include "radeon_compiler_util.h"
 #include "radeon_dataflow.h"
-
+#include "radeon_list.h"
+#include "radeon_variable.h"
 
 #define VERBOSE 0
 
 #define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)
 
 
-struct live_intervals {
-	int Start;
-	int End;
-	struct live_intervals * Next;
-};
 
 struct register_info {
-	struct live_intervals Live;
+	struct live_intervals Live[4];
 
 	unsigned int Used:1;
 	unsigned int Allocated:1;
 	unsigned int File:3;
 	unsigned int Index:RC_REGISTER_INDEX_BITS;
-};
-
-struct hardware_register {
-	struct live_intervals * Used;
+	unsigned int Writemask;
 };
 
 struct regalloc_state {
 	struct radeon_compiler * C;
 
-	struct register_info Input[RC_REGISTER_MAX_INDEX];
-	struct register_info Temporary[RC_REGISTER_MAX_INDEX];
-
-	struct hardware_register * HwTemporary;
-	unsigned int NumHwTemporaries;
-	/**
-	 * If an instruction is inside of a loop, EndLoop will be the
-	 * IP of the ENDLOOP instruction, and BeginLoop will be the IP
-	 * of the BGNLOOP instruction.  Otherwise, EndLoop and BeginLoop
-	 * will be -1.
-	 */
-	int EndLoop;
-	int BeginLoop;
+	struct register_info * Input;
+	unsigned int NumInputs;
+
+	struct register_info * Temporary;
+	unsigned int NumTemporaries;
+
+	unsigned int Simple;
+	int LoopEnd;
+};
+
+enum rc_reg_class {
+	RC_REG_CLASS_SINGLE,
+	RC_REG_CLASS_DOUBLE,
+	RC_REG_CLASS_TRIPLE,
+	RC_REG_CLASS_ALPHA,
+	RC_REG_CLASS_SINGLE_PLUS_ALPHA,
+	RC_REG_CLASS_DOUBLE_PLUS_ALPHA,
+	RC_REG_CLASS_TRIPLE_PLUS_ALPHA,
+	RC_REG_CLASS_X,
+	RC_REG_CLASS_Y,
+	RC_REG_CLASS_Z,
+	RC_REG_CLASS_XY,
+	RC_REG_CLASS_YZ,
+	RC_REG_CLASS_XZ,
+	RC_REG_CLASS_XW,
+	RC_REG_CLASS_YW,
+	RC_REG_CLASS_ZW,
+	RC_REG_CLASS_XYW,
+	RC_REG_CLASS_YZW,
+	RC_REG_CLASS_XZW,
+	RC_REG_CLASS_COUNT
+};
+
+struct rc_class {
+	enum rc_reg_class Class;
+
+	unsigned int WritemaskCount;
+
+	/** This is 1 if this class is being used by the register allocator
+	 * and 0 otherwise */
+	unsigned int Used;
+
+	/** This is the ID number assigned to this class by ra. */
+	unsigned int Id;
+
+	/** List of writemasks that belong to this class */
+	unsigned int Writemasks[3];
+
+
 };
 
 static void print_live_intervals(struct live_intervals * src)
 {
-	if (!src) {
+	if (!src || !src->Used) {
 		DBG("(null)");
 		return;
 	}
 
-	while(src) {
-		DBG("(%i,%i)", src->Start, src->End);
-		src = src->Next;
-	}
+	DBG("(%i,%i)", src->Start, src->End);
 }
 
-static void add_live_intervals(struct regalloc_state * s,
-		struct live_intervals ** dst, struct live_intervals * src)
+static int overlap_live_intervals(struct live_intervals * a, struct live_intervals * b)
 {
-	struct live_intervals ** dst_backup = dst;
-
 	if (VERBOSE) {
-		DBG("add_live_intervals: ");
-		print_live_intervals(*dst);
+		DBG("overlap_live_intervals: ");
+		print_live_intervals(a);
 		DBG(" to ");
-		print_live_intervals(src);
-		DBG("\n");
-	}
-
-	while(src) {
-		if (*dst && (*dst)->End < src->Start) {
-			dst = &(*dst)->Next;
-		} else if (!*dst || (*dst)->Start > src->End) {
-			struct live_intervals * li = memory_pool_malloc(&s->C->Pool, sizeof(*li));
-			li->Start = src->Start;
-			li->End = src->End;
-			li->Next = *dst;
-			*dst = li;
-			src = src->Next;
-		} else {
-			if (src->End > (*dst)->End)
-				(*dst)->End = src->End;
-			if (src->Start < (*dst)->Start)
-				(*dst)->Start = src->Start;
-			src = src->Next;
-		}
-	}
-
-	if (VERBOSE) {
-		DBG("    result: ");
-		print_live_intervals(*dst_backup);
+		print_live_intervals(b);
 		DBG("\n");
 	}
-}
 
-static int overlap_live_intervals(struct live_intervals * dst, struct live_intervals * src)
-{
-	if (VERBOSE) {
-		DBG("overlap_live_intervals: ");
-		print_live_intervals(dst);
-		DBG(" to ");
-		print_live_intervals(src);
-		DBG("\n");
+	if (!a->Used || !b->Used) {
+		DBG("    unused interval\n");
+		return 0;
 	}
 
-	while(src && dst) {
-		if (dst->End <= src->Start) {
-			dst = dst->Next;
-		} else if (dst->End <= src->End) {
+	if (a->Start > b->Start) {
+		if (a->Start < b->End) {
+			DBG("    overlap\n");
+			return 1;
+		}
+	} else if (b->Start > a->Start) {
+		if (b->Start < a->End) {
 			DBG("    overlap\n");
 			return 1;
-		} else if (dst->Start < src->End) {
+		}
+	} else { /* a->Start == b->Start */
+		if (a->Start != a->End && b->Start != b->End) {
 			DBG("    overlap\n");
 			return 1;
-		} else {
-			src = src->Next;
 		}
 	}
 
@@ -156,92 +158,27 @@ static int overlap_live_intervals(struct live_intervals * dst, struct live_inter
 	return 0;
 }
 
-static int try_add_live_intervals(struct regalloc_state * s,
-		struct live_intervals ** dst, struct live_intervals * src)
-{
-	if (overlap_live_intervals(*dst, src))
-		return 0;
-
-	add_live_intervals(s, dst, src);
-	return 1;
-}
-
-static void scan_callback(void * data, struct rc_instruction * inst,
+static void scan_read_callback(void * data, struct rc_instruction * inst,
 		rc_register_file file, unsigned int index, unsigned int mask)
 {
 	struct regalloc_state * s = data;
 	struct register_info * reg;
+	unsigned int i;
 
-	if (file == RC_FILE_TEMPORARY)
-		reg = &s->Temporary[index];
-	else if (file == RC_FILE_INPUT)
-		reg = &s->Input[index];
-	else
+	if (file != RC_FILE_INPUT)
 		return;
 
-	if (!reg->Used) {
-		reg->Used = 1;
-		if (file == RC_FILE_INPUT)
-			reg->Live.Start = -1;
-		else if (s->BeginLoop >= 0)
-			reg->Live.Start = s->BeginLoop;
-		else
-			reg->Live.Start = inst->IP;
-		reg->Live.End = inst->IP;
-	} else if (s->EndLoop >= 0)
-		reg->Live.End = s->EndLoop;
-	else if (inst->IP > reg->Live.End)
-		reg->Live.End = inst->IP;
-}
+	s->Input[index].Used = 1;
+	reg = &s->Input[index];
 
-static void compute_live_intervals(struct radeon_compiler *c,
-				   struct regalloc_state *s)
-{
-	memset(s, 0, sizeof(*s));
-	s->C = c;
-	s->NumHwTemporaries = c->max_temp_regs;
-	s->BeginLoop = -1;
-	s->EndLoop = -1;
-	s->HwTemporary =
-		memory_pool_malloc(&c->Pool,
-				   s->NumHwTemporaries * sizeof(struct hardware_register));
-	memset(s->HwTemporary, 0, s->NumHwTemporaries * sizeof(struct hardware_register));
-
-	rc_recompute_ips(s->C);
-
-	for(struct rc_instruction * inst = s->C->Program.Instructions.Next;
-	    inst != &s->C->Program.Instructions;
-	    inst = inst->Next) {
-
-		/* For all instructions inside of a loop, the ENDLOOP
-		 * instruction is used as the end of the live interval and
-		 * the BGNLOOP instruction is used as the beginning. */
-		if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP && s->EndLoop < 0) {
-			int loops = 1;
-			struct rc_instruction * tmp;
-			s->BeginLoop = inst->IP;
-			for(tmp = inst->Next;
-					tmp != &s->C->Program.Instructions;
-					tmp = tmp->Next) {
-				if (tmp->U.I.Opcode == RC_OPCODE_BGNLOOP) {
-					loops++;
-				} else if (tmp->U.I.Opcode
-							== RC_OPCODE_ENDLOOP) {
-					if(!--loops) {
-						s->EndLoop = tmp->IP;
-						break;
-					}
-				}
-			}
-		}
-
-		if (inst->IP == s->EndLoop) {
-			s->EndLoop = -1;
-			s->BeginLoop = -1;
+	for (i = 0; i < 4; i++) {
+		if (!((mask >> i) & 0x1)) {
+			continue;
 		}
-
-		rc_for_all_reads_mask(inst, scan_callback, s);
-		rc_for_all_writes_mask(inst, scan_callback, s);
+		reg->Live[i].Used = 1;
+		reg->Live[i].Start = 0;
+		reg->Live[i].End =
+			s->LoopEnd > inst->IP ? s->LoopEnd : inst->IP;
 	}
 }
 
@@ -251,7 +188,7 @@ static void remap_register(void * data, struct rc_instruction * inst,
 	struct regalloc_state * s = data;
 	const struct register_info * reg;
 
-	if (*file == RC_FILE_TEMPORARY)
+	if (*file == RC_FILE_TEMPORARY && s->Simple)
 		reg = &s->Temporary[*index];
 	else if (*file == RC_FILE_INPUT)
 		reg = &s->Input[*index];
@@ -259,106 +196,511 @@ static void remap_register(void * data, struct rc_instruction * inst,
 		return;
 
 	if (reg->Allocated) {
-		*file = reg->File;
 		*index = reg->Index;
 	}
 }
 
-static void do_regalloc(struct regalloc_state * s)
+static void alloc_input_simple(void * data, unsigned int input,
+							unsigned int hwreg)
+{
+	struct regalloc_state * s = data;
+
+	if (input >= s->NumInputs)
+		return;
+
+	s->Input[input].Allocated = 1;
+	s->Input[input].File = RC_FILE_TEMPORARY;
+	s->Input[input].Index = hwreg;
+}
+
+/* This functions offsets the temporary register indices by the number
+ * of input registers, because input registers are actually temporaries and
+ * should not occupy the same space.
+ *
+ * This pass is supposed to be used to maintain correct allocation of inputs
+ * if the standard register allocation is disabled. */
+static void do_regalloc_inputs_only(struct regalloc_state * s)
+{
+	for (unsigned i = 0; i < s->NumTemporaries; i++) {
+		s->Temporary[i].Allocated = 1;
+		s->Temporary[i].File = RC_FILE_TEMPORARY;
+		s->Temporary[i].Index = i + s->NumInputs;
+	}
+}
+
+static unsigned int is_derivative(rc_opcode op)
 {
-	/* Simple and stupid greedy register allocation */
-	for(unsigned int index = 0; index < RC_REGISTER_MAX_INDEX; ++index) {
-		struct register_info * reg = &s->Temporary[index];
+	return (op == RC_OPCODE_DDX || op == RC_OPCODE_DDY);
+}
 
-		if (!reg->Used)
+static int find_class(
+	struct rc_class * classes,
+	unsigned int writemask,
+	unsigned int max_writemask_count)
+{
+	unsigned int i;
+	for (i = 0; i < RC_REG_CLASS_COUNT; i++) {
+		unsigned int j;
+		if (classes[i].WritemaskCount > max_writemask_count) {
 			continue;
+		}
+		for (j = 0; j < 3; j++) {
+			if (classes[i].Writemasks[j] == writemask) {
+				return i;
+			}
+		}
+	}
+	return -1;
+}
+
+static enum rc_reg_class variable_get_class(
+	struct rc_variable * variable,
+	struct rc_class * classes)
+{
+	unsigned int i;
+	unsigned int can_change_writemask= 1;
+	unsigned int writemask = rc_variable_writemask_sum(variable);
+	struct rc_list * readers = rc_variable_readers_union(variable);
+	int class_index;
+
+	if (!variable->C->is_r500) {
+		struct rc_class c;
+		/* The assumption here is that if an instruction has type
+		 * RC_INSTRUCTION_NORMAL then it is a TEX instruction.
+		 * r300 and r400 can't swizzle the result of a TEX lookup. */
+		if (variable->Inst->Type == RC_INSTRUCTION_NORMAL) {
+			writemask = RC_MASK_XYZW;
+		}
 
-		for(unsigned int hwreg = 0; hwreg < s->NumHwTemporaries; ++hwreg) {
-			if (try_add_live_intervals(s, &s->HwTemporary[hwreg].Used, &reg->Live)) {
-				reg->Allocated = 1;
-				reg->File = RC_FILE_TEMPORARY;
-				reg->Index = hwreg;
-				goto success;
+		/* Check if it is possible to do swizzle packing for r300/r400
+		 * without creating non-native swizzles. */
+		class_index = find_class(classes, writemask, 3);
+		if (class_index < 0) {
+			goto error;
+		}
+		c = classes[class_index];
+		for (i = 0; i < c.WritemaskCount; i++) {
+			int j;
+			unsigned int conversion_swizzle =
+						rc_make_conversion_swizzle(
+						writemask, c.Writemasks[i]);
+			for (j = 0; j < variable->ReaderCount; j++) {
+				unsigned int old_swizzle;
+				unsigned int new_swizzle;
+				struct rc_reader r = variable->Readers[j];
+				if (r.Inst->Type == RC_INSTRUCTION_PAIR ) {
+					old_swizzle = r.U.P.Arg->Swizzle;
+				} else {
+					old_swizzle = r.U.I.Src->Swizzle;
+				}
+				new_swizzle = rc_adjust_channels(
+					old_swizzle, conversion_swizzle);
+				if (!r300_swizzle_is_native_basic(new_swizzle)) {
+					can_change_writemask = 0;
+					break;
+				}
+			}
+			if (!can_change_writemask) {
+				break;
 			}
 		}
+	}
 
-		rc_error(s->C, "Ran out of hardware temporaries\n");
-		return;
+	if (variable->Inst->Type == RC_INSTRUCTION_PAIR) {
+		/* DDX/DDY seem to always fail when their writemasks are
+		 * changed.*/
+		if (is_derivative(variable->Inst->U.P.RGB.Opcode)
+		    || is_derivative(variable->Inst->U.P.Alpha.Opcode)) {
+			can_change_writemask = 0;
+		}
+	}
+	for ( ; readers; readers = readers->Next) {
+		struct rc_reader * r = readers->Item;
+		if (r->Inst->Type == RC_INSTRUCTION_PAIR) {
+			if (r->U.P.Arg->Source == RC_PAIR_PRESUB_SRC) {
+				can_change_writemask = 0;
+				break;
+			}
+			/* DDX/DDY also fail when their swizzles are changed. */
+			if (is_derivative(r->Inst->U.P.RGB.Opcode)
+			    || is_derivative(r->Inst->U.P.Alpha.Opcode)) {
+				can_change_writemask = 0;
+				break;
+			}
+		}
+	}
 
-	success:;
+	class_index = find_class(classes, writemask,
+						can_change_writemask ? 3 : 1);
+	if (class_index > -1) {
+		return classes[class_index].Class;
+	} else {
+error:
+		rc_error(variable->C,
+				"Could not find class for index=%u mask=%u\n",
+				variable->Dst.Index, writemask);
+		return 0;
 	}
+}
 
-	/* Rewrite all instructions based on the translation table we built */
-	for(struct rc_instruction * inst = s->C->Program.Instructions.Next;
-	    inst != &s->C->Program.Instructions;
-	    inst = inst->Next) {
-		rc_remap_registers(inst, &remap_register, s);
+static unsigned int overlap_live_intervals_array(
+	struct live_intervals * a,
+	struct live_intervals * b)
+{
+	unsigned int a_chan, b_chan;
+	for (a_chan = 0; a_chan < 4; a_chan++) {
+		for (b_chan = 0; b_chan < 4; b_chan++) {
+			if (overlap_live_intervals(&a[a_chan], &b[b_chan])) {
+					return 1;
+			}
+		}
 	}
+	return 0;
 }
 
-static void alloc_input(void * data, unsigned int input, unsigned int hwreg)
+static unsigned int reg_get_index(int reg)
 {
-	struct regalloc_state * s = data;
+	return reg / RC_MASK_XYZW;
+}
 
-	if (!s->Input[input].Used)
-		return;
+static unsigned int reg_get_writemask(int reg)
+{
+	return (reg % RC_MASK_XYZW) + 1;
+}
 
-	add_live_intervals(s, &s->HwTemporary[hwreg].Used, &s->Input[input].Live);
+static int get_reg_id(unsigned int index, unsigned int writemask)
+{
+	assert(writemask);
+	if (writemask == 0) {
+		return 0;
+	}
+	return (index * RC_MASK_XYZW) + (writemask - 1);
+}
 
-	s->Input[input].Allocated = 1;
-	s->Input[input].File = RC_FILE_TEMPORARY;
-	s->Input[input].Index = hwreg;
+#if VERBOSE
+static void print_reg(int reg)
+{
+	unsigned int index = reg_get_index(reg);
+	unsigned int mask = reg_get_writemask(reg);
+	fprintf(stderr, "Temp[%u].%c%c%c%c", index,
+		mask & RC_MASK_X ? 'x' : '_',
+		mask & RC_MASK_Y ? 'y' : '_',
+		mask & RC_MASK_Z ? 'z' : '_',
+		mask & RC_MASK_W ? 'w' : '_');
+}
+#endif
 
+static void add_register_conflicts(
+	struct ra_regs * regs,
+	unsigned int max_temp_regs)
+{
+	unsigned int index, a_mask, b_mask;
+	for (index = 0; index < max_temp_regs; index++) {
+		for(a_mask = 1; a_mask <= RC_MASK_XYZW; a_mask++) {
+			for (b_mask = a_mask + 1; b_mask <= RC_MASK_XYZW;
+								b_mask++) {
+				if (a_mask & b_mask) {
+					ra_add_reg_conflict(regs,
+						get_reg_id(index, a_mask),
+						get_reg_id(index, b_mask));
+				}
+			}
+		}
+	}
 }
 
-void rc_pair_regalloc(struct radeon_compiler *cc, void *user)
+static void do_advanced_regalloc(struct regalloc_state * s)
 {
-	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)cc;
-	struct regalloc_state s;
+	struct rc_class rc_class_list [] = {
+		{RC_REG_CLASS_SINGLE, 3, 0, 0,
+			{RC_MASK_X,
+			 RC_MASK_Y,
+			 RC_MASK_Z}},
+		{RC_REG_CLASS_DOUBLE, 3, 0, 0,
+			{RC_MASK_X | RC_MASK_Y,
+			 RC_MASK_X | RC_MASK_Z,
+			 RC_MASK_Y | RC_MASK_Z}},
+		{RC_REG_CLASS_TRIPLE, 1, 0, 0,
+			{RC_MASK_X | RC_MASK_Y | RC_MASK_Z,
+			 RC_MASK_NONE,
+			 RC_MASK_NONE}},
+		{RC_REG_CLASS_ALPHA, 1, 0, 0,
+			{RC_MASK_W,
+			 RC_MASK_NONE,
+			 RC_MASK_NONE}},
+		{RC_REG_CLASS_SINGLE_PLUS_ALPHA, 3, 0, 0,
+			{RC_MASK_X | RC_MASK_W,
+			 RC_MASK_Y | RC_MASK_W,
+			 RC_MASK_Z | RC_MASK_W}},
+		{RC_REG_CLASS_DOUBLE_PLUS_ALPHA, 3, 0, 0,
+			{RC_MASK_X | RC_MASK_Y | RC_MASK_W,
+			 RC_MASK_X | RC_MASK_Z | RC_MASK_W,
+			 RC_MASK_Y | RC_MASK_Z | RC_MASK_W}},
+		{RC_REG_CLASS_TRIPLE_PLUS_ALPHA, 1, 0, 0,
+			{RC_MASK_X | RC_MASK_Y | RC_MASK_Z | RC_MASK_W,
+			RC_MASK_NONE,
+			RC_MASK_NONE}},
+		{RC_REG_CLASS_X, 1, 0, 0,
+			{RC_MASK_X,
+			RC_MASK_NONE,
+			RC_MASK_NONE}},
+		{RC_REG_CLASS_Y, 1, 0, 0,
+			{RC_MASK_Y,
+			RC_MASK_NONE,
+			RC_MASK_NONE}},
+		{RC_REG_CLASS_Z, 1, 0, 0,
+			{RC_MASK_Z,
+			RC_MASK_NONE,
+			RC_MASK_NONE}},
+		{RC_REG_CLASS_XY, 1, 0, 0,
+			{RC_MASK_X | RC_MASK_Y,
+			RC_MASK_NONE,
+			RC_MASK_NONE}},
+		{RC_REG_CLASS_YZ, 1, 0, 0,
+			{RC_MASK_Y | RC_MASK_Z,
+			RC_MASK_NONE,
+			RC_MASK_NONE}},
+		{RC_REG_CLASS_XZ, 1, 0, 0,
+			{RC_MASK_X | RC_MASK_Z,
+			RC_MASK_NONE,
+			RC_MASK_NONE}},
+		{RC_REG_CLASS_XW, 1, 0, 0,
+			{RC_MASK_X | RC_MASK_W,
+			RC_MASK_NONE,
+			RC_MASK_NONE}},
+		{RC_REG_CLASS_YW, 1, 0, 0,
+			{RC_MASK_Y | RC_MASK_W,
+			RC_MASK_NONE,
+			RC_MASK_NONE}},
+		{RC_REG_CLASS_ZW, 1, 0, 0,
+			{RC_MASK_Z | RC_MASK_W,
+			RC_MASK_NONE,
+			RC_MASK_NONE}},
+		{RC_REG_CLASS_XYW, 1, 0, 0,
+			{RC_MASK_X | RC_MASK_Y | RC_MASK_W,
+			RC_MASK_NONE,
+			RC_MASK_NONE}},
+		{RC_REG_CLASS_YZW, 1, 0, 0,
+			{RC_MASK_Y | RC_MASK_Z | RC_MASK_W,
+			RC_MASK_NONE,
+			RC_MASK_NONE}},
+		{RC_REG_CLASS_XZW, 1, 0, 0,
+			{RC_MASK_X | RC_MASK_Z | RC_MASK_W,
+			RC_MASK_NONE,
+			RC_MASK_NONE}}
+	};
+
+	unsigned int i, j, index, input_node, node_count, node_index;
+	unsigned int * node_classes;
+	unsigned int * input_classes;
+	struct rc_instruction * inst;
+	struct rc_list * var_ptr;
+	struct rc_list * variables;
+	struct ra_regs * regs;
+	struct ra_graph * graph;
+
+	/* Allocate the main ra data structure */
+	regs = ra_alloc_reg_set(s->C->max_temp_regs * RC_MASK_XYZW);
+
+	/* Get list of program variables */
+	variables = rc_get_variables(s->C);
+	node_count = rc_list_count(variables);
+	node_classes = memory_pool_malloc(&s->C->Pool,
+			node_count * sizeof(unsigned int));
+	input_classes = memory_pool_malloc(&s->C->Pool,
+			s->NumInputs * sizeof(unsigned int));
+
+	for (var_ptr = variables, node_index = 0; var_ptr;
+					var_ptr = var_ptr->Next, node_index++) {
+		unsigned int class_index;
+		/* Compute the live intervals */
+		rc_variable_compute_live_intervals(var_ptr->Item);
+
+		class_index = variable_get_class(var_ptr->Item,	rc_class_list);
+
+		/* If we haven't used this register class yet, mark it
+		 * as used and allocate space for it. */
+		if (!rc_class_list[class_index].Used) {
+			rc_class_list[class_index].Used = 1;
+			rc_class_list[class_index].Id = ra_alloc_reg_class(regs);
+		}
 
-	compute_live_intervals(cc, &s);
+		node_classes[node_index] = rc_class_list[class_index].Id;
+	}
 
-	c->AllocateHwInputs(c, &alloc_input, &s);
 
-	do_regalloc(&s);
-}
+	/* Assign registers to the classes */
+	for (i = 0; i < RC_REG_CLASS_COUNT; i++) {
+		struct rc_class class = rc_class_list[i];
+		if (!class.Used) {
+			continue;
+		}
 
-/* This functions offsets the temporary register indices by the number
- * of input registers, because input registers are actually temporaries and
- * should not occupy the same space.
- *
- * This pass is supposed to be used to maintain correct allocation of inputs
- * if the standard register allocation is disabled. */
-void rc_pair_regalloc_inputs_only(struct radeon_compiler *cc, void *user)
-{
-	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)cc;
-	struct regalloc_state s;
-	int temp_reg_offset;
+		for (index = 0; index < s->C->max_temp_regs; index++) {
+			for (j = 0; j < class.WritemaskCount; j++) {
+				int reg_id = get_reg_id(index,
+							class.Writemasks[j]);
+				ra_class_add_reg(regs, class.Id, reg_id);
+			}
+		}
+	}
+
+	/* Add register conflicts */
+	add_register_conflicts(regs, s->C->max_temp_regs);
+
+	/* Calculate live intervals for input registers */
+	for (inst = s->C->Program.Instructions.Next;
+					inst != &s->C->Program.Instructions;
+					inst = inst->Next) {
+		rc_opcode op = rc_get_flow_control_inst(inst);
+		if (op == RC_OPCODE_BGNLOOP) {
+			struct rc_instruction * endloop =
+							rc_match_bgnloop(inst);
+			if (endloop->IP > s->LoopEnd) {
+				s->LoopEnd = endloop->IP;
+			}
+		}
+		rc_for_all_reads_mask(inst, scan_read_callback, s);
+	}
 
-	compute_live_intervals(cc, &s);
+	/* Create classes for input registers */
+	for (i = 0; i < s->NumInputs; i++) {
+		unsigned int chan, class_id, writemask = 0;
+		for (chan = 0; chan < 4; chan++) {
+			if (s->Input[i].Live[chan].Used) {
+				writemask |= (1 << chan);
+			}
+		}
+		s->Input[i].Writemask = writemask;
+		if (!writemask) {
+			continue;
+		}
+
+		class_id = ra_alloc_reg_class(regs);
+		input_classes[i] = class_id;
+		ra_class_add_reg(regs, class_id,
+				get_reg_id(s->Input[i].Index, writemask));
+	}
 
-	c->AllocateHwInputs(c, &alloc_input, &s);
+	ra_set_finalize(regs);
 
-	temp_reg_offset = 0;
-	for (unsigned i = 0; i < RC_REGISTER_MAX_INDEX; i++) {
-		if (s.Input[i].Allocated && temp_reg_offset <= s.Input[i].Index)
-			temp_reg_offset = s.Input[i].Index + 1;
+	graph = ra_alloc_interference_graph(regs, node_count + s->NumInputs);
+
+	/* Build the interference graph */
+	for (var_ptr = variables, node_index = 0; var_ptr;
+					var_ptr = var_ptr->Next,node_index++) {
+		struct rc_list * a, * b;
+		unsigned int b_index;
+
+		ra_set_node_class(graph, node_index, node_classes[node_index]);
+
+		for (a = var_ptr, b = var_ptr->Next, b_index = node_index + 1;
+						b; b = b->Next, b_index++) {
+			struct rc_variable * var_a = a->Item;
+			while (var_a) {
+				struct rc_variable * var_b = b->Item;
+				while (var_b) {
+					if (overlap_live_intervals_array(var_a->Live, var_b->Live)) {
+						ra_add_node_interference(graph,
+							node_index, b_index);
+					}
+					var_b = var_b->Friend;
+				}
+				var_a = var_a->Friend;
+			}
+		}
 	}
 
-	if (temp_reg_offset) {
-		for (unsigned i = 0; i < RC_REGISTER_MAX_INDEX; i++) {
-			if (s.Temporary[i].Used) {
-				s.Temporary[i].Allocated = 1;
-				s.Temporary[i].File = RC_FILE_TEMPORARY;
-				s.Temporary[i].Index = i + temp_reg_offset;
+	/* Add input registers to the interference graph */
+	for (i = 0, input_node = 0; i< s->NumInputs; i++) {
+		if (!s->Input[i].Writemask) {
+			continue;
+		}
+		ra_set_node_class(graph, node_count + input_node,
+							input_classes[i]);
+		for (var_ptr = variables, node_index = 0;
+				var_ptr; var_ptr = var_ptr->Next, node_index++) {
+			struct rc_variable * var = var_ptr->Item;
+			if (overlap_live_intervals_array(s->Input[i].Live,
+								var->Live)) {
+				ra_add_node_interference(graph, node_index,
+						node_count + input_node);
 			}
 		}
+		/* Manually allocate a register for this input */
+		ra_set_node_reg(graph, node_count + input_node, get_reg_id(
+				s->Input[i].Index, s->Input[i].Writemask));
+		input_node++;
+	}
+
+	if (!ra_allocate_no_spills(graph)) {
+		rc_error(s->C, "Ran out of hardware temporaries\n");
+		return;
+	}
+
+	/* Rewrite the registers */
+	for (var_ptr = variables, node_index = 0; var_ptr;
+				var_ptr = var_ptr->Next, node_index++) {
+		int reg = ra_get_node_reg(graph, node_index);
+		unsigned int writemask = reg_get_writemask(reg);
+		unsigned int index = reg_get_index(reg);
+		struct rc_variable * var = var_ptr->Item;
+
+		if (!s->C->is_r500 && var->Inst->Type == RC_INSTRUCTION_NORMAL) {
+			writemask = rc_variable_writemask_sum(var);
+		}
 
-		/* Rewrite all registers. */
-		for (struct rc_instruction *inst = cc->Program.Instructions.Next;
-		    inst != &cc->Program.Instructions;
-		    inst = inst->Next) {
-			rc_remap_registers(inst, &remap_register, &s);
+		if (var->Dst.File == RC_FILE_INPUT) {
+			continue;
 		}
+		rc_variable_change_dst(var, index, writemask);
+	}
+
+	ralloc_free(graph);
+	ralloc_free(regs);
+}
+
+/**
+ * @param user This parameter should be a pointer to an integer value.  If this
+ * integer value is zero, then a simple register allocator will be used that
+ * only allocates space for input registers (\sa do_regalloc_inputs_only).  If
+ * user is non-zero, then the regular register allocator will be used
+ * (\sa do_regalloc).
+  */
+void rc_pair_regalloc(struct radeon_compiler *cc, void *user)
+{
+	struct r300_fragment_program_compiler *c =
+				(struct r300_fragment_program_compiler*)cc;
+	struct regalloc_state s;
+	int * do_full_regalloc = (int*)user;
+
+	memset(&s, 0, sizeof(s));
+	s.C = cc;
+	s.NumInputs = rc_get_max_index(cc, RC_FILE_INPUT) + 1;
+	s.Input = memory_pool_malloc(&cc->Pool,
+			s.NumInputs * sizeof(struct register_info));
+	memset(s.Input, 0, s.NumInputs * sizeof(struct register_info));
+
+	s.NumTemporaries = rc_get_max_index(cc, RC_FILE_TEMPORARY) + 1;
+	s.Temporary = memory_pool_malloc(&cc->Pool,
+			s.NumTemporaries * sizeof(struct register_info));
+	memset(s.Temporary, 0, s.NumTemporaries * sizeof(struct register_info));
+
+	rc_recompute_ips(s.C);
+
+	c->AllocateHwInputs(c, &alloc_input_simple, &s);
+	if (*do_full_regalloc) {
+		do_advanced_regalloc(&s);
+	} else {
+		s.Simple = 1;
+		do_regalloc_inputs_only(&s);
+	}
+
+	/* Rewrite inputs and if we are doing the simple allocation, rewrite
+	 * temporaries too. */
+	for (struct rc_instruction *inst = s.C->Program.Instructions.Next;
+					inst != &s.C->Program.Instructions;
+					inst = inst->Next) {
+		rc_remap_registers(inst, &remap_register, &s);
 	}
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
index 8e10813ff06..25cd52c9cd4 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
@@ -709,7 +709,7 @@ static int convert_rgb_to_alpha(
 
 	pair_inst->Alpha.Opcode = pair_inst->RGB.Opcode;
 	pair_inst->Alpha.DestIndex = new_index;
-	pair_inst->Alpha.WriteMask = 1;
+	pair_inst->Alpha.WriteMask = RC_MASK_W;
 	pair_inst->Alpha.Target = pair_inst->RGB.Target;
 	pair_inst->Alpha.OutputWriteMask = pair_inst->RGB.OutputWriteMask;
 	pair_inst->Alpha.DepthWriteMask = pair_inst->RGB.DepthWriteMask;
@@ -739,7 +739,7 @@ static int convert_rgb_to_alpha(
 
 	for(i = 0; i < sched_inst->GlobalReaders.ReaderCount; i++) {
 		struct rc_reader reader = sched_inst->GlobalReaders.Readers[i];
-		rgb_to_alpha_remap(reader.Inst, reader.U.Arg,
+		rgb_to_alpha_remap(reader.Inst, reader.U.P.Arg,
 					RC_FILE_TEMPORARY, old_swz, new_index);
 	}
 	return 1;
@@ -952,6 +952,7 @@ static void schedule_block(struct r300_fragment_program_compiler * c,
 			instruction_ready(&s, s.Current);
 
 		/* Get global readers for possible RGB->Alpha conversion. */
+		s.Current->GlobalReaders.ExitOnAbort = 1;
 		rc_get_readers(s.C, inst, &s.Current->GlobalReaders,
 				is_rgb_to_alpha_possible_normal,
 				is_rgb_to_alpha_possible, NULL);
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program.h b/src/mesa/drivers/dri/r300/compiler/radeon_program.h
index a07f6b63c6e..b899eccbf53 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program.h
@@ -108,6 +108,9 @@ struct rc_sub_instruction {
 
 	/** True if tex instruction should do shadow comparison */
 	unsigned int TexShadow:1;
+
+	/**R500 Only.  How to swizzle the result of a TEX lookup*/
+	unsigned int TexSwizzle:12;
 	/*@}*/
 
 	/** This holds information about the presubtract operation used by
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_constants.h b/src/mesa/drivers/dri/r300/compiler/radeon_program_constants.h
index 45f79ece5ba..24577333450 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_constants.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_constants.h
@@ -129,6 +129,7 @@ typedef enum {
 #define RC_SWIZZLE_0000 RC_MAKE_SWIZZLE_SMEAR(RC_SWIZZLE_ZERO)
 #define RC_SWIZZLE_1111 RC_MAKE_SWIZZLE_SMEAR(RC_SWIZZLE_ONE)
 #define RC_SWIZZLE_HHHH RC_MAKE_SWIZZLE_SMEAR(RC_SWIZZLE_HALF)
+#define RC_SWIZZLE_UUUU RC_MAKE_SWIZZLE_SMEAR(RC_SWIZZLE_UNUSED)
 
 /**
  * \name Bitmasks for components of vectors.
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c
index 68874795b8a..52315957520 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c
@@ -223,3 +223,17 @@ struct rc_pair_instruction_source * rc_pair_get_src(
 		return NULL;
 	}
 }
+
+int rc_pair_get_src_index(
+	struct rc_pair_instruction * pair_inst,
+	struct rc_pair_instruction_source * src)
+{
+	int i;
+	for (i = 0; i < 3; i++) {
+		if (&pair_inst->RGB.Src[i] == src
+			|| &pair_inst->Alpha.Src[i] == src) {
+			return i;
+		}
+	}
+	return -1;
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
index d1a435fc530..a957ea9f7a0 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
@@ -114,6 +114,10 @@ void rc_pair_foreach_source_that_rgb_reads(
 struct rc_pair_instruction_source * rc_pair_get_src(
 	struct rc_pair_instruction * pair_inst,
 	struct rc_pair_instruction_arg * arg);
+
+int rc_pair_get_src_index(
+	struct rc_pair_instruction * pair_inst,
+	struct rc_pair_instruction_source * src);
 /*@}*/
 
 
@@ -127,6 +131,7 @@ void rc_pair_translate(struct radeon_compiler *cc, void *user);
 void rc_pair_schedule(struct radeon_compiler *cc, void *user);
 void rc_pair_regalloc(struct radeon_compiler *cc, void *user);
 void rc_pair_regalloc_inputs_only(struct radeon_compiler *cc, void *user);
+void rc_pair_remove_dead_sources(struct radeon_compiler *c, void *user);
 /*@}*/
 
 #endif /* __RADEON_PROGRAM_PAIR_H_ */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c
index cef448ee4e1..8d16b2cf9ec 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c
@@ -142,6 +142,8 @@ int radeonTransformTEX(
 	if (inst->U.I.Opcode != RC_OPCODE_TEX &&
 		inst->U.I.Opcode != RC_OPCODE_TXB &&
 		inst->U.I.Opcode != RC_OPCODE_TXP &&
+		inst->U.I.Opcode != RC_OPCODE_TXD &&
+		inst->U.I.Opcode != RC_OPCODE_TXL &&
 		inst->U.I.Opcode != RC_OPCODE_KIL)
 		return 0;
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c
index 5bd19c0b9c6..cafa0579734 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c
@@ -71,6 +71,7 @@ void rc_rename_regs(struct radeon_compiler *c, void *user)
 		if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
 			continue;
 
+		reader_data.ExitOnAbort = 1;
 		rc_get_readers(c, inst, &reader_data, NULL, NULL, NULL);
 
 		if (reader_data.Abort || reader_data.ReaderCount == 0)
@@ -85,7 +86,7 @@ void rc_rename_regs(struct radeon_compiler *c, void *user)
 
 		reader_data.Writer->U.I.DstReg.Index = new_index;
 		for(i = 0; i < reader_data.ReaderCount; i++) {
-			reader_data.Readers[i].U.Src->Index = new_index;
+			reader_data.Readers[i].U.I.Src->Index = new_index;
 		}
 	}
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_variable.c b/src/mesa/drivers/dri/r300/compiler/radeon_variable.c
new file mode 100644
index 00000000000..16fa5d28902
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_variable.c
@@ -0,0 +1,484 @@
+/*
+ * Copyright 2011 Tom Stellard <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_variable.h"
+
+#include "memory_pool.h"
+#include "radeon_compiler_util.h"
+#include "radeon_dataflow.h"
+#include "radeon_list.h"
+#include "radeon_opcodes.h"
+#include "radeon_program.h"
+
+/**
+ * Rewrite the index and writemask for the destination register of var
+ * and its friends to new_index and new_writemask.  This function also takes
+ * care of rewriting the swizzles for the sources of var.
+ */
+void rc_variable_change_dst(
+	struct rc_variable * var,
+	unsigned int new_index,
+	unsigned int new_writemask)
+{
+	struct rc_variable * var_ptr;
+	struct rc_list * readers;
+	unsigned int old_mask = rc_variable_writemask_sum(var);
+	unsigned int conversion_swizzle =
+			rc_make_conversion_swizzle(old_mask, new_writemask);
+
+	for (var_ptr = var; var_ptr; var_ptr = var_ptr->Friend) {
+		if (var_ptr->Inst->Type == RC_INSTRUCTION_NORMAL) {
+			rc_normal_rewrite_writemask(var_ptr->Inst,
+							conversion_swizzle);
+			var_ptr->Inst->U.I.DstReg.Index = new_index;
+		} else {
+			struct rc_pair_sub_instruction * sub;
+			if (var_ptr->Dst.WriteMask == RC_MASK_W) {
+				assert(new_writemask & RC_MASK_W);
+				sub = &var_ptr->Inst->U.P.Alpha;
+			} else {
+				sub = &var_ptr->Inst->U.P.RGB;
+				rc_pair_rewrite_writemask(sub,
+							conversion_swizzle);
+			}
+			sub->DestIndex = new_index;
+		}
+	}
+
+	readers = rc_variable_readers_union(var);
+
+	for ( ; readers; readers = readers->Next) {
+		struct rc_reader * reader = readers->Item;
+		if (reader->Inst->Type == RC_INSTRUCTION_NORMAL) {
+			reader->U.I.Src->Index = new_index;
+			reader->U.I.Src->Swizzle = rc_rewrite_swizzle(
+				reader->U.I.Src->Swizzle, conversion_swizzle);
+		} else {
+			struct rc_pair_instruction * pair_inst =
+							&reader->Inst->U.P;
+			unsigned int src_type = rc_source_type_swz(
+							reader->U.P.Arg->Swizzle);
+
+			int src_index = reader->U.P.Arg->Source;
+			if (src_index == RC_PAIR_PRESUB_SRC) {
+				src_index = rc_pair_get_src_index(
+						pair_inst, reader->U.P.Src);
+			}
+			/* Try to delete the old src, it is OK if this fails,
+			 * because rc_pair_alloc_source might be able to
+			 * find a source the ca be reused.
+			 */
+			if (rc_pair_remove_src(reader->Inst, src_type,
+							src_index, old_mask)) {
+				/* Reuse the source index of the source that
+				 * was just deleted and set its register
+				 * index.  We can't use rc_pair_alloc_source
+				 * for this becuase it might return a source
+				 * index that is already being used. */
+				if (src_type & RC_SOURCE_RGB) {
+					pair_inst->RGB.Src[src_index]
+						.Used =	1;
+					pair_inst->RGB.Src[src_index]
+						.Index = new_index;
+					pair_inst->RGB.Src[src_index]
+						.File = RC_FILE_TEMPORARY;
+				}
+				if (src_type & RC_SOURCE_ALPHA) {
+					pair_inst->Alpha.Src[src_index]
+						.Used = 1;
+					pair_inst->Alpha.Src[src_index]
+						.Index = new_index;
+					pair_inst->Alpha.Src[src_index]
+						.File = RC_FILE_TEMPORARY;
+				}
+			} else {
+				src_index = rc_pair_alloc_source(
+						&reader->Inst->U.P,
+						src_type & RC_SOURCE_RGB,
+						src_type & RC_SOURCE_ALPHA,
+						RC_FILE_TEMPORARY,
+						new_index);
+				if (src_index < 0) {
+					rc_error(var->C, "Rewrite of inst %u failed "
+						"Can't allocate source for "
+						"Inst %u src_type=%x "
+						"new_index=%u new_mask=%u\n",
+						var->Inst->IP, reader->Inst->IP, src_type, new_index, new_writemask);
+						continue;
+				}
+			}
+			reader->U.P.Arg->Swizzle = rc_rewrite_swizzle(
+				reader->U.P.Arg->Swizzle, conversion_swizzle);
+			if (reader->U.P.Arg->Source != RC_PAIR_PRESUB_SRC) {
+				reader->U.P.Arg->Source = src_index;
+			}
+		}
+	}
+}
+
+/**
+ * Compute the live intervals for var and its friends.
+ */
+void rc_variable_compute_live_intervals(struct rc_variable * var)
+{
+	while(var) {
+		unsigned int i;
+		unsigned int start = var->Inst->IP;
+
+		for (i = 0; i < var->ReaderCount; i++) {
+			unsigned int chan;
+			unsigned int chan_start = start;
+			unsigned int chan_end = var->Readers[i].Inst->IP;
+			unsigned int mask = var->Readers[i].WriteMask;
+			struct rc_instruction * inst;
+
+			/* Extend the live interval of T0 to the start of the
+			 * loop for sequences like:
+			 * BGNLOOP
+			 * read T0
+			 * ...
+			 * write T0
+			 * ENDLOOP
+			 */
+			if (var->Readers[i].Inst->IP < start) {
+				struct rc_instruction * bgnloop =
+					rc_match_endloop(var->Readers[i].Inst);
+				chan_start = bgnloop->IP;
+			}
+
+			/* Extend the live interval of T0 to the start of the
+			 * loop in case there is a BRK instruction in the loop
+			 * (we don't actually check for a BRK instruction we
+			 * assume there is one somewhere in the loop, which
+			 * there usually is) for sequences like:
+			 * BGNLOOP
+			 * ...
+			 * conditional BRK
+			 * ...
+			 * write T0
+			 * ENDLOOP
+			 * read T0
+			 ***************************************************
+			 * Extend the live interval of T0 to the end of the
+			 * loop for sequences like:
+			 * write T0
+			 * BGNLOOP
+			 * ...
+			 * read T0
+			 * ENDLOOP
+			 */
+			for (inst = var->Inst; inst != var->Readers[i].Inst;
+							inst = inst->Next) {
+				rc_opcode op = rc_get_flow_control_inst(inst);
+				if (op == RC_OPCODE_ENDLOOP) {
+					struct rc_instruction * bgnloop =
+						rc_match_endloop(inst);
+					if (bgnloop->IP < chan_start) {
+						chan_start = bgnloop->IP;
+					}
+				} else if (op == RC_OPCODE_BGNLOOP) {
+					struct rc_instruction * endloop =
+						rc_match_bgnloop(inst);
+					if (endloop->IP > chan_end) {
+						chan_end = endloop->IP;
+					}
+				}
+			}
+
+			for (chan = 0; chan < 4; chan++) {
+				if ((mask >> chan) & 0x1) {
+					if (!var->Live[chan].Used
+					|| chan_start < var->Live[chan].Start) {
+						var->Live[chan].Start =
+								chan_start;
+					}
+					if (!var->Live[chan].Used
+					|| chan_end > var->Live[chan].End) {
+						var->Live[chan].End = chan_end;
+					}
+					var->Live[chan].Used = 1;
+				}
+			}
+		}
+		var = var->Friend;
+	}
+}
+
+/**
+ * @return 1 if a and b share a reader
+ * @return 0 if they do not
+ */
+static unsigned int readers_intersect(
+	struct rc_variable * a,
+	struct rc_variable * b)
+{
+	unsigned int a_index, b_index;
+	for (a_index = 0; a_index < a->ReaderCount; a_index++) {
+		struct rc_reader reader_a = a->Readers[a_index];
+		for (b_index = 0; b_index < b->ReaderCount; b_index++) {
+			struct rc_reader reader_b = b->Readers[b_index];
+			if (reader_a.Inst->Type == RC_INSTRUCTION_NORMAL
+				&& reader_b.Inst->Type == RC_INSTRUCTION_NORMAL
+				&& reader_a.U.I.Src == reader_b.U.I.Src) {
+
+				return 1;
+			}
+			if (reader_a.Inst->Type == RC_INSTRUCTION_PAIR
+				&& reader_b.Inst->Type == RC_INSTRUCTION_PAIR
+				&& reader_a.U.P.Src == reader_b.U.P.Src) {
+
+				return 1;
+			}
+		}
+	}
+	return 0;
+}
+
+void rc_variable_add_friend(
+	struct rc_variable * var,
+	struct rc_variable * friend)
+{
+	assert(var->Dst.Index == friend->Dst.Index);
+	while(var->Friend) {
+		var = var->Friend;
+	}
+	var->Friend = friend;
+}
+
+struct rc_variable * rc_variable(
+	struct radeon_compiler * c,
+	unsigned int DstFile,
+	unsigned int DstIndex,
+	unsigned int DstWriteMask,
+	struct rc_reader_data * reader_data)
+{
+	struct rc_variable * new =
+			memory_pool_malloc(&c->Pool, sizeof(struct rc_variable));
+	memset(new, 0, sizeof(struct rc_variable));
+	new->C = c;
+	new->Dst.File = DstFile;
+	new->Dst.Index = DstIndex;
+	new->Dst.WriteMask = DstWriteMask;
+	if (reader_data) {
+		new->Inst = reader_data->Writer;
+		new->ReaderCount = reader_data->ReaderCount;
+		new->Readers = reader_data->Readers;
+	}
+	return new;
+}
+
+static void get_variable_helper(
+	struct rc_list ** aborted_list,
+	struct rc_list ** variable_list,
+	unsigned int aborted,
+	struct rc_variable * variable)
+{
+	if (aborted) {
+		rc_list_add(aborted_list, rc_list(&variable->C->Pool, variable));
+	} else {
+		rc_list_add(variable_list, rc_list(&variable->C->Pool, variable));
+	}
+}
+
+static void get_variable_pair_helper(
+	struct rc_list ** aborted_list,
+	struct rc_list ** variable_list,
+	struct radeon_compiler * c,
+	struct rc_instruction * inst,
+	struct rc_pair_sub_instruction * sub_inst)
+{
+	struct rc_reader_data reader_data;
+	struct rc_variable * new_var;
+	rc_register_file file;
+	unsigned int writemask;
+
+	if (sub_inst->Opcode == RC_OPCODE_NOP) {
+		return;
+	}
+	memset(&reader_data, 0, sizeof(struct rc_reader_data));
+	rc_get_readers_sub(c, inst, sub_inst, &reader_data, NULL, NULL, NULL);
+
+	if (reader_data.ReaderCount == 0) {
+		return;
+	}
+
+	if (sub_inst->WriteMask) {
+		file = RC_FILE_TEMPORARY;
+		writemask = sub_inst->WriteMask;
+	} else if (sub_inst->OutputWriteMask) {
+		file = RC_FILE_OUTPUT;
+		writemask = sub_inst->OutputWriteMask;
+	} else {
+		writemask = 0;
+		file = RC_FILE_NONE;
+	}
+	new_var = rc_variable(c, file, sub_inst->DestIndex, writemask,
+								&reader_data);
+	get_variable_helper(aborted_list, variable_list, reader_data.Abort,
+								new_var);
+}
+
+/**
+ * Generate a list of variables used by the shader program.  Each instruction
+ * that writes to a register is considered a variable.  The struct rc_variable
+ * data structure includes a list of readers and is essentially a
+ * definition-use chain.  Any two variables that share a reader are considered
+ * "friends" and they are linked together via the Friend attribute.
+ */
+struct rc_list * rc_get_variables(struct radeon_compiler * c)
+{
+	struct rc_instruction * inst;
+	struct rc_list * aborted_list = NULL;
+	struct rc_list * variable_list = NULL;
+	struct rc_list * var_ptr;
+	struct rc_list * search_ptr;
+
+	for (inst = c->Program.Instructions.Next;
+					inst != &c->Program.Instructions;
+					inst = inst->Next) {
+		struct rc_reader_data reader_data;
+		struct rc_variable * new_var;
+		memset(&reader_data, 0, sizeof(reader_data));
+
+		if (inst->Type == RC_INSTRUCTION_NORMAL) {
+			rc_get_readers(c, inst, &reader_data, NULL, NULL, NULL);
+			if (reader_data.ReaderCount == 0) {
+				continue;
+			}
+			new_var = rc_variable(c, inst->U.I.DstReg.File,
+				inst->U.I.DstReg.Index,
+				inst->U.I.DstReg.WriteMask, &reader_data);
+			get_variable_helper(&aborted_list, &variable_list,
+						reader_data.Abort, new_var);
+		} else {
+			get_variable_pair_helper(&aborted_list, &variable_list,
+					c, inst, &inst->U.P.RGB);
+			get_variable_pair_helper(&aborted_list, &variable_list,
+					c, inst, &inst->U.P.Alpha);
+		}
+	}
+
+	/* The aborted_list contains a list of variables that might share a
+	 * reader with another variable.  We need to search through this list
+	 * and pair together variables that do share the same reader.
+	 */
+	while (aborted_list) {
+		struct rc_list * search_ptr_next;
+		var_ptr = aborted_list;
+
+		search_ptr = var_ptr->Next;
+		while(search_ptr) {
+			search_ptr_next = search_ptr->Next;
+			if (readers_intersect(var_ptr->Item, search_ptr->Item)){
+				rc_list_remove(&aborted_list, search_ptr);
+				rc_variable_add_friend(var_ptr->Item,
+							search_ptr->Item);
+			}
+			search_ptr = search_ptr_next;
+		}
+		rc_list_remove(&aborted_list, var_ptr);
+		rc_list_add(&variable_list, rc_list(
+			&((struct rc_variable*)(var_ptr->Item))->C->Pool,
+			var_ptr->Item));
+	}
+	return variable_list;
+}
+
+/**
+ * @return The bitwise or of the writemasks of a variable and all of its
+ * friends.
+ */
+unsigned int rc_variable_writemask_sum(struct rc_variable * var)
+{
+	unsigned int writemask = 0;
+	while(var) {
+		writemask |= var->Dst.WriteMask;
+		var = var->Friend;
+	}
+	return writemask;
+}
+
+/*
+ * @return A list of readers for a variable and its friends.  Readers
+ * that read from two different variable friends are only included once in
+ * this list.
+ */
+struct rc_list * rc_variable_readers_union(struct rc_variable * var)
+{
+	struct rc_list * list = NULL;
+	while (var) {
+		unsigned int i;
+		for (i = 0; i < var->ReaderCount; i++) {
+			struct rc_list * temp;
+			struct rc_reader * a = &var->Readers[i];
+			unsigned int match = 0;
+			for (temp = list; temp; temp = temp->Next) {
+				struct rc_reader * b = temp->Item;
+				if (a->Inst->Type != b->Inst->Type) {
+					continue;
+				}
+				if (a->Inst->Type == RC_INSTRUCTION_NORMAL) {
+					if (a->U.I.Src == b->U.I.Src) {
+						match = 1;
+						break;
+					}
+				}
+				if (a->Inst->Type == RC_INSTRUCTION_PAIR) {
+					if (a->U.P.Arg == b->U.P.Arg
+					    && a->U.P.Src == b->U.P.Src) {
+						match = 1;
+						break;
+					}
+				}
+			}
+			if (match) {
+				continue;
+			}
+			rc_list_add(&list, rc_list(&var->C->Pool, a));
+		}
+		var = var->Friend;
+	}
+	return list;
+}
+
+void rc_variable_print(struct rc_variable * var)
+{
+	unsigned int i;
+	while (var) {
+		fprintf(stderr, "%u: TEMP[%u].%u: ",
+			var->Inst->IP, var->Dst.Index, var->Dst.WriteMask);
+		for (i = 0; i < 4; i++) {
+			fprintf(stderr, "chan %u: start=%u end=%u ", i,
+					var->Live[i].Start, var->Live[i].End);
+		}
+		fprintf(stderr, "%u readers\n", var->ReaderCount);
+		if (var->Friend) {
+			fprintf(stderr, "Friend: \n\t");
+		}
+		var = var->Friend;
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_variable.h b/src/mesa/drivers/dri/r300/compiler/radeon_variable.h
new file mode 100644
index 00000000000..b8fbcaa4029
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_variable.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2011 Tom Stellard <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef RADEON_VARIABLE_H
+#define RADEON_VARIABLE_H
+
+#include "radeon_compiler.h"
+
+struct radeon_compiler;
+struct rc_list;
+struct rc_reader_data;
+struct rc_readers;
+
+struct live_intervals {
+	int Start;
+	int End;
+	int Used;
+};
+
+struct rc_variable {
+	struct radeon_compiler * C;
+	struct rc_dst_register Dst;
+
+	struct rc_instruction * Inst;
+	unsigned int ReaderCount;
+	struct rc_reader * Readers;
+	struct live_intervals Live[4];
+
+	/* A friend is a variable that shares a reader with another variable.
+	 */
+	struct rc_variable * Friend;
+};
+
+void rc_variable_change_dst(
+	struct rc_variable * var,
+	unsigned int new_index,
+	unsigned int new_writemask);
+
+void rc_variable_compute_live_intervals(struct rc_variable * var);
+
+void rc_variable_add_friend(
+	struct rc_variable * var,
+	struct rc_variable * friend);
+
+struct rc_variable * rc_variable(
+	struct radeon_compiler * c,
+	unsigned int DstFile,
+	unsigned int DstIndex,
+	unsigned int DstWriteMask,
+	struct rc_reader_data * reader_data);
+
+struct rc_list * rc_get_variables(struct radeon_compiler * c);
+
+unsigned int rc_variable_writemask_sum(struct rc_variable * var);
+
+struct rc_list * rc_variable_readers_union(struct rc_variable * var);
+
+void rc_variable_print(struct rc_variable * var);
+
+#endif /* RADEON_VARIABLE_H */
diff --git a/src/mesa/drivers/dri/radeon/radeon_chipset.h b/src/mesa/drivers/dri/radeon/radeon_chipset.h
index 88b68e3d191..9145023826e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_chipset.h
+++ b/src/mesa/drivers/dri/radeon/radeon_chipset.h
@@ -406,6 +406,7 @@
 #define PCI_CHIP_CEDAR_68E8             0x68E8
 #define PCI_CHIP_CEDAR_68E9             0x68E9
 #define PCI_CHIP_CEDAR_68F1             0x68F1
+#define PCI_CHIP_CEDAR_68F2             0x68F2
 #define PCI_CHIP_CEDAR_68F8             0x68F8
 #define PCI_CHIP_CEDAR_68F9             0x68F9
 #define PCI_CHIP_CEDAR_68FE             0x68FE
@@ -426,7 +427,9 @@
 #define PCI_CHIP_JUNIPER_68B0           0x68B0
 #define PCI_CHIP_JUNIPER_68B8           0x68B8
 #define PCI_CHIP_JUNIPER_68B9           0x68B9
+#define PCI_CHIP_JUNIPER_68BA           0x68BA
 #define PCI_CHIP_JUNIPER_68BE           0x68BE
+#define PCI_CHIP_JUNIPER_68BF           0x68BF
 
 #define PCI_CHIP_CYPRESS_6880           0x6880
 #define PCI_CHIP_CYPRESS_6888           0x6888
@@ -434,6 +437,7 @@
 #define PCI_CHIP_CYPRESS_688A           0x688A
 #define PCI_CHIP_CYPRESS_6898           0x6898
 #define PCI_CHIP_CYPRESS_6899           0x6899
+#define PCI_CHIP_CYPRESS_689B           0x689B
 #define PCI_CHIP_CYPRESS_689E           0x689E
 
 #define PCI_CHIP_HEMLOCK_689C           0x689C
@@ -458,6 +462,7 @@
 #define PCI_CHIP_BARTS_6729             0x6729
 #define PCI_CHIP_BARTS_6738             0x6738
 #define PCI_CHIP_BARTS_6739             0x6739
+#define PCI_CHIP_BARTS_673E             0x673E
 
 #define PCI_CHIP_TURKS_6740             0x6740
 #define PCI_CHIP_TURKS_6741             0x6741
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index 732efe8bd85..6449229e088 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -1106,6 +1106,7 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
     case PCI_CHIP_CEDAR_68E8:
     case PCI_CHIP_CEDAR_68E9:
     case PCI_CHIP_CEDAR_68F1:
+    case PCI_CHIP_CEDAR_68F2:
     case PCI_CHIP_CEDAR_68F8:
     case PCI_CHIP_CEDAR_68F9:
     case PCI_CHIP_CEDAR_68FE:
@@ -1132,7 +1133,9 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
     case PCI_CHIP_JUNIPER_68B0:
     case PCI_CHIP_JUNIPER_68B8:
     case PCI_CHIP_JUNIPER_68B9:
+    case PCI_CHIP_JUNIPER_68BA:
     case PCI_CHIP_JUNIPER_68BE:
+    case PCI_CHIP_JUNIPER_68BF:
        screen->chip_family = CHIP_FAMILY_JUNIPER;
        screen->chip_flags = RADEON_CHIPSET_TCL;
        break;
@@ -1143,6 +1146,7 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
     case PCI_CHIP_CYPRESS_688A:
     case PCI_CHIP_CYPRESS_6898:
     case PCI_CHIP_CYPRESS_6899:
+    case PCI_CHIP_CYPRESS_689B:
     case PCI_CHIP_CYPRESS_689E:
        screen->chip_family = CHIP_FAMILY_CYPRESS;
        screen->chip_flags = RADEON_CHIPSET_TCL;
@@ -1176,6 +1180,7 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
    case PCI_CHIP_BARTS_6729:
    case PCI_CHIP_BARTS_6738:
    case PCI_CHIP_BARTS_6739:
+   case PCI_CHIP_BARTS_673E:
        screen->chip_family = CHIP_FAMILY_BARTS;
        screen->chip_flags = RADEON_CHIPSET_TCL;
        break;
diff --git a/src/mesa/drivers/windows/fx/fx.rc b/src/mesa/drivers/windows/fx/fx.rc
deleted file mode 100644
index f920b8768dd..00000000000
--- a/src/mesa/drivers/windows/fx/fx.rc
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <windows.h>
-
-#define PRODNAME                "Mesa 6.x"
-#define CONTACTSTR              "http://www.mesa3d.org"
-#define HWSTR                   "3dfx Voodoo Graphics, Voodoo Rush, Voodoo^2, Voodoo Banshee, Velocity 100/200, Voodoo3, Voodoo4, Voodoo5"
-#define COPYRIGHTSTR            "Copyright \251 Brian E. Paul"
-
-#define VERSIONSTR              "6.3.0.1"
-#define MANVERSION              6
-#define MANREVISION             3
-#define BUILD_NUMBER            1
-
-VS_VERSION_INFO VERSIONINFO
- FILEVERSION MANVERSION, MANREVISION, 0, BUILD_NUMBER
- PRODUCTVERSION MANVERSION, MANREVISION, 0, BUILD_NUMBER
- FILEFLAGSMASK 0x0030003FL
-
- FILEOS VOS_DOS_WINDOWS32
- FILETYPE VFT_DRV
- FILESUBTYPE VFT2_DRV_INSTALLABLE
-BEGIN
-    BLOCK "StringFileInfo"
-    BEGIN
-        BLOCK "040904E4"
-        BEGIN
-            VALUE "FileDescription", PRODNAME
-            VALUE "FileVersion", VERSIONSTR
-            VALUE "LegalCopyright", COPYRIGHTSTR
-            VALUE "ProductName", PRODNAME
-            VALUE "Graphics Subsystem", HWSTR
-            VALUE "Contact", CONTACTSTR
-        END
-    END
-    BLOCK "VarFileInfo"
-    BEGIN
-        /* the following line should be extended for localized versions */
-        VALUE "Translation", 0x409, 1252
-    END
-END
diff --git a/src/mesa/drivers/windows/fx/fxopengl.def b/src/mesa/drivers/windows/fx/fxopengl.def
deleted file mode 100644
index bc615e93ae6..00000000000
--- a/src/mesa/drivers/windows/fx/fxopengl.def
+++ /dev/null
@@ -1,953 +0,0 @@
-LIBRARY OpenGL32
-DESCRIPTION "Mesa 5.1"
-EXPORTS
- glAccum
- glActiveStencilFaceEXT
- glActiveTexture
- glActiveTextureARB
- glAlphaFunc
- glAreProgramsResidentNV
- glAreTexturesResident
- glAreTexturesResidentEXT
- glArrayElement
- glArrayElementEXT
- glBegin
- glBeginQueryARB
- glBindBufferARB
- glBindProgramARB
- glBindProgramNV
- glBindTexture
- glBindTextureEXT
- glBitmap
-;glBlendColor
-;glBlendColorEXT
- glBlendEquation
- glBlendEquationEXT
- glBlendFunc
- glBlendFuncSeparate
- glBlendFuncSeparateEXT
- glBlendFuncSeparateINGR
- glBufferDataARB
- glBufferSubDataARB
- glCallList
- glCallLists
- glClear
- glClearAccum
- glClearColor
- glClearDepth
- glClearIndex
- glClearStencil
- glClientActiveTexture
- glClientActiveTextureARB
- glClipPlane
- glColorMask
- glColorMaterial
- glColorPointer
- glColorPointerEXT
- glColorSubTable
- glColorSubTableEXT
- glColorTable
- glColorTableEXT
- glColorTableParameterfv
- glColorTableParameterfvSGI
- glColorTableParameteriv
- glColorTableParameterivSGI
- glColorTableSGI
- glColor3b
- glColor3bv
- glColor3d
- glColor3dv
- glColor3f
- glColor3fv
- glColor3i
- glColor3iv
- glColor3s
- glColor3sv
- glColor3ub
- glColor3ubv
- glColor3ui
- glColor3uiv
- glColor3us
- glColor3usv
- glColor4b
- glColor4bv
- glColor4d
- glColor4dv
- glColor4f
- glColor4fv
- glColor4i
- glColor4iv
- glColor4s
- glColor4sv
- glColor4ub
- glColor4ubv
- glColor4ui
- glColor4uiv
- glColor4us
- glColor4usv
- glCombinerInputNV
- glCombinerOutputNV
- glCombinerParameterfNV
- glCombinerParameterfvNV
- glCombinerParameteriNV
- glCombinerParameterivNV
- glCompressedTexImage1D
- glCompressedTexImage1DARB
- glCompressedTexImage2D
- glCompressedTexImage2DARB
- glCompressedTexImage3D
- glCompressedTexImage3DARB
- glCompressedTexSubImage1D
- glCompressedTexSubImage1DARB
- glCompressedTexSubImage2D
- glCompressedTexSubImage2DARB
- glCompressedTexSubImage3D
- glCompressedTexSubImage3DARB
- glConvolutionFilter1D
- glConvolutionFilter1DEXT
- glConvolutionFilter2D
- glConvolutionFilter2DEXT
- glConvolutionParameterf
- glConvolutionParameterfEXT
- glConvolutionParameterfv
- glConvolutionParameterfvEXT
- glConvolutionParameteri
- glConvolutionParameteriEXT
- glConvolutionParameteriv
- glConvolutionParameterivEXT
- glCopyColorSubTable
- glCopyColorSubTableEXT
- glCopyColorTable
- glCopyColorTableSGI
- glCopyConvolutionFilter1D
- glCopyConvolutionFilter1DEXT
- glCopyConvolutionFilter2D
- glCopyConvolutionFilter2DEXT
- glCopyPixels
- glCopyTexImage1D
- glCopyTexImage1DEXT
- glCopyTexImage2D
- glCopyTexImage2DEXT
- glCopyTexSubImage1D
- glCopyTexSubImage1DEXT
- glCopyTexSubImage2D
- glCopyTexSubImage2DEXT
- glCopyTexSubImage3D
- glCopyTexSubImage3DEXT
- glCullFace
- glDeleteBuffersARB
- glDeleteFencesNV
- glDeleteLists
- glDeleteProgramsARB
- glDeleteProgramsNV
- glDeleteQueriesARB
- glDeleteTextures
- glDeleteTexturesEXT
- glDepthBoundsEXT
- glDepthFunc
- glDepthMask
- glDepthRange
- glDetailTexFuncSGIS
- glDisable
- glDisableClientState
- glDisableVertexAttribArrayARB
- glDrawArrays
- glDrawArraysEXT
- glDrawBuffer
- glDrawElements
- glDrawPixels
- glDrawRangeElements
- glDrawRangeElementsEXT
- glEdgeFlag
- glEdgeFlagPointer
- glEdgeFlagPointerEXT
- glEdgeFlagv
- glEnable
- glEnableClientState
- glEnableVertexAttribArrayARB
- glEnd
- glEndList
- glEndQueryARB
- glEvalCoord1d
- glEvalCoord1dv
- glEvalCoord1f
- glEvalCoord1fv
- glEvalCoord2d
- glEvalCoord2dv
- glEvalCoord2f
- glEvalCoord2fv
- glEvalMesh1
- glEvalMesh2
- glEvalPoint1
- glEvalPoint2
- glExecuteProgramNV
- glFeedbackBuffer
- glFinalCombinerInputNV
- glFinish
- glFinishFenceNV
- glFlush
- glFlushRasterSGIX
- glFlushVertexArrayRangeNV
- glFogCoordd
- glFogCoorddEXT
- glFogCoorddv
- glFogCoorddvEXT
- glFogCoordf
- glFogCoordfEXT
- glFogCoordfv
- glFogCoordfvEXT
- glFogCoordPointer
- glFogCoordPointerEXT
- glFogf
- glFogfv
- glFogi
- glFogiv
- glFragmentColorMaterialSGIX
- glFragmentLightfSGIX
- glFragmentLightfvSGIX
- glFragmentLightiSGIX
- glFragmentLightivSGIX
- glFragmentLightModelfSGIX
- glFragmentLightModelfvSGIX
- glFragmentLightModeliSGIX
- glFragmentLightModelivSGIX
- glFragmentMaterialfSGIX
- glFragmentMaterialfvSGIX
- glFragmentMaterialiSGIX
- glFragmentMaterialivSGIX
- glFrameZoomSGIX
- glFrontFace
- glFrustum
- glGenBuffersARB
- glGenFencesNV
- glGenLists
- glGenProgramsARB
- glGenProgramsNV
- glGenQueriesARB
- glGenTextures
- glGenTexturesEXT
- glGetBooleanv
- glGetBufferParameterivARB
- glGetBufferPointervARB
- glGetBufferSubDataARB
- glGetClipPlane
- glGetColorTable
- glGetColorTableEXT
- glGetColorTableParameterfv
- glGetColorTableParameterfvEXT
- glGetColorTableParameterfvSGI
- glGetColorTableParameteriv
- glGetColorTableParameterivEXT
- glGetColorTableParameterivSGI
- glGetColorTableSGI
- glGetCombinerInputParameterfvNV
- glGetCombinerInputParameterivNV
- glGetCombinerOutputParameterfvNV
- glGetCombinerOutputParameterivNV
- glGetCompressedTexImage
- glGetCompressedTexImageARB
- glGetConvolutionFilter
- glGetConvolutionFilterEXT
- glGetConvolutionParameterfv
- glGetConvolutionParameterfvEXT
- glGetConvolutionParameteriv
- glGetConvolutionParameterivEXT
- glGetDetailTexFuncSGIS
- glGetDoublev
- glGetError
- glGetFenceivNV
- glGetFinalCombinerInputParameterfvNV
- glGetFinalCombinerInputParameterivNV
- glGetFloatv
- glGetFragmentLightfvSGIX
- glGetFragmentLightivSGIX
- glGetFragmentMaterialfvSGIX
- glGetFragmentMaterialivSGIX
- glGetHistogram
- glGetHistogramEXT
- glGetHistogramParameterfv
- glGetHistogramParameterfvEXT
- glGetHistogramParameteriv
- glGetHistogramParameterivEXT
- glGetInstrumentsSGIX
- glGetIntegerv
- glGetLightfv
- glGetLightiv
- glGetListParameterfvSGIX
- glGetListParameterivSGIX
- glGetMapdv
- glGetMapfv
- glGetMapiv
- glGetMaterialfv
- glGetMaterialiv
- glGetMinmax
- glGetMinmaxEXT
- glGetMinmaxParameterfv
- glGetMinmaxParameterfvEXT
- glGetMinmaxParameteriv
- glGetMinmaxParameterivEXT
- glGetPixelMapfv
- glGetPixelMapuiv
- glGetPixelMapusv
- glGetPixelTexGenParameterfvSGIS
- glGetPixelTexGenParameterivSGIS
- glGetPointerv
- glGetPointervEXT
- glGetPolygonStipple
- glGetProgramEnvParameterdvARB
- glGetProgramEnvParameterfvARB
- glGetProgramivARB
- glGetProgramivNV
- glGetProgramLocalParameterdvARB
- glGetProgramLocalParameterfvARB
- glGetProgramNamedParameterdvNV
- glGetProgramNamedParameterfvNV
- glGetProgramParameterdvNV
- glGetProgramParameterfvNV
- glGetProgramStringARB
- glGetProgramStringNV
- glGetQueryivARB
- glGetQueryObjectivARB
- glGetQueryObjectuivARB
- glGetSeparableFilter
- glGetSeparableFilterEXT
- glGetSharpenTexFuncSGIS
- glGetString
- glGetTexEnvfv
- glGetTexEnviv
- glGetTexFilterFuncSGIS
- glGetTexGendv
- glGetTexGenfv
- glGetTexGeniv
- glGetTexImage
- glGetTexLevelParameterfv
- glGetTexLevelParameteriv
- glGetTexParameterfv
- glGetTexParameteriv
- glGetTrackMatrixivNV
- glGetVertexAttribdvARB
- glGetVertexAttribdvNV
- glGetVertexAttribfvARB
- glGetVertexAttribfvNV
- glGetVertexAttribivARB
- glGetVertexAttribivNV
- glGetVertexAttribPointervARB
- glGetVertexAttribPointervNV
- glHint
- glHintPGI
- glHistogram
- glHistogramEXT
- glIndexd
- glIndexdv
- glIndexf
- glIndexFuncEXT
- glIndexfv
- glIndexi
- glIndexiv
- glIndexMask
- glIndexMaterialEXT
- glIndexPointer
- glIndexPointerEXT
- glIndexs
- glIndexsv
- glIndexub
- glIndexubv
- glInitNames
- glInstrumentsBufferSGIX
- glInterleavedArrays
- glIsBufferARB
- glIsEnabled
- glIsFenceNV
- glIsList
- glIsProgramARB
- glIsProgramNV
- glIsQueryARB
- glIsTexture
- glIsTextureEXT
- glLightEnviSGIX
- glLightf
- glLightfv
- glLighti
- glLightiv
- glLightModelf
- glLightModelfv
- glLightModeli
- glLightModeliv
- glLineStipple
- glLineWidth
- glListBase
- glListParameterfSGIX
- glListParameterfvSGIX
- glListParameteriSGIX
- glListParameterivSGIX
- glLoadIdentity
- glLoadMatrixd
- glLoadMatrixf
- glLoadName
- glLoadProgramNV
- glLoadTransposeMatrixd
- glLoadTransposeMatrixdARB
- glLoadTransposeMatrixf
- glLoadTransposeMatrixfARB
- glLockArraysEXT
- glLogicOp
- glMapBufferARB
- glMapGrid1d
- glMapGrid1f
- glMapGrid2d
- glMapGrid2f
- glMap1d
- glMap1f
- glMap2d
- glMap2f
- glMaterialf
- glMaterialfv
- glMateriali
- glMaterialiv
- glMatrixMode
- glMinmax
- glMinmaxEXT
- glMultiDrawArrays
- glMultiDrawArraysEXT
- glMultiDrawElements
- glMultiDrawElementsEXT
- glMultiModeDrawArraysIBM
- glMultiModeDrawElementsIBM
- glMultiTexCoord1d
- glMultiTexCoord1dARB
- glMultiTexCoord1dv
- glMultiTexCoord1dvARB
- glMultiTexCoord1f
- glMultiTexCoord1fARB
- glMultiTexCoord1fv
- glMultiTexCoord1fvARB
- glMultiTexCoord1i
- glMultiTexCoord1iARB
- glMultiTexCoord1iv
- glMultiTexCoord1ivARB
- glMultiTexCoord1s
- glMultiTexCoord1sARB
- glMultiTexCoord1sv
- glMultiTexCoord1svARB
- glMultiTexCoord2d
- glMultiTexCoord2dARB
- glMultiTexCoord2dv
- glMultiTexCoord2dvARB
- glMultiTexCoord2f
- glMultiTexCoord2fARB
- glMultiTexCoord2fv
- glMultiTexCoord2fvARB
- glMultiTexCoord2i
- glMultiTexCoord2iARB
- glMultiTexCoord2iv
- glMultiTexCoord2ivARB
- glMultiTexCoord2s
- glMultiTexCoord2sARB
- glMultiTexCoord2sv
- glMultiTexCoord2svARB
- glMultiTexCoord3d
- glMultiTexCoord3dARB
- glMultiTexCoord3dv
- glMultiTexCoord3dvARB
- glMultiTexCoord3f
- glMultiTexCoord3fARB
- glMultiTexCoord3fv
- glMultiTexCoord3fvARB
- glMultiTexCoord3i
- glMultiTexCoord3iARB
- glMultiTexCoord3iv
- glMultiTexCoord3ivARB
- glMultiTexCoord3s
- glMultiTexCoord3sARB
- glMultiTexCoord3sv
- glMultiTexCoord3svARB
- glMultiTexCoord4d
- glMultiTexCoord4dARB
- glMultiTexCoord4dv
- glMultiTexCoord4dvARB
- glMultiTexCoord4f
- glMultiTexCoord4fARB
- glMultiTexCoord4fv
- glMultiTexCoord4fvARB
- glMultiTexCoord4i
- glMultiTexCoord4iARB
- glMultiTexCoord4iv
- glMultiTexCoord4ivARB
- glMultiTexCoord4s
- glMultiTexCoord4sARB
- glMultiTexCoord4sv
- glMultiTexCoord4svARB
- glMultMatrixd
- glMultMatrixf
- glMultTransposeMatrixd
- glMultTransposeMatrixdARB
- glMultTransposeMatrixf
- glMultTransposeMatrixfARB
- glNewList
- glNormalPointer
- glNormalPointerEXT
- glNormal3b
- glNormal3bv
- glNormal3d
- glNormal3dv
- glNormal3f
- glNormal3fv
- glNormal3i
- glNormal3iv
- glNormal3s
- glNormal3sv
- glOrtho
- glPassThrough
- glPixelMapfv
- glPixelMapuiv
- glPixelMapusv
- glPixelStoref
- glPixelStorei
- glPixelTexGenParameterfSGIS
- glPixelTexGenParameterfvSGIS
- glPixelTexGenParameteriSGIS
- glPixelTexGenParameterivSGIS
- glPixelTexGenSGIX
- glPixelTransferf
- glPixelTransferi
- glPixelZoom
- glPointParameterf
- glPointParameterfARB
- glPointParameterfEXT
- glPointParameterfSGIS
- glPointParameterfv
- glPointParameterfvARB
- glPointParameterfvEXT
- glPointParameterfvSGIS
- glPointParameteri
- glPointParameteriNV
- glPointParameteriv
- glPointParameterivNV
- glPointSize
- glPollInstrumentsSGIX
- glPolygonMode
- glPolygonOffset
- glPolygonOffsetEXT
- glPolygonStipple
- glPopAttrib
- glPopClientAttrib
- glPopMatrix
- glPopName
- glPrioritizeTextures
- glPrioritizeTexturesEXT
- glProgramEnvParameter4dARB
- glProgramEnvParameter4dvARB
- glProgramEnvParameter4fARB
- glProgramEnvParameter4fvARB
- glProgramLocalParameter4dARB
- glProgramLocalParameter4dvARB
- glProgramLocalParameter4fARB
- glProgramLocalParameter4fvARB
- glProgramNamedParameter4dNV
- glProgramNamedParameter4dvNV
- glProgramNamedParameter4fNV
- glProgramNamedParameter4fvNV
- glProgramParameters4dvNV
- glProgramParameters4fvNV
- glProgramParameter4dNV
- glProgramParameter4dvNV
- glProgramParameter4fNV
- glProgramParameter4fvNV
- glProgramStringARB
- glPushAttrib
- glPushClientAttrib
- glPushMatrix
- glPushName
- glRasterPos2d
- glRasterPos2dv
- glRasterPos2f
- glRasterPos2fv
- glRasterPos2i
- glRasterPos2iv
- glRasterPos2s
- glRasterPos2sv
- glRasterPos3d
- glRasterPos3dv
- glRasterPos3f
- glRasterPos3fv
- glRasterPos3i
- glRasterPos3iv
- glRasterPos3s
- glRasterPos3sv
- glRasterPos4d
- glRasterPos4dv
- glRasterPos4f
- glRasterPos4fv
- glRasterPos4i
- glRasterPos4iv
- glRasterPos4s
- glRasterPos4sv
- glReadBuffer
- glReadInstrumentsSGIX
- glReadPixels
- glRectd
- glRectdv
- glRectf
- glRectfv
- glRecti
- glRectiv
- glRects
- glRectsv
- glReferencePlaneSGIX
- glRenderMode
- glRequestResidentProgramsNV
- glResetHistogram
- glResetHistogramEXT
- glResetMinmax
- glResetMinmaxEXT
- glResizeBuffersMESA
- glRotated
- glRotatef
- glSampleCoverage
- glSampleCoverageARB
- glSampleMaskEXT
- glSampleMaskSGIS
- glSamplePatternEXT
- glSamplePatternSGIS
- glScaled
- glScalef
- glScissor
- glSecondaryColorPointer
- glSecondaryColorPointerEXT
- glSecondaryColor3b
- glSecondaryColor3bEXT
- glSecondaryColor3bv
- glSecondaryColor3bvEXT
- glSecondaryColor3d
- glSecondaryColor3dEXT
- glSecondaryColor3dv
- glSecondaryColor3dvEXT
- glSecondaryColor3f
- glSecondaryColor3fEXT
- glSecondaryColor3fv
- glSecondaryColor3fvEXT
- glSecondaryColor3i
- glSecondaryColor3iEXT
- glSecondaryColor3iv
- glSecondaryColor3ivEXT
- glSecondaryColor3s
- glSecondaryColor3sEXT
- glSecondaryColor3sv
- glSecondaryColor3svEXT
- glSecondaryColor3ub
- glSecondaryColor3ubEXT
- glSecondaryColor3ubv
- glSecondaryColor3ubvEXT
- glSecondaryColor3ui
- glSecondaryColor3uiEXT
- glSecondaryColor3uiv
- glSecondaryColor3uivEXT
- glSecondaryColor3us
- glSecondaryColor3usEXT
- glSecondaryColor3usv
- glSecondaryColor3usvEXT
- glSelectBuffer
- glSeparableFilter2D
- glSeparableFilter2DEXT
- glSetFenceNV
- glShadeModel
- glSharpenTexFuncSGIS
- glSpriteParameterfSGIX
- glSpriteParameterfvSGIX
- glSpriteParameteriSGIX
- glSpriteParameterivSGIX
- glStartInstrumentsSGIX
- glStencilFunc
- glStencilMask
- glStencilOp
- glStopInstrumentsSGIX
- glTagSampleBufferSGIX
- glTbufferMask3DFX
- glTestFenceNV
- glTexCoordPointer
- glTexCoordPointerEXT
- glTexCoord1d
- glTexCoord1dv
- glTexCoord1f
- glTexCoord1fv
- glTexCoord1i
- glTexCoord1iv
- glTexCoord1s
- glTexCoord1sv
- glTexCoord2d
- glTexCoord2dv
- glTexCoord2f
- glTexCoord2fv
- glTexCoord2i
- glTexCoord2iv
- glTexCoord2s
- glTexCoord2sv
- glTexCoord3d
- glTexCoord3dv
- glTexCoord3f
- glTexCoord3fv
- glTexCoord3i
- glTexCoord3iv
- glTexCoord3s
- glTexCoord3sv
- glTexCoord4d
- glTexCoord4dv
- glTexCoord4f
- glTexCoord4fv
- glTexCoord4i
- glTexCoord4iv
- glTexCoord4s
- glTexCoord4sv
- glTexEnvf
- glTexEnvfv
- glTexEnvi
- glTexEnviv
- glTexFilterFuncSGIS
- glTexGend
- glTexGendv
- glTexGenf
- glTexGenfv
- glTexGeni
- glTexGeniv
- glTexImage1D
- glTexImage2D
- glTexImage3D
- glTexImage3DEXT
- glTexImage4DSGIS
- glTexParameterf
- glTexParameterfv
- glTexParameteri
- glTexParameteriv
- glTexSubImage1D
- glTexSubImage1DEXT
- glTexSubImage2D
- glTexSubImage2DEXT
- glTexSubImage3D
- glTexSubImage3DEXT
- glTexSubImage4DSGIS
- glTrackMatrixNV
- glTranslated
- glTranslatef
- glUnlockArraysEXT
- glUnmapBufferARB
- glVertexArrayRangeNV
- glVertexAttribPointerARB
- glVertexAttribPointerNV
- glVertexAttribs1dvNV
- glVertexAttribs1fvNV
- glVertexAttribs1svNV
- glVertexAttribs2dvNV
- glVertexAttribs2fvNV
- glVertexAttribs2svNV
- glVertexAttribs3dvNV
- glVertexAttribs3fvNV
- glVertexAttribs3svNV
- glVertexAttribs4dvNV
- glVertexAttribs4fvNV
- glVertexAttribs4svNV
- glVertexAttribs4ubvNV
- glVertexAttrib1dARB
- glVertexAttrib1dNV
- glVertexAttrib1dvARB
- glVertexAttrib1dvNV
- glVertexAttrib1fARB
- glVertexAttrib1fNV
- glVertexAttrib1fvARB
- glVertexAttrib1fvNV
- glVertexAttrib1sARB
- glVertexAttrib1sNV
- glVertexAttrib1svARB
- glVertexAttrib1svNV
- glVertexAttrib2dARB
- glVertexAttrib2dNV
- glVertexAttrib2dvARB
- glVertexAttrib2dvNV
- glVertexAttrib2fARB
- glVertexAttrib2fNV
- glVertexAttrib2fvARB
- glVertexAttrib2fvNV
- glVertexAttrib2sARB
- glVertexAttrib2sNV
- glVertexAttrib2svARB
- glVertexAttrib2svNV
- glVertexAttrib3dARB
- glVertexAttrib3dNV
- glVertexAttrib3dvARB
- glVertexAttrib3dvNV
- glVertexAttrib3fARB
- glVertexAttrib3fNV
- glVertexAttrib3fvARB
- glVertexAttrib3fvNV
- glVertexAttrib3sARB
- glVertexAttrib3sNV
- glVertexAttrib3svARB
- glVertexAttrib3svNV
- glVertexAttrib4bvARB
- glVertexAttrib4dARB
- glVertexAttrib4dNV
- glVertexAttrib4dvARB
- glVertexAttrib4dvNV
- glVertexAttrib4fARB
- glVertexAttrib4fNV
- glVertexAttrib4fvARB
- glVertexAttrib4fvNV
- glVertexAttrib4ivARB
- glVertexAttrib4NbvARB
- glVertexAttrib4NivARB
- glVertexAttrib4NsvARB
- glVertexAttrib4NubARB
- glVertexAttrib4NubvARB
- glVertexAttrib4NuivARB
- glVertexAttrib4NusvARB
- glVertexAttrib4sARB
- glVertexAttrib4sNV
- glVertexAttrib4svARB
- glVertexAttrib4svNV
- glVertexAttrib4ubNV
- glVertexAttrib4ubvARB
- glVertexAttrib4ubvNV
- glVertexAttrib4uivARB
- glVertexAttrib4usvARB
- glVertexPointer
- glVertexPointerEXT
- glVertexWeightfEXT
- glVertexWeightfvEXT
- glVertexWeightPointerEXT
- glVertex2d
- glVertex2dv
- glVertex2f
- glVertex2fv
- glVertex2i
- glVertex2iv
- glVertex2s
- glVertex2sv
- glVertex3d
- glVertex3dv
- glVertex3f
- glVertex3fv
- glVertex3i
- glVertex3iv
- glVertex3s
- glVertex3sv
- glVertex4d
- glVertex4dv
- glVertex4f
- glVertex4fv
- glVertex4i
- glVertex4iv
- glVertex4s
- glVertex4sv
- glViewport
- glWindowPos2d
- glWindowPos2dARB
- glWindowPos2dMESA
- glWindowPos2dv
- glWindowPos2dvARB
- glWindowPos2dvMESA
- glWindowPos2f
- glWindowPos2fARB
- glWindowPos2fMESA
- glWindowPos2fv
- glWindowPos2fvARB
- glWindowPos2fvMESA
- glWindowPos2i
- glWindowPos2iARB
- glWindowPos2iMESA
- glWindowPos2iv
- glWindowPos2ivARB
- glWindowPos2ivMESA
- glWindowPos2s
- glWindowPos2sARB
- glWindowPos2sMESA
- glWindowPos2sv
- glWindowPos2svARB
- glWindowPos2svMESA
- glWindowPos3d
- glWindowPos3dARB
- glWindowPos3dMESA
- glWindowPos3dv
- glWindowPos3dvARB
- glWindowPos3dvMESA
- glWindowPos3f
- glWindowPos3fARB
- glWindowPos3fMESA
- glWindowPos3fv
- glWindowPos3fvARB
- glWindowPos3fvMESA
- glWindowPos3i
- glWindowPos3iARB
- glWindowPos3iMESA
- glWindowPos3iv
- glWindowPos3ivARB
- glWindowPos3ivMESA
- glWindowPos3s
- glWindowPos3sARB
- glWindowPos3sMESA
- glWindowPos3sv
- glWindowPos3svARB
- glWindowPos3svMESA
- glWindowPos4dMESA
- glWindowPos4dvMESA
- glWindowPos4fMESA
- glWindowPos4fvMESA
- glWindowPos4iMESA
- glWindowPos4ivMESA
- glWindowPos4sMESA
- glWindowPos4svMESA
- fxCloseHardware
-;fxGetScreenGeometry
- fxMesaCreateBestContext
- fxMesaCreateContext
- fxMesaDestroyContext
- fxMesaGetCurrentContext
- fxMesaMakeCurrent
- fxMesaSelectCurrentBoard
-;fxMesaSetNearFar
- fxMesaSwapBuffers
- fxMesaUpdateScreenSize
- wglChoosePixelFormat
- wglCopyContext
- wglCreateContext
- wglCreateLayerContext
- wglDeleteContext
- wglDescribeLayerPlane
- wglDescribePixelFormat
- wglGetCurrentContext
- wglGetCurrentDC
- wglGetDefaultProcAddress
- wglGetLayerPaletteEntries
- wglGetPixelFormat
- wglGetProcAddress
- wglMakeCurrent
- wglRealizeLayerPalette
- wglSetLayerPaletteEntries
- wglSetPixelFormat
- wglShareLists
- wglSwapBuffers
- wglSwapLayerBuffers
- wglUseFontBitmapsA
- wglUseFontBitmapsW
- wglUseFontOutlinesA
- wglUseFontOutlinesW
- ChoosePixelFormat
- DescribePixelFormat
- GetPixelFormat
- SetPixelFormat
- SwapBuffers
- DrvCopyContext
- DrvCreateContext
- DrvCreateLayerContext
- DrvDeleteContext
- DrvDescribeLayerPlane
- DrvDescribePixelFormat
- DrvGetLayerPaletteEntries
- DrvGetProcAddress
- DrvReleaseContext
- DrvRealizeLayerPalette
- DrvSetContext
- DrvSetLayerPaletteEntries
- DrvSetPixelFormat
- DrvShareLists
- DrvSwapBuffers
- DrvSwapLayerBuffers
- DrvValidateVersion
diff --git a/src/mesa/drivers/windows/fx/fxwgl.c b/src/mesa/drivers/windows/fx/fxwgl.c
deleted file mode 100644
index ce76ecd1568..00000000000
--- a/src/mesa/drivers/windows/fx/fxwgl.c
+++ /dev/null
@@ -1,1307 +0,0 @@
-/*
- * Mesa 3-D graphics library
- * Version:  4.0
- *
- * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/* Authors:
- *    David Bucciarelli
- *    Brian Paul
- *    Keith Whitwell
- *    Hiroshi Morii
- *    Daniel Borca
- */
-
-/* fxwgl.c - Microsoft wgl functions emulation for
- *           3Dfx VooDoo/Mesa interface
- */
-
-
-#ifdef _WIN32
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <windows.h>
-#define GL_GLEXT_PROTOTYPES
-#include "GL/gl.h"
-#include "GL/glext.h"
-
-#ifdef __cplusplus
-}
-#endif
-
-#include "GL/fxmesa.h"
-#include "glheader.h"
-#include "glapi.h"
-#include "imports.h"
-#include "../../glide/fxdrv.h"
-
-#define MAX_MESA_ATTRS  20
-
-#if (_MSC_VER >= 1200)
-#pragma warning( push )
-#pragma warning( disable : 4273 )
-#endif
-
-struct __extensions__ {
-   PROC proc;
-   char *name;
-};
-
-struct __pixelformat__ {
-   PIXELFORMATDESCRIPTOR pfd;
-   GLint mesaAttr[MAX_MESA_ATTRS];
-};
-
-WINGDIAPI void GLAPIENTRY gl3DfxSetPaletteEXT(GLuint *);
-static GLushort gammaTable[3 * 256];
-
-struct __pixelformat__ pix[] = {
-   /* 16bit RGB565 single buffer with depth */
-   {
-    {sizeof(PIXELFORMATDESCRIPTOR), 1,
-     PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL,
-     PFD_TYPE_RGBA,
-     16,
-     5, 0, 6, 5, 5, 11, 0, 0,
-     0, 0, 0, 0, 0,
-     16,
-     0,
-     0,
-     PFD_MAIN_PLANE,
-     0, 0, 0, 0}
-    ,
-    {FXMESA_COLORDEPTH, 16,
-     FXMESA_ALPHA_SIZE, 0,
-     FXMESA_DEPTH_SIZE, 16,
-     FXMESA_STENCIL_SIZE, 0,
-     FXMESA_ACCUM_SIZE, 0,
-     FXMESA_NONE}
-   }
-   ,
-   /* 16bit RGB565 double buffer with depth */
-   {
-    {sizeof(PIXELFORMATDESCRIPTOR), 1,
-     PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL |
-     PFD_DOUBLEBUFFER | PFD_SWAP_COPY,
-     PFD_TYPE_RGBA,
-     16,
-     5, 0, 6, 5, 5, 11, 0, 0,
-     0, 0, 0, 0, 0,
-     16,
-     0,
-     0,
-     PFD_MAIN_PLANE,
-     0, 0, 0, 0}
-    ,
-    {FXMESA_COLORDEPTH, 16,
-     FXMESA_DOUBLEBUFFER,
-     FXMESA_ALPHA_SIZE, 0,
-     FXMESA_DEPTH_SIZE, 16,
-     FXMESA_STENCIL_SIZE, 0,
-     FXMESA_ACCUM_SIZE, 0,
-     FXMESA_NONE}
-   }
-   ,
-   /* 16bit ARGB1555 single buffer with depth */
-   {
-    {sizeof(PIXELFORMATDESCRIPTOR), 1,
-     PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL,
-     PFD_TYPE_RGBA,
-     16,
-     5, 0, 5, 5, 5, 10, 1, 15,
-     0, 0, 0, 0, 0,
-     16,
-     0,
-     0,
-     PFD_MAIN_PLANE,
-     0, 0, 0, 0}
-    ,
-    {FXMESA_COLORDEPTH, 15,
-     FXMESA_ALPHA_SIZE, 1,
-     FXMESA_DEPTH_SIZE, 16,
-     FXMESA_STENCIL_SIZE, 0,
-     FXMESA_ACCUM_SIZE, 0,
-     FXMESA_NONE}
-   }
-   ,
-   /* 16bit ARGB1555 double buffer with depth */
-   {
-    {sizeof(PIXELFORMATDESCRIPTOR), 1,
-     PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL |
-     PFD_DOUBLEBUFFER | PFD_SWAP_COPY,
-     PFD_TYPE_RGBA,
-     16,
-     5, 0, 5, 5, 5, 10, 1, 15,
-     0, 0, 0, 0, 0,
-     16,
-     0,
-     0,
-     PFD_MAIN_PLANE,
-     0, 0, 0, 0}
-    ,
-    {FXMESA_COLORDEPTH, 15,
-     FXMESA_DOUBLEBUFFER,
-     FXMESA_ALPHA_SIZE, 1,
-     FXMESA_DEPTH_SIZE, 16,
-     FXMESA_STENCIL_SIZE, 0,
-     FXMESA_ACCUM_SIZE, 0,
-     FXMESA_NONE}
-   }
-   ,
-   /* 32bit ARGB8888 single buffer with depth */
-   {
-    {sizeof(PIXELFORMATDESCRIPTOR), 1,
-     PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL,
-     PFD_TYPE_RGBA,
-     32,
-     8, 0, 8, 8, 8, 16, 8, 24,
-     0, 0, 0, 0, 0,
-     24,
-     8,
-     0,
-     PFD_MAIN_PLANE,
-     0, 0, 0, 0}
-    ,
-    {FXMESA_COLORDEPTH, 32,
-     FXMESA_ALPHA_SIZE, 8,
-     FXMESA_DEPTH_SIZE, 24,
-     FXMESA_STENCIL_SIZE, 8,
-     FXMESA_ACCUM_SIZE, 0,
-     FXMESA_NONE}
-   }
-   ,
-   /* 32bit ARGB8888 double buffer with depth */
-   {
-    {sizeof(PIXELFORMATDESCRIPTOR), 1,
-     PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL |
-     PFD_DOUBLEBUFFER | PFD_SWAP_COPY,
-     PFD_TYPE_RGBA,
-     32,
-     8, 0, 8, 8, 8, 16, 8, 24,
-     0, 0, 0, 0, 0,
-     24,
-     8,
-     0,
-     PFD_MAIN_PLANE,
-     0, 0, 0, 0}
-    ,
-    {FXMESA_COLORDEPTH, 32,
-     FXMESA_DOUBLEBUFFER,
-     FXMESA_ALPHA_SIZE, 8,
-     FXMESA_DEPTH_SIZE, 24,
-     FXMESA_STENCIL_SIZE, 8,
-     FXMESA_ACCUM_SIZE, 0,
-     FXMESA_NONE}
-   }
-};
-
-static fxMesaContext ctx = NULL;
-static WNDPROC hWNDOldProc;
-static int curPFD = 0;
-static HDC hDC;
-static HWND hWND;
-
-static GLboolean haveDualHead;
-
-/* For the in-window-rendering hack */
-
-#ifndef GR_CONTROL_RESIZE
-/* Apparently GR_CONTROL_RESIZE can be ignored. OK? */
-#define GR_CONTROL_RESIZE -1
-#endif
-
-static GLboolean gdiWindowHack;
-static void *dibSurfacePtr;
-static BITMAPINFO *dibBMI;
-static HBITMAP dibHBM;
-static HWND dibWnd;
-
-static int
-env_check (const char *var, int val)
-{
-   const char *env = getenv(var);
-   return (env && (env[0] == val));
-}
-
-static LRESULT APIENTRY
-__wglMonitor (HWND hwnd, UINT message, UINT wParam, LONG lParam)
-{
-   long ret;                    /* Now gives the resized window at the end to hWNDOldProc */
-
-   if (ctx && hwnd == hWND) {
-      switch (message) {
-         case WM_PAINT:
-         case WM_MOVE:
-            break;
-         case WM_DISPLAYCHANGE:
-         case WM_SIZE:
-#if 0
-            if (wParam != SIZE_MINIMIZED) {
-               static int moving = 0;
-               if (!moving) {
-                  if (!FX_grSstControl(GR_CONTROL_RESIZE)) {
-                     moving = 1;
-                     SetWindowPos(hwnd, 0, 0, 0, 300, 300, SWP_NOMOVE | SWP_NOZORDER);
-                     moving = 0;
-                     if (!FX_grSstControl(GR_CONTROL_RESIZE)) {
-                        /*MessageBox(0,_T("Error changing windowsize"),_T("fxMESA"),MB_OK);*/
-                        PostMessage(hWND, WM_CLOSE, 0, 0);
-                     }
-                  }
-                  /* Do the clipping in the glide library */
-                  grClipWindow(0, 0, FX_grSstScreenWidth(), FX_grSstScreenHeight());
-                  /* And let the new size set in the context */
-                  fxMesaUpdateScreenSize(ctx);
-               }
-            }
-#endif
-            break;
-         case WM_ACTIVATE:
-            break;
-         case WM_SHOWWINDOW:
-            break;
-         case WM_SYSKEYDOWN:
-         case WM_SYSCHAR:
-            break;
-      }
-   }
-
-   /* Finally call the hWNDOldProc, which handles the resize with the
-    * now changed window sizes */
-   ret = CallWindowProc(hWNDOldProc, hwnd, message, wParam, lParam);
-
-   return ret;
-}
-
-static void
-wgl_error (long error)
-{
-#define WGL_INVALID_PIXELFORMAT ERROR_INVALID_PIXEL_FORMAT
-   SetLastError(0xC0000000      /* error severity */
-               |0x00070000      /* error facility (who we are) */
-               |error);
-}
-
-GLAPI BOOL GLAPIENTRY
-wglCopyContext (HGLRC hglrcSrc, HGLRC hglrcDst, UINT mask)
-{
-   return FALSE;
-}
-
-GLAPI HGLRC GLAPIENTRY
-wglCreateContext (HDC hdc)
-{
-   HWND hWnd;
-   WNDPROC oldProc;
-   int error;
-
-   if (ctx) {
-      SetLastError(0);
-      return NULL;
-   }
-
-   if (!(hWnd = WindowFromDC(hdc))) {
-      SetLastError(0);
-      return NULL;
-   }
-
-   if (curPFD == 0) {
-      wgl_error(WGL_INVALID_PIXELFORMAT);
-      return NULL;
-   }
-
-   if ((oldProc = (WNDPROC)GetWindowLong(hWnd, GWL_WNDPROC)) != __wglMonitor) {
-      hWNDOldProc = oldProc;
-      SetWindowLong(hWnd, GWL_WNDPROC, (LONG)__wglMonitor);
-   }
-
-   /* always log when debugging, or if user demands */
-   if (TDFX_DEBUG || env_check("MESA_FX_INFO", 'r')) {
-      freopen("MESA.LOG", "w", stderr);
-   }
-
-   {
-      RECT cliRect;
-      ShowWindow(hWnd, SW_SHOWNORMAL);
-      SetForegroundWindow(hWnd);
-      Sleep(100);               /* a hack for win95 */
-      if (env_check("MESA_GLX_FX", 'w') && !(GetWindowLong(hWnd, GWL_STYLE) & WS_POPUP)) {
-         /* XXX todo - windowed modes */
-         error = !(ctx = fxMesaCreateContext((GLuint) hWnd, GR_RESOLUTION_NONE, GR_REFRESH_NONE, pix[curPFD - 1].mesaAttr));
-      } else {
-         GetClientRect(hWnd, &cliRect);
-         error = !(ctx = fxMesaCreateBestContext((GLuint) hWnd, cliRect.right, cliRect.bottom, pix[curPFD - 1].mesaAttr));
-      }
-   }
-
-   /*if (getenv("SST_DUALHEAD"))
-      haveDualHead =
-         ((atoi(getenv("SST_DUALHEAD")) == 1) ? GL_TRUE : GL_FALSE);
-   else
-      haveDualHead = GL_FALSE;*/
-
-   if (error) {
-      SetLastError(0);
-      return NULL;
-   }
-
-   hDC = hdc;
-   hWND = hWnd;
-
-   /* Required by the OpenGL Optimizer 1.1 (is it a Optimizer bug ?) */
-   wglMakeCurrent(hdc, (HGLRC)1);
-
-   return (HGLRC)1;
-}
-
-GLAPI HGLRC GLAPIENTRY
-wglCreateLayerContext (HDC hdc, int iLayerPlane)
-{
-   SetLastError(0);
-   return NULL;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglDeleteContext (HGLRC hglrc)
-{
-   if (ctx && hglrc == (HGLRC)1) {
-
-      fxMesaDestroyContext(ctx);
-
-      SetWindowLong(WindowFromDC(hDC), GWL_WNDPROC, (LONG) hWNDOldProc);
-
-      ctx = NULL;
-      hDC = 0;
-      return TRUE;
-   }
-
-   SetLastError(0);
-
-   return FALSE;
-}
-
-GLAPI HGLRC GLAPIENTRY
-wglGetCurrentContext (VOID)
-{
-   if (ctx)
-      return (HGLRC)1;
-
-   SetLastError(0);
-   return NULL;
-}
-
-GLAPI HDC GLAPIENTRY
-wglGetCurrentDC (VOID)
-{
-   if (ctx)
-      return hDC;
-
-   SetLastError(0);
-   return NULL;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglSwapIntervalEXT (int interval)
-{
-   if (ctx == NULL) {
-      return FALSE;
-   }
-   if (interval < 0) {
-      interval = 0;
-   } else if (interval > 3) {
-      interval = 3;
-   }
-   ctx->swapInterval = interval;
-   return TRUE;
-}
-
-GLAPI int GLAPIENTRY
-wglGetSwapIntervalEXT (void)
-{
-   return (ctx == NULL) ? -1 : ctx->swapInterval;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglGetDeviceGammaRamp3DFX (HDC hdc, LPVOID arrays)
-{
-   /* gammaTable should be per-context */
-   memcpy(arrays, gammaTable, 3 * 256 * sizeof(GLushort));
-   return TRUE;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglSetDeviceGammaRamp3DFX (HDC hdc, LPVOID arrays)
-{
-   GLint i, tableSize, inc, index;
-   GLushort *red, *green, *blue;
-   FxU32 gammaTableR[256], gammaTableG[256], gammaTableB[256];
-
-   /* gammaTable should be per-context */
-   memcpy(gammaTable, arrays, 3 * 256 * sizeof(GLushort));
-
-   tableSize = FX_grGetInteger(GR_GAMMA_TABLE_ENTRIES);
-   inc = 256 / tableSize;
-   red = (GLushort *)arrays;
-   green = (GLushort *)arrays + 256;
-   blue = (GLushort *)arrays + 512;
-   for (i = 0, index = 0; i < tableSize; i++, index += inc) {
-      gammaTableR[i] = red[index] >> 8;
-      gammaTableG[i] = green[index] >> 8;
-      gammaTableB[i] = blue[index] >> 8;
-   }
-
-   grLoadGammaTable(tableSize, gammaTableR, gammaTableG, gammaTableB);
-
-   return TRUE;
-}
-
-typedef void *HPBUFFERARB;
-
-/* WGL_ARB_pixel_format */
-GLAPI BOOL GLAPIENTRY
-wglGetPixelFormatAttribivARB (HDC hdc,
-                              int iPixelFormat,
-                              int iLayerPlane,
-                              UINT nAttributes,
-                              const int *piAttributes,
-                              int *piValues)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglGetPixelFormatAttribfvARB (HDC hdc,
-                              int iPixelFormat,
-                              int iLayerPlane,
-                              UINT nAttributes,
-                              const int *piAttributes,
-                              FLOAT *pfValues)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglChoosePixelFormatARB (HDC hdc,
-                         const int *piAttribIList,
-                         const FLOAT *pfAttribFList,
-                         UINT nMaxFormats,
-                         int *piFormats,
-                         UINT *nNumFormats)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-/* WGL_ARB_render_texture */
-GLAPI BOOL GLAPIENTRY
-wglBindTexImageARB (HPBUFFERARB hPbuffer, int iBuffer)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglReleaseTexImageARB (HPBUFFERARB hPbuffer, int iBuffer)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglSetPbufferAttribARB (HPBUFFERARB hPbuffer,
-                        const int *piAttribList)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-/* WGL_ARB_pbuffer */
-GLAPI HPBUFFERARB GLAPIENTRY
-wglCreatePbufferARB (HDC hDC,
-                     int iPixelFormat,
-                     int iWidth,
-                     int iHeight,
-                     const int *piAttribList)
-{
-   SetLastError(0);
-   return NULL;
-}
-
-GLAPI HDC GLAPIENTRY
-wglGetPbufferDCARB (HPBUFFERARB hPbuffer)
-{
-   SetLastError(0);
-   return NULL;
-}
-
-GLAPI int GLAPIENTRY
-wglReleasePbufferDCARB (HPBUFFERARB hPbuffer, HDC hDC)
-{
-   SetLastError(0);
-   return -1;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglDestroyPbufferARB (HPBUFFERARB hPbuffer)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglQueryPbufferARB (HPBUFFERARB hPbuffer,
-                    int iAttribute,
-                    int *piValue)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-GLAPI const char * GLAPIENTRY
-wglGetExtensionsStringEXT (void)
-{
-   return "WGL_3DFX_gamma_control "
-          "WGL_EXT_swap_control "
-          "WGL_EXT_extensions_string WGL_ARB_extensions_string"
-         /*WGL_ARB_pixel_format WGL_ARB_render_texture WGL_ARB_pbuffer*/;
-}
-
-GLAPI const char * GLAPIENTRY
-wglGetExtensionsStringARB (HDC hdc)
-{
-   return wglGetExtensionsStringEXT();
-}
-
-static struct {
-   const char *name;
-   PROC func;
-} wgl_ext[] = {
-       {"wglGetExtensionsStringARB",    (PROC)wglGetExtensionsStringARB},
-       {"wglGetExtensionsStringEXT",    (PROC)wglGetExtensionsStringEXT},
-       {"wglSwapIntervalEXT",           (PROC)wglSwapIntervalEXT},
-       {"wglGetSwapIntervalEXT",        (PROC)wglGetSwapIntervalEXT},
-       {"wglGetDeviceGammaRamp3DFX",    (PROC)wglGetDeviceGammaRamp3DFX},
-       {"wglSetDeviceGammaRamp3DFX",    (PROC)wglSetDeviceGammaRamp3DFX},
-       /* WGL_ARB_pixel_format */
-       {"wglGetPixelFormatAttribivARB", (PROC)wglGetPixelFormatAttribivARB},
-       {"wglGetPixelFormatAttribfvARB", (PROC)wglGetPixelFormatAttribfvARB},
-       {"wglChoosePixelFormatARB",      (PROC)wglChoosePixelFormatARB},
-       /* WGL_ARB_render_texture */
-       {"wglBindTexImageARB",           (PROC)wglBindTexImageARB},
-       {"wglReleaseTexImageARB",        (PROC)wglReleaseTexImageARB},
-       {"wglSetPbufferAttribARB",       (PROC)wglSetPbufferAttribARB},
-       /* WGL_ARB_pbuffer */
-       {"wglCreatePbufferARB",          (PROC)wglCreatePbufferARB},
-       {"wglGetPbufferDCARB",           (PROC)wglGetPbufferDCARB},
-       {"wglReleasePbufferDCARB",       (PROC)wglReleasePbufferDCARB},
-       {"wglDestroyPbufferARB",         (PROC)wglDestroyPbufferARB},
-       {"wglQueryPbufferARB",           (PROC)wglQueryPbufferARB},
-       {NULL, NULL}
-};
-
-GLAPI PROC GLAPIENTRY
-wglGetProcAddress (LPCSTR lpszProc)
-{
-   int i;
-   PROC p = (PROC)_glapi_get_proc_address((const char *)lpszProc);
-
-   /* we can't BlendColor. work around buggy applications */
-   if (p && strcmp(lpszProc, "glBlendColor")
-         && strcmp(lpszProc, "glBlendColorEXT"))
-      return p;
-
-   for (i = 0; wgl_ext[i].name; i++) {
-      if (!strcmp(lpszProc, wgl_ext[i].name)) {
-         return wgl_ext[i].func;
-      }
-   }
-
-   SetLastError(0);
-   return NULL;
-}
-
-GLAPI PROC GLAPIENTRY
-wglGetDefaultProcAddress (LPCSTR lpszProc)
-{
-   SetLastError(0);
-   return NULL;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglMakeCurrent (HDC hdc, HGLRC hglrc)
-{
-   if ((hdc == NULL) && (hglrc == NULL))
-      return TRUE;
-
-   if (!ctx || hglrc != (HGLRC)1 || WindowFromDC(hdc) != hWND) {
-      SetLastError(0);
-      return FALSE;
-   }
-
-   hDC = hdc;
-
-   fxMesaMakeCurrent(ctx);
-
-   return TRUE;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglShareLists (HGLRC hglrc1, HGLRC hglrc2)
-{
-   if (!ctx || hglrc1 != (HGLRC)1 || hglrc1 != hglrc2) {
-      SetLastError(0);
-      return FALSE;
-   }
-
-   return TRUE;
-}
-
-static BOOL
-wglUseFontBitmaps_FX (HDC fontDevice, DWORD firstChar, DWORD numChars,
-                      DWORD listBase)
-{
-   TEXTMETRIC metric;
-   BITMAPINFO *dibInfo;
-   HDC bitDevice;
-   COLORREF tempColor;
-   int i;
-
-   GetTextMetrics(fontDevice, &metric);
-
-   dibInfo = (BITMAPINFO *)calloc(sizeof(BITMAPINFO) + sizeof(RGBQUAD), 1);
-   dibInfo->bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
-   dibInfo->bmiHeader.biPlanes = 1;
-   dibInfo->bmiHeader.biBitCount = 1;
-   dibInfo->bmiHeader.biCompression = BI_RGB;
-
-   bitDevice = CreateCompatibleDC(fontDevice);
-
-   /* Swap fore and back colors so the bitmap has the right polarity */
-   tempColor = GetBkColor(bitDevice);
-   SetBkColor(bitDevice, GetTextColor(bitDevice));
-   SetTextColor(bitDevice, tempColor);
-
-   /* Place chars based on base line */
-   SetTextAlign(bitDevice, TA_BASELINE);
-
-   for (i = 0; i < (int)numChars; i++) {
-      SIZE size;
-      char curChar;
-      int charWidth, charHeight, bmapWidth, bmapHeight, numBytes, res;
-      HBITMAP bitObject;
-      HGDIOBJ origBmap;
-      unsigned char *bmap;
-
-      curChar = (char)(i + firstChar); /* [koolsmoky] explicit cast */
-
-      /* Find how high/wide this character is */
-      GetTextExtentPoint32(bitDevice, &curChar, 1, &size);
-
-      /* Create the output bitmap */
-      charWidth = size.cx;
-      charHeight = size.cy;
-      bmapWidth = ((charWidth + 31) / 32) * 32; /* Round up to the next multiple of 32 bits */
-      bmapHeight = charHeight;
-      bitObject = CreateCompatibleBitmap(bitDevice, bmapWidth, bmapHeight);
-      /*VERIFY(bitObject);*/
-
-      /* Assign the output bitmap to the device */
-      origBmap = SelectObject(bitDevice, bitObject);
-
-      PatBlt(bitDevice, 0, 0, bmapWidth, bmapHeight, BLACKNESS);
-
-      /* Use our source font on the device */
-      SelectObject(bitDevice, GetCurrentObject(fontDevice, OBJ_FONT));
-
-      /* Draw the character */
-      TextOut(bitDevice, 0, metric.tmAscent, &curChar, 1);
-
-      /* Unselect our bmap object */
-      SelectObject(bitDevice, origBmap);
-
-      /* Convert the display dependant representation to a 1 bit deep DIB */
-      numBytes = (bmapWidth * bmapHeight) / 8;
-      bmap = MALLOC(numBytes);
-      dibInfo->bmiHeader.biWidth = bmapWidth;
-      dibInfo->bmiHeader.biHeight = bmapHeight;
-      res = GetDIBits(bitDevice, bitObject, 0, bmapHeight, bmap,
-                      dibInfo, DIB_RGB_COLORS);
-
-      /* Create the GL object */
-      glNewList(i + listBase, GL_COMPILE);
-      glBitmap(bmapWidth, bmapHeight, 0.0, metric.tmDescent,
-               charWidth, 0.0, bmap);
-      glEndList();
-      /* CheckGL(); */
-
-      /* Destroy the bmap object */
-      DeleteObject(bitObject);
-
-      /* Deallocate the bitmap data */
-      FREE(bmap);
-   }
-
-   /* Destroy the DC */
-   DeleteDC(bitDevice);
-
-   FREE(dibInfo);
-
-   return TRUE;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglUseFontBitmapsW (HDC hdc, DWORD first, DWORD count, DWORD listBase)
-{
-   return FALSE;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglUseFontOutlinesA (HDC hdc, DWORD first, DWORD count,
-                     DWORD listBase, FLOAT deviation,
-                     FLOAT extrusion, int format, LPGLYPHMETRICSFLOAT lpgmf)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglUseFontOutlinesW (HDC hdc, DWORD first, DWORD count,
-                     DWORD listBase, FLOAT deviation,
-                     FLOAT extrusion, int format, LPGLYPHMETRICSFLOAT lpgmf)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-
-GLAPI BOOL GLAPIENTRY
-wglSwapLayerBuffers (HDC hdc, UINT fuPlanes)
-{
-   if (ctx && WindowFromDC(hdc) == hWND) {
-      fxMesaSwapBuffers();
-
-      return TRUE;
-   }
-
-   SetLastError(0);
-   return FALSE;
-}
-
-static int
-pfd_tablen (void)
-{
-   /* we should take an envvar for `fxMesaSelectCurrentBoard' */
-   return (fxMesaSelectCurrentBoard(0) < GR_SSTTYPE_Voodoo4)
-         ? 2                      /* only 16bit entries */
-         : sizeof(pix) / sizeof(pix[0]);  /* full table */
-}
-
-GLAPI int GLAPIENTRY
-wglChoosePixelFormat (HDC hdc, const PIXELFORMATDESCRIPTOR *ppfd)
-{
-   int i, best = -1, qt_valid_pix;
-   PIXELFORMATDESCRIPTOR pfd = *ppfd;
-
-   qt_valid_pix = pfd_tablen();
-
-#if 1 || QUAKE2 || GORE
-   /* QUAKE2: 24+32 */
-   /* GORE  : 24+16 */
-   if ((pfd.cColorBits == 24) || (pfd.cColorBits == 32)) {
-      /* the first 2 entries are 16bit */
-      pfd.cColorBits = (qt_valid_pix > 2) ? 32 : 16;
-   }
-   if (pfd.cColorBits == 32) {
-      pfd.cDepthBits = 24;
-   } else if (pfd.cColorBits == 16) {
-      pfd.cDepthBits = 16;
-   }
-#endif
-
-   if (pfd.nSize != sizeof(PIXELFORMATDESCRIPTOR) || pfd.nVersion != 1) {
-      SetLastError(0);
-      return 0;
-   }
-
-   for (i = 0; i < qt_valid_pix; i++) {
-      if (pfd.cColorBits > 0 && pix[i].pfd.cColorBits != pfd.cColorBits)
-         continue;
-
-      if ((pfd.dwFlags & PFD_DRAW_TO_WINDOW)
-          && !(pix[i].pfd.dwFlags & PFD_DRAW_TO_WINDOW)) continue;
-      if ((pfd.dwFlags & PFD_DRAW_TO_BITMAP)
-          && !(pix[i].pfd.dwFlags & PFD_DRAW_TO_BITMAP)) continue;
-      if ((pfd.dwFlags & PFD_SUPPORT_GDI)
-          && !(pix[i].pfd.dwFlags & PFD_SUPPORT_GDI)) continue;
-      if ((pfd.dwFlags & PFD_SUPPORT_OPENGL)
-          && !(pix[i].pfd.dwFlags & PFD_SUPPORT_OPENGL)) continue;
-      if (!(pfd.dwFlags & PFD_DOUBLEBUFFER_DONTCARE)
-          && ((pfd.dwFlags & PFD_DOUBLEBUFFER) !=
-              (pix[i].pfd.dwFlags & PFD_DOUBLEBUFFER))) continue;
-#if 1 /* Doom3 fails here! */
-      if (!(pfd.dwFlags & PFD_STEREO_DONTCARE)
-          && ((pfd.dwFlags & PFD_STEREO) !=
-              (pix[i].pfd.dwFlags & PFD_STEREO))) continue;
-#endif
-
-      if (pfd.cDepthBits > 0 && pix[i].pfd.cDepthBits == 0)
-         continue;              /* need depth buffer */
-
-      if (pfd.cAlphaBits > 0 && pix[i].pfd.cAlphaBits == 0)
-         continue;              /* need alpha buffer */
-
-#if 0                           /* regression bug? */
-      if (pfd.cStencilBits > 0 && pix[i].pfd.cStencilBits == 0)
-         continue;              /* need stencil buffer */
-#endif
-
-      if (pfd.iPixelType == pix[i].pfd.iPixelType) {
-         best = i + 1;
-         break;
-      }
-   }
-
-   if (best == -1) {
-      FILE *err = fopen("MESA.LOG", "w");
-      if (err != NULL) {
-         fprintf(err, "wglChoosePixelFormat failed\n");
-         fprintf(err, "\tnSize           = %d\n", ppfd->nSize);
-         fprintf(err, "\tnVersion        = %d\n", ppfd->nVersion);
-         fprintf(err, "\tdwFlags         = %lu\n", ppfd->dwFlags);
-         fprintf(err, "\tiPixelType      = %d\n", ppfd->iPixelType);
-         fprintf(err, "\tcColorBits      = %d\n", ppfd->cColorBits);
-         fprintf(err, "\tcRedBits        = %d\n", ppfd->cRedBits);
-         fprintf(err, "\tcRedShift       = %d\n", ppfd->cRedShift);
-         fprintf(err, "\tcGreenBits      = %d\n", ppfd->cGreenBits);
-         fprintf(err, "\tcGreenShift     = %d\n", ppfd->cGreenShift);
-         fprintf(err, "\tcBlueBits       = %d\n", ppfd->cBlueBits);
-         fprintf(err, "\tcBlueShift      = %d\n", ppfd->cBlueShift);
-         fprintf(err, "\tcAlphaBits      = %d\n", ppfd->cAlphaBits);
-         fprintf(err, "\tcAlphaShift     = %d\n", ppfd->cAlphaShift);
-         fprintf(err, "\tcAccumBits      = %d\n", ppfd->cAccumBits);
-         fprintf(err, "\tcAccumRedBits   = %d\n", ppfd->cAccumRedBits);
-         fprintf(err, "\tcAccumGreenBits = %d\n", ppfd->cAccumGreenBits);
-         fprintf(err, "\tcAccumBlueBits  = %d\n", ppfd->cAccumBlueBits);
-         fprintf(err, "\tcAccumAlphaBits = %d\n", ppfd->cAccumAlphaBits);
-         fprintf(err, "\tcDepthBits      = %d\n", ppfd->cDepthBits);
-         fprintf(err, "\tcStencilBits    = %d\n", ppfd->cStencilBits);
-         fprintf(err, "\tcAuxBuffers     = %d\n", ppfd->cAuxBuffers);
-         fprintf(err, "\tiLayerType      = %d\n", ppfd->iLayerType);
-         fprintf(err, "\tbReserved       = %d\n", ppfd->bReserved);
-         fprintf(err, "\tdwLayerMask     = %lu\n", ppfd->dwLayerMask);
-         fprintf(err, "\tdwVisibleMask   = %lu\n", ppfd->dwVisibleMask);
-         fprintf(err, "\tdwDamageMask    = %lu\n", ppfd->dwDamageMask);
-         fclose(err);
-      }
-
-      SetLastError(0);
-      return 0;
-   }
-
-   return best;
-}
-
-GLAPI int GLAPIENTRY
-ChoosePixelFormat (HDC hdc, const PIXELFORMATDESCRIPTOR *ppfd)
-{
-
-   return wglChoosePixelFormat(hdc, ppfd);
-}
-
-GLAPI int GLAPIENTRY
-wglDescribePixelFormat (HDC hdc, int iPixelFormat, UINT nBytes,
-                        LPPIXELFORMATDESCRIPTOR ppfd)
-{
-   int qt_valid_pix;
-
-   qt_valid_pix = pfd_tablen();
-
-   if (iPixelFormat < 1 || iPixelFormat > qt_valid_pix ||
-       ((nBytes != sizeof(PIXELFORMATDESCRIPTOR)) && (nBytes != 0))) {
-      SetLastError(0);
-      return qt_valid_pix;
-   }
-
-   if (nBytes != 0)
-      *ppfd = pix[iPixelFormat - 1].pfd;
-
-   return qt_valid_pix;
-}
-
-GLAPI int GLAPIENTRY
-DescribePixelFormat (HDC hdc, int iPixelFormat, UINT nBytes,
-                     LPPIXELFORMATDESCRIPTOR ppfd)
-{
-   return wglDescribePixelFormat(hdc, iPixelFormat, nBytes, ppfd);
-}
-
-GLAPI int GLAPIENTRY
-wglGetPixelFormat (HDC hdc)
-{
-   if (curPFD == 0) {
-      SetLastError(0);
-      return 0;
-   }
-
-   return curPFD;
-}
-
-GLAPI int GLAPIENTRY
-GetPixelFormat (HDC hdc)
-{
-   return wglGetPixelFormat(hdc);
-}
-
-GLAPI BOOL GLAPIENTRY
-wglSetPixelFormat (HDC hdc, int iPixelFormat, const PIXELFORMATDESCRIPTOR *ppfd)
-{
-   int qt_valid_pix;
-
-   qt_valid_pix = pfd_tablen();
-
-   if (iPixelFormat < 1 || iPixelFormat > qt_valid_pix) {
-      if (ppfd == NULL) {
-         PIXELFORMATDESCRIPTOR my_pfd;
-         if (!wglDescribePixelFormat(hdc, iPixelFormat, sizeof(PIXELFORMATDESCRIPTOR), &my_pfd)) {
-            SetLastError(0);
-            return FALSE;
-         }
-      } else if (ppfd->nSize != sizeof(PIXELFORMATDESCRIPTOR)) {
-         SetLastError(0);
-         return FALSE;
-      }
-   }
-   curPFD = iPixelFormat;
-
-   return TRUE;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglSwapBuffers (HDC hdc)
-{
-   if (!ctx) {
-      SetLastError(0);
-      return FALSE;
-   }
-
-   fxMesaSwapBuffers();
-
-   return TRUE;
-}
-
-GLAPI BOOL GLAPIENTRY
-SetPixelFormat (HDC hdc, int iPixelFormat, const PIXELFORMATDESCRIPTOR *ppfd)
-{
-   return wglSetPixelFormat(hdc, iPixelFormat, ppfd);
-}
-
-GLAPI BOOL GLAPIENTRY
-SwapBuffers(HDC hdc)
-{
-   return wglSwapBuffers(hdc);
-}
-
-static FIXED
-FixedFromDouble (double d)
-{
-   struct {
-      FIXED f;
-      long l;
-   } pun;
-   pun.l = (long)(d * 65536L);
-   return pun.f;
-}
-
-/*
-** This was yanked from windows/gdi/wgl.c
-*/
-GLAPI BOOL GLAPIENTRY
-wglUseFontBitmapsA (HDC hdc, DWORD first, DWORD count, DWORD listBase)
-{
-   int i;
-   GLuint font_list;
-   DWORD size;
-   GLYPHMETRICS gm;
-   HANDLE hBits;
-   LPSTR lpBits;
-   MAT2 mat;
-   int success = TRUE;
-
-   font_list = listBase;
-
-   mat.eM11 = FixedFromDouble(1);
-   mat.eM12 = FixedFromDouble(0);
-   mat.eM21 = FixedFromDouble(0);
-   mat.eM22 = FixedFromDouble(-1);
-
-   memset(&gm, 0, sizeof(gm));
-
-   /*
-    ** If we can't get the glyph outline, it may be because this is a fixed
-    ** font.  Try processing it that way.
-    */
-   if (GetGlyphOutline(hdc, first, GGO_BITMAP, &gm, 0, NULL, &mat) == GDI_ERROR) {
-      return wglUseFontBitmaps_FX(hdc, first, count, listBase);
-   }
-
-   /*
-    ** Otherwise process all desired characters.
-    */
-   for (i = 0; i < count; i++) {
-      DWORD err;
-
-      glNewList(font_list + i, GL_COMPILE);
-
-      /* allocate space for the bitmap/outline */
-      size = GetGlyphOutline(hdc, first + i, GGO_BITMAP, &gm, 0, NULL, &mat);
-      if (size == GDI_ERROR) {
-         glEndList();
-         err = GetLastError();
-         success = FALSE;
-         continue;
-      }
-
-      hBits = GlobalAlloc(GHND, size + 1);
-      lpBits = GlobalLock(hBits);
-
-      err = GetGlyphOutline(hdc,        /* handle to device context */
-                            first + i,  /* character to query */
-                            GGO_BITMAP, /* format of data to return */
-                            &gm,        /* pointer to structure for metrics */
-                            size,       /* size of buffer for data */
-                            lpBits,     /* pointer to buffer for data */
-                            &mat        /* pointer to transformation */
-                                        /* matrix structure */
-          );
-
-      if (err == GDI_ERROR) {
-         GlobalUnlock(hBits);
-         GlobalFree(hBits);
-
-         glEndList();
-         err = GetLastError();
-         success = FALSE;
-         continue;
-      }
-
-      glBitmap(gm.gmBlackBoxX, gm.gmBlackBoxY,
-               -gm.gmptGlyphOrigin.x,
-               gm.gmptGlyphOrigin.y,
-               gm.gmCellIncX, gm.gmCellIncY,
-               (const GLubyte *)lpBits);
-
-      GlobalUnlock(hBits);
-      GlobalFree(hBits);
-
-      glEndList();
-   }
-
-   return success;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglDescribeLayerPlane (HDC hdc, int iPixelFormat, int iLayerPlane,
-                       UINT nBytes, LPLAYERPLANEDESCRIPTOR ppfd)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-GLAPI int GLAPIENTRY
-wglGetLayerPaletteEntries (HDC hdc, int iLayerPlane, int iStart,
-                           int cEntries, COLORREF *pcr)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-GLAPI BOOL GLAPIENTRY
-wglRealizeLayerPalette (HDC hdc, int iLayerPlane, BOOL bRealize)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-GLAPI int GLAPIENTRY
-wglSetLayerPaletteEntries (HDC hdc, int iLayerPlane, int iStart,
-                           int cEntries, CONST COLORREF *pcr)
-{
-   SetLastError(0);
-   return FALSE;
-}
-
-
-/***************************************************************************
- * [dBorca] simplistic ICD implementation, based on ICD code by Gregor Anich
- */
-
-typedef struct _icdTable {
-   DWORD size;
-   PROC table[336];
-} ICDTABLE, *PICDTABLE;
-
-#ifdef USE_MGL_NAMESPACE
-#define GL_FUNC(func) mgl##func
-#else
-#define GL_FUNC(func) gl##func
-#endif
-
-static ICDTABLE icdTable = { 336, {
-#define ICD_ENTRY(func) (PROC)GL_FUNC(func),
-#include "../icd/icdlist.h"
-#undef ICD_ENTRY
-} };
-
-
-GLAPI BOOL GLAPIENTRY
-DrvCopyContext (HGLRC hglrcSrc, HGLRC hglrcDst, UINT mask)
-{
-   return wglCopyContext(hglrcSrc, hglrcDst, mask);
-}
-
-
-GLAPI HGLRC GLAPIENTRY
-DrvCreateContext (HDC hdc)
-{
-   return wglCreateContext(hdc);
-}
-
-
-GLAPI BOOL GLAPIENTRY
-DrvDeleteContext (HGLRC hglrc)
-{
-   return wglDeleteContext(hglrc);
-}
-
-
-GLAPI HGLRC GLAPIENTRY
-DrvCreateLayerContext (HDC hdc, int iLayerPlane)
-{
-   return wglCreateContext(hdc);
-}
-
-
-GLAPI PICDTABLE GLAPIENTRY
-DrvSetContext (HDC hdc, HGLRC hglrc, void *callback)
-{
-   return wglMakeCurrent(hdc, hglrc) ? &icdTable : NULL;
-}
-
-
-GLAPI BOOL GLAPIENTRY
-DrvReleaseContext (HGLRC hglrc)
-{
-   return TRUE;
-}
-
-
-GLAPI BOOL GLAPIENTRY
-DrvShareLists (HGLRC hglrc1, HGLRC hglrc2)
-{
-   return wglShareLists(hglrc1, hglrc2);
-}
-
-
-GLAPI BOOL GLAPIENTRY
-DrvDescribeLayerPlane (HDC hdc, int iPixelFormat,
-                       int iLayerPlane, UINT nBytes,
-                       LPLAYERPLANEDESCRIPTOR plpd)
-{
-   return wglDescribeLayerPlane(hdc, iPixelFormat, iLayerPlane, nBytes, plpd);
-}
-
-
-GLAPI int GLAPIENTRY
-DrvSetLayerPaletteEntries (HDC hdc, int iLayerPlane,
-                           int iStart, int cEntries, CONST COLORREF *pcr)
-{
-   return wglSetLayerPaletteEntries(hdc, iLayerPlane, iStart, cEntries, pcr);
-}
-
-
-GLAPI int GLAPIENTRY
-DrvGetLayerPaletteEntries (HDC hdc, int iLayerPlane,
-                           int iStart, int cEntries, COLORREF *pcr)
-{
-   return wglGetLayerPaletteEntries(hdc, iLayerPlane, iStart, cEntries, pcr);
-}
-
-
-GLAPI BOOL GLAPIENTRY
-DrvRealizeLayerPalette (HDC hdc, int iLayerPlane, BOOL bRealize)
-{
-   return wglRealizeLayerPalette(hdc, iLayerPlane, bRealize);
-}
-
-
-GLAPI BOOL GLAPIENTRY
-DrvSwapLayerBuffers (HDC hdc, UINT fuPlanes)
-{
-   return wglSwapLayerBuffers(hdc, fuPlanes);
-}
-
-GLAPI int GLAPIENTRY
-DrvDescribePixelFormat (HDC hdc, int iPixelFormat, UINT nBytes,
-                        LPPIXELFORMATDESCRIPTOR ppfd)
-{
-   return wglDescribePixelFormat(hdc, iPixelFormat, nBytes, ppfd);
-}
-
-
-GLAPI PROC GLAPIENTRY
-DrvGetProcAddress (LPCSTR lpszProc)
-{
-   return wglGetProcAddress(lpszProc);
-}
-
-
-GLAPI BOOL GLAPIENTRY
-DrvSetPixelFormat (HDC hdc, int iPixelFormat)
-{
-   return wglSetPixelFormat(hdc, iPixelFormat, NULL);
-}
-
-
-GLAPI BOOL GLAPIENTRY
-DrvSwapBuffers (HDC hdc)
-{
-   return wglSwapBuffers(hdc);
-}
-
-
-GLAPI BOOL GLAPIENTRY
-DrvValidateVersion (DWORD version)
-{
-   (void)version;
-   return TRUE;
-}
-
-
-#if (_MSC_VER >= 1200)
-#pragma warning( pop )
-#endif
-
-#endif /* FX */
diff --git a/src/mesa/drivers/windows/gdi/InitCritSections.cpp b/src/mesa/drivers/windows/gdi/InitCritSections.cpp
deleted file mode 100644
index 69f03b8e47c..00000000000
--- a/src/mesa/drivers/windows/gdi/InitCritSections.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "glapi.h"
-#include "glThread.h"
-
-#ifdef WIN32
-
-extern "C" _glthread_Mutex OneTimeLock;
-extern "C" _glthread_Mutex GenTexturesLock;
-
-extern "C" void FreeAllTSD(void);
-
-class _CriticalSectionInit
-{
-public:
-	static _CriticalSectionInit	m_inst;
-
-	_CriticalSectionInit()
-	{
-		_glthread_INIT_MUTEX(OneTimeLock);
-		_glthread_INIT_MUTEX(GenTexturesLock);
-	}
-
-	~_CriticalSectionInit()
-	{
-		_glthread_DESTROY_MUTEX(OneTimeLock);
-		_glthread_DESTROY_MUTEX(GenTexturesLock);
-		FreeAllTSD();
-	}
-};
-
-_CriticalSectionInit _CriticalSectionInit::m_inst;
-
-
-#endif /* WIN32 */
diff --git a/src/mesa/drivers/windows/gdi/wgl.c b/src/mesa/drivers/windows/gdi/wgl.c
index 1dafe6e2952..bf4ca9c908f 100644
--- a/src/mesa/drivers/windows/gdi/wgl.c
+++ b/src/mesa/drivers/windows/gdi/wgl.c
@@ -390,7 +390,7 @@ static FIXED FixedFromDouble(double d)
 static BOOL wglUseFontBitmaps_FX(HDC fontDevice, DWORD firstChar,
                                  DWORD numChars, DWORD listBase)
 {
-#define VERIFY(a) a
+#define VERIFY(a) (void)(a)
     
     TEXTMETRIC metric;
     BITMAPINFO *dibInfo;
diff --git a/src/mesa/drivers/windows/gdi/wmesa.c b/src/mesa/drivers/windows/gdi/wmesa.c
index e3a37eb1ace..35a150d0687 100644
--- a/src/mesa/drivers/windows/gdi/wmesa.c
+++ b/src/mesa/drivers/windows/gdi/wmesa.c
@@ -5,7 +5,7 @@
 
 #include "wmesadef.h"
 #include "colors.h"
-#include <GL/wmesa.h>
+#include "GL/wmesa.h"
 #include <winuser.h>
 #include "main/context.h"
 #include "main/extensions.h"
@@ -30,7 +30,7 @@ static WMesaFramebuffer FirstFramebuffer = NULL;
  * Create a new WMesaFramebuffer object which will correspond to the
  * given HDC (Window handle).
  */
-WMesaFramebuffer
+static WMesaFramebuffer
 wmesa_new_framebuffer(HDC hdc, struct gl_config *visual)
 {
     WMesaFramebuffer pwfb
@@ -48,7 +48,7 @@ wmesa_new_framebuffer(HDC hdc, struct gl_config *visual)
 /**
  * Given an hdc, free the corresponding WMesaFramebuffer
  */
-void
+static void
 wmesa_free_framebuffer(HDC hdc)
 {
     WMesaFramebuffer pwfb, prev;
@@ -71,7 +71,7 @@ wmesa_free_framebuffer(HDC hdc)
 /**
  * Given an hdc, return the corresponding WMesaFramebuffer
  */
-WMesaFramebuffer
+static WMesaFramebuffer
 wmesa_lookup_framebuffer(HDC hdc)
 {
     WMesaFramebuffer pwfb;
@@ -147,9 +147,8 @@ static void wmSetPixelFormat(WMesaFramebuffer pwfb, HDC hDC)
  * We write into this memory with the span routines and then blit it
  * to the window on a buffer swap.
  */
-BOOL wmCreateBackingStore(WMesaFramebuffer pwfb, long lxSize, long lySize)
+static BOOL wmCreateBackingStore(WMesaFramebuffer pwfb, long lxSize, long lySize)
 {
-    HDC          hdc = pwfb->hDC;
     LPBITMAPINFO pbmi = &(pwfb->bmi);
     HDC          hic;
 
@@ -227,7 +226,6 @@ wmesa_get_buffer_size(struct gl_framebuffer *buffer, GLuint *width, GLuint *heig
 
 static void wmesa_flush(struct gl_context *ctx)
 {
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->WinSysDrawBuffer);
 
     if (ctx->Visual.doubleBufferMode == 1) {
@@ -254,9 +252,7 @@ static void wmesa_flush(struct gl_context *ctx)
 static void clear_color(struct gl_context *ctx, const GLfloat color[4])
 {
     WMesaContext pwc = wmesa_context(ctx);
-    WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     GLubyte col[3];
-    UINT    bytesPerPixel = pwfb->cColorBits / 8; 
 
     CLAMPED_FLOAT_TO_UBYTE(col[0], color[0]);
     CLAMPED_FLOAT_TO_UBYTE(col[1], color[1]);
@@ -448,21 +444,15 @@ static void clear(struct gl_context *ctx, GLbitfield mask)
  **/
 
 /* Write a horizontal span of RGBA color pixels with a boolean mask. */
-static void write_rgba_span_front(const struct gl_context *ctx, 
-				   struct gl_renderbuffer *rb, 
-				   GLuint n, GLint x, GLint y,
-				   const GLubyte rgba[][4], 
-				   const GLubyte mask[] )
+static void write_rgba_span_front(struct gl_context *ctx, 
+				  struct gl_renderbuffer *rb, 
+				  GLuint n, GLint x, GLint y,
+				  const void *values, 
+				  const GLubyte *mask)
 {
+   const GLubyte (*rgba)[4] = (const GLubyte (*)[4])values;
    WMesaContext pwc = wmesa_context(ctx);
    WMesaFramebuffer pwfb = wmesa_lookup_framebuffer(pwc->hDC);
-   CONST BITMAPINFO bmi=
-   {
-      {
-         sizeof(BITMAPINFOHEADER),
-         n, 1, 1, 32, BI_RGB, 0, 1, 1, 0, 0
-      }
-   };
    HBITMAP bmp=0;
    HDC mdc=0;
    typedef union
@@ -535,12 +525,13 @@ static void write_rgba_span_front(const struct gl_context *ctx,
 }
 
 /* Write a horizontal span of RGB color pixels with a boolean mask. */
-static void write_rgb_span_front(const struct gl_context *ctx, 
-				  struct gl_renderbuffer *rb, 
-				  GLuint n, GLint x, GLint y,
-				  const GLubyte rgb[][3], 
-				  const GLubyte mask[] )
+static void write_rgb_span_front(struct gl_context *ctx, 
+				 struct gl_renderbuffer *rb, 
+				 GLuint n, GLint x, GLint y,
+				 const void *values, 
+				 const GLubyte *mask)
 {
+    const GLubyte (*rgb)[3] = (const GLubyte (*)[3])values;
     WMesaContext pwc = wmesa_context(ctx);
     GLuint i;
     
@@ -564,12 +555,13 @@ static void write_rgb_span_front(const struct gl_context *ctx,
  * Write a horizontal span of pixels with a boolean mask.  The current color
  * is used for all pixels.
  */
-static void write_mono_rgba_span_front(const struct gl_context *ctx, 
-					struct gl_renderbuffer *rb,
-					GLuint n, GLint x, GLint y,
-					const GLchan color[4], 
-					const GLubyte mask[])
+static void write_mono_rgba_span_front(struct gl_context *ctx, 
+                                       struct gl_renderbuffer *rb,
+                                       GLuint n, GLint x, GLint y,
+                                       const void *value, 
+                                       const GLubyte *mask)
 {
+    const GLchan *color = (const GLchan *)value;
     GLuint i;
     WMesaContext pwc = wmesa_context(ctx);
     COLORREF colorref;
@@ -589,13 +581,14 @@ static void write_mono_rgba_span_front(const struct gl_context *ctx,
 }
 
 /* Write an array of RGBA pixels with a boolean mask. */
-static void write_rgba_pixels_front(const struct gl_context *ctx, 
-				     struct gl_renderbuffer *rb,
-				     GLuint n, 
-				     const GLint x[], const GLint y[],
-				     const GLubyte rgba[][4], 
-				     const GLubyte mask[] )
+static void write_rgba_pixels_front(struct gl_context *ctx, 
+                                    struct gl_renderbuffer *rb,
+                                    GLuint n, 
+                                    const GLint x[], const GLint y[],
+                                    const void *values, 
+                                    const GLubyte *mask)
 {
+    const GLubyte (*rgba)[4] = (const GLubyte (*)[4])values;
     GLuint i;
     WMesaContext pwc = wmesa_context(ctx);
     (void) ctx;
@@ -612,13 +605,14 @@ static void write_rgba_pixels_front(const struct gl_context *ctx,
  * Write an array of pixels with a boolean mask.  The current color
  * is used for all pixels.
  */
-static void write_mono_rgba_pixels_front(const struct gl_context *ctx, 
-					  struct gl_renderbuffer *rb,
-					  GLuint n,
-					  const GLint x[], const GLint y[],
-					  const GLchan color[4],
-					  const GLubyte mask[] )
+static void write_mono_rgba_pixels_front(struct gl_context *ctx, 
+                                         struct gl_renderbuffer *rb,
+                                         GLuint n,
+                                         const GLint x[], const GLint y[],
+                                         const void *value,
+                                         const GLubyte *mask)
 {
+    const GLchan *color = (const GLchan *)value;
     GLuint i;
     WMesaContext pwc = wmesa_context(ctx);
     COLORREF colorref;
@@ -630,11 +624,12 @@ static void write_mono_rgba_pixels_front(const struct gl_context *ctx,
 }
 
 /* Read a horizontal span of color pixels. */
-static void read_rgba_span_front(const struct gl_context *ctx, 
-				  struct gl_renderbuffer *rb,
-				  GLuint n, GLint x, GLint y,
-				  GLubyte rgba[][4] )
+static void read_rgba_span_front(struct gl_context *ctx, 
+                                 struct gl_renderbuffer *rb,
+                                 GLuint n, GLint x, GLint y,
+                                 void *values)
 {
+    GLubyte (*rgba)[4] = (GLubyte (*)[4])values;
     WMesaContext pwc = wmesa_context(ctx);
     GLuint i;
     COLORREF Color;
@@ -650,11 +645,12 @@ static void read_rgba_span_front(const struct gl_context *ctx,
 
 
 /* Read an array of color pixels. */
-static void read_rgba_pixels_front(const struct gl_context *ctx, 
-				    struct gl_renderbuffer *rb,
-				    GLuint n, const GLint x[], const GLint y[],
-				    GLubyte rgba[][4])
+static void read_rgba_pixels_front(struct gl_context *ctx, 
+                                   struct gl_renderbuffer *rb,
+                                   GLuint n, const GLint x[], const GLint y[],
+                                   void *values)
 {
+    GLubyte (*rgba)[4] = (GLubyte (*)[4])values;
     WMesaContext pwc = wmesa_context(ctx);
     GLuint i;
     COLORREF Color;
@@ -679,13 +675,13 @@ LPDWORD lpdw = ((LPDWORD)((pwc)->pbPixels + (pwc)->ScanWidth * (y)) + (x)); \
 
 
 /* Write a horizontal span of RGBA color pixels with a boolean mask. */
-static void write_rgba_span_32(const struct gl_context *ctx, 
+static void write_rgba_span_32(struct gl_context *ctx, 
 			       struct gl_renderbuffer *rb, 
 			       GLuint n, GLint x, GLint y,
-			       const GLubyte rgba[][4], 
-			       const GLubyte mask[] )
+			       const void *values, 
+			       const GLubyte *mask)
 {
-    WMesaContext pwc = wmesa_context(ctx);
+    const GLubyte (*rgba)[4] = (const GLubyte (*)[4])values;
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     GLuint i;
     LPDWORD lpdw;
@@ -709,13 +705,13 @@ static void write_rgba_span_32(const struct gl_context *ctx,
 
 
 /* Write a horizontal span of RGB color pixels with a boolean mask. */
-static void write_rgb_span_32(const struct gl_context *ctx, 
+static void write_rgb_span_32(struct gl_context *ctx, 
 			      struct gl_renderbuffer *rb, 
 			      GLuint n, GLint x, GLint y,
-			      const GLubyte rgb[][3], 
-			      const GLubyte mask[] )
+			      const void *values, 
+			      const GLubyte *mask)
 {
-    WMesaContext pwc = wmesa_context(ctx);
+    const GLubyte (*rgb)[3] = (const GLubyte (*)[3])values;
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     GLuint i;
     LPDWORD lpdw;
@@ -741,16 +737,16 @@ static void write_rgb_span_32(const struct gl_context *ctx,
  * Write a horizontal span of pixels with a boolean mask.  The current color
  * is used for all pixels.
  */
-static void write_mono_rgba_span_32(const struct gl_context *ctx, 
+static void write_mono_rgba_span_32(struct gl_context *ctx, 
 				    struct gl_renderbuffer *rb,
 				    GLuint n, GLint x, GLint y,
-				    const GLchan color[4], 
-				    const GLubyte mask[])
+				    const void *value, 
+				    const GLubyte *mask)
 {
+    const GLchan *color = (const GLchan *)value;
     LPDWORD lpdw;
     DWORD pixel;
     GLuint i;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     lpdw = ((LPDWORD)(pwfb->pbPixels + pwfb->ScanWidth * y)) + x;
     y=FLIP(y);
@@ -767,14 +763,14 @@ static void write_mono_rgba_span_32(const struct gl_context *ctx,
 }
 
 /* Write an array of RGBA pixels with a boolean mask. */
-static void write_rgba_pixels_32(const struct gl_context *ctx, 
+static void write_rgba_pixels_32(struct gl_context *ctx, 
 				 struct gl_renderbuffer *rb,
 				 GLuint n, const GLint x[], const GLint y[],
-				 const GLubyte rgba[][4], 
-				 const GLubyte mask[])
+				 const void *values, 
+				 const GLubyte *mask)
 {
+    const GLubyte (*rgba)[4] = (const GLubyte (*)[4])values;
     GLuint i;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     for (i=0; i<n; i++)
 	if (mask[i])
@@ -786,15 +782,15 @@ static void write_rgba_pixels_32(const struct gl_context *ctx,
  * Write an array of pixels with a boolean mask.  The current color
  * is used for all pixels.
  */
-static void write_mono_rgba_pixels_32(const struct gl_context *ctx, 
+static void write_mono_rgba_pixels_32(struct gl_context *ctx, 
 				      struct gl_renderbuffer *rb,
 				      GLuint n,
 				      const GLint x[], const GLint y[],
-				      const GLchan color[4],
-				      const GLubyte mask[])
+				      const void *value,
+				      const GLubyte *mask)
 {
+    const GLchan *color = (const GLchan *)value;
     GLuint i;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     for (i=0; i<n; i++)
 	if (mask[i])
@@ -803,15 +799,15 @@ static void write_mono_rgba_pixels_32(const struct gl_context *ctx,
 }
 
 /* Read a horizontal span of color pixels. */
-static void read_rgba_span_32(const struct gl_context *ctx, 
+static void read_rgba_span_32(struct gl_context *ctx, 
 			      struct gl_renderbuffer *rb,
 			      GLuint n, GLint x, GLint y,
-			      GLubyte rgba[][4] )
+			      void *values)
 {
+    GLubyte (*rgba)[4] = (GLubyte (*)[4])values;
     GLuint i;
     DWORD pixel;
     LPDWORD lpdw;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     
     y = FLIP(y);
@@ -827,15 +823,15 @@ static void read_rgba_span_32(const struct gl_context *ctx,
 
 
 /* Read an array of color pixels. */
-static void read_rgba_pixels_32(const struct gl_context *ctx, 
+static void read_rgba_pixels_32(struct gl_context *ctx, 
 				struct gl_renderbuffer *rb,
 				GLuint n, const GLint x[], const GLint y[],
-				GLubyte rgba[][4])
+				void *values)
 {
+    GLubyte (*rgba)[4] = (GLubyte (*)[4])values;
     GLuint i;
     DWORD pixel;
     LPDWORD lpdw;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
 
     for (i=0; i<n; i++) {
@@ -861,13 +857,13 @@ lpb[1] = (g); \
 lpb[2] = (r); }
 
 /* Write a horizontal span of RGBA color pixels with a boolean mask. */
-static void write_rgba_span_24(const struct gl_context *ctx, 
+static void write_rgba_span_24(struct gl_context *ctx, 
 			       struct gl_renderbuffer *rb, 
 			       GLuint n, GLint x, GLint y,
-			       const GLubyte rgba[][4], 
-			       const GLubyte mask[] )
+			       const void *values, 
+			       const GLubyte *mask)
 {
-    WMesaContext pwc = wmesa_context(ctx);
+    const GLubyte (*rgba)[4] = (const GLubyte (*)[4])values;
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     GLuint i;
     LPBYTE lpb;
@@ -895,13 +891,13 @@ static void write_rgba_span_24(const struct gl_context *ctx,
 
 
 /* Write a horizontal span of RGB color pixels with a boolean mask. */
-static void write_rgb_span_24(const struct gl_context *ctx, 
+static void write_rgb_span_24(struct gl_context *ctx, 
 			      struct gl_renderbuffer *rb, 
 			      GLuint n, GLint x, GLint y,
-			      const GLubyte rgb[][3], 
-			      const GLubyte mask[] )
+			      const void *values, 
+			      const GLubyte *mask)
 {
-    WMesaContext pwc = wmesa_context(ctx);
+    const GLubyte (*rgb)[3] = (const GLubyte (*)[3])values;
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     GLuint i;
     LPBYTE lpb;
@@ -931,15 +927,15 @@ static void write_rgb_span_24(const struct gl_context *ctx,
  * Write a horizontal span of pixels with a boolean mask.  The current color
  * is used for all pixels.
  */
-static void write_mono_rgba_span_24(const struct gl_context *ctx, 
+static void write_mono_rgba_span_24(struct gl_context *ctx, 
 				    struct gl_renderbuffer *rb,
 				    GLuint n, GLint x, GLint y,
-				    const GLchan color[4], 
-				    const GLubyte mask[])
+				    const void *value, 
+				    const GLubyte *mask)
 {
+    const GLchan *color = (const GLchan *)value;
     LPBYTE lpb;
     GLuint i;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     lpb = ((LPBYTE)(pwfb->pbPixels + pwfb->ScanWidth * y)) + (3 * x);
     y=FLIP(y);
@@ -960,14 +956,14 @@ static void write_mono_rgba_span_24(const struct gl_context *ctx,
 }
 
 /* Write an array of RGBA pixels with a boolean mask. */
-static void write_rgba_pixels_24(const struct gl_context *ctx, 
+static void write_rgba_pixels_24(struct gl_context *ctx, 
 				 struct gl_renderbuffer *rb,
 				 GLuint n, const GLint x[], const GLint y[],
-				 const GLubyte rgba[][4], 
-				 const GLubyte mask[])
+				 const void *values, 
+				 const GLubyte *mask)
 {
+    const GLubyte (*rgba)[4] = (const GLubyte (*)[4])values;
     GLuint i;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     for (i=0; i<n; i++)
 	if (mask[i])
@@ -979,15 +975,15 @@ static void write_rgba_pixels_24(const struct gl_context *ctx,
  * Write an array of pixels with a boolean mask.  The current color
  * is used for all pixels.
  */
-static void write_mono_rgba_pixels_24(const struct gl_context *ctx, 
+static void write_mono_rgba_pixels_24(struct gl_context *ctx, 
 				      struct gl_renderbuffer *rb,
 				      GLuint n,
 				      const GLint x[], const GLint y[],
-				      const GLchan color[4],
-				      const GLubyte mask[])
+				      const void *value,
+				      const GLubyte *mask)
 {
+    const GLchan *color = (const GLchan *)value;
     GLuint i;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     for (i=0; i<n; i++)
 	if (mask[i])
@@ -996,14 +992,14 @@ static void write_mono_rgba_pixels_24(const struct gl_context *ctx,
 }
 
 /* Read a horizontal span of color pixels. */
-static void read_rgba_span_24(const struct gl_context *ctx, 
+static void read_rgba_span_24(struct gl_context *ctx, 
 			      struct gl_renderbuffer *rb,
 			      GLuint n, GLint x, GLint y,
-			      GLubyte rgba[][4] )
+			      void *values)
 {
+    GLubyte (*rgba)[4] = (GLubyte (*)[4])values;
     GLuint i;
     LPBYTE lpb;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     
     y = FLIP(y);
@@ -1018,14 +1014,14 @@ static void read_rgba_span_24(const struct gl_context *ctx,
 
 
 /* Read an array of color pixels. */
-static void read_rgba_pixels_24(const struct gl_context *ctx, 
+static void read_rgba_pixels_24(struct gl_context *ctx, 
 				struct gl_renderbuffer *rb,
 				GLuint n, const GLint x[], const GLint y[],
-				GLubyte rgba[][4])
+				void *values)
 {
+    GLubyte (*rgba)[4] = (GLubyte (*)[4])values;
     GLuint i;
     LPBYTE lpb;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
 
     for (i=0; i<n; i++) {
@@ -1050,13 +1046,13 @@ LPWORD lpw = ((LPWORD)((pwc)->pbPixels + (pwc)->ScanWidth * (y)) + (x)); \
 
 
 /* Write a horizontal span of RGBA color pixels with a boolean mask. */
-static void write_rgba_span_16(const struct gl_context *ctx, 
+static void write_rgba_span_16(struct gl_context *ctx, 
 			       struct gl_renderbuffer *rb, 
 			       GLuint n, GLint x, GLint y,
-			       const GLubyte rgba[][4], 
-			       const GLubyte mask[] )
+			       const void *values, 
+			       const GLubyte *mask)
 {
-    WMesaContext pwc = wmesa_context(ctx);
+    const GLubyte (*rgba)[4] = (const GLubyte (*)[4])values;
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     GLuint i;
     LPWORD lpw;
@@ -1080,13 +1076,13 @@ static void write_rgba_span_16(const struct gl_context *ctx,
 
 
 /* Write a horizontal span of RGB color pixels with a boolean mask. */
-static void write_rgb_span_16(const struct gl_context *ctx, 
+static void write_rgb_span_16(struct gl_context *ctx, 
 			      struct gl_renderbuffer *rb, 
 			      GLuint n, GLint x, GLint y,
-			      const GLubyte rgb[][3], 
-			      const GLubyte mask[] )
+			      const void *values, 
+			      const GLubyte *mask)
 {
-    WMesaContext pwc = wmesa_context(ctx);
+    const GLubyte (*rgb)[3] = (const GLubyte (*)[3])values;
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     GLuint i;
     LPWORD lpw;
@@ -1112,16 +1108,16 @@ static void write_rgb_span_16(const struct gl_context *ctx,
  * Write a horizontal span of pixels with a boolean mask.  The current color
  * is used for all pixels.
  */
-static void write_mono_rgba_span_16(const struct gl_context *ctx, 
+static void write_mono_rgba_span_16(struct gl_context *ctx, 
 				    struct gl_renderbuffer *rb,
 				    GLuint n, GLint x, GLint y,
-				    const GLchan color[4], 
-				    const GLubyte mask[])
+				    const void *value, 
+				    const GLubyte *mask)
 {
+    const GLchan *color = (const GLchan *)value;
     LPWORD lpw;
     WORD pixel;
     GLuint i;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     (void) ctx;
     lpw = ((LPWORD)(pwfb->pbPixels + pwfb->ScanWidth * y)) + x;
@@ -1139,14 +1135,14 @@ static void write_mono_rgba_span_16(const struct gl_context *ctx,
 }
 
 /* Write an array of RGBA pixels with a boolean mask. */
-static void write_rgba_pixels_16(const struct gl_context *ctx, 
+static void write_rgba_pixels_16(struct gl_context *ctx, 
 				 struct gl_renderbuffer *rb,
 				 GLuint n, const GLint x[], const GLint y[],
-				 const GLubyte rgba[][4], 
-				 const GLubyte mask[])
+				 const void *values, 
+				 const GLubyte *mask)
 {
+    const GLubyte (*rgba)[4] = (const GLubyte (*)[4])values;
     GLuint i;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     (void) ctx;
     for (i=0; i<n; i++)
@@ -1159,15 +1155,15 @@ static void write_rgba_pixels_16(const struct gl_context *ctx,
  * Write an array of pixels with a boolean mask.  The current color
  * is used for all pixels.
  */
-static void write_mono_rgba_pixels_16(const struct gl_context *ctx, 
+static void write_mono_rgba_pixels_16(struct gl_context *ctx, 
 				      struct gl_renderbuffer *rb,
 				      GLuint n,
 				      const GLint x[], const GLint y[],
-				      const GLchan color[4],
-				      const GLubyte mask[])
+				      const void *value,
+				      const GLubyte *mask)
 {
+    const GLchan *color = (const GLchan *)value;
     GLuint i;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     (void) ctx;
     for (i=0; i<n; i++)
@@ -1177,14 +1173,14 @@ static void write_mono_rgba_pixels_16(const struct gl_context *ctx,
 }
 
 /* Read a horizontal span of color pixels. */
-static void read_rgba_span_16(const struct gl_context *ctx, 
+static void read_rgba_span_16(struct gl_context *ctx, 
 			      struct gl_renderbuffer *rb,
 			      GLuint n, GLint x, GLint y,
-			      GLubyte rgba[][4] )
+			      void *values)
 {
+    GLubyte (*rgba)[4] = (GLubyte (*)[4])values;
     GLuint i, pixel;
     LPWORD lpw;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
     
     y = FLIP(y);
@@ -1201,14 +1197,14 @@ static void read_rgba_span_16(const struct gl_context *ctx,
 
 
 /* Read an array of color pixels. */
-static void read_rgba_pixels_16(const struct gl_context *ctx, 
+static void read_rgba_pixels_16(struct gl_context *ctx, 
 				struct gl_renderbuffer *rb,
 				GLuint n, const GLint x[], const GLint y[],
-				GLubyte rgba[][4])
+				void *values)
 {
+    GLubyte (*rgba)[4] = (GLubyte (*)[4])values;
     GLuint i, pixel;
     LPWORD lpw;
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(ctx->DrawBuffer);
 
     for (i=0; i<n; i++) {
@@ -1261,8 +1257,9 @@ wmesa_renderbuffer_storage(struct gl_context *ctx,
  * Plug in the Get/PutRow/Values functions for a renderbuffer depending
  * on if we're drawing to the front or back color buffer.
  */
-void wmesa_set_renderbuffer_funcs(struct gl_renderbuffer *rb, int pixelformat,
-                                  int cColorBits, int double_buffer)
+static void
+wmesa_set_renderbuffer_funcs(struct gl_renderbuffer *rb, int pixelformat,
+                             int cColorBits, int double_buffer)
 {
     if (double_buffer) {
         /* back buffer */
@@ -1324,7 +1321,6 @@ static void
 wmesa_resize_buffers(struct gl_context *ctx, struct gl_framebuffer *buffer,
                      GLuint width, GLuint height)
 {
-    WMesaContext pwc = wmesa_context(ctx);
     WMesaFramebuffer pwfb = wmesa_framebuffer(buffer);
 
     if (pwfb->Base.Width != width || pwfb->Base.Height != height) {
@@ -1353,7 +1349,6 @@ static void wmesa_viewport(struct gl_context *ctx,
 			   GLint x, GLint y, 
 			   GLsizei width, GLsizei height)
 {
-    WMesaContext pwc = wmesa_context(ctx);
     GLuint new_width, new_height;
 
     wmesa_get_buffer_size(ctx->WinSysDrawBuffer, &new_width, &new_height);
@@ -1553,7 +1548,7 @@ void WMesaDestroyContext( WMesaContext pwc )
 /**
  * Create a new color renderbuffer.
  */
-struct gl_renderbuffer *
+static struct gl_renderbuffer *
 wmesa_new_renderbuffer(void)
 {
     struct gl_renderbuffer *rb = CALLOC_STRUCT(gl_renderbuffer);
diff --git a/src/mesa/drivers/windows/gdi/wmesadef.h b/src/mesa/drivers/windows/gdi/wmesadef.h
index 32289ebc700..9fda8839014 100644
--- a/src/mesa/drivers/windows/gdi/wmesadef.h
+++ b/src/mesa/drivers/windows/gdi/wmesadef.h
@@ -1,8 +1,8 @@
 #ifndef WMESADEF_H
 #define WMESADEF_H
-#ifdef __MINGW32__
+
 #include <windows.h>
-#endif
+
 #include "main/context.h"
 
 
diff --git a/src/mesa/drivers/x11/xm_dd.c b/src/mesa/drivers/x11/xm_dd.c
index 3031b7b3273..81f48f9d95a 100644
--- a/src/mesa/drivers/x11/xm_dd.c
+++ b/src/mesa/drivers/x11/xm_dd.c
@@ -445,11 +445,11 @@ xmesa_DrawPixels_8R8G8B( struct gl_context *ctx,
       if (swrast->NewState)
          _swrast_validate_derived( ctx );
 
-      if (unpack->BufferObj->Name) {
+      if (_mesa_is_bufferobj(unpack->BufferObj)) {
          /* unpack from PBO */
          GLubyte *buf;
          if (!_mesa_validate_pbo_access(2, unpack, width, height, 1,
-                                        format, type, pixels)) {
+                                        format, type, INT_MAX, pixels)) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "glDrawPixels(invalid PBO access)");
             return;
@@ -507,7 +507,7 @@ xmesa_DrawPixels_8R8G8B( struct gl_context *ctx,
          XPutImage(dpy, xrb->pixmap, gc, &ximage, 0, 0, dstX, dstY, w, h);
       }
 
-      if (unpack->BufferObj->Name) {
+      if (_mesa_is_bufferobj(unpack->BufferObj)) {
          ctx->Driver.UnmapBuffer(ctx, GL_PIXEL_UNPACK_BUFFER_EXT,
                                  unpack->BufferObj);
       }
@@ -580,11 +580,11 @@ xmesa_DrawPixels_5R6G5B( struct gl_context *ctx,
       if (swrast->NewState)
          _swrast_validate_derived( ctx );
       
-      if (unpack->BufferObj->Name) {
+      if (_mesa_is_bufferobj(unpack->BufferObj)) {
          /* unpack from PBO */
          GLubyte *buf;
          if (!_mesa_validate_pbo_access(2, unpack, width, height, 1,
-                                        format, type, pixels)) {
+                                        format, type, INT_MAX, pixels)) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "glDrawPixels(invalid PBO access)");
             return;