53 files changed, 1225 insertions, 875 deletions
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 629ec0ffec5..c548e104203 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -2400,6 +2400,9 @@ _mesa_meta_GenerateMipmap(GLcontext *ctx, GLenum target,
          break;
       }
 
+      /* Set MaxLevel large enough to hold the new level when we allocate it  */
+      _mesa_TexParameteri(target, GL_TEXTURE_MAX_LEVEL, dstLevel);
+
       /* Create empty dest image */
       if (target == GL_TEXTURE_1D) {
          _mesa_TexImage1D(target, dstLevel, srcImage->InternalFormat,
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index a0039e800d2..831981558d8 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -61,6 +61,7 @@ DRIVER_SOURCES = \
 	brw_sf.c \
 	brw_sf_emit.c \
 	brw_sf_state.c \
+	brw_state.c \
 	brw_state_batch.c \
 	brw_state_cache.c \
 	brw_state_dump.c \
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index c9e42a1529b..cfce5d31405 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -36,7 +36,8 @@
 #include "brw_util.h"
 #include "main/macros.h"
 
-static void prepare_cc_vp( struct brw_context *brw )
+void
+brw_update_cc_vp(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct brw_cc_viewport ccv;
@@ -54,40 +55,9 @@ static void prepare_cc_vp( struct brw_context *brw )
    }
 
    drm_intel_bo_unreference(brw->cc.vp_bo);
-   brw->cc.vp_bo = brw_cache_data(&brw->cache, BRW_CC_VP, &ccv, sizeof(ccv),
-				  NULL, 0);
+   brw->cc.vp_bo = brw_cache_data(&brw->cache, BRW_CC_VP, &ccv, sizeof(ccv));
 }
 
-const struct brw_tracked_state brw_cc_vp = {
-   .dirty = {
-      .mesa = _NEW_VIEWPORT | _NEW_TRANSFORM,
-      .brw = BRW_NEW_CONTEXT,
-      .cache = 0
-   },
-   .prepare = prepare_cc_vp
-};
-
-struct brw_cc_unit_key {
-   GLboolean stencil, stencil_two_side, color_blend, alpha_enabled;
-
-   GLenum stencil_func[2], stencil_fail_op[2];
-   GLenum stencil_pass_depth_fail_op[2], stencil_pass_depth_pass_op[2];
-   GLubyte stencil_ref[2], stencil_write_mask[2], stencil_test_mask[2];
-   GLenum logic_op;
-
-   GLenum blend_eq_rgb, blend_eq_a;
-   GLenum blend_src_rgb, blend_src_a;
-   GLenum blend_dst_rgb, blend_dst_a;
-
-   GLenum alpha_func;
-   GLclampf alpha_ref;
-
-   GLboolean dither;
-
-   GLboolean depth_test, depth_write;
-   GLenum depth_func;
-};
-
 /**
  * Modify blend function to force destination alpha to 1.0
  *
@@ -110,136 +80,83 @@ fix_xRGB_alpha(GLenum function)
    return function;
 }
 
-static void
-cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
+static void prepare_cc_unit(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
-   const unsigned back = ctx->Stencil._BackFace;
-
-   memset(key, 0, sizeof(*key));
-
-   key->stencil = ctx->Stencil._Enabled;
-   key->stencil_two_side = ctx->Stencil._TestTwoSide;
-
-   if (key->stencil) {
-      key->stencil_func[0] = ctx->Stencil.Function[0];
-      key->stencil_fail_op[0] = ctx->Stencil.FailFunc[0];
-      key->stencil_pass_depth_fail_op[0] = ctx->Stencil.ZFailFunc[0];
-      key->stencil_pass_depth_pass_op[0] = ctx->Stencil.ZPassFunc[0];
-      key->stencil_ref[0] = ctx->Stencil.Ref[0];
-      key->stencil_write_mask[0] = ctx->Stencil.WriteMask[0];
-      key->stencil_test_mask[0] = ctx->Stencil.ValueMask[0];
-   }
-   if (key->stencil_two_side) {
-      key->stencil_func[1] = ctx->Stencil.Function[back];
-      key->stencil_fail_op[1] = ctx->Stencil.FailFunc[back];
-      key->stencil_pass_depth_fail_op[1] = ctx->Stencil.ZFailFunc[back];
-      key->stencil_pass_depth_pass_op[1] = ctx->Stencil.ZPassFunc[back];
-      key->stencil_ref[1] = ctx->Stencil.Ref[back];
-      key->stencil_write_mask[1] = ctx->Stencil.WriteMask[back];
-      key->stencil_test_mask[1] = ctx->Stencil.ValueMask[back];
-   }
-
-   if (ctx->Color._LogicOpEnabled)
-      key->logic_op = ctx->Color.LogicOp;
-   else
-      key->logic_op = GL_COPY;
-
-   key->color_blend = ctx->Color.BlendEnabled;
-   if (key->color_blend) {
-      key->blend_eq_rgb = ctx->Color.BlendEquationRGB;
-      key->blend_eq_a = ctx->Color.BlendEquationA;
-      key->blend_src_rgb = ctx->Color.BlendSrcRGB;
-      key->blend_dst_rgb = ctx->Color.BlendDstRGB;
-      key->blend_src_a = ctx->Color.BlendSrcA;
-      key->blend_dst_a = ctx->Color.BlendDstA;
-
-      /* If the renderbuffer is XRGB, we have to frob the blend function to
-       * force the destination alpha to 1.0.  This means replacing GL_DST_ALPHA
-       * with GL_ONE and GL_ONE_MINUS_DST_ALPHA with GL_ZERO.
-       */
-      if (ctx->DrawBuffer->Visual.alphaBits == 0) {
-	 key->blend_src_rgb = fix_xRGB_alpha(key->blend_src_rgb);
-	 key->blend_src_a   = fix_xRGB_alpha(key->blend_src_a);
-	 key->blend_dst_rgb = fix_xRGB_alpha(key->blend_dst_rgb);
-	 key->blend_dst_a   = fix_xRGB_alpha(key->blend_dst_a);
-      }
-   }
-
-   key->alpha_enabled = ctx->Color.AlphaEnabled;
-   if (key->alpha_enabled) {
-      key->alpha_func = ctx->Color.AlphaFunc;
-      key->alpha_ref = ctx->Color.AlphaRef;
-   }
-
-   key->dither = ctx->Color.DitherFlag;
-
-   key->depth_test = ctx->Depth.Test;
-   if (key->depth_test) {
-      key->depth_func = ctx->Depth.Func;
-      key->depth_write = ctx->Depth.Mask;
-   }
+   brw_add_validated_bo(brw, brw->cc.vp_bo);
 }
 
 /**
  * Creates the state cache entry for the given CC unit key.
  */
-static drm_intel_bo *
-cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
+static void upload_cc_unit(struct brw_context *brw)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct brw_cc_unit_state cc;
-   drm_intel_bo *bo;
+   void *map;
 
    memset(&cc, 0, sizeof(cc));
 
    /* _NEW_STENCIL */
-   if (key->stencil) {
+   if (ctx->Stencil._Enabled) {
+      const unsigned back = ctx->Stencil._BackFace;
+
       cc.cc0.stencil_enable = 1;
       cc.cc0.stencil_func =
-	 intel_translate_compare_func(key->stencil_func[0]);
+	 intel_translate_compare_func(ctx->Stencil.Function[0]);
       cc.cc0.stencil_fail_op =
-	 intel_translate_stencil_op(key->stencil_fail_op[0]);
+	 intel_translate_stencil_op(ctx->Stencil.FailFunc[0]);
       cc.cc0.stencil_pass_depth_fail_op =
-	 intel_translate_stencil_op(key->stencil_pass_depth_fail_op[0]);
+	 intel_translate_stencil_op(ctx->Stencil.ZFailFunc[0]);
       cc.cc0.stencil_pass_depth_pass_op =
-	 intel_translate_stencil_op(key->stencil_pass_depth_pass_op[0]);
-      cc.cc1.stencil_ref = key->stencil_ref[0];
-      cc.cc1.stencil_write_mask = key->stencil_write_mask[0];
-      cc.cc1.stencil_test_mask = key->stencil_test_mask[0];
+	 intel_translate_stencil_op(ctx->Stencil.ZPassFunc[0]);
+      cc.cc1.stencil_ref = ctx->Stencil.Ref[0];
+      cc.cc1.stencil_write_mask = ctx->Stencil.WriteMask[0];
+      cc.cc1.stencil_test_mask = ctx->Stencil.ValueMask[0];
 
-      if (key->stencil_two_side) {
+      if (ctx->Stencil._TestTwoSide) {
 	 cc.cc0.bf_stencil_enable = 1;
 	 cc.cc0.bf_stencil_func =
-	    intel_translate_compare_func(key->stencil_func[1]);
+	    intel_translate_compare_func(ctx->Stencil.Function[back]);
 	 cc.cc0.bf_stencil_fail_op =
-	    intel_translate_stencil_op(key->stencil_fail_op[1]);
+	    intel_translate_stencil_op(ctx->Stencil.FailFunc[back]);
 	 cc.cc0.bf_stencil_pass_depth_fail_op =
-	    intel_translate_stencil_op(key->stencil_pass_depth_fail_op[1]);
+	    intel_translate_stencil_op(ctx->Stencil.ZFailFunc[back]);
 	 cc.cc0.bf_stencil_pass_depth_pass_op =
-	    intel_translate_stencil_op(key->stencil_pass_depth_pass_op[1]);
-	 cc.cc1.bf_stencil_ref = key->stencil_ref[1];
-	 cc.cc2.bf_stencil_write_mask = key->stencil_write_mask[1];
-	 cc.cc2.bf_stencil_test_mask = key->stencil_test_mask[1];
+	    intel_translate_stencil_op(ctx->Stencil.ZPassFunc[back]);
+	 cc.cc1.bf_stencil_ref = ctx->Stencil.Ref[back];
+	 cc.cc2.bf_stencil_write_mask = ctx->Stencil.WriteMask[back];
+	 cc.cc2.bf_stencil_test_mask = ctx->Stencil.ValueMask[back];
       }
 
       /* Not really sure about this:
        */
-      if (key->stencil_write_mask[0] ||
-	  (key->stencil_two_side && key->stencil_write_mask[1]))
+      if (ctx->Stencil.WriteMask[0] ||
+	  (ctx->Stencil._TestTwoSide && ctx->Stencil.WriteMask[back]))
 	 cc.cc0.stencil_write_enable = 1;
    }
 
    /* _NEW_COLOR */
-   if (key->logic_op != GL_COPY) {
+   if (ctx->Color._LogicOpEnabled && ctx->Color.LogicOp != GL_COPY) {
       cc.cc2.logicop_enable = 1;
-      cc.cc5.logicop_func = intel_translate_logic_op(key->logic_op);
-   } else if (key->color_blend) {
-      GLenum eqRGB = key->blend_eq_rgb;
-      GLenum eqA = key->blend_eq_a;
-      GLenum srcRGB = key->blend_src_rgb;
-      GLenum dstRGB = key->blend_dst_rgb;
-      GLenum srcA = key->blend_src_a;
-      GLenum dstA = key->blend_dst_a;
+      cc.cc5.logicop_func = intel_translate_logic_op(ctx->Color.LogicOp);
+   } else if (ctx->Color.BlendEnabled) {
+      GLenum eqRGB = ctx->Color.BlendEquationRGB;
+      GLenum eqA = ctx->Color.BlendEquationA;
+      GLenum srcRGB = ctx->Color.BlendSrcRGB;
+      GLenum dstRGB = ctx->Color.BlendDstRGB;
+      GLenum srcA = ctx->Color.BlendSrcA;
+      GLenum dstA = ctx->Color.BlendDstA;
+
+      /* If the renderbuffer is XRGB, we have to frob the blend function to
+       * force the destination alpha to 1.0.  This means replacing GL_DST_ALPHA
+       * with GL_ONE and GL_ONE_MINUS_DST_ALPHA with GL_ZERO.
+       */
+      if (ctx->DrawBuffer->Visual.alphaBits == 0) {
+	 srcRGB = fix_xRGB_alpha(srcRGB);
+	 srcA   = fix_xRGB_alpha(srcA);
+	 dstRGB = fix_xRGB_alpha(dstRGB);
+	 dstA   = fix_xRGB_alpha(dstA);
+      }
 
       if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
 	 srcRGB = dstRGB = GL_ONE;
@@ -263,25 +180,27 @@ cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 				eqA != eqRGB);
    }
 
-   if (key->alpha_enabled) {
+   if (ctx->Color.AlphaEnabled) {
       cc.cc3.alpha_test = 1;
-      cc.cc3.alpha_test_func = intel_translate_compare_func(key->alpha_func);
+      cc.cc3.alpha_test_func =
+	 intel_translate_compare_func(ctx->Color.AlphaFunc);
       cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
 
-      UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], key->alpha_ref);
+      UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], ctx->Color.AlphaRef);
    }
 
-   if (key->dither) {
+   if (ctx->Color.DitherFlag) {
       cc.cc5.dither_enable = 1;
       cc.cc6.y_dither_offset = 0;
       cc.cc6.x_dither_offset = 0;
    }
 
    /* _NEW_DEPTH */
-   if (key->depth_test) {
+   if (ctx->Depth.Test) {
       cc.cc2.depth_test = 1;
-      cc.cc2.depth_test_function = intel_translate_compare_func(key->depth_func);
-      cc.cc2.depth_write_enable = key->depth_write;
+      cc.cc2.depth_test_function =
+	 intel_translate_compare_func(ctx->Depth.Func);
+      cc.cc2.depth_write_enable = ctx->Depth.Mask;
    }
 
    /* CACHE_NEW_CC_VP */
@@ -290,43 +209,25 @@ cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
    if (INTEL_DEBUG & DEBUG_STATS)
       cc.cc5.statistics_enable = 1;
 
-   bo = brw_upload_cache(&brw->cache, BRW_CC_UNIT,
-			 key, sizeof(*key),
-			 &brw->cc.vp_bo, 1,
-			 &cc, sizeof(cc));
+   map = brw_state_batch(brw, sizeof(cc), 64,
+			 &brw->cc.state_bo, &brw->cc.state_offset);
+   memcpy(map, &cc, sizeof(cc));
+   brw->state.dirty.cache |= CACHE_NEW_CC_UNIT;
 
    /* Emit CC viewport relocation */
-   drm_intel_bo_emit_reloc(bo, offsetof(struct brw_cc_unit_state, cc4),
+   drm_intel_bo_emit_reloc(brw->cc.state_bo, (brw->cc.state_offset +
+					      offsetof(struct brw_cc_unit_state,
+						       cc4)),
 			   brw->cc.vp_bo, 0,
 			   I915_GEM_DOMAIN_INSTRUCTION, 0);
-
-   return bo;
-}
-
-static void prepare_cc_unit( struct brw_context *brw )
-{
-   struct brw_cc_unit_key key;
-
-   cc_unit_populate_key(brw, &key);
-
-   drm_intel_bo_unreference(brw->cc.state_bo);
-   brw->cc.state_bo = brw_search_cache(&brw->cache, BRW_CC_UNIT,
-				       &key, sizeof(key),
-				       &brw->cc.vp_bo, 1,
-				       NULL);
-
-   if (brw->cc.state_bo == NULL)
-      brw->cc.state_bo = cc_unit_create_from_key(brw, &key);
 }
 
 const struct brw_tracked_state brw_cc_unit = {
    .dirty = {
       .mesa = _NEW_STENCIL | _NEW_COLOR | _NEW_DEPTH,
-      .brw = 0,
+      .brw = BRW_NEW_BATCH,
       .cache = CACHE_NEW_CC_VP
    },
    .prepare = prepare_cc_unit,
+   .emit = upload_cc_unit,
 };
-
-
-
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index e688431b126..6d064b822e5 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -34,8 +34,6 @@
 #include "main/api_noop.h"
 #include "main/macros.h"
 #include "main/simple_list.h"
-#include "program/shader_api.h"
-
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_draw.h"
@@ -54,6 +52,9 @@ static void brwInitDriverFunctions( struct dd_function_table *functions )
 
    brwInitFragProgFuncs( functions );
    brw_init_queryobj_functions(functions);
+
+   functions->Enable = brw_enable;
+   functions->DepthRange = brw_depth_range;
 }
 
 GLboolean brwCreateContext( int api,
@@ -187,6 +188,11 @@ GLboolean brwCreateContext( int api,
 
    brw_draw_init( brw );
 
+   /* Now that most driver functions are hooked up, initialize some of the
+    * immediate state.
+    */
+   brw_update_cc_vp(brw);
+
    return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index d97634c1c60..cc4e6638e8b 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -131,6 +131,7 @@ struct brw_context;
 #define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
 #define BRW_NEW_PSP                     0x800
 #define BRW_NEW_WM_SURFACES		0x1000
+#define BRW_NEW_BINDING_TABLE		0x2000
 #define BRW_NEW_INDICES			0x4000
 #define BRW_NEW_VERTICES		0x8000
 /**
@@ -143,6 +144,8 @@ struct brw_context;
 #define BRW_NEW_NR_WM_SURFACES		0x40000
 #define BRW_NEW_NR_VS_SURFACES		0x80000
 #define BRW_NEW_INDEX_BUFFER		0x100000
+#define BRW_NEW_VS_CONSTBUF		0x200000
+#define BRW_NEW_WM_CONSTBUF		0x200000
 
 struct brw_state_flags {
    /** State update flags signalled by mesa internals */
@@ -160,7 +163,6 @@ struct brw_state_flags {
 struct brw_vertex_program {
    struct gl_vertex_program program;
    GLuint id;
-   drm_intel_bo *const_buffer;    /** Program constant buffer/surface */
    GLboolean use_const_buffer;
 };
 
@@ -172,7 +174,6 @@ struct brw_fragment_program {
    GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
 
    GLboolean use_const_buffer;
-   drm_intel_bo *const_buffer;    /** Program constant buffer/surface */
 
    /** for debugging, which texture units are referenced */
    GLbitfield tex_units_used;
@@ -301,8 +302,6 @@ enum brw_cache_id {
    BRW_CLIP_VP,
    BRW_CLIP_UNIT,
    BRW_CLIP_PROG,
-   BRW_SS_SURFACE,
-   BRW_SS_SURF_BIND,
 
    BRW_MAX_CACHE
 };
@@ -376,8 +375,6 @@ struct brw_tracked_state {
 #define CACHE_NEW_CLIP_VP                (1<<BRW_CLIP_VP)
 #define CACHE_NEW_CLIP_UNIT              (1<<BRW_CLIP_UNIT)
 #define CACHE_NEW_CLIP_PROG              (1<<BRW_CLIP_PROG)
-#define CACHE_NEW_SURFACE                (1<<BRW_SS_SURFACE)
-#define CACHE_NEW_SURF_BIND              (1<<BRW_SS_SURF_BIND)
 
 struct brw_cached_batch_item {
    struct header *header;
@@ -460,12 +457,11 @@ struct brw_context
        * consisting of the vertex buffers, pipelined state pointers,
        * the CURBE, the depth buffer, and a query BO.
        */
-      drm_intel_bo *validated_bos[VERT_ATTRIB_MAX + 16];
+      drm_intel_bo *validated_bos[VERT_ATTRIB_MAX + BRW_WM_MAX_SURF + 16];
       int validated_bo_count;
    } state;
 
-   struct brw_cache cache;  /** non-surface items */
-   struct brw_cache surface_cache;  /* surface items */
+   struct brw_cache cache;
    struct brw_cached_batch_item *cached_batch_items;
 
    struct {
@@ -594,10 +590,13 @@ struct brw_context
 
       drm_intel_bo *prog_bo;
       drm_intel_bo *state_bo;
+      drm_intel_bo *const_bo;
 
       /** Binding table of pointers to surf_bo entries */
       drm_intel_bo *bind_bo;
+      uint32_t bind_bo_offset;
       drm_intel_bo *surf_bo[BRW_VS_MAX_SURF];
+      uint32_t surf_offset[BRW_VS_MAX_SURF];
       GLuint nr_surfaces;      
    } vs;
 
@@ -649,10 +648,13 @@ struct brw_context
 
       /** Binding table of pointers to surf_bo entries */
       drm_intel_bo *bind_bo;
+      uint32_t bind_bo_offset;
       drm_intel_bo *surf_bo[BRW_WM_MAX_SURF];
+      uint32_t surf_offset[BRW_WM_MAX_SURF];
 
       drm_intel_bo *prog_bo;
       drm_intel_bo *state_bo;
+      drm_intel_bo *const_bo;
    } wm;
 
 
@@ -667,6 +669,7 @@ struct brw_context
       drm_intel_bo *color_calc_state_bo;
 
       drm_intel_bo *state_bo;
+      uint32_t state_offset;
    } cc;
 
    struct {
@@ -727,6 +730,9 @@ void brwInitFragProgFuncs( struct dd_function_table *functions );
  */
 void brw_upload_urb_fence(struct brw_context *brw);
 
+/* brw_cc.c */
+void brw_update_cc_vp(struct brw_context *brw);
+
 /* brw_curbe.c
  */
 void brw_upload_cs_urb_state(struct brw_context *brw);
@@ -734,6 +740,10 @@ void brw_upload_cs_urb_state(struct brw_context *brw);
 /* brw_disasm.c */
 int brw_disasm (FILE *file, struct brw_instruction *inst, int gen);
 
+/* brw_state.c */
+void brw_enable(GLcontext * ctx, GLenum cap, GLboolean state);
+void brw_depth_range(GLcontext *ctx, GLclampd nearval, GLclampd farval);
+
 /*======================================================================
  * Inline conversion functions.  These are better-typed than the
  * macros used previously:
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 3d52f6f6047..8196d8ca625 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -182,8 +182,6 @@ static void prepare_constant_buffer(struct brw_context *brw)
    GLcontext *ctx = &brw->intel.ctx;
    const struct brw_vertex_program *vp =
       brw_vertex_program_const(brw->vertex_program);
-   const struct brw_fragment_program *fp =
-      brw_fragment_program_const(brw->fragment_program);
    const GLuint sz = brw->curbe.total_size;
    const GLuint bufsz = sz * 16 * sizeof(GLfloat);
    GLfloat *buf;
@@ -200,8 +198,6 @@ static void prepare_constant_buffer(struct brw_context *brw)
    if (brw->curbe.wm_size) {
       GLuint offset = brw->curbe.wm_start * 16;
 
-      _mesa_load_state_parameters(ctx, fp->program.Base.Parameters); 
-
       /* copy float constants */
       for (i = 0; i < brw->wm.prog_data->nr_params; i++) 
 	 buf[offset + i] = *brw->wm.prog_data->param[i];
@@ -244,14 +240,6 @@ static void prepare_constant_buffer(struct brw_context *brw)
       GLuint offset = brw->curbe.vs_start * 16;
       GLuint nr = brw->vs.prog_data->nr_params / 4;
 
-      if (brw->vertex_program->IsNVProgram)
-	 _mesa_load_tracked_matrices(ctx);
-
-      /* Updates the ParamaterValues[i] pointers for all parameters of the
-       * basic type of PROGRAM_STATE_VAR.
-       */
-      _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
-
       if (vp->use_const_buffer) {
 	 /* Load the subset of push constants that will get used when
 	  * we also have a pull constant buffer.
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 2d3556b8054..39bf5b63fc2 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -998,7 +998,7 @@
 # define GEN6_WM_LINE_AA_WIDTH_2_0			(2 << 14)
 # define GEN6_WM_LINE_AA_WIDTH_4_0			(3 << 14)
 # define GEN6_WM_POLYGON_STIPPLE_ENABLE			(1 << 13)
-# define GEN6_WM_LINE_STIPPLE_ENABLE			(1 << 12)
+# define GEN6_WM_LINE_STIPPLE_ENABLE			(1 << 11)
 # define GEN6_WM_OMASK_TO_RENDER_TARGET			(1 << 9)
 # define GEN6_WM_USES_SOURCE_W				(1 << 8)
 # define GEN6_WM_DUAL_SOURCE_BLEND_ENABLE		(1 << 7)
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 3e305c89686..16331cc3ac0 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -151,9 +151,6 @@ static void brw_emit_prim(struct brw_context *brw,
    prim_packet.start_instance_location = 0;
    prim_packet.base_vert_location = prim->basevertex;
 
-   /* Can't wrap here, since we rely on the validated state. */
-   intel->no_batch_wrap = GL_TRUE;
-
    /* If we're set to always flush, do it before and after the primitive emit.
     * We want to catch both missed flushes that hurt instruction/state cache
     * and missed flushes of the render cache as it heads to other parts of
@@ -169,8 +166,6 @@ static void brw_emit_prim(struct brw_context *brw,
    if (intel->always_flush_cache) {
       intel_batchbuffer_emit_mi_flush(intel->batch);
    }
-
-   intel->no_batch_wrap = GL_FALSE;
 }
 
 static void brw_merge_inputs( struct brw_context *brw,
@@ -394,11 +389,14 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
 	    }
 	 }
 
+	 intel->no_batch_wrap = GL_TRUE;
 	 brw_upload_state(brw);
       }
 
       brw_emit_prim(brw, &prim[i], hw_prim);
 
+      intel->no_batch_wrap = GL_FALSE;
+
       retval = GL_TRUE;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 175899b0268..34dfe10cb93 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -286,6 +286,7 @@ static void brw_set_ff_sync_message(struct brw_context *brw,
 				    GLuint response_length,
 				    GLboolean end_of_thread)
 {
+	struct intel_context *intel = &brw->intel;
 	brw_set_src1(insn, brw_imm_d(0));
 
 	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
@@ -298,8 +299,12 @@ static void brw_set_ff_sync_message(struct brw_context *brw,
 	insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
 	insn->bits3.urb_gen5.msg_length = 1;
 	insn->bits3.urb_gen5.end_of_thread = end_of_thread;
-	insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
-	insn->bits2.send_gen5.end_of_thread = end_of_thread;
+	if (intel->gen >= 6) {
+	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
+	} else {
+	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
+	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
+	}
 }
 
 static void brw_set_urb_message( struct brw_context *brw,
@@ -966,10 +971,25 @@ void brw_math_16( struct brw_compile *p,
 		  struct brw_reg src,
 		  GLuint precision )
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1; 
    GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1; 
 
+   if (intel->gen >= 6) {
+      insn = next_insn(p, BRW_OPCODE_MATH);
+
+      /* Math is the same ISA format as other opcodes, except that CondModifier
+       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
+       */
+      insn->header.destreg__conditionalmod = function;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+      brw_set_src1(insn, brw_null_reg());
+      return;
+   }
+
    /* First instruction:
     */
    brw_push_insn_state(p);
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 35908ee7b69..572175f463e 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -96,18 +96,12 @@ const struct brw_tracked_state brw_drawing_rect = {
    .emit = upload_drawing_rect
 };
 
-static void prepare_binding_table_pointers(struct brw_context *brw)
-{
-   brw_add_validated_bo(brw, brw->vs.bind_bo);
-   brw_add_validated_bo(brw, brw->wm.bind_bo);
-}
-
 /**
  * Upload the binding table pointers, which point each stage's array of surface
  * state pointers.
  *
  * The binding table pointers are relative to the surface state base address,
- * which is 0.
+ * which points at the batchbuffer containing the streamed batch state.
  */
 static void upload_binding_table_pointers(struct brw_context *brw)
 {
@@ -115,24 +109,20 @@ static void upload_binding_table_pointers(struct brw_context *brw)
 
    BEGIN_BATCH(6);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
-   if (brw->vs.bind_bo != NULL)
-      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
-   else
-      OUT_BATCH(0);
+   OUT_BATCH(brw->vs.bind_bo_offset);
    OUT_BATCH(0); /* gs */
    OUT_BATCH(0); /* clip */
    OUT_BATCH(0); /* sf */
-   OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */
+   OUT_BATCH(brw->wm.bind_bo_offset);
    ADVANCE_BATCH();
 }
 
 const struct brw_tracked_state brw_binding_table_pointers = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_BATCH,
-      .cache = CACHE_NEW_SURF_BIND,
+      .brw = BRW_NEW_BATCH | BRW_NEW_BINDING_TABLE,
+      .cache = 0,
    },
-   .prepare = prepare_binding_table_pointers,
    .emit = upload_binding_table_pointers,
 };
 
@@ -141,7 +131,7 @@ const struct brw_tracked_state brw_binding_table_pointers = {
  * state pointers.
  *
  * The binding table pointers are relative to the surface state base address,
- * which is 0.
+ * which points at the batchbuffer containing the streamed batch state.
  */
 static void upload_gen6_binding_table_pointers(struct brw_context *brw)
 {
@@ -153,22 +143,18 @@ static void upload_gen6_binding_table_pointers(struct brw_context *brw)
 	     GEN6_BINDING_TABLE_MODIFY_GS |
 	     GEN6_BINDING_TABLE_MODIFY_PS |
 	     (4 - 2));
-   if (brw->vs.bind_bo != NULL)
-      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
-   else
-      OUT_BATCH(0);
+   OUT_BATCH(brw->vs.bind_bo_offset); /* vs */
    OUT_BATCH(0); /* gs */
-   OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */
+   OUT_BATCH(brw->wm.bind_bo_offset); /* wm/ps */
    ADVANCE_BATCH();
 }
 
 const struct brw_tracked_state gen6_binding_table_pointers = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_BATCH,
-      .cache = CACHE_NEW_SURF_BIND,
+      .brw = BRW_NEW_BATCH | BRW_NEW_BINDING_TABLE,
+      .cache = 0,
    },
-   .prepare = prepare_binding_table_pointers,
    .emit = upload_gen6_binding_table_pointers,
 };
 
@@ -199,7 +185,8 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
    OUT_RELOC(brw->clip.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
    OUT_RELOC(brw->sf.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
    OUT_RELOC(brw->wm.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
-   OUT_RELOC(brw->cc.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_RELOC(brw->cc.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+	     brw->cc.state_offset);
    ADVANCE_BATCH();
 
    brw->state.dirty.brw |= BRW_NEW_PSP;
@@ -213,7 +200,6 @@ static void prepare_psp_urb_cbs(struct brw_context *brw)
    brw_add_validated_bo(brw, brw->clip.state_bo);
    brw_add_validated_bo(brw, brw->sf.state_bo);
    brw_add_validated_bo(brw, brw->wm.state_bo);
-   brw_add_validated_bo(brw, brw->cc.state_bo);
 }
 
 static void upload_psp_urb_cbs(struct brw_context *brw )
@@ -590,23 +576,23 @@ const struct brw_tracked_state brw_invarient_state = {
 /**
  * Define the base addresses which some state is referenced from.
  *
- * This allows us to avoid having to emit relocations in many places for
- * cached state, and instead emit pointers inside of large, mostly-static
- * state pools.  This comes at the expense of memory, and more expensive cache
- * misses.
+ * This allows us to avoid having to emit relocations for the objects,
+ * and is actually required for binding table pointers on gen6.
+ *
+ * Surface state base address covers binding table pointers and
+ * surface state objects, but not the surfaces that the surface state
+ * objects point to.
  */
 static void upload_state_base_address( struct brw_context *brw )
 {
    struct intel_context *intel = &brw->intel;
 
-   /* Output the structure (brw_state_base_address) directly to the
-    * batchbuffer, so we can emit relocations inline.
-    */
    if (intel->gen >= 6) {
        BEGIN_BATCH(10);
        OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2));
        OUT_BATCH(1); /* General state base address */
-       OUT_BATCH(1); /* Surface state base address */
+       OUT_RELOC(intel->batch->buf, I915_GEM_DOMAIN_SAMPLER, 0,
+		 1); /* Surface state base address */
        OUT_BATCH(1); /* Dynamic state base address */
        OUT_BATCH(1); /* Indirect object base address */
        OUT_BATCH(1); /* Instruction base address */
@@ -619,7 +605,8 @@ static void upload_state_base_address( struct brw_context *brw )
        BEGIN_BATCH(8);
        OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
        OUT_BATCH(1); /* General state base address */
-       OUT_BATCH(1); /* Surface state base address */
+       OUT_RELOC(intel->batch->buf, I915_GEM_DOMAIN_SAMPLER, 0,
+		 1); /* Surface state base address */
        OUT_BATCH(1); /* Indirect object base address */
        OUT_BATCH(1); /* Instruction base address */
        OUT_BATCH(1); /* General state upper bound */
@@ -630,7 +617,8 @@ static void upload_state_base_address( struct brw_context *brw )
        BEGIN_BATCH(6);
        OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
        OUT_BATCH(1); /* General state base address */
-       OUT_BATCH(1); /* Surface state base address */
+       OUT_RELOC(intel->batch->buf, I915_GEM_DOMAIN_SAMPLER, 0,
+		 1); /* Surface state base address */
        OUT_BATCH(1); /* Indirect object base address */
        OUT_BATCH(1); /* General state upper bound */
        OUT_BATCH(1); /* Indirect object upper bound */
@@ -641,7 +629,7 @@ static void upload_state_base_address( struct brw_context *brw )
 const struct brw_tracked_state brw_state_base_address = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_CONTEXT,
+      .brw = BRW_NEW_BATCH,
       .cache = 0,
    },
    .emit = upload_state_base_address
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index cc9ac6d5749..aeed24d4e14 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -31,10 +31,10 @@
   
 #include "main/imports.h"
 #include "main/enums.h"
+#include "main/shaderobj.h"
 #include "program/prog_parameter.h"
 #include "program/program.h"
 #include "program/programopt.h"
-#include "program/shader_api.h"
 #include "tnl/tnl.h"
 
 #include "brw_context.h"
@@ -95,20 +95,6 @@ static struct gl_program *brwNewProgram( GLcontext *ctx,
 static void brwDeleteProgram( GLcontext *ctx,
 			      struct gl_program *prog )
 {
-   if (prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
-      struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
-      struct brw_fragment_program *brw_fp = brw_fragment_program(fp);
-
-      drm_intel_bo_unreference(brw_fp->const_buffer);
-   }
-
-   if (prog->Target == GL_VERTEX_PROGRAM_ARB) {
-      struct gl_vertex_program *vp = (struct gl_vertex_program *) prog;
-      struct brw_vertex_program *brw_vp = brw_vertex_program(vp);
-
-      drm_intel_bo_unreference(brw_vp->const_buffer);
-   }
-
    _mesa_delete_program( ctx, prog );
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index a95acb4cf82..e290ca92f60 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -105,8 +105,7 @@ static void upload_sf_vp(struct brw_context *brw)
    }
 
    drm_intel_bo_unreference(brw->sf.vp_bo);
-   brw->sf.vp_bo = brw_cache_data(&brw->cache, BRW_SF_VP, &sfv, sizeof(sfv),
-				  NULL, 0);
+   brw->sf.vp_bo = brw_cache_data(&brw->cache, BRW_SF_VP, &sfv, sizeof(sfv));
 }
 
 const struct brw_tracked_state brw_sf_vp = {
diff --git a/src/mesa/drivers/dri/i965/brw_state.c b/src/mesa/drivers/dri/i965/brw_state.c
new file mode 100644
index 00000000000..1e77e427d38
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_state.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <[email protected]>
+ *
+ */
+
+#include "brw_context.h"
+
+void
+brw_enable(GLcontext *ctx, GLenum cap, GLboolean state)
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   switch (cap) {
+   case GL_DEPTH_CLAMP:
+      brw_update_cc_vp(brw);
+      break;
+   }
+}
+
+void
+brw_depth_range(GLcontext *ctx, GLclampd nearval, GLclampd farval)
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   if (ctx->Transform.DepthClamp)
+      brw_update_cc_vp(brw);
+}
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 85949215e82..40eece276b7 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -48,10 +48,11 @@ brw_add_validated_bo(struct brw_context *brw, drm_intel_bo *bo)
 
 const struct brw_tracked_state brw_blend_constant_color;
 const struct brw_tracked_state brw_cc_unit;
-const struct brw_tracked_state brw_cc_vp;
 const struct brw_tracked_state brw_check_fallback;
 const struct brw_tracked_state brw_clip_prog;
 const struct brw_tracked_state brw_clip_unit;
+const struct brw_tracked_state brw_vs_constants;
+const struct brw_tracked_state brw_wm_constants;
 const struct brw_tracked_state brw_constant_buffer;
 const struct brw_tracked_state brw_curbe_offsets;
 const struct brw_tracked_state brw_invarient_state;
@@ -80,6 +81,7 @@ const struct brw_tracked_state brw_wm_prog;
 const struct brw_tracked_state brw_wm_samplers;
 const struct brw_tracked_state brw_wm_constant_surface;
 const struct brw_tracked_state brw_wm_surfaces;
+const struct brw_tracked_state brw_wm_binding_table;
 const struct brw_tracked_state brw_wm_unit;
 
 const struct brw_tracked_state brw_psp_urb_cbs;
@@ -93,7 +95,6 @@ const struct brw_tracked_state brw_index_buffer;
 const struct brw_tracked_state gen6_binding_table_pointers;
 const struct brw_tracked_state gen6_blend_state;
 const struct brw_tracked_state gen6_cc_state_pointers;
-const struct brw_tracked_state gen6_cc_vp;
 const struct brw_tracked_state gen6_clip_state;
 const struct brw_tracked_state gen6_clip_vp;
 const struct brw_tracked_state gen6_color_calc_state;
@@ -108,20 +109,6 @@ const struct brw_tracked_state gen6_viewport_state;
 const struct brw_tracked_state gen6_vs_state;
 const struct brw_tracked_state gen6_wm_state;
 
-/**
- * Use same key for WM and VS surfaces.
- */
-struct brw_surface_key {
-   GLenum target, depthmode;
-   drm_intel_bo *bo;
-   GLint format, internal_format;
-   GLint first_level, last_level;
-   GLint width, height, depth;
-   GLint pitch, cpp;
-   uint32_t tiling;
-   GLuint offset;
-};
-
 /***********************************************************************
  * brw_state.c
  */
@@ -137,9 +124,7 @@ void brw_clear_validated_bos(struct brw_context *brw);
 drm_intel_bo *brw_cache_data(struct brw_cache *cache,
 		       enum brw_cache_id cache_id,
 		       const void *data,
-		       GLuint size,
-		       drm_intel_bo **reloc_bufs,
-		       GLuint nr_reloc_bufs);
+		       GLuint size);
 
 drm_intel_bo *brw_upload_cache(struct brw_cache *cache,
 			       enum brw_cache_id cache_id,
@@ -173,7 +158,6 @@ void brw_state_cache_check_size( struct brw_context *brw );
 
 void brw_init_caches( struct brw_context *brw );
 void brw_destroy_caches( struct brw_context *brw );
-void brw_state_cache_bo_delete(struct brw_cache *cache, drm_intel_bo *bo);
 
 /***********************************************************************
  * brw_state_batch.c
@@ -186,10 +170,17 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 				   GLuint sz );
 void brw_destroy_batch_cache( struct brw_context *brw );
 void brw_clear_batch_cache( struct brw_context *brw );
+void *brw_state_batch(struct brw_context *brw,
+		      int size,
+		      int alignment,
+		      drm_intel_bo **out_bo,
+		      uint32_t *out_offset);
 
 /* brw_wm_surface_state.c */
-drm_intel_bo *
-brw_create_constant_surface( struct brw_context *brw,
-                             struct brw_surface_key *key );
+void brw_create_constant_surface(struct brw_context *brw,
+				 drm_intel_bo *bo,
+				 int width,
+				 drm_intel_bo **out_bo,
+				 uint32_t *out_offset);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index 39019412fda..be3989eb7db 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -97,3 +97,52 @@ void brw_destroy_batch_cache( struct brw_context *brw )
 {
    brw_clear_batch_cache(brw);
 }
+
+/**
+ * Allocates a block of space in the batchbuffer for indirect state.
+ *
+ * We don't want to allocate separate BOs for every bit of indirect
+ * state in the driver.  It means overallocating by a significant
+ * margin (4096 bytes, even if the object is just a 20-byte surface
+ * state), and more buffers to walk and count for aperture size checking.
+ *
+ * However, due to the restrictions inposed by the aperture size
+ * checking performance hacks, we can't have the batch point at a
+ * separate indirect state buffer, because once the batch points at
+ * it, no more relocations can be added to it.  So, we sneak these
+ * buffers in at the top of the batchbuffer.
+ */
+void *
+brw_state_batch(struct brw_context *brw,
+		int size,
+		int alignment,
+		drm_intel_bo **out_bo,
+		uint32_t *out_offset)
+{
+   struct intel_batchbuffer *batch = brw->intel.batch;
+   uint32_t offset;
+
+   assert(size < batch->buf->size);
+   offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
+
+   /* If allocating from the top would wrap below the batchbuffer, or
+    * if the batch's used space (plus the reserved pad) collides with our
+    * space, then flush and try again.
+    */
+   if (batch->state_batch_offset < size ||
+       offset < batch->ptr - batch->map + batch->reserved_space) {
+      intel_batchbuffer_flush(batch);
+      offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
+   }
+
+   batch->state_batch_offset = offset;
+
+   if (*out_bo != batch->buf) {
+      drm_intel_bo_unreference(*out_bo);
+      drm_intel_bo_reference(batch->buf);
+      *out_bo = batch->buf;
+   }
+
+   *out_offset = offset;
+   return batch->map + offset;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index ea81ad13417..b31d84953a1 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -310,9 +310,7 @@ drm_intel_bo *
 brw_cache_data(struct brw_cache *cache,
 	       enum brw_cache_id cache_id,
 	       const void *data,
-	       GLuint data_size,
-	       drm_intel_bo **reloc_bufs,
-	       GLuint nr_reloc_bufs)
+	       GLuint data_size)
 {
    drm_intel_bo *bo;
    struct brw_cache_item *item, lookup;
@@ -321,8 +319,8 @@ brw_cache_data(struct brw_cache *cache,
    lookup.cache_id = cache_id;
    lookup.key = data;
    lookup.key_size = data_size;
-   lookup.reloc_bufs = reloc_bufs;
-   lookup.nr_reloc_bufs = nr_reloc_bufs;
+   lookup.reloc_bufs = NULL;
+   lookup.nr_reloc_bufs = 0;
    hash = hash_key(&lookup);
    lookup.hash = hash;
 
@@ -335,7 +333,7 @@ brw_cache_data(struct brw_cache *cache,
 
    bo = brw_upload_cache(cache, cache_id,
 			 data, data_size,
-			 reloc_bufs, nr_reloc_bufs,
+			 NULL, 0,
 			 data, data_size);
 
    return bo;
@@ -396,29 +394,10 @@ brw_init_non_surface_cache(struct brw_context *brw)
    brw_init_cache_id(cache, "DEPTH_STENCIL_STATE", BRW_DEPTH_STENCIL_STATE);
 }
 
-
-static void
-brw_init_surface_cache(struct brw_context *brw)
-{
-   struct brw_cache *cache = &brw->surface_cache;
-
-   cache->brw = brw;
-
-   cache->size = 7;
-   cache->n_items = 0;
-   cache->items = (struct brw_cache_item **)
-      calloc(1, cache->size * sizeof(struct brw_cache_item));
-
-   brw_init_cache_id(cache, "SS_SURFACE", BRW_SS_SURFACE);
-   brw_init_cache_id(cache, "SS_SURF_BIND", BRW_SS_SURF_BIND);
-}
-
-
 void
 brw_init_caches(struct brw_context *brw)
 {
    brw_init_non_surface_cache(brw);
-   brw_init_surface_cache(brw);
 }
 
 
@@ -452,56 +431,17 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
    brw->state.dirty.cache |= ~0;
 }
 
-/* Clear all entries from the cache that point to the given bo.
- *
- * This lets us release memory for reuse earlier for known-dead buffers,
- * at the cost of walking the entire hash table.
- */
-void
-brw_state_cache_bo_delete(struct brw_cache *cache, drm_intel_bo *bo)
-{
-   struct brw_cache_item **prev;
-   GLuint i;
-
-   if (INTEL_DEBUG & DEBUG_STATE)
-      printf("%s\n", __FUNCTION__);
-
-   for (i = 0; i < cache->size; i++) {
-      for (prev = &cache->items[i]; *prev;) {
-	 struct brw_cache_item *c = *prev;
-
-	 if (drm_intel_bo_references(c->bo, bo)) {
-	    int j;
-
-	    *prev = c->next;
-
-	    for (j = 0; j < c->nr_reloc_bufs; j++)
-	       drm_intel_bo_unreference(c->reloc_bufs[j]);
-	    drm_intel_bo_unreference(c->bo);
-	    free((void *)c->key);
-	    free(c);
-	    cache->n_items--;
-	 } else {
-	    prev = &c->next;
-	 }
-      }
-   }
-}
-
 void
 brw_state_cache_check_size(struct brw_context *brw)
 {
    if (INTEL_DEBUG & DEBUG_STATE)
       printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
 
-   /* un-tuned guess.  We've got around 20 state objects for a total of around
-    * 32k, so 1000 of them is around 1.5MB.
+   /* un-tuned guess.  Each object is generally a page, so 1000 of them is 4 MB of
+    * state cache.
     */
    if (brw->cache.n_items > 1000)
       brw_clear_cache(brw, &brw->cache);
-
-   if (brw->surface_cache.n_items > 1000)
-      brw_clear_cache(brw, &brw->surface_cache);
 }
 
 
@@ -528,5 +468,4 @@ void
 brw_destroy_caches(struct brw_context *brw)
 {
    brw_destroy_cache(brw, &brw->cache);
-   brw_destroy_cache(brw, &brw->surface_cache);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index cb66806ebf3..d410861bdf6 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -111,8 +111,8 @@ static void dump_wm_surface_state(struct brw_context *brw)
 	 continue;
       }
       drm_intel_bo_map(surf_bo, GL_FALSE);
-      surfoff = surf_bo->offset;
-      surf = (struct brw_surface_state *)(surf_bo->virtual);
+      surfoff = surf_bo->offset + brw->wm.surf_offset[i];
+      surf = (struct brw_surface_state *)(surf_bo->virtual + brw->wm.surf_offset[i]);
 
       sprintf(name, "WM SS%d", i);
       state_out(name, surf, surfoff, 0, "%s %s\n",
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 49629ba2289..f92a19c2aa0 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -61,12 +61,15 @@ static const struct brw_tracked_state *gen4_atoms[] =
    &brw_curbe_offsets,
    &brw_recalculate_urb_fence,
 
-   &brw_cc_vp,
    &brw_cc_unit,
 
+   &brw_vs_constants, /* Before vs_surfaces and constant_buffer */
+   &brw_wm_constants, /* Before wm_surfaces and constant_buffer */
+
    &brw_vs_surfaces,		/* must do before unit */
    &brw_wm_constant_surface,	/* must do before wm surfaces/bind bo */
    &brw_wm_surfaces,		/* must do before samplers and unit */
+   &brw_wm_binding_table,
    &brw_wm_samplers,
 
    &brw_wm_unit,
@@ -113,7 +116,6 @@ const struct brw_tracked_state *gen6_atoms[] =
 
    &gen6_clip_vp,
    &gen6_sf_vp,
-   &gen6_cc_vp,
 
    /* Command packets: */
    &brw_invarient_state,
@@ -126,9 +128,13 @@ const struct brw_tracked_state *gen6_atoms[] =
    &gen6_depth_stencil_state,	/* must do before cc unit */
    &gen6_cc_state_pointers,
 
+   &brw_vs_constants, /* Before vs_surfaces and constant_buffer */
+   &brw_wm_constants, /* Before wm_surfaces and constant_buffer */
+
    &brw_vs_surfaces,		/* must do before unit */
    &brw_wm_constant_surface,	/* must do before wm surfaces/bind bo */
    &brw_wm_surfaces,		/* must do before samplers and unit */
+   &brw_wm_binding_table,
 
    &brw_wm_samplers,
    &gen6_sampler_state,
@@ -266,6 +272,8 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_CONTEXT),
    DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS),
    DEFINE_BIT(BRW_NEW_PSP),
+   DEFINE_BIT(BRW_NEW_WM_SURFACES),
+   DEFINE_BIT(BRW_NEW_BINDING_TABLE),
    DEFINE_BIT(BRW_NEW_INDICES),
    DEFINE_BIT(BRW_NEW_INDEX_BUFFER),
    DEFINE_BIT(BRW_NEW_VERTICES),
@@ -292,8 +300,6 @@ static struct dirty_bit_map cache_bits[] = {
    DEFINE_BIT(CACHE_NEW_CLIP_VP),
    DEFINE_BIT(CACHE_NEW_CLIP_UNIT),
    DEFINE_BIT(CACHE_NEW_CLIP_PROG),
-   DEFINE_BIT(CACHE_NEW_SURFACE),
-   DEFINE_BIT(CACHE_NEW_SURF_BIND),
    {0, 0, 0}
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index f17fe485306..2a7fa5b6997 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -278,7 +278,7 @@ struct brw_aa_line_parameters
    struct header header;
 
    struct {
-      GLuint aa_coverage_scope:8;
+      GLuint aa_coverage_slope:8;
       GLuint pad0:8;
       GLuint aa_coverage_bias:8;
       GLuint pad1:8;
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index 568c2e3b030..0250a68d292 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -42,42 +42,59 @@
  * Otherwise, constants go through the CURBEs using the brw_constant_buffer
  * state atom.
  */
-static drm_intel_bo *
-brw_vs_update_constant_buffer(struct brw_context *brw)
+static void
+prepare_vs_constants(struct brw_context *brw)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = &brw->intel;
    struct brw_vertex_program *vp =
       (struct brw_vertex_program *) brw->vertex_program;
    const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
    const int size = params->NumParameters * 4 * sizeof(GLfloat);
-   drm_intel_bo *const_buffer;
    int i;
 
-   /* BRW_NEW_VERTEX_PROGRAM */
-   if (!vp->use_const_buffer)
-      return NULL;
-
-   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer",
-				     size, 64);
-
-   /* _NEW_PROGRAM_CONSTANTS */
+   if (vp->program.IsNVProgram)
+      _mesa_load_tracked_matrices(ctx);
 
    /* Updates the ParamaterValues[i] pointers for all parameters of the
     * basic type of PROGRAM_STATE_VAR.
     */
    _mesa_load_state_parameters(&brw->intel.ctx, vp->program.Base.Parameters);
 
-   drm_intel_gem_bo_map_gtt(const_buffer);
+   /* BRW_NEW_VERTEX_PROGRAM */
+   if (!vp->use_const_buffer) {
+      if (brw->vs.const_bo) {
+	 drm_intel_bo_unreference(brw->vs.const_bo);
+	 brw->vs.const_bo = NULL;
+	 brw->state.dirty.brw |= BRW_NEW_VS_CONSTBUF;
+      }
+      return;
+   }
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   drm_intel_bo_unreference(brw->vs.const_bo);
+   brw->vs.const_bo = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer",
+					 size, 64);
+
+   drm_intel_gem_bo_map_gtt(brw->vs.const_bo);
    for (i = 0; i < params->NumParameters; i++) {
-      memcpy(const_buffer->virtual + i * 4 * sizeof(float),
+      memcpy(brw->vs.const_bo->virtual + i * 4 * sizeof(float),
 	     params->ParameterValues[i],
 	     4 * sizeof(float));
    }
-   drm_intel_gem_bo_unmap_gtt(const_buffer);
-
-   return const_buffer;
+   drm_intel_gem_bo_unmap_gtt(brw->vs.const_bo);
+   brw->state.dirty.brw |= BRW_NEW_VS_CONSTBUF;
 }
 
+const struct brw_tracked_state brw_vs_constants = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_VERTEX_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_vs_constants,
+};
+
 /**
  * Update the surface state for a VS constant buffer.
  *
@@ -88,101 +105,41 @@ brw_update_vs_constant_surface( GLcontext *ctx,
                                 GLuint surf)
 {
    struct brw_context *brw = brw_context(ctx);
-   struct brw_surface_key key;
    struct brw_vertex_program *vp =
       (struct brw_vertex_program *) brw->vertex_program;
    const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
 
    assert(surf == 0);
 
-   /* If we're in this state update atom, we need to update VS constants, so
-    * free the old buffer and create a new one for the new contents.
-    */
-   drm_intel_bo_unreference(vp->const_buffer);
-   vp->const_buffer = brw_vs_update_constant_buffer(brw);
-
    /* If there's no constant buffer, then no surface BO is needed to point at
     * it.
     */
-   if (vp->const_buffer == NULL) {
+   if (brw->vs.const_bo == NULL) {
       drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
       brw->vs.surf_bo[surf] = NULL;
       return;
    }
 
-   memset(&key, 0, sizeof(key));
-
-   key.format = MESA_FORMAT_RGBA_FLOAT32;
-   key.internal_format = GL_RGBA;
-   key.bo = vp->const_buffer;
-   key.depthmode = GL_NONE;
-   key.pitch = params->NumParameters;
-   key.width = params->NumParameters;
-   key.height = 1;
-   key.depth = 1;
-   key.cpp = 16;
-
-   /*
-   printf("%s:\n", __FUNCTION__);
-   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
-          key.width, key.height, key.depth, key.cpp, key.pitch);
-   */
-
-   drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
-   brw->vs.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
-                                            BRW_SS_SURFACE,
-                                            &key, sizeof(key),
-                                            &key.bo, 1,
-                                            NULL);
-   if (brw->vs.surf_bo[surf] == NULL) {
-      brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
-   }
+   brw_create_constant_surface(brw, brw->vs.const_bo, params->NumParameters,
+			       &brw->vs.surf_bo[surf],
+			       &brw->vs.surf_offset[surf]);
 }
 
 
-/**
- * Constructs the binding table for the VS surface state.
- */
-static drm_intel_bo *
-brw_vs_get_binding_table(struct brw_context *brw)
+static void
+prepare_vs_surfaces(struct brw_context *brw)
 {
-   drm_intel_bo *bind_bo;
-
-   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
-			      NULL, 0,
-			      brw->vs.surf_bo, BRW_VS_MAX_SURF,
-			      NULL);
-
-   if (bind_bo == NULL) {
-      GLuint data_size = BRW_VS_MAX_SURF * sizeof(GLuint);
-      uint32_t data[BRW_VS_MAX_SURF];
-      int i;
-
-      for (i = 0; i < BRW_VS_MAX_SURF; i++)
-         if (brw->vs.surf_bo[i])
-            data[i] = brw->vs.surf_bo[i]->offset;
-         else
-            data[i] = 0;
-
-      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
-				  NULL, 0,
-				  brw->vs.surf_bo, BRW_VS_MAX_SURF,
-				  data, data_size);
-
-      /* Emit binding table relocations to surface state */
-      for (i = 0; i < BRW_VS_MAX_SURF; i++) {
-	 if (brw->vs.surf_bo[i] != NULL) {
-	    /* The presumed offsets were set in the data values for
-	     * brw_upload_cache.
-	     */
-	    drm_intel_bo_emit_reloc(bind_bo, i * 4,
-				    brw->vs.surf_bo[i], 0,
-				    I915_GEM_DOMAIN_INSTRUCTION, 0);
-	 }
-      }
+   int nr_surfaces = 0;
+
+   if (brw->vs.const_bo) {
+      brw_add_validated_bo(brw, brw->vs.const_bo);
+      nr_surfaces = 1;
    }
 
-   return bind_bo;
+   if (brw->vs.nr_surfaces != nr_surfaces) {
+      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
+      brw->vs.nr_surfaces = nr_surfaces;
+   }
 }
 
 /**
@@ -192,43 +149,50 @@ brw_vs_get_binding_table(struct brw_context *brw)
  * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and
  * CACHE_NEW_SURF_BIND for the binding table upload.
  */
-static void prepare_vs_surfaces(struct brw_context *brw )
+static void upload_vs_surfaces(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
+   uint32_t *bind;
    int i;
-   int nr_surfaces = 0;
-
-   brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER);
 
-   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
-      if (brw->vs.surf_bo[i] != NULL) {
-	 nr_surfaces = i + 1;
+   /* BRW_NEW_NR_VS_SURFACES */
+   if (brw->vs.nr_surfaces == 0) {
+      if (brw->vs.bind_bo) {
+	 drm_intel_bo_unreference(brw->vs.bind_bo);
+	 brw->vs.bind_bo = NULL;
+	 brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE;
       }
+      return;
    }
 
-   if (brw->vs.nr_surfaces != nr_surfaces) {
-      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
-      brw->vs.nr_surfaces = nr_surfaces;
-   }
+   brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER);
 
-   /* Note that we don't end up updating the bind_bo if we don't have a
-    * surface to be pointing at.  This should be relatively harmless, as it
-    * just slightly increases our working set size.
+   /* Might want to calculate nr_surfaces first, to avoid taking up so much
+    * space for the binding table. (once we have vs samplers)
     */
-   if (brw->vs.nr_surfaces != 0) {
-      drm_intel_bo_unreference(brw->vs.bind_bo);
-      brw->vs.bind_bo = brw_vs_get_binding_table(brw);
+   bind = brw_state_batch(brw, sizeof(uint32_t) * BRW_VS_MAX_SURF,
+			  32, &brw->vs.bind_bo, &brw->vs.bind_bo_offset);
+
+   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+      /* BRW_NEW_VS_CONSTBUF */
+      if (brw->vs.surf_bo[i]) {
+	 bind[i] = brw->vs.surf_offset[i];
+      } else {
+	 bind[i] = 0;
+      }
    }
+
+   brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE;
 }
 
 const struct brw_tracked_state brw_vs_surfaces = {
    .dirty = {
-      .mesa = (_NEW_PROGRAM_CONSTANTS),
-      .brw = (BRW_NEW_VERTEX_PROGRAM),
+      .mesa = 0,
+      .brw = (BRW_NEW_VS_CONSTBUF |
+	      BRW_NEW_NR_VS_SURFACES |
+	      BRW_NEW_BATCH),
       .cache = 0
    },
    .prepare = prepare_vs_surfaces,
+   .emit = upload_vs_surfaces,
 };
-
-
-
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index a02e958c5e6..14227a51332 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -83,6 +83,7 @@ static void brw_destroy_context( struct intel_context *intel )
    dri_bo_release(&brw->vs.prog_bo);
    dri_bo_release(&brw->vs.state_bo);
    dri_bo_release(&brw->vs.bind_bo);
+   dri_bo_release(&brw->vs.const_bo);
    dri_bo_release(&brw->gs.prog_bo);
    dri_bo_release(&brw->gs.state_bo);
    dri_bo_release(&brw->clip.prog_bo);
@@ -99,6 +100,7 @@ static void brw_destroy_context( struct intel_context *intel )
    dri_bo_release(&brw->wm.sampler_bo);
    dri_bo_release(&brw->wm.prog_bo);
    dri_bo_release(&brw->wm.state_bo);
+   dri_bo_release(&brw->wm.const_bo);
    dri_bo_release(&brw->cc.prog_bo);
    dri_bo_release(&brw->cc.state_bo);
    dri_bo_release(&brw->cc.vp_bo);
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index 9fbabdc2852..1fc802cfa65 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -74,7 +74,7 @@ static drm_intel_bo *upload_default_color( struct brw_context *brw,
    COPY_4V(sdc.color, color); 
    
    return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
-			 &sdc, sizeof(sdc), NULL, 0);
+			 &sdc, sizeof(sdc));
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index c7b61240e75..17b016b569b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -196,36 +196,40 @@ brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling)
    }
 }
 
-static drm_intel_bo *
-brw_create_texture_surface( struct brw_context *brw,
-			    struct brw_surface_key *key )
+static void
+brw_update_texture_surface( GLcontext *ctx, GLuint unit )
 {
+   struct brw_context *brw = brw_context(ctx);
+   struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current;
+   struct intel_texture_object *intelObj = intel_texture_object(tObj);
+   struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel];
+   const GLuint surf_index = SURF_INDEX_TEXTURE(unit);
    struct brw_surface_state surf;
-   drm_intel_bo *bo;
+   void *map;
 
    memset(&surf, 0, sizeof(surf));
 
    surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
-   surf.ss0.surface_type = translate_tex_target(key->target);
-   surf.ss0.surface_format = translate_tex_format(key->format,
-						  key->internal_format,
-						  key->depthmode);
+   surf.ss0.surface_type = translate_tex_target(tObj->Target);
+   surf.ss0.surface_format = translate_tex_format(firstImage->TexFormat,
+						  firstImage->InternalFormat,
+						  tObj->DepthMode);
 
    /* This is ok for all textures with channel width 8bit or less:
     */
 /*    surf.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
-   surf.ss1.base_addr = key->bo->offset; /* reloc */
+   surf.ss1.base_addr = intelObj->mt->region->buffer->offset; /* reloc */
 
-   surf.ss2.mip_count = key->last_level - key->first_level;
-   surf.ss2.width = key->width - 1;
-   surf.ss2.height = key->height - 1;
-   brw_set_surface_tiling(&surf, key->tiling);
-   surf.ss3.pitch = (key->pitch * key->cpp) - 1;
-   surf.ss3.depth = key->depth - 1;
+   surf.ss2.mip_count = intelObj->lastLevel - intelObj->firstLevel;
+   surf.ss2.width = firstImage->Width - 1;
+   surf.ss2.height = firstImage->Height - 1;
+   brw_set_surface_tiling(&surf, intelObj->mt->region->tiling);
+   surf.ss3.pitch = (intelObj->mt->region->pitch * intelObj->mt->cpp) - 1;
+   surf.ss3.depth = firstImage->Depth - 1;
 
    surf.ss4.min_lod = 0;
  
-   if (key->target == GL_TEXTURE_CUBE_MAP) {
+   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
       surf.ss0.cube_pos_x = 1;
       surf.ss0.cube_pos_y = 1;
       surf.ss0.cube_pos_z = 1;
@@ -234,71 +238,33 @@ brw_create_texture_surface( struct brw_context *brw,
       surf.ss0.cube_neg_z = 1;
    }
 
-   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
-			 key, sizeof(*key),
-			 &key->bo, 1,
-			 &surf, sizeof(surf));
+   map = brw_state_batch(brw, sizeof(surf), 32,
+			 &brw->wm.surf_bo[surf_index],
+			 &brw->wm.surf_offset[surf_index]);
+   memcpy(map, &surf, sizeof(surf));
 
    /* Emit relocation to surface contents */
-   drm_intel_bo_emit_reloc(bo, offsetof(struct brw_surface_state, ss1),
-			   key->bo, 0,
+   drm_intel_bo_emit_reloc(brw->wm.surf_bo[surf_index],
+			   brw->wm.surf_offset[surf_index] +
+			   offsetof(struct brw_surface_state, ss1),
+			   intelObj->mt->region->buffer, 0,
 			   I915_GEM_DOMAIN_SAMPLER, 0);
-
-   return bo;
-}
-
-static void
-brw_update_texture_surface( GLcontext *ctx, GLuint unit )
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current;
-   struct intel_texture_object *intelObj = intel_texture_object(tObj);
-   struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel];
-   struct brw_surface_key key;
-   const GLuint surf = SURF_INDEX_TEXTURE(unit);
-
-   memset(&key, 0, sizeof(key));
-
-   key.format = firstImage->TexFormat;
-   key.internal_format = firstImage->InternalFormat;
-   key.pitch = intelObj->mt->region->pitch;
-   key.depth = firstImage->Depth;
-   key.bo = intelObj->mt->region->buffer;
-   key.offset = 0;
-
-   key.target = tObj->Target;
-   key.depthmode = tObj->DepthMode;
-   key.first_level = intelObj->firstLevel;
-   key.last_level = intelObj->lastLevel;
-   key.width = firstImage->Width;
-   key.height = firstImage->Height;
-   key.cpp = intelObj->mt->cpp;
-   key.tiling = intelObj->mt->region->tiling;
-
-   drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
-   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
-                                            BRW_SS_SURFACE,
-                                            &key, sizeof(key),
-                                            &key.bo, 1,
-                                            NULL);
-   if (brw->wm.surf_bo[surf] == NULL) {
-      brw->wm.surf_bo[surf] = brw_create_texture_surface(brw, &key);
-   }
 }
 
-
-
 /**
  * Create the constant buffer surface.  Vertex/fragment shader constants will be
  * read from this buffer with Data Port Read instructions/messages.
  */
-drm_intel_bo *
-brw_create_constant_surface( struct brw_context *brw,
-                             struct brw_surface_key *key )
+void
+brw_create_constant_surface(struct brw_context *brw,
+			    drm_intel_bo *bo,
+			    int width,
+			    drm_intel_bo **out_bo,
+			    uint32_t *out_offset)
 {
-   const GLint w = key->width - 1;
+   const GLint w = width - 1;
    struct brw_surface_state surf;
-   drm_intel_bo *bo;
+   void *map;
 
    memset(&surf, 0, sizeof(surf));
 
@@ -306,29 +272,26 @@ brw_create_constant_surface( struct brw_context *brw,
    surf.ss0.surface_type = BRW_SURFACE_BUFFER;
    surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
 
-   assert(key->bo);
-   surf.ss1.base_addr = key->bo->offset; /* reloc */
+   assert(bo);
+   surf.ss1.base_addr = bo->offset; /* reloc */
 
    surf.ss2.width = w & 0x7f;            /* bits 6:0 of size or width */
    surf.ss2.height = (w >> 7) & 0x1fff;  /* bits 19:7 of size or width */
    surf.ss3.depth = (w >> 20) & 0x7f;    /* bits 26:20 of size or width */
-   surf.ss3.pitch = (key->pitch * key->cpp) - 1; /* ignored?? */
-   brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
- 
-   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
-			 key, sizeof(*key),
-			 &key->bo, 1,
-			 &surf, sizeof(surf));
+   surf.ss3.pitch = (width * 16) - 1; /* ignored?? */
+   brw_set_surface_tiling(&surf, I915_TILING_NONE); /* tiling now allowed */
+
+   map = brw_state_batch(brw, sizeof(surf), 32, out_bo, out_offset);
+   memcpy(map, &surf, sizeof(surf));
 
    /* Emit relocation to surface contents.  Section 5.1.1 of the gen4
     * bspec ("Data Cache") says that the data cache does not exist as
     * a separate cache and is just the sampler cache.
     */
-   drm_intel_bo_emit_reloc(bo, offsetof(struct brw_surface_state, ss1),
-			   key->bo, 0,
+   drm_intel_bo_emit_reloc(*out_bo, (*out_offset +
+				     offsetof(struct brw_surface_state, ss1)),
+			   bo, 0,
 			   I915_GEM_DOMAIN_SAMPLER, 0);
-
-   return bo;
 }
 
 /* Creates a new WM constant buffer reflecting the current fragment program's
@@ -337,89 +300,45 @@ brw_create_constant_surface( struct brw_context *brw,
  * Otherwise, constants go through the CURBEs using the brw_constant_buffer
  * state atom.
  */
-static drm_intel_bo *
-brw_wm_update_constant_buffer(struct brw_context *brw)
+static void
+prepare_wm_constants(struct brw_context *brw)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = &brw->intel;
    struct brw_fragment_program *fp =
       (struct brw_fragment_program *) brw->fragment_program;
    const struct gl_program_parameter_list *params = fp->program.Base.Parameters;
    const int size = params->NumParameters * 4 * sizeof(GLfloat);
-   drm_intel_bo *const_buffer;
-
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   if (!fp->use_const_buffer)
-      return NULL;
-
-   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "fp_const_buffer",
-				     size, 64);
 
-   /* _NEW_PROGRAM_CONSTANTS */
-   drm_intel_bo_subdata(const_buffer, 0, size, params->ParameterValues);
-
-   return const_buffer;
-}
+   _mesa_load_state_parameters(ctx, fp->program.Base.Parameters);
 
-/**
- * Update the surface state for a WM constant buffer.
- * The constant buffer will be (re)allocated here if needed.
- */
-static void
-brw_update_wm_constant_surface( GLcontext *ctx,
-                                GLuint surf)
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct brw_surface_key key;
-   struct brw_fragment_program *fp =
-      (struct brw_fragment_program *) brw->fragment_program;
-   const struct gl_program_parameter_list *params =
-      fp->program.Base.Parameters;
-
-   /* If we're in this state update atom, we need to update WM constants, so
-    * free the old buffer and create a new one for the new contents.
-    */
-   drm_intel_bo_unreference(fp->const_buffer);
-   fp->const_buffer = brw_wm_update_constant_buffer(brw);
-
-   /* If there's no constant buffer, then no surface BO is needed to point at
-    * it.
-    */
-   if (fp->const_buffer == NULL) {
-      drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
-      brw->wm.surf_bo[surf] = NULL;
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   if (!fp->use_const_buffer) {
+      if (brw->wm.const_bo) {
+	 drm_intel_bo_unreference(brw->wm.const_bo);
+	 brw->wm.const_bo = NULL;
+	 brw->state.dirty.brw |= BRW_NEW_WM_CONSTBUF;
+      }
       return;
    }
 
-   memset(&key, 0, sizeof(key));
+   drm_intel_bo_unreference(brw->wm.const_bo);
+   brw->wm.const_bo = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer",
+					 size, 64);
 
-   key.format = MESA_FORMAT_RGBA_FLOAT32;
-   key.internal_format = GL_RGBA;
-   key.bo = fp->const_buffer;
-   key.depthmode = GL_NONE;
-   key.pitch = params->NumParameters;
-   key.width = params->NumParameters;
-   key.height = 1;
-   key.depth = 1;
-   key.cpp = 16;
-
-   /*
-   printf("%s:\n", __FUNCTION__);
-   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
-          key.width, key.height, key.depth, key.cpp, key.pitch);
-   */
-
-   drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
-   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
-                                            BRW_SS_SURFACE,
-                                            &key, sizeof(key),
-                                            &key.bo, 1,
-                                            NULL);
-   if (brw->wm.surf_bo[surf] == NULL) {
-      brw->wm.surf_bo[surf] = brw_create_constant_surface(brw, &key);
-   }
-   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+   /* _NEW_PROGRAM_CONSTANTS */
+   drm_intel_bo_subdata(brw->wm.const_bo, 0, size, params->ParameterValues);
 }
 
+const struct brw_tracked_state brw_wm_constants = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_FRAGMENT_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_wm_constants,
+};
+
 /**
  * Updates surface / buffer for fragment shader constant buffer, if
  * one is required.
@@ -428,20 +347,18 @@ brw_update_wm_constant_surface( GLcontext *ctx,
  * BRW_NEW_WM_SURFACES to get picked up by brw_prepare_wm_surfaces for
  * inclusion in the binding table.
  */
-static void prepare_wm_constant_surface(struct brw_context *brw )
+static void upload_wm_constant_surface(struct brw_context *brw )
 {
-   GLcontext *ctx = &brw->intel.ctx;
+   GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
    struct brw_fragment_program *fp =
       (struct brw_fragment_program *) brw->fragment_program;
-   GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
-
-   drm_intel_bo_unreference(fp->const_buffer);
-   fp->const_buffer = brw_wm_update_constant_buffer(brw);
+   const struct gl_program_parameter_list *params =
+      fp->program.Base.Parameters;
 
    /* If there's no constant buffer, then no surface BO is needed to point at
     * it.
     */
-   if (fp->const_buffer == 0) {
+   if (brw->wm.const_bo == 0) {
       if (brw->wm.surf_bo[surf] != NULL) {
 	 drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
 	 brw->wm.surf_bo[surf] = NULL;
@@ -450,16 +367,20 @@ static void prepare_wm_constant_surface(struct brw_context *brw )
       return;
    }
 
-   brw_update_wm_constant_surface(ctx, surf);
+   brw_create_constant_surface(brw, brw->wm.const_bo, params->NumParameters,
+			       &brw->wm.surf_bo[surf],
+			       &brw->wm.surf_offset[surf]);
+   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
 }
 
 const struct brw_tracked_state brw_wm_constant_surface = {
    .dirty = {
-      .mesa = (_NEW_PROGRAM_CONSTANTS),
-      .brw = (BRW_NEW_FRAGMENT_PROGRAM),
+      .mesa = 0,
+      .brw = (BRW_NEW_WM_CONSTBUF |
+	      BRW_NEW_BATCH),
       .cache = 0
    },
-   .prepare = prepare_wm_constant_surface,
+   .emit = upload_wm_constant_surface,
 };
 
 
@@ -488,6 +409,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       uint32_t draw_x;
       uint32_t draw_y;
    } key;
+   struct brw_surface_state surf;
+   void *map;
 
    memset(&key, 0, sizeof(key));
 
@@ -554,137 +477,123 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 			 (ctx->Color.BlendEnabled & (1 << unit)));
    }
 
-   drm_intel_bo_unreference(brw->wm.surf_bo[unit]);
-   brw->wm.surf_bo[unit] = brw_search_cache(&brw->surface_cache,
-					    BRW_SS_SURFACE,
-					    &key, sizeof(key),
-					    &region_bo, 1,
-					    NULL);
-
-   if (brw->wm.surf_bo[unit] == NULL) {
-      struct brw_surface_state surf;
-
-      memset(&surf, 0, sizeof(surf));
+   memset(&surf, 0, sizeof(surf));
 
-      surf.ss0.surface_format = key.surface_format;
-      surf.ss0.surface_type = key.surface_type;
-      if (key.tiling == I915_TILING_NONE) {
-	 surf.ss1.base_addr = (key.draw_x + key.draw_y * key.pitch) * key.cpp;
+   surf.ss0.surface_format = key.surface_format;
+   surf.ss0.surface_type = key.surface_type;
+   if (key.tiling == I915_TILING_NONE) {
+      surf.ss1.base_addr = (key.draw_x + key.draw_y * key.pitch) * key.cpp;
+   } else {
+      uint32_t tile_base, tile_x, tile_y;
+      uint32_t pitch = key.pitch * key.cpp;
+
+      if (key.tiling == I915_TILING_X) {
+	 tile_x = key.draw_x % (512 / key.cpp);
+	 tile_y = key.draw_y % 8;
+	 tile_base = ((key.draw_y / 8) * (8 * pitch));
+	 tile_base += (key.draw_x - tile_x) / (512 / key.cpp) * 4096;
       } else {
-	 uint32_t tile_base, tile_x, tile_y;
-	 uint32_t pitch = key.pitch * key.cpp;
-
-	 if (key.tiling == I915_TILING_X) {
-	    tile_x = key.draw_x % (512 / key.cpp);
-	    tile_y = key.draw_y % 8;
-	    tile_base = ((key.draw_y / 8) * (8 * pitch));
-	    tile_base += (key.draw_x - tile_x) / (512 / key.cpp) * 4096;
-	 } else {
-	    /* Y */
-	    tile_x = key.draw_x % (128 / key.cpp);
-	    tile_y = key.draw_y % 32;
-	    tile_base = ((key.draw_y / 32) * (32 * pitch));
-	    tile_base += (key.draw_x - tile_x) / (128 / key.cpp) * 4096;
-	 }
-	 assert(brw->has_surface_tile_offset || (tile_x == 0 && tile_y == 0));
-	 assert(tile_x % 4 == 0);
-	 assert(tile_y % 2 == 0);
-	 /* Note that the low bits of these fields are missing, so
-	  * there's the possibility of getting in trouble.
-	  */
-	 surf.ss1.base_addr = tile_base;
-	 surf.ss5.x_offset = tile_x / 4;
-	 surf.ss5.y_offset = tile_y / 2;
-      }
-      if (region_bo != NULL)
-	 surf.ss1.base_addr += region_bo->offset; /* reloc */
-
-      surf.ss2.width = key.width - 1;
-      surf.ss2.height = key.height - 1;
-      brw_set_surface_tiling(&surf, key.tiling);
-      surf.ss3.pitch = (key.pitch * key.cpp) - 1;
-
-      if (intel->gen < 6) {
-	 /* _NEW_COLOR */
-	 surf.ss0.color_blend = key.color_blend;
-	 surf.ss0.writedisable_red =   !key.color_mask[0];
-	 surf.ss0.writedisable_green = !key.color_mask[1];
-	 surf.ss0.writedisable_blue =  !key.color_mask[2];
-	 surf.ss0.writedisable_alpha = !key.color_mask[3];
+	 /* Y */
+	 tile_x = key.draw_x % (128 / key.cpp);
+	 tile_y = key.draw_y % 32;
+	 tile_base = ((key.draw_y / 32) * (32 * pitch));
+	 tile_base += (key.draw_x - tile_x) / (128 / key.cpp) * 4096;
       }
+      assert(brw->has_surface_tile_offset || (tile_x == 0 && tile_y == 0));
+      assert(tile_x % 4 == 0);
+      assert(tile_y % 2 == 0);
+      /* Note that the low bits of these fields are missing, so
+       * there's the possibility of getting in trouble.
+       */
+      surf.ss1.base_addr = tile_base;
+      surf.ss5.x_offset = tile_x / 4;
+      surf.ss5.y_offset = tile_y / 2;
+   }
+   if (region_bo != NULL)
+      surf.ss1.base_addr += region_bo->offset; /* reloc */
 
-      /* Key size will never match key size for textures, so we're safe. */
-      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->surface_cache,
-                                               BRW_SS_SURFACE,
-                                               &key, sizeof(key),
-					       &region_bo, 1,
-					       &surf, sizeof(surf));
-      if (region_bo != NULL) {
-	 /* We might sample from it, and we might render to it, so flag
-	  * them both.  We might be able to figure out from other state
-	  * a more restrictive relocation to emit.
-	  */
-	 drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit],
-				 offsetof(struct brw_surface_state, ss1),
-				 region_bo,
-				 surf.ss1.base_addr - region_bo->offset,
-				 I915_GEM_DOMAIN_RENDER,
-				 I915_GEM_DOMAIN_RENDER);
-      }
+   surf.ss2.width = key.width - 1;
+   surf.ss2.height = key.height - 1;
+   brw_set_surface_tiling(&surf, key.tiling);
+   surf.ss3.pitch = (key.pitch * key.cpp) - 1;
+
+   if (intel->gen < 6) {
+      /* _NEW_COLOR */
+      surf.ss0.color_blend = key.color_blend;
+      surf.ss0.writedisable_red =   !key.color_mask[0];
+      surf.ss0.writedisable_green = !key.color_mask[1];
+      surf.ss0.writedisable_blue =  !key.color_mask[2];
+      surf.ss0.writedisable_alpha = !key.color_mask[3];
    }
-}
 
+   map = brw_state_batch(brw, sizeof(surf), 32,
+			 &brw->wm.surf_bo[unit],
+			 &brw->wm.surf_offset[unit]);
+   memcpy(map, &surf, sizeof(surf));
+
+   if (region_bo != NULL) {
+      drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit],
+			      brw->wm.surf_offset[unit] +
+			      offsetof(struct brw_surface_state, ss1),
+			      region_bo,
+			      surf.ss1.base_addr - region_bo->offset,
+			      I915_GEM_DOMAIN_RENDER,
+			      I915_GEM_DOMAIN_RENDER);
+   }
+}
 
-/**
- * Constructs the binding table for the WM surface state, which maps unit
- * numbers to surface state objects.
- */
-static drm_intel_bo *
-brw_wm_get_binding_table(struct brw_context *brw)
+static void
+prepare_wm_surfaces(struct brw_context *brw)
 {
-   drm_intel_bo *bind_bo;
-
-   assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
-
-   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
-			      NULL, 0,
-			      brw->wm.surf_bo, brw->wm.nr_surfaces,
-			      NULL);
-
-   if (bind_bo == NULL) {
-      GLuint data_size = brw->wm.nr_surfaces * sizeof(GLuint);
-      uint32_t data[BRW_WM_MAX_SURF];
-      int i;
-
-      for (i = 0; i < brw->wm.nr_surfaces; i++)
-         if (brw->wm.surf_bo[i])
-            data[i] = brw->wm.surf_bo[i]->offset;
-         else
-            data[i] = 0;
-
-      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
-				  NULL, 0,
-				  brw->wm.surf_bo, brw->wm.nr_surfaces,
-				  data, data_size);
-
-      /* Emit binding table relocations to surface state */
-      for (i = 0; i < BRW_WM_MAX_SURF; i++) {
-	 if (brw->wm.surf_bo[i] != NULL) {
-	    drm_intel_bo_emit_reloc(bind_bo, i * sizeof(GLuint),
-				    brw->wm.surf_bo[i], 0,
-				    I915_GEM_DOMAIN_INSTRUCTION, 0);
-	 }
+   GLcontext *ctx = &brw->intel.ctx;
+   int i;
+   int nr_surfaces = 0;
+
+   if (ctx->DrawBuffer->_NumColorDrawBuffers >= 1) {
+      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+	 struct intel_region *region = irb ? irb->region : NULL;
+
+	 brw_add_validated_bo(brw, region->buffer);
+	 nr_surfaces = SURF_INDEX_DRAW(i) + 1;
+      }
+   }
+
+   if (brw->wm.const_bo) {
+      brw_add_validated_bo(brw, brw->wm.const_bo);
+      nr_surfaces = SURF_INDEX_FRAG_CONST_BUFFER + 1;
+   }
+
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
+      const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
+      struct gl_texture_object *tObj = texUnit->_Current;
+      struct intel_texture_object *intelObj = intel_texture_object(tObj);
+
+      if (texUnit->_ReallyEnabled) {
+	 brw_add_validated_bo(brw, intelObj->mt->region->buffer);
+	 nr_surfaces = SURF_INDEX_TEXTURE(i) + 1;
       }
    }
 
-   return bind_bo;
+   /* Have to update this in our prepare, since the unit's prepare
+    * relies on it.
+    */
+   if (brw->wm.nr_surfaces != nr_surfaces) {
+      brw->wm.nr_surfaces = nr_surfaces;
+      brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
+   }
 }
 
-static void prepare_wm_surfaces(struct brw_context *brw )
+/**
+ * Constructs the set of surface state objects pointed to by the
+ * binding table.
+ */
+static void
+upload_wm_surfaces(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    GLuint i;
-   int old_nr_surfaces;
 
    /* _NEW_BUFFERS | _NEW_COLOR */
    /* Update surfaces for drawing buffers */
@@ -698,32 +607,21 @@ static void prepare_wm_surfaces(struct brw_context *brw )
       brw_update_renderbuffer_surface(brw, NULL, 0);
    }
 
-   old_nr_surfaces = brw->wm.nr_surfaces;
-   brw->wm.nr_surfaces = BRW_MAX_DRAW_BUFFERS;
-
-   if (brw->wm.surf_bo[SURF_INDEX_FRAG_CONST_BUFFER] != NULL)
-       brw->wm.nr_surfaces = SURF_INDEX_FRAG_CONST_BUFFER + 1;
-
    /* Update surfaces for textures */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
       const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
       const GLuint surf = SURF_INDEX_TEXTURE(i);
 
-      /* _NEW_TEXTURE, BRW_NEW_TEXDATA */
+      /* _NEW_TEXTURE */
       if (texUnit->_ReallyEnabled) {
 	 brw_update_texture_surface(ctx, i);
-	 brw->wm.nr_surfaces = surf + 1;
       } else {
          drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
          brw->wm.surf_bo[surf] = NULL;
       }
    }
 
-   drm_intel_bo_unreference(brw->wm.bind_bo);
-   brw->wm.bind_bo = brw_wm_get_binding_table(brw);
-
-   if (brw->wm.nr_surfaces != old_nr_surfaces)
-      brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
+   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
 }
 
 const struct brw_tracked_state brw_wm_surfaces = {
@@ -731,12 +629,48 @@ const struct brw_tracked_state brw_wm_surfaces = {
       .mesa = (_NEW_COLOR |
                _NEW_TEXTURE |
                _NEW_BUFFERS),
-      .brw = (BRW_NEW_CONTEXT |
-	      BRW_NEW_WM_SURFACES),
+      .brw = (BRW_NEW_BATCH),
       .cache = 0
    },
    .prepare = prepare_wm_surfaces,
+   .emit = upload_wm_surfaces,
 };
 
+/**
+ * Constructs the binding table for the WM surface state, which maps unit
+ * numbers to surface state objects.
+ */
+static void
+brw_wm_upload_binding_table(struct brw_context *brw)
+{
+   uint32_t *bind;
+   int i;
+
+   /* Might want to calculate nr_surfaces first, to avoid taking up so much
+    * space for the binding table.
+    */
+   bind = brw_state_batch(brw, sizeof(uint32_t) * BRW_WM_MAX_SURF,
+			  32, &brw->wm.bind_bo, &brw->wm.bind_bo_offset);
+
+   for (i = 0; i < BRW_WM_MAX_SURF; i++) {
+      /* BRW_NEW_WM_SURFACES */
+      bind[i] = brw->wm.surf_offset[i];
+      if (brw->wm.surf_bo[i]) {
+	 bind[i] = brw->wm.surf_offset[i];
+      } else {
+	 bind[i] = 0;
+      }
+   }
 
+   brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE;
+}
 
+const struct brw_tracked_state brw_wm_binding_table = {
+   .dirty = {
+      .mesa = 0,
+      .brw = (BRW_NEW_BATCH |
+	      BRW_NEW_WM_SURFACES),
+      .cache = 0
+   },
+   .emit = brw_wm_upload_binding_table,
+};
diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
index 2e21e5f7335..34a9dc234c2 100644
--- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
@@ -63,8 +63,7 @@ prepare_scissor_state(struct brw_context *brw)
 
    drm_intel_bo_unreference(brw->sf.state_bo);
    brw->sf.state_bo = brw_cache_data(&brw->cache, BRW_SF_UNIT,
-				     &scissor, sizeof(scissor),
-				     NULL, 0);
+				     &scissor, sizeof(scissor));
 }
 
 const struct brw_tracked_state gen6_scissor_state = {
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 8d96b44f1dc..51940efb443 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -87,7 +87,7 @@ upload_sf_state(struct brw_context *brw)
    if (ctx->Polygon.CullFlag) {
       switch (ctx->Polygon.CullFaceMode) {
       case GL_FRONT:
-	 dw3 |= GEN6_SF_CULL_BOTH;
+	 dw3 |= GEN6_SF_CULL_FRONT;
 	 break;
       case GL_BACK:
 	 dw3 |= GEN6_SF_CULL_BACK;
diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
index 0c2aa4206c6..301c68e7f9e 100644
--- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
@@ -29,7 +29,6 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
-#include "main/macros.h"
 
 /* The clip VP defines the guardband region where expensive clipping is skipped
  * and fragments are allowed to be generated and clipped out cheaply by the SF.
@@ -51,8 +50,7 @@ prepare_clip_vp(struct brw_context *brw)
 
    drm_intel_bo_unreference(brw->clip.vp_bo);
    brw->clip.vp_bo = brw_cache_data(&brw->cache, BRW_CLIP_VP,
-				    &vp, sizeof(vp),
-				    NULL, 0);
+				    &vp, sizeof(vp));
 }
 
 const struct brw_tracked_state gen6_clip_vp = {
@@ -95,8 +93,7 @@ prepare_sf_vp(struct brw_context *brw)
 
    drm_intel_bo_unreference(brw->sf.vp_bo);
    brw->sf.vp_bo = brw_cache_data(&brw->cache, BRW_SF_VP,
-				  &sfv, sizeof(sfv),
-				  NULL, 0);
+				  &sfv, sizeof(sfv));
 }
 
 const struct brw_tracked_state gen6_sf_vp = {
@@ -108,36 +105,6 @@ const struct brw_tracked_state gen6_sf_vp = {
    .prepare = prepare_sf_vp,
 };
 
-static void
-prepare_cc_vp(struct brw_context *brw)
-{
-   GLcontext *ctx = &brw->intel.ctx;
-   struct brw_cc_viewport ccv;
-
-   /* _NEW_TRANSOFORM */
-   if (ctx->Transform.DepthClamp) {
-      /* _NEW_VIEWPORT */
-      ccv.min_depth = MIN2(ctx->Viewport.Near, ctx->Viewport.Far);
-      ccv.max_depth = MAX2(ctx->Viewport.Near, ctx->Viewport.Far);
-   } else {
-      ccv.min_depth = 0.0;
-      ccv.max_depth = 1.0;
-   }
-
-   drm_intel_bo_unreference(brw->cc.vp_bo);
-   brw->cc.vp_bo = brw_cache_data(&brw->cache, BRW_CC_VP, &ccv, sizeof(ccv),
-				  NULL, 0);
-}
-
-const struct brw_tracked_state gen6_cc_vp = {
-   .dirty = {
-      .mesa = _NEW_VIEWPORT | _NEW_TRANSFORM,
-      .brw = 0,
-      .cache = 0,
-   },
-   .prepare = prepare_cc_vp,
-};
-
 static void prepare_viewport_state_pointers(struct brw_context *brw)
 {
    brw_add_validated_bo(brw, brw->sf.state_bo);
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 325f6b43d30..863c85449d9 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -98,7 +98,8 @@ upload_wm_state(struct brw_context *brw)
 
    /* CACHE_NEW_SAMPLER */
    dw2 |= (ALIGN(brw->wm.sampler_count, 4) / 4) << GEN6_WM_SAMPLER_COUNT_SHIFT;
-   dw4 |= (1 << GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
+   dw4 |= (brw->wm.prog_data->first_curbe_grf <<
+	   GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
 
    dw5 |= (40 - 1) << GEN6_WM_MAX_THREADS_SHIFT;
    dw5 |= GEN6_WM_DISPATCH_ENABLE;
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index 1116bccd8e7..698445c5268 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -49,6 +49,7 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch)
    batch->ptr = batch->map;
    batch->reserved_space = BATCH_RESERVED;
    batch->dirty_state = ~0;
+   batch->state_batch_offset = batch->size;
 }
 
 struct intel_batchbuffer *
@@ -84,6 +85,12 @@ do_flush_locked(struct intel_batchbuffer *batch, GLuint used)
    int x_off = 0, y_off = 0;
 
    drm_intel_bo_subdata(batch->buf, 0, used, batch->buffer);
+   if (batch->state_batch_offset != batch->size) {
+      drm_intel_bo_subdata(batch->buf,
+			   batch->state_batch_offset,
+			   batch->size - batch->state_batch_offset,
+			   batch->buffer + batch->state_batch_offset);
+   }
 
    batch->ptr = NULL;
 
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.h b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
index f4ac1825cd1..ae53f455117 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
@@ -23,6 +23,7 @@ struct intel_batchbuffer
    GLubyte *ptr;
 
    GLuint size;
+   uint32_t state_batch_offset;
 
 #ifdef DEBUG
    /** Tracking of BEGIN_BATCH()/OUT_BATCH()/ADVANCE_BATCH() debugging */
@@ -92,7 +93,8 @@ static INLINE uint32_t float_as_int(float f)
 static INLINE GLint
 intel_batchbuffer_space(struct intel_batchbuffer *batch)
 {
-   return (batch->size - batch->reserved_space) - (batch->ptr - batch->map);
+   return (batch->state_batch_offset - batch->reserved_space) -
+      (batch->ptr - batch->map);
 }
 
 
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index dec47974f2a..5f2035d79c9 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -827,8 +827,6 @@ intelDestroyContext(__DRIcontext * driContextPriv)
 
    assert(intel);               /* should never be null */
    if (intel) {
-      GLboolean release_texture_heaps;
-
       INTEL_FIREVERTICES(intel);
 
       _mesa_meta_free(&intel->ctx);
@@ -837,7 +835,6 @@ intelDestroyContext(__DRIcontext * driContextPriv)
 
       intel->vtbl.destroy(intel);
 
-      release_texture_heaps = (intel->ctx.Shared->RefCount == 1);
       _swsetup_DestroyContext(&intel->ctx);
       _tnl_DestroyContext(&intel->ctx);
       _vbo_DestroyContext(&intel->ctx);
@@ -855,18 +852,6 @@ intelDestroyContext(__DRIcontext * driContextPriv)
       drm_intel_bo_unreference(intel->first_post_swapbuffers_batch);
       intel->first_post_swapbuffers_batch = NULL;
 
-      if (release_texture_heaps) {
-         /* Nothing is currently done here to free texture heaps;
-          * but we're not using the texture heap utilities, so I
-          * rather think we shouldn't.  I've taken a look, and can't
-          * find any private texture data hanging around anywhere, but
-          * I'm not yet certain there isn't any at all...
-          */
-         /* if (INTEL_DEBUG & DEBUG_TEXTURE)
-            fprintf(stderr, "do something to free texture heaps\n");
-          */
-      }
-
       driDestroyOptionCache(&intel->optionCache);
 
       /* free the Mesa context */
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index 14ff4a96950..c7ac2de01e6 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -261,6 +261,8 @@ extern char *__progname;
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
 #define ALIGN(value, alignment)  ((value + alignment - 1) & ~(alignment - 1))
+#define ROUND_DOWN_TO(value, alignment) (ALIGN(value - alignment - 1, \
+					       alignment))
 #define IS_POWER_OF_TWO(val) (((val) & (val - 1)) == 0)
 
 static INLINE uint32_t
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index 8f61f1f5b24..4a83886fc16 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -42,9 +42,6 @@
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 #include "intel_regions.h"
-#ifndef I915
-#include "brw_state.h"
-#endif
 
 #define FILE_DEBUG_FLAG DEBUG_FBO
 
@@ -296,12 +293,6 @@ intel_renderbuffer_set_region(struct intel_context *intel,
    old = rb->region;
    rb->region = NULL;
    intel_region_reference(&rb->region, region);
-#ifndef I915
-   if (old) {
-      brw_state_cache_bo_delete(&brw_context(&intel->ctx)->surface_cache,
-				old->buffer);
-   }
-#endif
    intel_region_release(&old);
 }
 
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
index 71ef7a8e39b..39ac0205fa1 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
@@ -29,9 +29,6 @@
 #include "intel_mipmap_tree.h"
 #include "intel_regions.h"
 #include "intel_tex_layout.h"
-#ifndef I915
-#include "brw_state.h"
-#endif
 #include "main/enums.h"
 
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
@@ -203,19 +200,6 @@ intel_miptree_release(struct intel_context *intel,
 
       DBG("%s deleting %p\n", __FUNCTION__, *mt);
 
-#ifndef I915
-      /* Free up cached binding tables holding a reference on our buffer, to
-       * avoid excessive memory consumption.
-       *
-       * This isn't as aggressive as we could be, as we'd like to do
-       * it from any time we free the last ref on a region.  But intel_region.c
-       * is context-agnostic.  Perhaps our constant state cache should be, as
-       * well.
-       */
-      brw_state_cache_bo_delete(&brw_context(&intel->ctx)->surface_cache,
-				(*mt)->region->buffer);
-#endif
-
       intel_region_release(&((*mt)->region));
 
       for (i = 0; i < MAX_TEXTURE_LEVELS; i++) {
diff --git a/src/mesa/drivers/dri/intel/intel_span.c b/src/mesa/drivers/dri/intel/intel_span.c
index c30552c5a79..fb840c1020d 100644
--- a/src/mesa/drivers/dri/intel/intel_span.c
+++ b/src/mesa/drivers/dri/intel/intel_span.c
@@ -257,6 +257,8 @@ intelSpanRenderStart(GLcontext * ctx)
    for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
       if (ctx->Texture.Unit[i]._ReallyEnabled) {
          struct gl_texture_object *texObj = ctx->Texture.Unit[i]._Current;
+
+         intel_finalize_mipmap_tree(intel, i);
          intel_tex_map_images(intel, intel_texture_object(texObj));
       }
    }
diff --git a/src/mesa/drivers/dri/r300/compiler/Makefile b/src/mesa/drivers/dri/r300/compiler/Makefile
index 34d22b45591..ff3801dc676 100644
--- a/src/mesa/drivers/dri/r300/compiler/Makefile
+++ b/src/mesa/drivers/dri/r300/compiler/Makefile
@@ -9,6 +9,7 @@ C_SOURCES = \
 		radeon_code.c \
 		radeon_compiler.c \
 		radeon_emulate_branches.c \
+		radeon_emulate_loops.c \
 		radeon_program.c \
 		radeon_program_print.c \
 		radeon_opcodes.c \
diff --git a/src/mesa/drivers/dri/r300/compiler/SConscript b/src/mesa/drivers/dri/r300/compiler/SConscript
index 663926e3216..50d9cdb7f2d 100755
--- a/src/mesa/drivers/dri/r300/compiler/SConscript
+++ b/src/mesa/drivers/dri/r300/compiler/SConscript
@@ -23,6 +23,7 @@ r300compiler = env.ConvenienceLibrary(
         'radeon_pair_regalloc.c',
         'radeon_optimize.c',
         'radeon_emulate_branches.c',
+        'radeon_emulate_loops.c',
         'radeon_dataflow.c',
         'radeon_dataflow_deadcode.c',
         'radeon_dataflow_swizzles.c',
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index 7f3b88ed759..bbdfa0d56f9 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -26,6 +26,7 @@
 
 #include "radeon_dataflow.h"
 #include "radeon_emulate_branches.h"
+#include "radeon_emulate_loops.h"
 #include "radeon_program_alu.h"
 #include "radeon_program_tex.h"
 #include "r300_fragprog.h"
@@ -103,6 +104,14 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 	/* XXX Ideally this should be done only for r3xx, but since
 	 * we don't have branching support for r5xx, we use the emulation
 	 * on all chipsets. */
+	
+	if (c->Base.is_r500) {
+		rc_emulate_loops(&c->Base, R500_PFS_MAX_INST);
+	} else {
+		rc_emulate_loops(&c->Base, R300_PFS_MAX_ALU_INST);
+	}
+	debug_program_log(c, "after emulate loops");
+
 	rc_emulate_branches(&c->Base);
 
 	debug_program_log(c, "after emulate branches");
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
index 507b2e532fe..e984797e2d3 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
@@ -30,6 +30,7 @@
 #include "radeon_program_alu.h"
 #include "radeon_swizzle.h"
 #include "radeon_emulate_branches.h"
+#include "radeon_emulate_loops.h"
 
 /*
  * Take an already-setup and valid source then swizzle it appropriately to
@@ -348,7 +349,8 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 		if (!valid_dst(compiler->code, &vpi->DstReg))
 			continue;
 
-		if (compiler->code->length >= VSF_MAX_FRAGMENT_LENGTH) {
+		if (compiler->code->length >= R500_VS_MAX_ALU_DWORDS ||
+		    (compiler->code->length >= R300_VS_MAX_ALU_DWORDS && !compiler->Base.is_r500)) {
 			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
 			return;
 		}
@@ -404,7 +406,7 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 {
 	struct rc_instruction *inst;
 	unsigned int num_orig_temps = 0;
-	char hwtemps[VSF_MAX_FRAGMENT_TEMPS];
+	char hwtemps[R300_VS_MAX_TEMPS];
 	struct temporary_allocation * ta;
 	unsigned int i, j;
 
@@ -463,11 +465,11 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 				unsigned int orig = inst->U.I.DstReg.Index;
 
 				if (!ta[orig].Allocated) {
-					for(j = 0; j < VSF_MAX_FRAGMENT_TEMPS; ++j) {
+					for(j = 0; j < R300_VS_MAX_TEMPS; ++j) {
 						if (!hwtemps[j])
 							break;
 					}
-					if (j >= VSF_MAX_FRAGMENT_TEMPS) {
+					if (j >= R300_VS_MAX_TEMPS) {
 						fprintf(stderr, "Out of hw temporaries\n");
 					} else {
 						ta[orig].Allocated = 1;
@@ -600,6 +602,13 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 	/* XXX Ideally this should be done only for r3xx, but since
 	 * we don't have branching support for r5xx, we use the emulation
 	 * on all chipsets. */
+	if (compiler->Base.is_r500){
+		rc_emulate_loops(&compiler->Base, R500_VS_MAX_ALU);
+	} else {
+		rc_emulate_loops(&compiler->Base, R300_VS_MAX_ALU);
+	}
+	debug_program_log(compiler, "after emulate loops");
+
 	rc_emulate_branches(&compiler->Base);
 
 	debug_program_log(compiler, "after emulate branches");
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
index 1979e7e4e49..d03689763bc 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
@@ -235,8 +235,11 @@ struct rX00_fragment_program_code {
 };
 
 
-#define VSF_MAX_FRAGMENT_LENGTH (255*4)
-#define VSF_MAX_FRAGMENT_TEMPS (14)
+#define R300_VS_MAX_ALU		256
+#define R300_VS_MAX_ALU_DWORDS  (R300_VS_MAX_ALU * 4)
+#define R500_VS_MAX_ALU	        1024
+#define R500_VS_MAX_ALU_DWORDS  (R500_VS_MAX_ALU * 4)
+#define R300_VS_MAX_TEMPS	32
 
 #define VSF_MAX_INPUTS 32
 #define VSF_MAX_OUTPUTS 32
@@ -244,8 +247,8 @@ struct rX00_fragment_program_code {
 struct r300_vertex_program_code {
 	int length;
 	union {
-		uint32_t d[VSF_MAX_FRAGMENT_LENGTH];
-		float f[VSF_MAX_FRAGMENT_LENGTH];
+		uint32_t d[R500_VS_MAX_ALU_DWORDS];
+		float f[R500_VS_MAX_ALU_DWORDS];
 	} body;
 
 	int pos_end;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
new file mode 100644
index 00000000000..4c5d29f4217
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
@@ -0,0 +1,474 @@
+/*
+ * Copyright 2010 Tom Stellard <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ */
+
+#include "radeon_emulate_loops.h"
+
+#include "radeon_compiler.h"
+#include "radeon_dataflow.h"
+
+#define VERBOSE 0
+
+#define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)
+
+struct emulate_loop_state {
+	struct radeon_compiler * C;
+	struct loop_info * Loops;
+	unsigned int LoopCount;
+	unsigned int LoopReserved;
+};
+
+struct loop_info {
+	struct rc_instruction * BeginLoop;
+	struct rc_instruction * Cond;
+	struct rc_instruction * If;
+	struct rc_instruction * Brk;
+	struct rc_instruction * EndIf;
+	struct rc_instruction * EndLoop;
+};
+
+struct const_value {
+	
+	struct radeon_compiler * C;
+	struct rc_src_register * Src;
+	float Value;
+	int HasValue;
+};
+
+struct count_inst {
+	struct radeon_compiler * C;
+	int Index;
+	rc_swizzle Swz;
+	float Amount;
+	int Unknown;
+};
+
+static float get_constant_value(struct radeon_compiler * c,
+						struct rc_src_register * src,
+						int chan)
+{
+	float base = 1.0f;
+	int swz = GET_SWZ(src->Swizzle, chan);
+	if(swz >= 4 || src->Index >= c->Program.Constants.Count ){
+		rc_error(c, "get_constant_value: Can't find a value.\n");
+		return 0.0f;
+	}
+	if(GET_BIT(src->Negate, chan)){
+		base = -1.0f;
+	}
+	return base *
+		c->Program.Constants.Constants[src->Index].u.Immediate[swz];
+}
+
+static int src_reg_is_immediate(struct rc_src_register * src,
+						struct radeon_compiler * c)
+{
+	return src->File == RC_FILE_CONSTANT &&
+	c->Program.Constants.Constants[src->Index].Type==RC_CONSTANT_IMMEDIATE;
+}
+
+static unsigned int loop_count_instructions(struct loop_info * loop)
+{
+	unsigned int count = 0;
+	struct rc_instruction * inst = loop->BeginLoop->Next;
+	while(inst != loop->EndLoop){
+		count++;
+		inst = inst->Next;
+	}
+	return count;
+}
+
+static unsigned int loop_calc_iterations(struct loop_info * loop,
+		unsigned int loop_count, unsigned int max_instructions)
+{
+	unsigned int icount = loop_count_instructions(loop);
+	return max_instructions / (loop_count * icount);
+}
+
+static void loop_unroll(struct emulate_loop_state * s,
+			struct loop_info *loop, unsigned int iterations)
+{
+	unsigned int i;
+	struct rc_instruction * ptr;
+	struct rc_instruction * first = loop->BeginLoop->Next;
+	struct rc_instruction * last = loop->EndLoop->Prev;
+	struct rc_instruction * append_to = last;
+	rc_remove_instruction(loop->BeginLoop);
+	rc_remove_instruction(loop->EndLoop);
+	for( i = 1; i < iterations; i++){
+		for(ptr = first; ptr != last->Next; ptr = ptr->Next){
+			struct rc_instruction *new = rc_alloc_instruction(s->C);
+			memcpy(new, ptr, sizeof(struct rc_instruction));
+			rc_insert_instruction(append_to, new);
+			append_to = new;
+		}
+	}
+}
+
+
+static void update_const_value(void * data, struct rc_instruction * inst,
+		rc_register_file file, unsigned int index, unsigned int mask)
+{
+	struct const_value * value = data;
+	if(value->Src->File != file ||
+	   value->Src->Index != index ||
+	   !(1 << GET_SWZ(value->Src->Swizzle, 0) & mask)){
+	   	return;
+	}
+	switch(inst->U.I.Opcode){
+	case RC_OPCODE_MOV:
+		if(!src_reg_is_immediate(&inst->U.I.SrcReg[0], value->C)){
+			return;
+		}
+		value->HasValue = 1;
+		value->Value =
+			get_constant_value(value->C, &inst->U.I.SrcReg[0], 0);
+		break;
+	}
+}
+
+static void get_incr_amount(void * data, struct rc_instruction * inst,
+		rc_register_file file, unsigned int index, unsigned int mask)
+{
+	struct count_inst * count_inst = data;
+	int amnt_src_index;
+	const struct rc_opcode_info * opcode;
+	float amount;
+
+	if(file != RC_FILE_TEMPORARY ||
+	   count_inst->Index != index ||
+	   (1 << GET_SWZ(count_inst->Swz,0) != mask)){
+	   	return;
+	}
+	/* Find the index of the counter register. */
+	opcode = rc_get_opcode_info(inst->U.I.Opcode);
+	if(opcode->NumSrcRegs != 2){
+		count_inst->Unknown = 1;
+		return;
+	}
+	if(inst->U.I.SrcReg[0].File == RC_FILE_TEMPORARY &&
+	   inst->U.I.SrcReg[0].Index == count_inst->Index &&
+	   inst->U.I.SrcReg[0].Swizzle == count_inst->Swz){
+		amnt_src_index = 1;
+	} else if( inst->U.I.SrcReg[1].File == RC_FILE_TEMPORARY &&
+		   inst->U.I.SrcReg[1].Index == count_inst->Index &&
+		   inst->U.I.SrcReg[1].Swizzle == count_inst->Swz){
+		amnt_src_index = 0;
+	}
+	else{
+		count_inst->Unknown = 1;
+		return;
+	}
+	if(src_reg_is_immediate(&inst->U.I.SrcReg[amnt_src_index],
+							count_inst->C)){
+		amount = get_constant_value(count_inst->C,
+				&inst->U.I.SrcReg[amnt_src_index], 0);
+	}
+	else{
+		count_inst->Unknown = 1 ;
+		return;
+	}
+	switch(inst->U.I.Opcode){
+	case RC_OPCODE_ADD:
+		count_inst->Amount += amount;
+		break;
+	case RC_OPCODE_SUB:
+		if(amnt_src_index == 0){
+			count_inst->Unknown = 0;
+			return;
+		}
+		count_inst->Amount -= amount;
+		break;
+	default:
+		count_inst->Unknown = 1;
+		return;
+	}
+	
+}
+
+static int transform_const_loop(struct emulate_loop_state * s,
+						struct loop_info * loop,
+						struct rc_instruction * cond)
+{
+	int end_loops = 1;
+	int iterations;
+	struct count_inst count_inst;
+	float limit_value;
+	struct rc_src_register * counter;
+	struct rc_src_register * limit;
+	struct const_value counter_value;
+	struct rc_instruction * inst;
+
+	/* Find the counter and the upper limit */
+	
+	if(src_reg_is_immediate(&cond->U.I.SrcReg[0], s->C)){
+		limit = &cond->U.I.SrcReg[0];
+		counter = &cond->U.I.SrcReg[1];
+	}
+	else if(src_reg_is_immediate(&cond->U.I.SrcReg[1], s->C)){
+		limit = &cond->U.I.SrcReg[1];
+		counter = &cond->U.I.SrcReg[0];
+	}
+	else{
+		DBG("No constant limit.\n");
+		return 0;
+	}
+	
+	/* Find the initial value of the counter */
+	counter_value.Src = counter;
+	counter_value.Value = 0.0f;
+	counter_value.HasValue = 0;
+	counter_value.C = s->C;
+	for(inst = s->C->Program.Instructions.Next; inst != loop->BeginLoop;
+							inst = inst->Next){
+		rc_for_all_writes_mask(inst, update_const_value, &counter_value);
+	}
+	if(!counter_value.HasValue){
+		DBG("Initial counter value cannot be determined.\n");
+		return 0;
+	}
+	DBG("Initial counter value is %f\n", counter_value.Value);
+	/* Determine how the counter is modified each loop */
+	count_inst.C = s->C;
+	count_inst.Index = counter->Index;
+	count_inst.Swz = counter->Swizzle;
+	count_inst.Amount = 0.0f;
+	count_inst.Unknown = 0;
+	for(inst = loop->BeginLoop->Next; end_loops > 0; inst = inst->Next){
+		switch(inst->U.I.Opcode){
+		/* XXX In the future we might want to try to unroll nested
+		 * loops here.*/
+		case RC_OPCODE_BGNLOOP:
+			end_loops++;
+			break;
+		case RC_OPCODE_ENDLOOP:
+			loop->EndLoop = inst;
+			end_loops--;
+			break;
+		/* XXX Check if the counter is modified within an if statement.
+		 */
+		case RC_OPCODE_IF:
+			break;
+		default:
+			rc_for_all_writes_mask(inst, get_incr_amount, &count_inst);
+			if(count_inst.Unknown){
+				return 0;
+			}
+			break;
+		}
+	}
+	/* Infinite loop */
+	if(count_inst.Amount == 0.0f){
+		return 0;
+	}
+	DBG("Counter is increased by %f each iteration.\n", count_inst.Amount);
+	/* Calculate the number of iterations of this loop.  Keeping this
+	 * simple, since we only support increment and decrement loops.
+	 */
+	limit_value = get_constant_value(s->C, limit, 0);
+	iterations = (int) ((limit_value - counter_value.Value) /
+							count_inst.Amount);
+
+	DBG("Loop will have %d iterations.\n", iterations);
+	
+	/* Prepare loop for unrolling */
+	rc_remove_instruction(loop->Cond);
+	rc_remove_instruction(loop->If);
+	rc_remove_instruction(loop->Brk);
+	rc_remove_instruction(loop->EndIf);
+	
+	loop_unroll(s, loop, iterations);
+	loop->EndLoop = NULL;
+	return 1;
+}
+
+/** 
+ * This function prepares a loop to be unrolled by converting it into an if
+ * statement.  Here is an outline of the conversion process:
+ * BGNLOOP;                         	-> BGNLOOP;
+ * <Additional conditional code>	-> <Additional conditional code>
+ * SGE/SLT temp[0], temp[1], temp[2];	-> SLT/SGE temp[0], temp[1], temp[2];
+ * IF temp[0];                      	-> IF temp[0];
+ * BRK;                             	->
+ * ENDIF;                           	-> <Loop Body>
+ * <Loop Body>                      	-> ENDIF;
+ * ENDLOOP;                         	-> ENDLOOP
+ *
+ * @param inst A pointer to a BGNLOOP instruction.
+ * @return If the loop can be unrolled, a pointer to the first instruction of
+ * 		the unrolled loop.
+ * 	   Otherwise, A pointer to the ENDLOOP instruction.
+ * 	   Null if there is an error.
+ */
+static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
+						struct rc_instruction * inst)
+{
+	struct loop_info *loop;
+	struct rc_instruction * ptr;
+
+	memory_pool_array_reserve(&s->C->Pool, struct loop_info,
+			s->Loops, s->LoopCount, s->LoopReserved, 1);
+
+	loop = &s->Loops[s->LoopCount++];
+	memset(loop, 0, sizeof(struct loop_info));
+	if(inst->U.I.Opcode != RC_OPCODE_BGNLOOP){
+		rc_error(s->C, "expected BGNLOOP\n", __FUNCTION__);
+		return NULL;
+	}
+	loop->BeginLoop = inst;
+
+	for(ptr = loop->BeginLoop->Next; !loop->EndLoop; ptr = ptr->Next){
+		switch(ptr->U.I.Opcode){
+		case RC_OPCODE_BGNLOOP:
+			/* Nested loop */
+			ptr = transform_loop(s, ptr);
+			if(!ptr){
+				return NULL;
+			}
+			break;
+		case RC_OPCODE_BRK:
+			loop->Brk = ptr;
+			if(ptr->Next->U.I.Opcode != RC_OPCODE_ENDIF){
+				rc_error(s->C,
+					"%s: expected ENDIF\n",__FUNCTION__);
+				return NULL;
+			}
+			loop->EndIf = ptr->Next;
+			if(ptr->Prev->U.I.Opcode != RC_OPCODE_IF){
+				rc_error(s->C,
+					"%s: expected IF\n", __FUNCTION__);
+				return NULL;
+			}
+			loop->If = ptr->Prev;
+			switch(loop->If->Prev->U.I.Opcode){
+			case RC_OPCODE_SLT:
+			case RC_OPCODE_SGE:
+			case RC_OPCODE_SGT:
+			case RC_OPCODE_SLE:
+			case RC_OPCODE_SEQ:
+			case RC_OPCODE_SNE:
+				break;
+			default:
+				rc_error(s->C, "%s expected conditional\n",
+								__FUNCTION__);
+				return NULL;
+			}
+			loop->Cond = loop->If->Prev;
+			ptr = loop->EndIf;
+			break;
+		case RC_OPCODE_ENDLOOP:
+			loop->EndLoop = ptr;
+			break;
+		}
+	}
+	/* Reverse the conditional instruction */
+	switch(loop->Cond->U.I.Opcode){
+	case RC_OPCODE_SGE:
+		loop->Cond->U.I.Opcode = RC_OPCODE_SLT;
+		break;
+	case RC_OPCODE_SLT:
+		loop->Cond->U.I.Opcode = RC_OPCODE_SGE;
+		break;
+	case RC_OPCODE_SLE:
+		loop->Cond->U.I.Opcode = RC_OPCODE_SGT;
+		break;
+	case RC_OPCODE_SGT:
+		loop->Cond->U.I.Opcode = RC_OPCODE_SLE;
+		break;
+	case RC_OPCODE_SEQ:
+		loop->Cond->U.I.Opcode = RC_OPCODE_SNE;
+		break;
+	case RC_OPCODE_SNE:
+		loop->Cond->U.I.Opcode = RC_OPCODE_SEQ;
+		break;
+	default:
+		rc_error(s->C, "loop->Cond is not a conditional.\n");
+		return NULL;
+	}
+	
+	/* Check if the number of loops is known at compile time. */
+	if(transform_const_loop(s, loop, ptr)){
+		return loop->BeginLoop->Next;
+	}
+
+	/* Prepare the loop to be unrolled */
+	rc_remove_instruction(loop->Brk);
+	rc_remove_instruction(loop->EndIf);
+	rc_insert_instruction(loop->EndLoop->Prev, loop->EndIf);
+	return loop->EndLoop;
+}
+
+static void rc_transform_loops(struct emulate_loop_state * s)
+{
+	struct rc_instruction * ptr = s->C->Program.Instructions.Next;
+	while(ptr != &s->C->Program.Instructions) {
+		if(ptr->Type == RC_INSTRUCTION_NORMAL &&
+					ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){
+			ptr = transform_loop(s, ptr);
+			if(!ptr){
+				return;
+			}
+		}
+		ptr = ptr->Next;
+	}
+}
+
+static void rc_unroll_loops(struct emulate_loop_state *s,
+						unsigned int max_instructions)
+{
+	int i;
+	/* Iterate backwards of the list of loops so that loops that nested
+	 * loops are unrolled first.
+	 */
+	for( i = s->LoopCount - 1; i >= 0; i-- ){
+		if(!s->Loops[i].EndLoop){
+			continue;
+		}
+		unsigned int iterations = loop_calc_iterations(&s->Loops[i],
+						s->LoopCount, max_instructions);
+		loop_unroll(s, &s->Loops[i], iterations);
+	}
+}
+
+void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions)
+{
+	struct emulate_loop_state s;
+
+	memset(&s, 0, sizeof(struct emulate_loop_state));
+	s.C = c;
+
+	/* We may need to move these two operations to r3xx_(vert|frag)prog.c
+	 * and run the optimization passes between them in order to increase
+	 * the number of unrolls we can do for each loop.
+	 */
+	rc_transform_loops(&s);
+	
+	rc_unroll_loops(&s, max_instructions);
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
new file mode 100644
index 00000000000..ddcf1c0fabe
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
@@ -0,0 +1,12 @@
+
+
+#ifndef RADEON_EMULATE_LOOPS_H
+#define RADEON_EMULATE_LOOPS_H
+
+#define MAX_ITERATIONS 8
+
+struct radeon_compiler;
+
+void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions);
+
+#endif /* RADEON_EMULATE_LOOPS_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
index d593b3e81ae..1dc16855dc1 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
@@ -368,6 +368,24 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
 		.NumSrcRegs = 0
 	},
 	{
+		.Opcode = RC_OPCODE_BGNLOOP,
+		.Name = "BGNLOOP",
+		.IsFlowControl = 1,
+		.NumSrcRegs = 0
+	},
+	{
+		.Opcode = RC_OPCODE_BRK,
+		.Name = "BRK",
+		.IsFlowControl = 1,
+		.NumSrcRegs = 0
+	},
+	{
+		.Opcode = RC_OPCODE_ENDLOOP,
+		.Name = "ENDLOOP",
+		.IsFlowControl = 1,
+		.NumSrcRegs = 0,
+	},
+	{
 		.Opcode = RC_OPCODE_REPL_ALPHA,
 		.Name = "REPL_ALPHA",
 		.HasDstReg = 1
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
index 87a2e23084c..91c82ac0890 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
@@ -180,6 +180,12 @@ typedef enum {
 
 	/** branch instruction: has no effect */
 	RC_OPCODE_ENDIF,
+	
+	RC_OPCODE_BGNLOOP,
+
+	RC_OPCODE_BRK,
+
+	RC_OPCODE_ENDLOOP,
 
 	/** special instruction, used in R300-R500 fragment program pair instructions
 	 * indicates that the result of the alpha operation shall be replicated
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index 6992ca59dbf..e4b302bbad9 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -376,13 +376,12 @@ static void r300InitConstValues(GLcontext *ctx, radeonScreenPtr screen)
 	ctx->Const.MaxDrawBuffers = 1;
 	ctx->Const.MaxColorAttachments = 1;
 
-	/* currently bogus data */
 	if (r300->options.hw_tcl_enabled) {
-		ctx->Const.VertexProgram.MaxNativeInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
-		ctx->Const.VertexProgram.MaxNativeAluInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
-		ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
+		ctx->Const.VertexProgram.MaxNativeInstructions = 255;
+		ctx->Const.VertexProgram.MaxNativeAluInstructions = 255;
+		ctx->Const.VertexProgram.MaxNativeAttribs = 16;
 		ctx->Const.VertexProgram.MaxNativeTemps = 32;
-		ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
+		ctx->Const.VertexProgram.MaxNativeParameters = 256;
 		ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
 	}
 
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c
index 61133e686f1..88d6b06df56 100644
--- a/src/mesa/drivers/dri/r600/r700_assembler.c
+++ b/src/mesa/drivers/dri/r600/r700_assembler.c
@@ -6159,7 +6159,7 @@ GLboolean callPreSub(r700_AssemblerBase* pAsm,
     }
     if(uNumValidSrc > 0)
     {
-        prelude_cf_ptr     = pAsm->cf_current_alu_clause_ptr;
+        prelude_cf_ptr     = (R700ControlFlowGenericClause*) pAsm->cf_current_alu_clause_ptr;
         pAsm->alu_x_opcode = SQ_CF_INST_ALU;
     }
 
@@ -6279,7 +6279,7 @@ GLboolean callPreSub(r700_AssemblerBase* pAsm,
 
         next_ins(pAsm);        
 
-        pAsm->callers[pAsm->unCallerArrayPointer - 1].finale_cf_ptr  = pAsm->cf_current_alu_clause_ptr;
+        pAsm->callers[pAsm->unCallerArrayPointer - 1].finale_cf_ptr  = (R700ControlFlowGenericClause*) pAsm->cf_current_alu_clause_ptr;
         pAsm->callers[pAsm->unCallerArrayPointer - 1].prelude_cf_ptr = prelude_cf_ptr;
         pAsm->alu_x_opcode = SQ_CF_INST_ALU;
     }
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.c b/src/mesa/drivers/dri/r600/r700_fragprog.c
index 5a90f729e68..aab1a7947ab 100644
--- a/src/mesa/drivers/dri/r600/r700_fragprog.c
+++ b/src/mesa/drivers/dri/r600/r700_fragprog.c
@@ -563,11 +563,15 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
 
     /* see if we need any point_sprite replacements, also increase num_interp
      * as there's no vp output for them */
-    for (i = FRAG_ATTRIB_TEX0; i<= FRAG_ATTRIB_TEX7; i++)
+    if (ctx->Point.PointSprite)
     {
-        if(ctx->Point.CoordReplace[i - FRAG_ATTRIB_TEX0] == GL_TRUE) {
-            ui++;
-            point_sprite = GL_TRUE;
+        for (i = FRAG_ATTRIB_TEX0; i<= FRAG_ATTRIB_TEX7; i++)
+        {
+            if (ctx->Point.CoordReplace[i - FRAG_ATTRIB_TEX0] == GL_TRUE)
+            {
+                ui++;
+                point_sprite = GL_TRUE;
+            }
         }
     }
 
@@ -670,8 +674,9 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
 
     for(i=0; i<8; i++)
     {
+	    GLboolean coord_replace = ctx->Point.PointSprite && ctx->Point.CoordReplace[i];
 	    unBit = 1 << (VERT_RESULT_TEX0 + i);
-	    if((OutputsWritten & unBit) || (ctx->Point.CoordReplace[i] == GL_TRUE))
+	    if ((OutputsWritten & unBit) || coord_replace)
 	    {
 		    ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_TEX0 + i];
 		    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
@@ -679,7 +684,7 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
 			     SEMANTIC_shift, SEMANTIC_mask);
 		    CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
 		    /* ARB_point_sprite */
-		    if(ctx->Point.CoordReplace[i] == GL_TRUE)
+		    if (coord_replace)
 		    {
 			     SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, PT_SPRITE_TEX_bit);
 		    }
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index bcac125baf4..d2b190e42e0 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -593,12 +593,7 @@ static int image_matches_texture_obj(struct gl_texture_object *texObj,
 	if (!baseImage)
 		return 0;
 
-	/* Check image level against object BaseLevel, but not MaxLevel. MaxLevel is not
-	 * the highest level that can be assigned to the miptree.
-	 */
-	const unsigned maxLevel = texObj->BaseLevel + baseImage->MaxLog2;
-	if (level < texObj->BaseLevel || level > maxLevel
-			|| level > RADEON_MIPTREE_MAX_TEXTURE_LEVELS)
+	if (level < texObj->BaseLevel || level > texObj->MaxLevel)
 		return 0;
 
 	const unsigned levelDiff = level - texObj->BaseLevel;
@@ -620,7 +615,9 @@ static void teximage_assign_miptree(radeonContextPtr rmesa,
 	radeonTexObj *t = radeon_tex_obj(texObj);
 	radeon_texture_image* image = get_radeon_texture_image(texImage);
 
-	/* check image for dimension and level compatibility with texture */
+	/* Since miptree holds only images for levels <BaseLevel..MaxLevel>
+	 * don't allocate the miptree if the teximage won't fit.
+	 */
 	if (!image_matches_texture_obj(texObj, texImage, level))
 		return;
 
diff --git a/src/mesa/drivers/osmesa/Makefile b/src/mesa/drivers/osmesa/Makefile
index ea49a896590..c6b4a040851 100644
--- a/src/mesa/drivers/osmesa/Makefile
+++ b/src/mesa/drivers/osmesa/Makefile
@@ -20,17 +20,11 @@ INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/src/mesa/main
 
-# Standalone osmesa needs to be linked with core Mesa APIs
-ifeq ($(DRIVER_DIRS), osmesa)
 CORE_MESA = \
 	$(TOP)/src/mesa/libmesa.a \
 	$(TOP)/src/mapi/glapi/libglapi.a \
 	$(TOP)/src/glsl/cl/libglslcl.a \
 	$(TOP)/src/glsl/pp/libglslpp.a
-else
-CORE_MESA =
-endif
-
 
 .c.o:
 	$(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $< -o $@
diff --git a/src/mesa/main/arbprogram.h b/src/mesa/main/arbprogram.h
index df16513e398..787ffd62f4b 100644
--- a/src/mesa/main/arbprogram.h
+++ b/src/mesa/main/arbprogram.h
@@ -27,6 +27,10 @@
 #define ARBPROGRAM_H
 
 
+#include "compiler.h"
+#include "glheader.h"
+
+
 extern void GLAPIENTRY
 _mesa_BindProgram(GLenum target, GLuint id);
 
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 1a2e9b1da6f..48b9904642a 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -147,6 +147,8 @@ invalidate_framebuffer(struct gl_framebuffer *fb)
 /**
  * Given a GL_*_ATTACHMENTn token, return a pointer to the corresponding
  * gl_renderbuffer_attachment object.
+ * This function is only used for user-created FB objects, not the
+ * default / window-system FB object.
  * If \p attachment is GL_DEPTH_STENCIL_ATTACHMENT, return a pointer to
  * the depth buffer attachment point.
  */
@@ -156,6 +158,8 @@ _mesa_get_attachment(GLcontext *ctx, struct gl_framebuffer *fb,
 {
    GLuint i;
 
+   assert(fb->Name > 0);
+
    switch (attachment) {
    case GL_COLOR_ATTACHMENT0_EXT:
    case GL_COLOR_ATTACHMENT1_EXT:
@@ -195,6 +199,45 @@ _mesa_get_attachment(GLcontext *ctx, struct gl_framebuffer *fb,
 
 
 /**
+ * As above, but only used for getting attachments of the default /
+ * window-system framebuffer (not user-created framebuffer objects).
+ */
+static struct gl_renderbuffer_attachment *
+_mesa_get_fb0_attachment(GLcontext *ctx, struct gl_framebuffer *fb,
+                         GLenum attachment)
+{
+   assert(fb->Name == 0);
+
+   switch (attachment) {
+   case GL_FRONT_LEFT:
+      return &fb->Attachment[BUFFER_FRONT_LEFT];
+   case GL_FRONT_RIGHT:
+      return &fb->Attachment[BUFFER_FRONT_RIGHT];
+   case GL_BACK_LEFT:
+      return &fb->Attachment[BUFFER_BACK_LEFT];
+   case GL_BACK_RIGHT:
+      return &fb->Attachment[BUFFER_BACK_RIGHT];
+   case GL_AUX0:
+      if (fb->Visual.numAuxBuffers == 1) {
+         return &fb->Attachment[BUFFER_AUX0];
+      }
+      return NULL;
+   case GL_DEPTH_BUFFER:
+      /* fall-through / new in GL 3.0 */
+   case GL_DEPTH_ATTACHMENT_EXT:
+      return &fb->Attachment[BUFFER_DEPTH];
+   case GL_STENCIL_BUFFER:
+      /* fall-through / new in GL 3.0 */
+   case GL_STENCIL_ATTACHMENT_EXT:
+      return &fb->Attachment[BUFFER_STENCIL];
+   default:
+      return NULL;
+   }
+}
+
+
+
+/**
  * Remove any texture or renderbuffer attached to the given attachment
  * point.  Update reference counts, etc.
  */
@@ -1878,12 +1921,14 @@ _mesa_GetFramebufferAttachmentParameterivEXT(GLenum target, GLenum attachment,
    }
 
    if (buffer->Name == 0) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetFramebufferAttachmentParameterivEXT");
-      return;
+      /* the default / window-system FBO */
+      att = _mesa_get_fb0_attachment(ctx, buffer, attachment);
+   }
+   else {
+      /* user-created framebuffer FBO */
+      att = _mesa_get_attachment(ctx, buffer, attachment);
    }
 
-   att = _mesa_get_attachment(ctx, buffer, attachment);
    if (att == NULL) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetFramebufferAttachmentParameterivEXT(attachment)");
diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c
index 01f84180af7..56558cfcc1e 100644
--- a/src/mesa/main/framebuffer.c
+++ b/src/mesa/main/framebuffer.c
@@ -879,6 +879,7 @@ _mesa_source_buffer_exists(GLcontext *ctx, GLenum format)
          return GL_FALSE;
       }
       ASSERT(_mesa_get_format_bits(ctx->ReadBuffer->_ColorReadBuffer->Format, GL_RED_BITS) > 0 ||
+             _mesa_get_format_bits(ctx->ReadBuffer->_ColorReadBuffer->Format, GL_ALPHA_BITS) > 0 ||
              _mesa_get_format_bits(ctx->ReadBuffer->_ColorReadBuffer->Format, GL_INDEX_BITS) > 0);
       break;
    case GL_DEPTH:
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 2101b9bc18d..8f7ebeed976 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -1242,8 +1242,6 @@ st_CompressedTexSubImage2D(GLcontext *ctx, GLenum target, GLint level,
 
    assert(xoffset % util_format_get_blockwidth(pformat) == 0);
    assert(yoffset % util_format_get_blockheight(pformat) == 0);
-   assert(width % util_format_get_blockwidth(pformat) == 0);
-   assert(height % util_format_get_blockheight(pformat) == 0);
 
    for (y = 0; y < height; y += util_format_get_blockheight(pformat)) {
       /* don't need to adjust for xoffset and yoffset as st_texture_image_map does that */
diff --git a/src/mesa/swrast_setup/ss_triangle.c b/src/mesa/swrast_setup/ss_triangle.c
index bad0d819460..f22bc52f0a8 100644
--- a/src/mesa/swrast_setup/ss_triangle.c
+++ b/src/mesa/swrast_setup/ss_triangle.c
@@ -159,7 +159,7 @@ static void _swsetup_render_tri(GLcontext *ctx,
 }
 
 #define SS_COLOR(a,b) UNCLAMPED_FLOAT_TO_RGBA_CHAN(a,b)
-#define SS_SPEC(a,b) UNCLAMPED_FLOAT_TO_RGB_CHAN(a,b)
+#define SS_SPEC(a,b) COPY_4V(a,b)
 #define SS_IND(a,b) (a = b)
 
 #define IND (0)