99 files changed, 1723 insertions, 1175 deletions
diff --git a/src/mesa/drivers/dri/common/utils.c b/src/mesa/drivers/dri/common/utils.c
index c195c4fd8f5..42be77fd7c4 100644
--- a/src/mesa/drivers/dri/common/utils.c
+++ b/src/mesa/drivers/dri/common/utils.c
@@ -738,12 +738,18 @@ static const struct { unsigned int attrib, offset; } attribMap[] = {
 
 #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
 
+
+/**
+ * Return the value of a configuration attribute.  The attribute is
+ * indicated by the index.
+ */
 static int
 driGetConfigAttribIndex(const __DRIconfig *config,
 			unsigned int index, unsigned int *value)
 {
     switch (attribMap[index].attrib) {
     case __DRI_ATTRIB_RENDER_TYPE:
+        /* no support for color index mode */
 	*value = __DRI_ATTRIB_RGBA_BIT;
 	break;
     case __DRI_ATTRIB_CONFIG_CAVEAT:
@@ -755,13 +761,16 @@ driGetConfigAttribIndex(const __DRIconfig *config,
 	    *value = 0;
 	break;
     case __DRI_ATTRIB_SWAP_METHOD:
+        /* XXX no return value??? */
 	break;
 
     case __DRI_ATTRIB_FLOAT_MODE:
+        /* this field is not int-sized */
         *value = config->modes.floatMode;
         break;
 
     default:
+        /* any other int-sized field */
 	*value = *(unsigned int *)
 	    ((char *) &config->modes + attribMap[index].offset);
 	
@@ -771,6 +780,13 @@ driGetConfigAttribIndex(const __DRIconfig *config,
     return GL_TRUE;
 }
 
+
+/**
+ * Get the value of a configuration attribute.
+ * \param attrib  the attribute (one of the _DRI_ATTRIB_x tokens)
+ * \param value  returns the attribute's value
+ * \return 1 for success, 0 for failure
+ */
 int
 driGetConfigAttrib(const __DRIconfig *config,
 		   unsigned int attrib, unsigned int *value)
@@ -784,6 +800,14 @@ driGetConfigAttrib(const __DRIconfig *config,
     return GL_FALSE;
 }
 
+
+/**
+ * Get a configuration attribute name and value, given an index.
+ * \param index  which field of the __DRIconfig to query
+ * \param attrib  returns the attribute name (one of the _DRI_ATTRIB_x tokens)
+ * \param value  returns the attribute's value
+ * \return 1 for success, 0 for failure
+ */
 int
 driIndexConfigAttrib(const __DRIconfig *config, int index,
 		     unsigned int *attrib, unsigned int *value)
diff --git a/src/mesa/drivers/dri/i915/intel_structs.h b/src/mesa/drivers/dri/i915/intel_structs.h
deleted file mode 100644
index 522e3bd92c2..00000000000
--- a/src/mesa/drivers/dri/i915/intel_structs.h
+++ /dev/null
@@ -1,132 +0,0 @@
-#ifndef INTEL_STRUCTS_H
-#define INTEL_STRUCTS_H
-
-struct br0 {
-   GLuint length:8;
-   GLuint pad0:3;
-   GLuint dst_tiled:1;
-   GLuint pad1:8;
-   GLuint write_rgb:1;
-   GLuint write_alpha:1;
-   GLuint opcode:7;
-   GLuint client:3;
-};
-
-   
-struct br13 {
-   GLint dest_pitch:16;
-   GLuint rop:8;
-   GLuint color_depth:2;
-   GLuint pad1:3;
-   GLuint mono_source_transparency:1;
-   GLuint clipping_enable:1;
-   GLuint pad0:1;
-};
-
-
-
-/* This is an attempt to move some of the 2D interaction in this
- * driver to using structs for packets rather than a bunch of #defines
- * and dwords.
- */
-struct xy_color_blit {
-   struct br0 br0;
-   struct br13 br13;
-
-   struct {
-      GLuint dest_x1:16;
-      GLuint dest_y1:16;
-   } dw2;
-
-   struct {
-      GLuint dest_x2:16;
-      GLuint dest_y2:16;
-   } dw3;
-   
-   GLuint dest_base_addr;
-   GLuint color;
-};
-
-struct xy_src_copy_blit {
-   struct br0 br0;
-   struct br13 br13;
-
-   struct {
-      GLuint dest_x1:16;
-      GLuint dest_y1:16;
-   } dw2;
-
-   struct {
-      GLuint dest_x2:16;
-      GLuint dest_y2:16;
-   } dw3;
-   
-   GLuint dest_base_addr;
-
-   struct {
-      GLuint src_x1:16;
-      GLuint src_y1:16;
-   } dw5;
-
-   struct {
-      GLint src_pitch:16;
-      GLuint pad:16;
-   } dw6;
-   
-   GLuint src_base_addr;
-};
-
-struct xy_setup_blit {
-   struct br0 br0;
-   struct br13 br13;
-
-   struct {
-      GLuint clip_x1:16;
-      GLuint clip_y1:16;
-   } dw2;
-
-   struct {
-      GLuint clip_x2:16;
-      GLuint clip_y2:16;
-   } dw3;
-      
-   GLuint dest_base_addr;
-   GLuint background_color;
-   GLuint foreground_color;
-   GLuint pattern_base_addr;
-};
-
-
-struct xy_text_immediate_blit {
-   struct {
-      GLuint length:8;
-      GLuint pad2:3;
-      GLuint dst_tiled:1;
-      GLuint pad1:4;
-      GLuint byte_packed:1;
-      GLuint pad0:5;
-      GLuint opcode:7;
-      GLuint client:3;
-   } dw0;
-
-   struct {
-      GLuint dest_x1:16;
-      GLuint dest_y1:16;
-   } dw1;
-
-   struct {
-      GLuint dest_x2:16;
-      GLuint dest_y2:16;
-   } dw2;   
-
-   /* Src bitmap data follows as inline dwords.
-    */
-};
-
-
-#define CLIENT_2D 0x2
-#define OPCODE_XY_SETUP_BLT 0x1
-#define OPCODE_XY_COLOR_BLT 0x50
-#define OPCODE_XY_TEXT_IMMEDIATE_BLT 0x31
-
-#endif
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index 00418760da3..a8369b07c35 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -204,7 +204,7 @@ static void upload_cc_unit(struct brw_context *brw)
       cc.cc2.depth_write_enable = ctx->Depth.Mask;
    }
 
-   if (intel->stats_wm || (INTEL_DEBUG & DEBUG_STATS))
+   if (intel->stats_wm || unlikely(INTEL_DEBUG & DEBUG_STATS))
       cc.cc5.statistics_enable = 1;
 
    /* CACHE_NEW_CC_VP */
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index 15e60bf3ce3..1be165cc9a1 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -133,13 +133,13 @@ static void compile_clip_prog( struct brw_context *brw,
     */
    program = brw_get_program(&c.func, &program_size);
 
-    if (INTEL_DEBUG & DEBUG_CLIP) {
+   if (unlikely(INTEL_DEBUG & DEBUG_CLIP)) {
       printf("clip:\n");
       for (i = 0; i < program_size / sizeof(struct brw_instruction); i++)
 	 brw_disasm(stdout, &((struct brw_instruction *)program)[i],
 		    intel->gen);
       printf("\n");
-    }
+   }
 
    /* Upload
     */
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index 885167da908..60fd5fa7d9e 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -114,10 +114,10 @@ clip_unit_create_from_key(struct brw_context *brw,
       clip.thread4.max_threads = 1 - 1;
    }
 
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
+   if (unlikely(INTEL_DEBUG & DEBUG_SINGLE_THREAD))
       clip.thread4.max_threads = 0;
 
-   if (INTEL_DEBUG & DEBUG_STATS)
+   if (unlikely(INTEL_DEBUG & DEBUG_STATS))
       clip.thread4.stats_enable = 1;
 
    clip.clip5.userclip_enable_flags = 0x7f;
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 3c4ae8a7a4f..cb0a8b96c9c 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -124,7 +124,7 @@ GLboolean brwCreateContext( int api,
 	 (i == MESA_SHADER_FRAGMENT);
 
       if (intel->gen == 6)
-	 ctx->ShaderCompilerOptions[i].EmitNoIfs = GL_TRUE;
+	 ctx->ShaderCompilerOptions[i].EmitNoIfs = (i == MESA_SHADER_VERTEX);
    }
 
    ctx->Const.VertexProgram.MaxNativeInstructions = (16 * 1024);
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 6c3db61035a..239586a0366 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -930,6 +930,11 @@
 #define CMD_3D_CLIP_STATE		      0x7812 /* GEN6+ */
 /* DW1 */
 # define GEN6_CLIP_STATISTICS_ENABLE			(1 << 10)
+/**
+ * Just does cheap culling based on the clip distance.  Bits must be
+ * disjoint with USER_CLIP_CLIP_DISTANCE bits.
+ */
+# define GEN6_USER_CLIP_CULL_DISTANCES_SHIFT		0
 /* DW2 */
 # define GEN6_CLIP_ENABLE				(1 << 31)
 # define GEN6_CLIP_API_OGL				(0 << 30)
@@ -937,6 +942,8 @@
 # define GEN6_CLIP_XY_TEST				(1 << 28)
 # define GEN6_CLIP_Z_TEST				(1 << 27)
 # define GEN6_CLIP_GB_TEST				(1 << 26)
+/** 8-bit field of which user clip distances to clip aganist. */
+# define GEN6_USER_CLIP_CLIP_DISTANCES_SHIFT		16
 # define GEN6_CLIP_MODE_NORMAL				(0 << 13)
 # define GEN6_CLIP_MODE_REJECT_ALL			(3 << 13)
 # define GEN6_CLIP_MODE_ACCEPT_ALL			(4 << 13)
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 04bc8cb2db0..a1f403ca4e6 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -42,7 +42,7 @@
 
 #include "intel_batchbuffer.h"
 
-#define FILE_DEBUG_FLAG DEBUG_BATCH
+#define FILE_DEBUG_FLAG DEBUG_PRIMS
 
 static GLuint prim_to_hw_prim[GL_POLYGON+1] = {
    _3DPRIM_POINTLIST,
@@ -83,8 +83,7 @@ static GLuint brw_set_prim(struct brw_context *brw,
    struct gl_context *ctx = &brw->intel.ctx;
    GLenum mode = prim->mode;
 
-   if (INTEL_DEBUG & DEBUG_PRIMS)
-      printf("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim->mode));
+   DBG("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim->mode));
 
    /* Slight optimization to avoid the GS program when not needed:
     */
@@ -133,9 +132,8 @@ static void brw_emit_prim(struct brw_context *brw,
    struct brw_3d_primitive prim_packet;
    struct intel_context *intel = &brw->intel;
 
-   if (INTEL_DEBUG & DEBUG_PRIMS)
-      printf("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode), 
-		   prim->start, prim->count);
+   DBG("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode),
+       prim->start, prim->count);
 
    prim_packet.header.opcode = CMD_3D_PRIM;
    prim_packet.header.length = sizeof(prim_packet)/4 - 2;
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index c4654360d46..2cefe614dd2 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -168,7 +168,7 @@ static GLuint byte_types_scale[5] = {
 static GLuint get_surface_type( GLenum type, GLuint size,
                                 GLenum format, GLboolean normalized )
 {
-   if (INTEL_DEBUG & DEBUG_VERTS)
+   if (unlikely(INTEL_DEBUG & DEBUG_VERTS))
       printf("type %s size %d normalized %d\n", 
 		   _mesa_lookup_enum_by_nr(type), size, normalized);
 
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 9cb99a2b999..9cb941dacfd 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -969,7 +969,7 @@ void brw_ENDIF(struct brw_compile *p,
 	 brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 	 brw_set_src1(insn, brw_imm_d(0x0));
       } else {
-	 brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_W));
+	 brw_set_dest(insn, brw_imm_w(0));
 	 brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 	 brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 2ed59d3f5d4..283d5aad496 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -89,8 +89,6 @@ brw_compile_shader(struct gl_context *ctx, struct gl_shader *shader)
 GLboolean
 brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 {
-   struct intel_context *intel = intel_context(ctx);
-
    struct brw_shader *shader =
       (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
    if (shader != NULL) {
@@ -132,9 +130,6 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 						GL_TRUE, /* temp */
 						GL_TRUE /* uniform */
 						) || progress;
-	 if (intel->gen == 6) {
-	    progress = do_if_to_cond_assign(shader->ir) || progress;
-	 }
       } while (progress);
 
       validate_ir_tree(shader->ir);
@@ -3129,7 +3124,7 @@ fs_visitor::generate_code()
    const char *last_annotation_string = NULL;
    ir_instruction *last_annotation_ir = NULL;
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       printf("Native code for fragment shader %d:\n",
 	     ctx->Shader.CurrentFragmentProgram->Name);
    }
@@ -3141,7 +3136,7 @@ fs_visitor::generate_code()
       fs_inst *inst = (fs_inst *)iter.get();
       struct brw_reg src[3], dst;
 
-      if (INTEL_DEBUG & DEBUG_WM) {
+      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
 	 if (last_annotation_ir != inst->ir) {
 	    last_annotation_ir = inst->ir;
 	    if (last_annotation_ir) {
@@ -3335,7 +3330,7 @@ fs_visitor::generate_code()
 	 this->fail = true;
       }
 
-      if (INTEL_DEBUG & DEBUG_WM) {
+      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
 	 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
 	    if (0) {
 	       printf("0x%08x 0x%08x 0x%08x 0x%08x ",
@@ -3376,7 +3371,7 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
     */
    c->dispatch_width = 8;
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       printf("GLSL IR for native fragment shader %d:\n", prog->Name);
       _mesa_print_ir(shader->ir, NULL);
       printf("\n");
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index cfcc8ea4d6a..b0c76f4094d 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -127,8 +127,8 @@ static void compile_gs_prog( struct brw_context *brw,
     */
    program = brw_get_program(&c.func, &program_size);
 
-    if (INTEL_DEBUG & DEBUG_GS) {
-       int i;
+   if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
+      int i;
 
       printf("gs:\n");
       for (i = 0; i < program_size / sizeof(struct brw_instruction); i++)
diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
index 63562ebcfc2..69a5f7a6667 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_state.c
@@ -101,7 +101,7 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
    if (intel->gen == 5)
       gs.thread4.rendering_enable = 1;
 
-   if (INTEL_DEBUG & DEBUG_STATS)
+   if (unlikely(INTEL_DEBUG & DEBUG_STATS))
       gs.thread4.stats_enable = 1;
 
    bo = brw_upload_cache(&brw->cache, BRW_GS_UNIT,
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 24041e57b00..1d350bc0413 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -555,7 +555,7 @@ static void upload_invarient_state( struct brw_context *brw )
       memset(&vfs, 0, sizeof(vfs));
 
       vfs.opcode = brw->CMD_VF_STATISTICS;
-      if (INTEL_DEBUG & DEBUG_STATS)
+      if (unlikely(INTEL_DEBUG & DEBUG_STATS))
 	 vfs.statistics_enable = 1; 
 
       BRW_BATCH_STRUCT(brw, &vfs);
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index 7dbd70daaea..6da155b1a9b 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -108,7 +108,7 @@ static void compile_sf_prog( struct brw_context *brw,
     */
    program = brw_get_program(&c.func, &program_size);
 
-   if (INTEL_DEBUG & DEBUG_SF) {
+   if (unlikely(INTEL_DEBUG & DEBUG_SF)) {
       printf("sf:\n");
       for (i = 0; i < program_size / sizeof(struct brw_instruction); i++)
 	 brw_disasm(stdout, &((struct brw_instruction *)program)[i],
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 6ad9e1b48a4..bd3a21ed9e2 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -210,10 +210,10 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    sf.thread4.max_threads = MIN2(chipset_max_threads, key->nr_urb_entries) - 1;
 
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
+   if (unlikely(INTEL_DEBUG & DEBUG_SINGLE_THREAD))
       sf.thread4.max_threads = 0;
 
-   if (INTEL_DEBUG & DEBUG_STATS)
+   if (unlikely(INTEL_DEBUG & DEBUG_STATS))
       sf.thread4.stats_enable = 1;
 
    /* CACHE_NEW_SF_VP */
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index b31d84953a1..58ff528d44b 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -61,6 +61,7 @@
 #include "intel_batchbuffer.h"
 #include "brw_wm.h"
 
+#define FILE_DEBUG_FLAG DEBUG_STATE
 
 static GLuint
 hash_key(struct brw_cache_item *item)
@@ -265,10 +266,9 @@ brw_upload_cache_with_auxdata(struct brw_cache *cache,
       *(void **)aux_return = (void *)((char *)item->key + item->key_size);
    }
 
-   if (INTEL_DEBUG & DEBUG_STATE)
-      printf("upload %s: %d bytes to cache id %d\n",
-		   cache->name[cache_id],
-		   data_size, cache_id);
+   DBG("upload %s: %d bytes to cache id %d\n",
+       cache->name[cache_id],
+       data_size, cache_id);
 
    /* Copy data to the buffer */
    drm_intel_bo_subdata(bo, 0, data_size, data);
@@ -407,8 +407,7 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
    struct brw_cache_item *c, *next;
    GLuint i;
 
-   if (INTEL_DEBUG & DEBUG_STATE)
-      printf("%s\n", __FUNCTION__);
+   DBG("%s\n", __FUNCTION__);
 
    for (i = 0; i < cache->size; i++) {
       for (c = cache->items[i]; c; c = next) {
@@ -434,8 +433,7 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 void
 brw_state_cache_check_size(struct brw_context *brw)
 {
-   if (INTEL_DEBUG & DEBUG_STATE)
-      printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
+   DBG("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
 
    /* un-tuned guess.  Each object is generally a page, so 1000 of them is 4 MB of
     * state cache.
@@ -450,8 +448,7 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 {
    GLuint i;
 
-   if (INTEL_DEBUG & DEBUG_STATE)
-      printf("%s\n", __FUNCTION__);
+   DBG("%s\n", __FUNCTION__);
 
    brw_clear_cache(brw, cache);
    for (i = 0; i < BRW_MAX_CACHE; i++) {
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index f3b6a90f61a..338f3876b31 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -435,7 +435,7 @@ void brw_upload_state(struct brw_context *brw)
 
    brw_clear_validated_bos(brw);
 
-   if (INTEL_DEBUG) {
+   if (unlikely(INTEL_DEBUG)) {
       /* Debug version which enforces various sanity checks on the
        * state flags which are generated and checked to help ensure
        * state atoms are ordered correctly in the list.
@@ -487,7 +487,7 @@ void brw_upload_state(struct brw_context *brw)
       }
    }
 
-   if (INTEL_DEBUG & DEBUG_STATE) {
+   if (unlikely(INTEL_DEBUG & DEBUG_STATE)) {
       brw_update_dirty_count(mesa_bits, state->mesa);
       brw_update_dirty_count(brw_bits, state->brw);
       brw_update_dirty_count(cache_bits, state->cache);
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 0f597184b42..dfc1551aca6 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -190,12 +190,12 @@ static void recalculate_urb_fence( struct brw_context *brw )
 	    exit(1);
 	 }
 	 
-	 if (INTEL_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
+	 if (unlikely(INTEL_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS)))
 	    printf("URB CONSTRAINED\n");
       }
 
 done:
-      if (INTEL_DEBUG & DEBUG_URB)
+      if (unlikely(INTEL_DEBUG & DEBUG_URB))
 	 printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
 		      brw->urb.vs_start,
 		      brw->urb.gs_start,
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index ce334799965..7e43324a1f9 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -165,13 +165,20 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    /* User clip planes from curbe: 
     */
    if (c->key.nr_userclip) {
-      for (i = 0; i < c->key.nr_userclip; i++) {
-	 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
-      }     
+      if (intel->gen >= 6) {
+	 for (i = 0; i < c->key.nr_userclip; i++) {
+	    c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
+						  (i % 2) * 4), 0, 4, 1);
+	 }
+	 reg += ALIGN(c->key.nr_userclip, 2) / 2;
+      } else {
+	 for (i = 0; i < c->key.nr_userclip; i++) {
+	    c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
+						  (i % 2) * 4), 0, 4, 1);
+	 }
+	 reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
+      }
 
-      /* Deal with curbe alignment:
-       */
-      reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
    }
 
    /* Vertex program parameters from curbe:
@@ -253,9 +260,11 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    c->first_output = reg;
    c->first_overflow_output = 0;
 
-   if (intel->gen >= 6)
-      mrf = 3; /* no more pos store in attribute */
-   else if (intel->gen == 5)
+   if (intel->gen >= 6) {
+      mrf = 3;
+      if (c->key.nr_userclip)
+	 mrf += 2;
+   } else if (intel->gen == 5)
       mrf = 8;
    else
       mrf = 4;
@@ -372,16 +381,20 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    /* See emit_vertex_write() for where the VUE's overhead on top of the
     * attributes comes from.
     */
-   if (intel->gen >= 6)
-      c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8;
-   else if (intel->gen == 5)
+   if (intel->gen >= 6) {
+      int header_regs = 2;
+      if (c->key.nr_userclip)
+	 header_regs += 2;
+
+      c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 7) / 8;
+   } else if (intel->gen == 5)
       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
    else
       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 
    c->prog_data.total_grf = reg;
 
-   if (INTEL_DEBUG & DEBUG_VS) {
+   if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
       printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
       printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
       printf("%s reg = %d\n", __FUNCTION__, reg);
@@ -576,12 +589,11 @@ static void emit_min( struct brw_compile *p,
    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 }
 
-
-static void emit_math1( struct brw_vs_compile *c,
-			GLuint function,
-			struct brw_reg dst,
-			struct brw_reg arg0,
-			GLuint precision)
+static void emit_math1_gen4(struct brw_vs_compile *c,
+			    GLuint function,
+			    struct brw_reg dst,
+			    struct brw_reg arg0,
+			    GLuint precision)
 {
    /* There are various odd behaviours with SEND on the simulator.  In
     * addition there are documented issues with the fact that the GEN4
@@ -591,14 +603,11 @@ static void emit_math1( struct brw_vs_compile *c,
     * whether that turns out to be a simulator bug or not:
     */
    struct brw_compile *p = &c->func;
-   struct intel_context *intel = &p->brw->intel;
    struct brw_reg tmp = dst;
    GLboolean need_tmp = GL_FALSE;
 
-   if (dst.file != BRW_GENERAL_REGISTER_FILE)
-      need_tmp = GL_TRUE;
-
-   if (intel->gen < 6 && dst.dw1.bits.writemask != 0xf)
+   if (dst.file != BRW_GENERAL_REGISTER_FILE ||
+       dst.dw1.bits.writemask != 0xf)
       need_tmp = GL_TRUE;
 
    if (need_tmp)
@@ -619,6 +628,57 @@ static void emit_math1( struct brw_vs_compile *c,
    }
 }
 
+static void
+emit_math1_gen6(struct brw_vs_compile *c,
+		GLuint function,
+		struct brw_reg dst,
+		struct brw_reg arg0,
+		GLuint precision)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp_src, tmp_dst;
+
+   /* Something is strange on gen6 math in 16-wide mode, though the
+    * docs say it's supposed to work.  Punt to using align1 mode,
+    * which doesn't do writemasking and swizzles.
+    */
+   tmp_src = get_tmp(c);
+   tmp_dst = get_tmp(c);
+
+   brw_MOV(p, tmp_src, arg0);
+
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_math(p,
+	    tmp_dst,
+	    function,
+	    BRW_MATH_SATURATE_NONE,
+	    2,
+	    tmp_src,
+	    BRW_MATH_DATA_SCALAR,
+	    precision);
+   brw_set_access_mode(p, BRW_ALIGN_16);
+
+   brw_MOV(p, dst, tmp_dst);
+
+   release_tmp(c, tmp_src);
+   release_tmp(c, tmp_dst);
+}
+
+static void
+emit_math1(struct brw_vs_compile *c,
+	   GLuint function,
+	   struct brw_reg dst,
+	   struct brw_reg arg0,
+	   GLuint precision)
+{
+   struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6)
+      emit_math1_gen6(c, function, dst, arg0, precision);
+   else
+      emit_math1_gen4(c, function, dst, arg0, precision);
+}
 
 static void emit_math2( struct brw_vs_compile *c, 
 			GLuint function,
@@ -1392,9 +1452,33 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    /* Update the header for point size, user clipping flags, and -ve rhw
     * workaround.
     */
-   if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
-       c->key.nr_userclip || brw->has_negative_rhw_bug)
-   {
+   if (intel->gen >= 6) {
+      struct brw_reg m1 = brw_message_reg(1);
+
+      /* On gen6, m1 has each value in a separate dword, so we never
+       * need to mess with a temporary for computing the m1 value.
+       */
+      brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
+      if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
+	 brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
+		 brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
+      }
+
+      /* Set the user clip distances in dword 8-15. (m3-4)*/
+      if (c->key.nr_userclip) {
+	 for (i = 0; i < c->key.nr_userclip; i++) {
+	    struct brw_reg m;
+	    if (i < 4)
+	       m = brw_message_reg(3);
+	    else
+	       m = brw_message_reg(4);
+
+	    brw_DP4(p, brw_writemask(m, (1 << (i & 7))),pos, c->userplane[i]);
+	 }
+      }
+   } else if ((c->prog_data.outputs_written &
+	       BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
+	      c->key.nr_userclip || brw->has_negative_rhw_bug) {
       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
       GLuint i;
 
@@ -1404,11 +1488,10 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 
       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
 	 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
-	 if (intel->gen < 6) {
-	     brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
-	     brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
-	 } else
-	     brw_MOV(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0));
+	 brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
+		 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
+	 brw_AND(p, brw_writemask(header1, WRITEMASK_W),
+		 header1, brw_imm_ud(0x7ff<<8));
       }
 
       for (i = 0; i < c->key.nr_userclip; i++) {
@@ -1461,12 +1544,14 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * dword 0-3 (m1) of the header is indices, point width, clip flags.
        * dword 4-7 (m2) is the 4D space position
        * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
-       * enabled.  We don't use it, so skip it.
-       * m3 is the first vertex element data we fill, which is the vertex
-       * position.
+       * enabled.
+       * m3 or 5 is the first vertex element data we fill, which is
+       * the vertex position.
        */
       brw_MOV(p, brw_message_reg(2), pos);
       len_vertex_header = 1;
+      if (c->key.nr_userclip > 0)
+	 len_vertex_header += 2;
    } else if (intel->gen == 5) {
       /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
        * dword 0-3 (m1) of the header is indices, point width, clip flags.
@@ -1640,17 +1725,13 @@ void brw_vs_emit(struct brw_vs_compile *c )
    GLuint index;
    GLuint file;
 
-   if (INTEL_DEBUG & DEBUG_VS) {
+   if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
       printf("vs-mesa:\n");
       _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
 			       GL_TRUE);
       printf("\n");
    }
 
-   /* FIXME Need to fix conditional instruction to remove this */
-   if (intel->gen >= 6)
-       p->single_program_flow = GL_TRUE;
-
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_access_mode(p, BRW_ALIGN_16);
    if_depth_in_loop[loop_depth] = 0;
@@ -2010,7 +2091,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
    brw_optimize(p);
 
-   if (INTEL_DEBUG & DEBUG_VS) {
+   if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
       int i;
 
       printf("vs-native:\n");
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index ebae94269f9..be923138617 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -154,7 +154,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
     */
    vs.vs5.sampler_count = 0;
 
-   if (INTEL_DEBUG & DEBUG_STATS)
+   if (unlikely(INTEL_DEBUG & DEBUG_STATS))
       vs.thread4.stats_enable = 1;
 
    /* Vertex program always enabled:
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 7f3ba5f0581..a6d2a2377f6 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -198,7 +198,7 @@ static void do_wm_prog( struct brw_context *brw,
       c->prog_data.total_scratch = 0;
    }
 
-   if (INTEL_DEBUG & DEBUG_WM)
+   if (unlikely(INTEL_DEBUG & DEBUG_WM))
       fprintf(stderr, "\n");
 
    /* get the program
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index d06c49fd5be..96fecc97ee2 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -83,6 +83,7 @@ brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
       [OPCODE_SLE] = 2,
       [OPCODE_SLT] = 2,
       [OPCODE_SNE] = 2,
+      [OPCODE_SWZ] = 1,
       [OPCODE_XPD] = 2,
    };
 
@@ -895,11 +896,12 @@ void emit_math1(struct brw_wm_compile *c,
 		      BRW_MATH_SATURATE_NONE);
    struct brw_reg src;
 
-   if (intel->gen >= 6 && arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
-      /* Gen6 math requires that source and dst horizontal stride be 1.
-       *
+   if (intel->gen >= 6 && (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
+			   arg0[0].file != BRW_GENERAL_REGISTER_FILE)) {
+      /* Gen6 math requires that source and dst horizontal stride be 1,
+       * and that the argument be in the GRF.
        */
-      src = *dst;
+      src = dst[dst_chan];
       brw_MOV(p, src, arg0[0]);
    } else {
       src = arg0[0];
@@ -1920,7 +1922,7 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	brw_remove_grf_to_mrf_moves(p);
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       int i;
 
      printf("wm-native:\n");
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index 15a238cda62..2cae6988804 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -663,7 +663,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 			 const struct prog_instruction *inst )
 {
    struct prog_src_register coord;
-   struct prog_dst_register tmpcoord;
+   struct prog_dst_register tmpcoord = { 0 };
    const GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
 
    assert(unit < BRW_MAX_TEX_UNIT);
@@ -963,7 +963,7 @@ static void emit_render_target_writes( struct brw_wm_compile *c )
    struct prog_src_register outcolor;
    GLuint i;
 
-   struct prog_instruction *inst, *last_inst;
+   struct prog_instruction *inst, *last_inst = NULL;
 
    /* The inst->Aux field is used for FB write target and the EOT marker */
 
@@ -1058,7 +1058,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
    struct brw_fragment_program *fp = c->fp;
    GLuint insn;
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       printf("pre-fp:\n");
       _mesa_fprint_program_opt(stdout, &fp->program.Base, PROG_PRINT_DEBUG,
 			       GL_TRUE);
@@ -1174,7 +1174,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
       }
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       printf("pass_fp:\n");
       print_insns( c->prog_instructions, c->nr_fp_insns );
       printf("\n");
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
index d325f85ce00..7fe8ab1f334 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@@ -19,7 +19,7 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 {
     int i;
 
-    if (INTEL_DEBUG & DEBUG_GLSL_FORCE)
+    if (unlikely(INTEL_DEBUG & DEBUG_GLSL_FORCE))
        return GL_TRUE;
 
     for (i = 0; i < fp->Base.NumInstructions; i++) {
@@ -1002,7 +1002,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
     }
     post_wm_emit(c);
 
-    if (INTEL_DEBUG & DEBUG_WM) {
+    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       printf("wm-native:\n");
       for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stdout, &p->store[i], intel->gen);
@@ -1016,7 +1016,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
  */
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
 {
-    if (INTEL_DEBUG & DEBUG_WM) {
+    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
         printf("brw_wm_glsl_emit:\n");
     }
 
@@ -1026,7 +1026,7 @@ void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
     /* actual code generation */
     brw_wm_emit_glsl(brw, c);
 
-    if (INTEL_DEBUG & DEBUG_WM) {
+    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
         brw_wm_print_program(c, "brw_wm_glsl_emit done");
     }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
index d6aa9f957a2..83152526b3a 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
@@ -440,7 +440,7 @@ void brw_wm_pass0( struct brw_wm_compile *c )
       }
    }
  
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       brw_wm_print_program(c, "pass0");
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass1.c b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
index 962515a99e9..3a2874b6ddf 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass1.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
@@ -291,7 +291,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       track_arg(c, inst, 2, read2);
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       brw_wm_print_program(c, "pass1");
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass2.c b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
index 54acb3038b5..44e39538145 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass2.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
@@ -331,13 +331,13 @@ void brw_wm_pass2( struct brw_wm_compile *c )
       }
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       brw_wm_print_program(c, "pass2");
    }
 
    c->state = PASS2_DONE;
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
        brw_wm_print_program(c, "pass2/done");
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 9a27b937103..76de7b7b6f6 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -249,7 +249,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 
    wm.wm5.line_stipple = key->line_stipple;
 
-   if (INTEL_DEBUG & DEBUG_STATS || key->stats_wm)
+   if (unlikely(INTEL_DEBUG & DEBUG_STATS) || key->stats_wm)
       wm.wm4.stats_enable = 1;
 
    bo = brw_upload_cache(&brw->cache, BRW_WM_UNIT,
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index 0d6e923f734..800a2555214 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -254,7 +254,7 @@ prepare_color_calc_state(struct brw_context *brw)
 
 const struct brw_tracked_state gen6_color_calc_state = {
    .dirty = {
-      .mesa = _NEW_COLOR,
+      .mesa = _NEW_COLOR | _NEW_STENCIL,
       .brw = 0,
       .cache = 0,
    },
diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index cd2ac9d92fe..c65b41e2b6b 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -28,6 +28,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_util.h"
 #include "intel_batchbuffer.h"
 
 static void
@@ -36,7 +37,7 @@ upload_clip_state(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &intel->ctx;
    uint32_t depth_clamp = 0;
-   uint32_t provoking;
+   uint32_t provoking, userclip;
 
    if (!ctx->Transform.DepthClamp)
       depth_clamp = GEN6_CLIP_Z_TEST;
@@ -50,6 +51,9 @@ upload_clip_state(struct brw_context *brw)
 	 (1 << GEN6_CLIP_LINE_PROVOKE_SHIFT);
    }
 
+   /* _NEW_TRANSFORM */
+   userclip = (1 << brw_count_bits(ctx->Transform.ClipPlanesEnabled)) - 1;
+
    BEGIN_BATCH(4);
    OUT_BATCH(CMD_3D_CLIP_STATE << 16 | (4 - 2));
    OUT_BATCH(GEN6_CLIP_STATISTICS_ENABLE);
@@ -57,6 +61,7 @@ upload_clip_state(struct brw_context *brw)
 	     GEN6_CLIP_API_OGL |
 	     GEN6_CLIP_MODE_NORMAL |
 	     GEN6_CLIP_XY_TEST |
+	     userclip << GEN6_USER_CLIP_CLIP_DISTANCES_SHIFT |
 	     depth_clamp |
 	     provoking);
    OUT_BATCH(GEN6_CLIP_FORCE_ZERO_RTAINDEX);
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 55a70bea62f..471067e8f02 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -73,12 +73,19 @@ upload_sf_state(struct brw_context *brw)
    /* _NEW_BUFFER */
    GLboolean render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
    int attr = 0;
+   int urb_start;
+
+   /* _NEW_TRANSFORM */
+   if (ctx->Transform.ClipPlanesEnabled)
+      urb_start = 2;
+   else
+      urb_start = 1;
 
    dw1 =
       GEN6_SF_SWIZZLE_ENABLE |
       num_outputs << GEN6_SF_NUM_OUTPUTS_SHIFT |
       (num_inputs + 1) / 2 << GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT |
-      1 << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT;
+      urb_start << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT;
    dw2 = GEN6_SF_VIEWPORT_TRANSFORM_ENABLE |
       GEN6_SF_STATISTICS_ENABLE;
    dw3 = 0;
@@ -195,7 +202,9 @@ const struct brw_tracked_state gen6_sf_state = {
 		_NEW_POLYGON |
 		_NEW_LINE |
 		_NEW_SCISSOR |
-		_NEW_BUFFERS),
+		_NEW_BUFFERS |
+		_NEW_POINT |
+		_NEW_TRANSFORM),
       .brw   = BRW_NEW_CONTEXT,
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index 304eaddf409..e94d0c0ddbb 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -40,11 +40,11 @@ upload_vs_state(struct brw_context *brw)
    struct gl_context *ctx = &intel->ctx;
    const struct brw_vertex_program *vp =
       brw_vertex_program_const(brw->vertex_program);
-   unsigned int nr_params = vp->program.Base.Parameters->NumParameters;
+   unsigned int nr_params = brw->vs.prog_data->nr_params / 4;
    drm_intel_bo *constant_bo;
    int i;
 
-   if (vp->use_const_buffer || nr_params == 0) {
+   if (brw->vs.prog_data->nr_params == 0 && !ctx->Transform.ClipPlanesEnabled) {
       /* Disable the push constant buffers. */
       BEGIN_BATCH(5);
       OUT_BATCH(CMD_3D_CONSTANT_VS_STATE << 16 | (5 - 2));
@@ -54,6 +54,9 @@ upload_vs_state(struct brw_context *brw)
       OUT_BATCH(0);
       ADVANCE_BATCH();
    } else {
+      int params_uploaded = 0;
+      float *param;
+
       if (brw->vertex_program->IsNVProgram)
 	 _mesa_load_tracked_matrices(ctx);
 
@@ -63,14 +66,55 @@ upload_vs_state(struct brw_context *brw)
       _mesa_load_state_parameters(ctx, vp->program.Base.Parameters);
 
       constant_bo = drm_intel_bo_alloc(intel->bufmgr, "VS constant_bo",
-				       nr_params * 4 * sizeof(float),
+				       (MAX_CLIP_PLANES + nr_params) *
+				       4 * sizeof(float),
 				       4096);
       drm_intel_gem_bo_map_gtt(constant_bo);
-      for (i = 0; i < nr_params; i++) {
-	 memcpy((char *)constant_bo->virtual + i * 4 * sizeof(float),
-		vp->program.Base.Parameters->ParameterValues[i],
-		4 * sizeof(float));
+      param = constant_bo->virtual;
+
+      /* This should be loaded like any other param, but it's ad-hoc
+       * until we redo the VS backend.
+       */
+      for (i = 0; i < MAX_CLIP_PLANES; i++) {
+	 if (ctx->Transform.ClipPlanesEnabled & (1 << i)) {
+	    memcpy(param, ctx->Transform._ClipUserPlane[i], 4 * sizeof(float));
+	    param += 4;
+	    params_uploaded++;
+	 }
       }
+      /* Align to a reg for convenience for brw_vs_emit.c */
+      if (params_uploaded & 1) {
+	 param += 4;
+	 params_uploaded++;
+      }
+
+      if (vp->use_const_buffer) {
+	 for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
+	    if (brw->vs.constant_map[i] != -1) {
+	       memcpy(param + brw->vs.constant_map[i] * 4,
+		      vp->program.Base.Parameters->ParameterValues[i],
+		      4 * sizeof(float));
+	       params_uploaded++;
+	    }
+	 }
+      } else {
+	 for (i = 0; i < nr_params; i++) {
+	    memcpy(param, vp->program.Base.Parameters->ParameterValues[i],
+		   4 * sizeof(float));
+	    param += 4;
+	    params_uploaded++;
+	 }
+      }
+
+      if (0) {
+	 printf("VS constant buffer:\n");
+	 for (i = 0; i < params_uploaded; i++) {
+	    float *buf = (float *)constant_bo->virtual + i * 4;
+	    printf("%d: %f %f %f %f\n",
+		   i, buf[0], buf[1], buf[2], buf[3]);
+	 }
+      }
+
       drm_intel_gem_bo_unmap_gtt(constant_bo);
 
       BEGIN_BATCH(5);
@@ -79,7 +123,7 @@ upload_vs_state(struct brw_context *brw)
 		(5 - 2));
       OUT_RELOC(constant_bo,
 		I915_GEM_DOMAIN_RENDER, 0, /* XXX: bad domain */
-		ALIGN(nr_params, 2) / 2 - 1);
+		ALIGN(params_uploaded, 2) / 2 - 1);
       OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
@@ -91,7 +135,7 @@ upload_vs_state(struct brw_context *brw)
    BEGIN_BATCH(6);
    OUT_BATCH(CMD_3D_VS_STATE << 16 | (6 - 2));
    OUT_RELOC(brw->vs.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
-   OUT_BATCH(GEN6_VS_SPF_MODE | (0 << GEN6_VS_SAMPLER_COUNT_SHIFT) |
+   OUT_BATCH((0 << GEN6_VS_SAMPLER_COUNT_SHIFT) |
 	     (brw->vs.nr_surfaces << GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
    OUT_BATCH(0); /* scratch space base offset */
    OUT_BATCH((1 << GEN6_VS_DISPATCH_START_GRF_SHIFT) |
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 36d4ab93ba9..ea5418bacf1 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -73,7 +73,7 @@ prepare_wm_constants(struct brw_context *brw)
 const struct brw_tracked_state gen6_wm_constants = {
    .dirty = {
       .mesa  = _NEW_PROGRAM_CONSTANTS,
-      .brw   = 0,
+      .brw   = BRW_NEW_FRAGMENT_PROGRAM,
       .cache = 0,
    },
    .prepare = prepare_wm_constants,
diff --git a/src/mesa/drivers/dri/i965/intel_structs.h b/src/mesa/drivers/dri/i965/intel_structs.h
deleted file mode 100644
index 522e3bd92c2..00000000000
--- a/src/mesa/drivers/dri/i965/intel_structs.h
+++ /dev/null
@@ -1,132 +0,0 @@
-#ifndef INTEL_STRUCTS_H
-#define INTEL_STRUCTS_H
-
-struct br0 {
-   GLuint length:8;
-   GLuint pad0:3;
-   GLuint dst_tiled:1;
-   GLuint pad1:8;
-   GLuint write_rgb:1;
-   GLuint write_alpha:1;
-   GLuint opcode:7;
-   GLuint client:3;
-};
-
-   
-struct br13 {
-   GLint dest_pitch:16;
-   GLuint rop:8;
-   GLuint color_depth:2;
-   GLuint pad1:3;
-   GLuint mono_source_transparency:1;
-   GLuint clipping_enable:1;
-   GLuint pad0:1;
-};
-
-
-
-/* This is an attempt to move some of the 2D interaction in this
- * driver to using structs for packets rather than a bunch of #defines
- * and dwords.
- */
-struct xy_color_blit {
-   struct br0 br0;
-   struct br13 br13;
-
-   struct {
-      GLuint dest_x1:16;
-      GLuint dest_y1:16;
-   } dw2;
-
-   struct {
-      GLuint dest_x2:16;
-      GLuint dest_y2:16;
-   } dw3;
-   
-   GLuint dest_base_addr;
-   GLuint color;
-};
-
-struct xy_src_copy_blit {
-   struct br0 br0;
-   struct br13 br13;
-
-   struct {
-      GLuint dest_x1:16;
-      GLuint dest_y1:16;
-   } dw2;
-
-   struct {
-      GLuint dest_x2:16;
-      GLuint dest_y2:16;
-   } dw3;
-   
-   GLuint dest_base_addr;
-
-   struct {
-      GLuint src_x1:16;
-      GLuint src_y1:16;
-   } dw5;
-
-   struct {
-      GLint src_pitch:16;
-      GLuint pad:16;
-   } dw6;
-   
-   GLuint src_base_addr;
-};
-
-struct xy_setup_blit {
-   struct br0 br0;
-   struct br13 br13;
-
-   struct {
-      GLuint clip_x1:16;
-      GLuint clip_y1:16;
-   } dw2;
-
-   struct {
-      GLuint clip_x2:16;
-      GLuint clip_y2:16;
-   } dw3;
-      
-   GLuint dest_base_addr;
-   GLuint background_color;
-   GLuint foreground_color;
-   GLuint pattern_base_addr;
-};
-
-
-struct xy_text_immediate_blit {
-   struct {
-      GLuint length:8;
-      GLuint pad2:3;
-      GLuint dst_tiled:1;
-      GLuint pad1:4;
-      GLuint byte_packed:1;
-      GLuint pad0:5;
-      GLuint opcode:7;
-      GLuint client:3;
-   } dw0;
-
-   struct {
-      GLuint dest_x1:16;
-      GLuint dest_y1:16;
-   } dw1;
-
-   struct {
-      GLuint dest_x2:16;
-      GLuint dest_y2:16;
-   } dw2;   
-
-   /* Src bitmap data follows as inline dwords.
-    */
-};
-
-
-#define CLIENT_2D 0x2
-#define OPCODE_XY_SETUP_BLT 0x1
-#define OPCODE_XY_COLOR_BLT 0x50
-#define OPCODE_XY_TEXT_IMMEDIATE_BLT 0x31
-
-#endif
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index 9b398239172..4b498f8c5b2 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -44,7 +44,9 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch)
 
    batch->buf = drm_intel_bo_alloc(intel->bufmgr, "batchbuffer",
 				   intel->maxBatchSize, 4096);
-   batch->map = batch->buffer;
+   drm_intel_gem_bo_map_gtt(batch->buf);
+   batch->map = batch->buf->virtual;
+
    batch->size = intel->maxBatchSize;
    batch->ptr = batch->map;
    batch->reserved_space = BATCH_RESERVED;
@@ -58,7 +60,6 @@ intel_batchbuffer_alloc(struct intel_context *intel)
    struct intel_batchbuffer *batch = calloc(sizeof(*batch), 1);
 
    batch->intel = intel;
-   batch->buffer = malloc(intel->maxBatchSize);
    intel_batchbuffer_reset(batch);
 
    return batch;
@@ -67,8 +68,11 @@ intel_batchbuffer_alloc(struct intel_context *intel)
 void
 intel_batchbuffer_free(struct intel_batchbuffer *batch)
 {
-   free (batch->buffer);
-   drm_intel_bo_unreference(batch->buf);
+   if (batch->map) {
+      drm_intel_gem_bo_unmap_gtt(batch->buf);
+      batch->map = NULL;
+   }
+   dri_bo_unreference(batch->buf);
    batch->buf = NULL;
    free(batch);
 }
@@ -84,13 +88,7 @@ do_flush_locked(struct intel_batchbuffer *batch, GLuint used)
    int ret = 0;
    int x_off = 0, y_off = 0;
 
-   drm_intel_bo_subdata(batch->buf, 0, used, batch->buffer);
-   if (batch->state_batch_offset != batch->size) {
-      drm_intel_bo_subdata(batch->buf,
-			   batch->state_batch_offset,
-			   batch->size - batch->state_batch_offset,
-			   batch->buffer + batch->state_batch_offset);
-   }
+   drm_intel_gem_bo_unmap_gtt(batch->buf);
 
    batch->ptr = NULL;
 
@@ -99,7 +97,7 @@ do_flush_locked(struct intel_batchbuffer *batch, GLuint used)
 			(x_off & 0xffff) | (y_off << 16));
    }
 
-   if (INTEL_DEBUG & DEBUG_BATCH) {
+   if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
       drm_intel_bo_map(batch->buf, GL_FALSE);
       intel_decode(batch->buf->virtual, used / 4, batch->buf->offset,
 		   intel->intelScreen->deviceID, GL_TRUE);
@@ -130,7 +128,7 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
    if (used == 0)
       return;
 
-   if (INTEL_DEBUG & DEBUG_BATCH)
+   if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
       fprintf(stderr, "%s:%d: Batchbuffer flush with %db used\n", file, line,
 	      used);
 
@@ -174,7 +172,7 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
 
    do_flush_locked(batch, used);
 
-   if (INTEL_DEBUG & DEBUG_SYNC) {
+   if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
       fprintf(stderr, "waiting for idle\n");
       drm_intel_bo_map(batch->buf, GL_TRUE);
       drm_intel_bo_unmap(batch->buf);
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.h b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
index ae53f455117..428c027c2f1 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
@@ -17,8 +17,6 @@ struct intel_batchbuffer
 
    drm_intel_bo *buf;
 
-   GLubyte *buffer;
-
    GLubyte *map;
    GLubyte *ptr;
 
diff --git a/src/mesa/drivers/dri/intel/intel_blit.c b/src/mesa/drivers/dri/intel/intel_blit.c
index a74e21720fb..c2917e9b07e 100644
--- a/src/mesa/drivers/dri/intel/intel_blit.c
+++ b/src/mesa/drivers/dri/intel/intel_blit.c
@@ -483,8 +483,11 @@ intel_emit_linear_blit(struct intel_context *intel,
    /* Blits are in a different ringbuffer so we don't use them. */
    assert(intel->gen < 6);
 
-   /* The pitch is a signed value. */
-   pitch = MIN2(size, (1 << 15) - 1);
+   /* The pitch given to the GPU must be DWORD aligned, and
+    * we want width to match pitch. Max width is (1 << 15 - 1),
+    * rounding that down to the nearest DWORD is 1 << 15 - 4
+    */
+   pitch = MIN2(size, (1 << 15) - 4);
    height = size / pitch;
    ok = intelEmitCopyBlit(intel, 1,
 			  pitch, src_bo, src_offset, I915_TILING_NONE,
@@ -499,6 +502,7 @@ intel_emit_linear_blit(struct intel_context *intel,
    dst_offset += pitch * height;
    size -= pitch * height;
    assert (size < (1 << 15));
+   assert ((size & 3) == 0); /* Pitch must be DWORD aligned */
    if (size != 0) {
       ok = intelEmitCopyBlit(intel, 1,
 			     size, src_bo, src_offset, I915_TILING_NONE,
diff --git a/src/mesa/drivers/dri/intel/intel_chipset.h b/src/mesa/drivers/dri/intel/intel_chipset.h
index 1e7ceed32a2..4fecdbed203 100644
--- a/src/mesa/drivers/dri/intel/intel_chipset.h
+++ b/src/mesa/drivers/dri/intel/intel_chipset.h
@@ -67,6 +67,7 @@
 #define PCI_CHIP_G45_G                  0x2E22
 #define PCI_CHIP_G41_G                  0x2E32
 #define PCI_CHIP_B43_G                  0x2E42
+#define PCI_CHIP_B43_G1                 0x2E92
 
 #define PCI_CHIP_ILD_G                  0x0042
 #define PCI_CHIP_ILM_G                  0x0046
@@ -93,7 +94,8 @@
                                  devid == PCI_CHIP_Q45_G || \
                                  devid == PCI_CHIP_G45_G || \
                                  devid == PCI_CHIP_G41_G || \
-                                 devid == PCI_CHIP_B43_G)
+                                 devid == PCI_CHIP_B43_G || \
+                                 devid == PCI_CHIP_B43_G1)
 #define IS_GM45(devid)          (devid == PCI_CHIP_GM45_GM)
 #define IS_G4X(devid)		(IS_G45(devid) || IS_GM45(devid))
 
diff --git a/src/mesa/drivers/dri/intel/intel_clear.c b/src/mesa/drivers/dri/intel/intel_clear.c
index d7814635b72..fa451f0045e 100644
--- a/src/mesa/drivers/dri/intel/intel_clear.c
+++ b/src/mesa/drivers/dri/intel/intel_clear.c
@@ -58,6 +58,21 @@ static const char *buffer_names[] = {
    [BUFFER_COLOR7] = "color7",
 };
 
+static void
+debug_mask(const char *name, GLbitfield mask)
+{
+   GLuint i;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_BLIT)) {
+      DBG("%s clear:", name);
+      for (i = 0; i < BUFFER_COUNT; i++) {
+	 if (mask & (1 << i))
+	    DBG(" %s", buffer_names[i]);
+      }
+      DBG("\n");
+   }
+}
+
 /**
  * Called by ctx->Driver.Clear.
  */
@@ -70,7 +85,6 @@ intelClear(struct gl_context *ctx, GLbitfield mask)
    GLbitfield blit_mask = 0;
    GLbitfield swrast_mask = 0;
    struct gl_framebuffer *fb = ctx->DrawBuffer;
-   GLuint i;
 
    if (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_FRONT_RIGHT)) {
       intel->front_buffer_dirty = GL_TRUE;
@@ -162,39 +176,17 @@ intelClear(struct gl_context *ctx, GLbitfield mask)
    }
 
    if (blit_mask) {
-      if (INTEL_DEBUG & DEBUG_BLIT) {
-	 DBG("blit clear:");
-	 for (i = 0; i < BUFFER_COUNT; i++) {
-	    if (blit_mask & (1 << i))
-	       DBG(" %s", buffer_names[i]);
-	 }
-	 DBG("\n");
-      }
+      debug_mask("blit", blit_mask);
       intelClearWithBlit(ctx, blit_mask);
    }
 
    if (tri_mask) {
-      if (INTEL_DEBUG & DEBUG_BLIT) {
-	 DBG("tri clear:");
-	 for (i = 0; i < BUFFER_COUNT; i++) {
-	    if (tri_mask & (1 << i))
-	       DBG(" %s", buffer_names[i]);
-	 }
-	 DBG("\n");
-      }
-
+      debug_mask("tri", tri_mask);
       _mesa_meta_Clear(&intel->ctx, tri_mask);
    }
 
    if (swrast_mask) {
-      if (INTEL_DEBUG & DEBUG_BLIT) {
-	 DBG("swrast clear:");
-	 for (i = 0; i < BUFFER_COUNT; i++) {
-	    if (swrast_mask & (1 << i))
-	       DBG(" %s", buffer_names[i]);
-	 }
-	 DBG("\n");
-      }
+      debug_mask("swrast", swrast_mask);
       _swrast_Clear(ctx, swrast_mask);
    }
 }
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 7ace50bde97..152cdcaf37d 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -155,6 +155,7 @@ intelGetString(struct gl_context * ctx, GLenum name)
          chipset = "Intel(R) G41";
          break;
       case PCI_CHIP_B43_G:
+      case PCI_CHIP_B43_G1:
          chipset = "Intel(R) B43";
          break;
       case PCI_CHIP_ILD_G:
@@ -249,7 +250,7 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
     * thus ignore the invalidate. */
    drawable->lastStamp = drawable->dri2.stamp;
 
-   if (INTEL_DEBUG & DEBUG_DRI)
+   if (unlikely(INTEL_DEBUG & DEBUG_DRI))
       fprintf(stderr, "enter %s, drawable %p\n", __func__, drawable);
 
    screen = intel->intelScreen->driScrnPriv;
@@ -378,14 +379,14 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
        if (rb->region && rb->region->name == buffers[i].name)
 	     continue;
 
-       if (INTEL_DEBUG & DEBUG_DRI)
+       if (unlikely(INTEL_DEBUG & DEBUG_DRI))
 	  fprintf(stderr,
 		  "attaching buffer %d, at %d, cpp %d, pitch %d\n",
 		  buffers[i].name, buffers[i].attachment,
 		  buffers[i].cpp, buffers[i].pitch);
        
        if (buffers[i].attachment == __DRI_BUFFER_STENCIL && depth_region) {
-	  if (INTEL_DEBUG & DEBUG_DRI)
+	  if (unlikely(INTEL_DEBUG & DEBUG_DRI))
 	     fprintf(stderr, "(reusing depth buffer as stencil)\n");
 	  intel_region_reference(&region, depth_region);
        }
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index 46d10d74ba3..9d5139c0000 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -98,6 +98,16 @@ extern void intelFallback(struct intel_context *intel, GLbitfield bit,
 
 #define INTEL_MAX_FIXUP 64
 
+#ifndef likely
+#ifdef __GNUC__
+#define likely(expr) (__builtin_expect(expr, 1))
+#define unlikely(expr) (__builtin_expect(expr, 0))
+#else
+#define likely(expr) (expr)
+#define unlikely(expr) (expr)
+#endif
+#endif
+
 struct intel_sync_object {
    struct gl_sync_object Base;
 
@@ -180,9 +190,6 @@ struct intel_context
    } prim;
 
    GLuint stats_wm;
-   GLboolean locked;
-   char *prevLockFile;
-   int prevLockLine;
 
    /* Offsets of fields within the current vertex:
     */
@@ -359,10 +366,15 @@ extern int INTEL_DEBUG;
 #define DEBUG_CLIP      0x8000000
 
 #define DBG(...) do {						\
-	if (INTEL_DEBUG & FILE_DEBUG_FLAG)			\
+	if (unlikely(INTEL_DEBUG & FILE_DEBUG_FLAG))		\
 		printf(__VA_ARGS__);			\
 } while(0)
 
+#define fallback_debug(...) do {				\
+	if (unlikely(INTEL_DEBUG & DEBUG_FALLBACKS))		\
+		printf(__VA_ARGS__);				\
+} while(0)
+
 #define PCI_CHIP_845_G			0x2562
 #define PCI_CHIP_I830_M			0x3577
 #define PCI_CHIP_I855_GM		0x3582
diff --git a/src/mesa/drivers/dri/intel/intel_pixel.c b/src/mesa/drivers/dri/intel/intel_pixel.c
index 60583ef4c0d..d5c35775ce4 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel.c
@@ -147,10 +147,9 @@ intel_check_blit_format(struct intel_region * region,
       return GL_TRUE;
    }
 
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      fprintf(stderr, "%s: bad format for blit (cpp %d, type %s format %s)\n",
-              __FUNCTION__, region->cpp,
-              _mesa_lookup_enum_by_nr(type), _mesa_lookup_enum_by_nr(format));
+   DBG("%s: bad format for blit (cpp %d, type %s format %s)\n",
+       __FUNCTION__, region->cpp,
+       _mesa_lookup_enum_by_nr(type), _mesa_lookup_enum_by_nr(format));
 
    return GL_FALSE;
 }
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
index 63fb4b37b18..e7356a6da0d 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
@@ -113,9 +113,8 @@ static GLuint get_bitmap_rect(GLsizei width, GLsizei height,
    GLint incr;
    GLuint count = 0;
 
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      printf("%s %d,%d %dx%d bitmap %dx%d skip %d src_offset %d mask %d\n",
-		   __FUNCTION__, x,y,w,h,width,height,unpack->SkipPixels, src_offset, mask);
+   DBG("%s %d,%d %dx%d bitmap %dx%d skip %d src_offset %d mask %d\n",
+       __FUNCTION__, x,y,w,h,width,height,unpack->SkipPixels, src_offset, mask);
 
    if (invert) {
       first = h-1;
@@ -285,7 +284,7 @@ do_blit_bitmap( struct gl_context *ctx,
    }
 out:
 
-   if (INTEL_DEBUG & DEBUG_SYNC)
+   if (unlikely(INTEL_DEBUG & DEBUG_SYNC))
       intel_batchbuffer_flush(intel->batch);
 
    if (_mesa_is_bufferobj(unpack->BufferObj)) {
@@ -299,6 +298,7 @@ out:
    return GL_TRUE;
 }
 
+
 /* There are a large number of possible ways to implement bitmap on
  * this hardware, most of them have some sort of drawback.  Here are a
  * few that spring to mind:
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_copy.c b/src/mesa/drivers/dri/intel/intel_pixel_copy.c
index c6b36ed4291..a7ca780e944 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_copy.c
@@ -119,8 +119,7 @@ do_blit_copypixels(struct gl_context * ctx,
    GLboolean flip = GL_FALSE;
 
    if (type == GL_DEPTH || type == GL_STENCIL) {
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "glCopyPixels() fallback: GL_DEPTH || GL_STENCIL\n");
+      fallback_debug("glCopyPixels() fallback: GL_DEPTH || GL_STENCIL\n");
       return GL_FALSE;
    }
 
@@ -203,8 +202,7 @@ intelCopyPixels(struct gl_context * ctx,
                 GLsizei width, GLsizei height,
                 GLint destx, GLint desty, GLenum type)
 {
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
+   DBG("%s\n", __FUNCTION__);
 
    if (do_blit_copypixels(ctx, srcx, srcy, width, height, destx, desty, type))
       return;
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_read.c b/src/mesa/drivers/dri/intel/intel_pixel_read.c
index b249f9a5a0b..54da29236d2 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_read.c
@@ -42,6 +42,8 @@
 #include "intel_pixel.h"
 #include "intel_buffer_objects.h"
 
+#define FILE_DEBUG_FLAG DEBUG_PIXEL
+
 /* For many applications, the new ability to pull the source buffers
  * back out of the GTT and then do the packing/conversion operations
  * in software will be as much of an improvement as trying to get the
@@ -79,8 +81,7 @@ do_blit_readpixels(struct gl_context * ctx,
    GLboolean all;
    GLint dst_x, dst_y;
 
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      printf("%s\n", __FUNCTION__);
+   DBG("%s\n", __FUNCTION__);
 
    if (!src)
       return GL_FALSE;
@@ -88,22 +89,19 @@ do_blit_readpixels(struct gl_context * ctx,
    if (!_mesa_is_bufferobj(pack->BufferObj)) {
       /* PBO only for now:
        */
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         printf("%s - not PBO\n", __FUNCTION__);
+      DBG("%s - not PBO\n", __FUNCTION__);
       return GL_FALSE;
    }
 
 
    if (ctx->_ImageTransferState ||
        !intel_check_blit_format(src, format, type)) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         printf("%s - bad format for blit\n", __FUNCTION__);
+      DBG("%s - bad format for blit\n", __FUNCTION__);
       return GL_FALSE;
    }
 
    if (pack->Alignment != 1 || pack->SwapBytes || pack->LsbFirst) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         printf("%s: bad packing params\n", __FUNCTION__);
+      DBG("%s: bad packing params\n", __FUNCTION__);
       return GL_FALSE;
    }
 
@@ -113,8 +111,7 @@ do_blit_readpixels(struct gl_context * ctx,
       rowLength = width;
 
    if (pack->Invert) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         printf("%s: MESA_PACK_INVERT not done yet\n", __FUNCTION__);
+      DBG("%s: MESA_PACK_INVERT not done yet\n", __FUNCTION__);
       return GL_FALSE;
    }
    else {
@@ -158,8 +155,7 @@ do_blit_readpixels(struct gl_context * ctx,
       return GL_FALSE;
    }
 
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      printf("%s - DONE\n", __FUNCTION__);
+   DBG("%s - DONE\n", __FUNCTION__);
 
    return GL_TRUE;
 }
@@ -173,8 +169,7 @@ intelReadPixels(struct gl_context * ctx,
    struct intel_context *intel = intel_context(ctx);
    GLboolean dirty;
 
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
+   DBG("%s\n", __FUNCTION__);
 
    intel_flush(ctx);
 
@@ -188,8 +183,7 @@ intelReadPixels(struct gl_context * ctx,
        (ctx, x, y, width, height, format, type, pack, pixels))
       return;
 
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      printf("%s: fallback to swrast\n", __FUNCTION__);
+   fallback_debug("%s: fallback to swrast\n", __FUNCTION__);
 
    /* Update Mesa state before calling down into _swrast_ReadPixels, as
     * the spans code requires the computed buffer states to be up to date,
diff --git a/src/mesa/drivers/dri/intel/intel_tex.c b/src/mesa/drivers/dri/intel/intel_tex.c
index 3d9a2549db0..2c21ea0576e 100644
--- a/src/mesa/drivers/dri/intel/intel_tex.c
+++ b/src/mesa/drivers/dri/intel/intel_tex.c
@@ -61,88 +61,6 @@ intelFreeTextureImageData(struct gl_context * ctx, struct gl_texture_image *texI
    }
 }
 
-
-/* The system memcpy (at least on ubuntu 5.10) has problems copying
- * to agp (writecombined) memory from a source which isn't 64-byte
- * aligned - there is a 4x performance falloff.
- *
- * The x86 __memcpy is immune to this but is slightly slower
- * (10%-ish) than the system memcpy.
- *
- * The sse_memcpy seems to have a slight cliff at 64/32 bytes, but
- * isn't much faster than x86_memcpy for agp copies.
- * 
- * TODO: switch dynamically.
- */
-static void *
-do_memcpy(void *dest, const void *src, size_t n)
-{
-   if ((((unsigned long) src) & 63) || (((unsigned long) dest) & 63)) {
-      return __memcpy(dest, src, n);
-   }
-   else
-      return memcpy(dest, src, n);
-}
-
-
-#if DO_DEBUG && !defined(__ia64__)
-
-#ifndef __x86_64__
-static unsigned
-fastrdtsc(void)
-{
-   unsigned eax;
-   __asm__ volatile ("\t"
-                     "pushl  %%ebx\n\t"
-                     "cpuid\n\t" ".byte 0x0f, 0x31\n\t"
-                     "popl %%ebx\n":"=a" (eax)
-                     :"0"(0)
-                     :"ecx", "edx", "cc");
-
-   return eax;
-}
-#else
-static unsigned
-fastrdtsc(void)
-{
-   unsigned eax;
-   __asm__ volatile ("\t" "cpuid\n\t" ".byte 0x0f, 0x31\n\t":"=a" (eax)
-                     :"0"(0)
-                     :"ecx", "edx", "ebx", "cc");
-
-   return eax;
-}
-#endif
-
-static unsigned
-time_diff(unsigned t, unsigned t2)
-{
-   return ((t < t2) ? t2 - t : 0xFFFFFFFFU - (t - t2 - 1));
-}
-
-
-static void *
-timed_memcpy(void *dest, const void *src, size_t n)
-{
-   void *ret;
-   unsigned t1, t2;
-   double rate;
-
-   if ((((unsigned) src) & 63) || (((unsigned) dest) & 63))
-      printf("Warning - non-aligned texture copy!\n");
-
-   t1 = fastrdtsc();
-   ret = do_memcpy(dest, src, n);
-   t2 = fastrdtsc();
-
-   rate = time_diff(t1, t2);
-   rate /= (double) n;
-   printf("timed_memcpy: %u %u --> %f clocks/byte\n", t1, t2, rate);
-   return ret;
-}
-#endif /* DO_DEBUG */
-
-
 /**
  * Called via ctx->Driver.GenerateMipmap()
  * This is basically a wrapper for _mesa_meta_GenerateMipmap() which checks
@@ -158,8 +76,7 @@ intelGenerateMipmap(struct gl_context *ctx, GLenum target,
       struct intel_context *intel = intel_context(ctx);
       struct intel_texture_object *intelObj = intel_texture_object(texObj);
 
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "%s - fallback to swrast\n", __FUNCTION__);
+      fallback_debug("%s - fallback to swrast\n", __FUNCTION__);
 
       intel_tex_map_level_images(intel, intelObj, texObj->BaseLevel);
       _mesa_generate_mipmap(ctx, target, texObj);
@@ -203,11 +120,4 @@ intelInitTextureFuncs(struct dd_function_table *functions)
    functions->NewTextureImage = intelNewTextureImage;
    functions->DeleteTexture = intelDeleteTextureObject;
    functions->FreeTexImageData = intelFreeTextureImageData;
-
-#if DO_DEBUG && !defined(__ia64__)
-   if (INTEL_DEBUG & DEBUG_BUFMGR)
-      functions->TextureMemCpy = timed_memcpy;
-   else
-#endif
-      functions->TextureMemCpy = do_memcpy;
 }
diff --git a/src/mesa/drivers/dri/intel/intel_tex_copy.c b/src/mesa/drivers/dri/intel/intel_tex_copy.c
index 2d046fd52d9..284ba19e8a3 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_copy.c
@@ -105,16 +105,15 @@ do_copy_texsubimage(struct intel_context *intel,
    const struct intel_region *src = get_teximage_source(intel, internalFormat);
 
    if (!intelImage->mt || !src || !src->buffer) {
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+      if (unlikely(INTEL_DEBUG & DEBUG_FALLBACKS))
 	 fprintf(stderr, "%s fail %p %p (0x%08x)\n",
 		 __FUNCTION__, intelImage->mt, src, internalFormat);
       return GL_FALSE;
    }
 
    if (intelImage->mt->cpp != src->cpp) {
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "%s fail %d vs %d cpp\n",
-		 __FUNCTION__, intelImage->mt->cpp, src->cpp);
+      fallback_debug("%s fail %d vs %d cpp\n",
+		     __FUNCTION__, intelImage->mt->cpp, src->cpp);
       return GL_FALSE;
    }
 
@@ -212,8 +211,7 @@ intelCopyTexImage1D(struct gl_context * ctx, GLenum target, GLint level,
    return;
 
  fail:
-   if (INTEL_DEBUG & DEBUG_FALLBACKS)
-      fprintf(stderr, "%s - fallback to swrast\n", __FUNCTION__);
+   fallback_debug("%s - fallback to swrast\n", __FUNCTION__);
    _mesa_meta_CopyTexImage1D(ctx, target, level, internalFormat, x, y,
                              width, border);
 }
@@ -261,8 +259,7 @@ intelCopyTexImage2D(struct gl_context * ctx, GLenum target, GLint level,
    return;
 
  fail:
-   if (INTEL_DEBUG & DEBUG_FALLBACKS)
-      fprintf(stderr, "%s - fallback to swrast\n", __FUNCTION__);
+   fallback_debug("%s - fallback to swrast\n", __FUNCTION__);
    _mesa_meta_CopyTexImage2D(ctx, target, level, internalFormat, x, y,
                              width, height, border);
 }
@@ -287,8 +284,7 @@ intelCopyTexSubImage1D(struct gl_context * ctx, GLenum target, GLint level,
    if (!do_copy_texsubimage(intel_context(ctx), target,
                             intel_texture_image(texImage),
                             internalFormat, xoffset, 0, x, y, width, 1)) {
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-         fprintf(stderr, "%s - fallback to swrast\n", __FUNCTION__);
+      fallback_debug("%s - fallback to swrast\n", __FUNCTION__);
       _mesa_meta_CopyTexSubImage1D(ctx, target, level, xoffset, x, y, width);
    }
 }
@@ -314,8 +310,7 @@ intelCopyTexSubImage2D(struct gl_context * ctx, GLenum target, GLint level,
                             internalFormat,
                             xoffset, yoffset, x, y, width, height)) {
 
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-         fprintf(stderr, "%s - fallback to swrast\n", __FUNCTION__);
+      fallback_debug("%s - fallback to swrast\n", __FUNCTION__);
       _mesa_meta_CopyTexSubImage2D(ctx, target, level,
                                    xoffset, yoffset, x, y, width, height);
    }
diff --git a/src/mesa/drivers/dri/intel/intel_tex_image.c b/src/mesa/drivers/dri/intel/intel_tex_image.c
index 35f3d7d3829..50fe9bd9f33 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_image.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_image.c
@@ -66,7 +66,6 @@ guess_and_alloc_mipmap_tree(struct intel_context *intel,
    GLuint width = intelImage->base.Width;
    GLuint height = intelImage->base.Height;
    GLuint depth = intelImage->base.Depth;
-   GLuint l2width, l2height, l2depth;
    GLuint i, comp_byte = 0;
    GLuint texelBytes;
 
@@ -114,10 +113,7 @@ guess_and_alloc_mipmap_tree(struct intel_context *intel,
       lastLevel = firstLevel;
    }
    else {
-      l2width = logbase2(width);
-      l2height = logbase2(height);
-      l2depth = logbase2(depth);
-      lastLevel = firstLevel + MAX2(MAX2(l2width, l2height), l2depth);
+      lastLevel = firstLevel + logbase2(MAX2(MAX2(width, height), depth));
    }
 
    assert(!intelObj->mt);
@@ -347,21 +343,6 @@ intelTexImage(struct gl_context * ctx,
       texImage->Data = NULL;
    }
 
-   /* If this is the only texture image in the tree, could call
-    * bmBufferData with NULL data to free the old block and avoid
-    * waiting on any outstanding fences.
-    */
-   if (intelObj->mt &&
-       intelObj->mt->first_level == level &&
-       intelObj->mt->last_level == level &&
-       intelObj->mt->target != GL_TEXTURE_CUBE_MAP_ARB &&
-       !intel_miptree_match_image(intelObj->mt, &intelImage->base)) {
-
-      DBG("release it\n");
-      intel_miptree_release(intel, &intelObj->mt);
-      assert(!intelObj->mt);
-   }
-
    if (!intelObj->mt) {
       guess_and_alloc_mipmap_tree(intel, intelObj, intelImage, pixels == NULL);
       if (!intelObj->mt) {
diff --git a/src/mesa/drivers/dri/nouveau/Makefile b/src/mesa/drivers/dri/nouveau/Makefile
index 7be19b26fda..3b506a91ffa 100644
--- a/src/mesa/drivers/dri/nouveau/Makefile
+++ b/src/mesa/drivers/dri/nouveau/Makefile
@@ -19,6 +19,8 @@ DRIVER_SOURCES = \
 	nouveau_bo_state.c \
 	nouveau_texture.c \
 	nouveau_surface.c \
+	nouveau_scratch.c \
+	nouveau_array.c \
 	nv04_context.c \
 	nv04_render.c \
 	nv04_state_fb.c \
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_array.c b/src/mesa/drivers/dri/nouveau/nouveau_array.c
new file mode 100644
index 00000000000..17e6d163a02
--- /dev/null
+++ b/src/mesa/drivers/dri/nouveau/nouveau_array.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2009-2010 Francisco Jerez.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "main/bufferobj.h"
+#include "nouveau_driver.h"
+#include "nouveau_array.h"
+#include "nouveau_bufferobj.h"
+
+static void
+get_array_extract(struct nouveau_array *a, extract_u_t *extract_u,
+		  extract_f_t *extract_f)
+{
+#define EXTRACT(in_t, out_t, k)						\
+	({								\
+		auto out_t f(struct nouveau_array *, int, int);		\
+		out_t f(struct nouveau_array *a, int i, int j) {	\
+			in_t x = ((in_t *)(a->buf + i * a->stride))[j];	\
+									\
+			return (out_t)x / (k);				\
+		};							\
+		f;							\
+	});
+
+	switch (a->type) {
+	case GL_BYTE:
+		*extract_u = EXTRACT(char, unsigned, 1);
+		*extract_f = EXTRACT(char, float, SCHAR_MAX);
+		break;
+	case GL_UNSIGNED_BYTE:
+		*extract_u = EXTRACT(unsigned char, unsigned, 1);
+		*extract_f = EXTRACT(unsigned char, float, UCHAR_MAX);
+		break;
+	case GL_SHORT:
+		*extract_u = EXTRACT(short, unsigned, 1);
+		*extract_f = EXTRACT(short, float, SHRT_MAX);
+		break;
+	case GL_UNSIGNED_SHORT:
+		*extract_u = EXTRACT(unsigned short, unsigned, 1);
+		*extract_f = EXTRACT(unsigned short, float, USHRT_MAX);
+		break;
+	case GL_INT:
+		*extract_u = EXTRACT(int, unsigned, 1);
+		*extract_f = EXTRACT(int, float, INT_MAX);
+		break;
+	case GL_UNSIGNED_INT:
+		*extract_u = EXTRACT(unsigned int, unsigned, 1);
+		*extract_f = EXTRACT(unsigned int, float, UINT_MAX);
+		break;
+	case GL_FLOAT:
+		*extract_u = EXTRACT(float, unsigned, 1.0 / UINT_MAX);
+		*extract_f = EXTRACT(float, float, 1);
+		break;
+	default:
+		assert(0);
+	}
+}
+
+void
+nouveau_init_array(struct nouveau_array *a, int attr, int stride,
+		   int fields, int type, struct gl_buffer_object *obj,
+		   const void *ptr, GLboolean map)
+{
+	a->attr = attr;
+	a->stride = stride;
+	a->fields = fields;
+	a->type = type;
+	a->buf = NULL;
+
+	if (obj) {
+		if (nouveau_bufferobj_hw(obj)) {
+			struct nouveau_bufferobj *nbo =
+				to_nouveau_bufferobj(obj);
+
+			nouveau_bo_ref(nbo->bo, &a->bo);
+			a->offset = (intptr_t)ptr;
+
+			if (map) {
+				nouveau_bo_map(a->bo, NOUVEAU_BO_RD);
+				a->buf = a->bo->map + a->offset;
+			}
+
+		} else {
+			nouveau_bo_ref(NULL, &a->bo);
+			a->offset = 0;
+
+			if (map)
+				a->buf = ADD_POINTERS(
+					nouveau_bufferobj_sys(obj), ptr);
+		}
+	}
+
+	if (a->buf)
+		get_array_extract(a, &a->extract_u, &a->extract_f);
+}
+
+void
+nouveau_deinit_array(struct nouveau_array *a)
+{
+	if (a->bo) {
+		if (a->bo->map)
+			nouveau_bo_unmap(a->bo);
+	}
+
+	a->buf = NULL;
+	a->fields = 0;
+}
+
+void
+nouveau_cleanup_array(struct nouveau_array *a)
+{
+	nouveau_deinit_array(a);
+	nouveau_bo_ref(NULL, &a->bo);
+}
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_array.h b/src/mesa/drivers/dri/nouveau/nouveau_array.h
new file mode 100644
index 00000000000..ad3d69b33d9
--- /dev/null
+++ b/src/mesa/drivers/dri/nouveau/nouveau_array.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2009-2010 Francisco Jerez.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __NOUVEAU_ARRAY_H__
+#define __NOUVEAU_ARRAY_H__
+
+struct nouveau_array;
+
+typedef unsigned (*extract_u_t)(struct nouveau_array *, int, int);
+typedef float (*extract_f_t)(struct nouveau_array *, int, int);
+
+struct nouveau_array {
+	int attr;
+	int stride, fields, type;
+
+	struct nouveau_bo *bo;
+	unsigned offset;
+	const void *buf;
+
+	extract_u_t extract_u;
+	extract_f_t extract_f;
+};
+
+void
+nouveau_init_array(struct nouveau_array *a, int attr, int stride,
+		   int fields, int type, struct gl_buffer_object *obj,
+		   const void *ptr, GLboolean map);
+
+void
+nouveau_deinit_array(struct nouveau_array *a);
+
+void
+nouveau_cleanup_array(struct nouveau_array *a);
+
+#endif
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_bo_state.c b/src/mesa/drivers/dri/nouveau/nouveau_bo_state.c
index f31772fe1d1..7eef8c1ee81 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_bo_state.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_bo_state.c
@@ -126,13 +126,13 @@ void
 nouveau_bo_context_reset(struct nouveau_bo_context *bctx)
 {
 	struct nouveau_bo_state *s = &to_nouveau_context(bctx->ctx)->bo;
-	int i;
-
-	for (i = 0; i < bctx->count; i++)
-		nouveau_bo_ref(NULL, &bctx->marker[i].bo);
+	int i, n = bctx->count;
 
-	s->count -= bctx->count;
+	s->count -= n;
 	bctx->count = 0;
+
+	for (i = 0; i < n; i++)
+		nouveau_bo_ref(NULL, &bctx->marker[i].bo);
 }
 
 GLboolean
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_bo_state.h b/src/mesa/drivers/dri/nouveau/nouveau_bo_state.h
index 6119a8336e3..388a16a56ea 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_bo_state.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_bo_state.h
@@ -29,7 +29,7 @@
 
 enum {
 	NOUVEAU_BO_CONTEXT_FRAMEBUFFER = 0,
-	NOUVEAU_BO_CONTEXT_LMA_DEPTH,
+	NOUVEAU_BO_CONTEXT_HIERZ,
 	NOUVEAU_BO_CONTEXT_SURFACE,
 	NOUVEAU_BO_CONTEXT_TEXTURE0,
 	NOUVEAU_BO_CONTEXT_TEXTURE1,
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.c b/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.c
index ad6e5bd805a..e60b91f64be 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.c
@@ -30,6 +30,23 @@
 
 #include "main/bufferobj.h"
 
+static inline char *
+get_bufferobj_map(struct gl_buffer_object *obj, unsigned flags)
+{
+	struct nouveau_bufferobj *nbo = to_nouveau_bufferobj(obj);
+	void *map = NULL;
+
+	if (nbo->sys) {
+		map = nbo->sys;
+	} else if (nbo->bo) {
+		nouveau_bo_map(nbo->bo, flags);
+		map = nbo->bo->map;
+		nouveau_bo_unmap(nbo->bo);
+	}
+
+	return map;
+}
+
 static struct gl_buffer_object *
 nouveau_bufferobj_new(struct gl_context *ctx, GLuint buffer, GLenum target)
 {
@@ -50,6 +67,7 @@ nouveau_bufferobj_del(struct gl_context *ctx, struct gl_buffer_object *obj)
 	struct nouveau_bufferobj *nbo = to_nouveau_bufferobj(obj);
 
 	nouveau_bo_ref(NULL, &nbo->bo);
+	FREE(nbo->sys);
 	FREE(nbo);
 }
 
@@ -64,18 +82,27 @@ nouveau_bufferobj_data(struct gl_context *ctx, GLenum target, GLsizeiptrARB size
 	obj->Size = size;
 	obj->Usage = usage;
 
+	/* Free previous storage */
 	nouveau_bo_ref(NULL, &nbo->bo);
-	ret = nouveau_bo_new(context_dev(ctx),
-			     NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0,
-			     size, &nbo->bo);
-	assert(!ret);
-
-	if (data) {
-		nouveau_bo_map(nbo->bo, NOUVEAU_BO_WR);
-		memcpy(nbo->bo->map, data, size);
-		nouveau_bo_unmap(nbo->bo);
+	FREE(nbo->sys);
+
+	if (target == GL_ELEMENT_ARRAY_BUFFER_ARB ||
+	    (size < 512 && usage == GL_DYNAMIC_DRAW_ARB) ||
+	    context_chipset(ctx) < 0x10) {
+		/* Heuristic: keep it in system ram */
+		nbo->sys = MALLOC(size);
+
+	} else {
+		/* Get a hardware BO */
+		ret = nouveau_bo_new(context_dev(ctx),
+				     NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0,
+				     size, &nbo->bo);
+		assert(!ret);
 	}
 
+	if (data)
+		memcpy(get_bufferobj_map(obj, NOUVEAU_BO_WR), data, size);
+
 	return GL_TRUE;
 }
 
@@ -84,11 +111,7 @@ nouveau_bufferobj_subdata(struct gl_context *ctx, GLenum target, GLintptrARB off
 			  GLsizeiptrARB size, const GLvoid *data,
 			  struct gl_buffer_object *obj)
 {
-	struct nouveau_bufferobj *nbo = to_nouveau_bufferobj(obj);
-
-	nouveau_bo_map(nbo->bo, NOUVEAU_BO_WR);
-	memcpy(nbo->bo->map + offset, data, size);
-	nouveau_bo_unmap(nbo->bo);
+	memcpy(get_bufferobj_map(obj, NOUVEAU_BO_WR) + offset, data, size);
 }
 
 static void
@@ -96,44 +119,48 @@ nouveau_bufferobj_get_subdata(struct gl_context *ctx, GLenum target, GLintptrARB
 			   GLsizeiptrARB size, GLvoid *data,
 			   struct gl_buffer_object *obj)
 {
-	struct nouveau_bufferobj *nbo = to_nouveau_bufferobj(obj);
-
-	nouveau_bo_map(nbo->bo, NOUVEAU_BO_RD);
-	memcpy(data, nbo->bo->map + offset, size);
-	nouveau_bo_unmap(nbo->bo);
+	memcpy(data, get_bufferobj_map(obj, NOUVEAU_BO_RD) + offset, size);
 }
 
 static void *
 nouveau_bufferobj_map(struct gl_context *ctx, GLenum target, GLenum access,
 		   struct gl_buffer_object *obj)
 {
-	return ctx->Driver.MapBufferRange(ctx, target, 0, obj->Size, access,
+	unsigned flags = 0;
+
+	if (access == GL_READ_ONLY_ARB ||
+	    access == GL_READ_WRITE_ARB)
+		flags |= GL_MAP_READ_BIT;
+	if (access == GL_WRITE_ONLY_ARB ||
+	    access == GL_READ_WRITE_ARB)
+		flags |= GL_MAP_WRITE_BIT;
+
+	return ctx->Driver.MapBufferRange(ctx, target, 0, obj->Size, flags,
 					  obj);
 }
 
 static void *
 nouveau_bufferobj_map_range(struct gl_context *ctx, GLenum target, GLintptr offset,
-			    GLsizeiptr length, GLenum access,
+			    GLsizeiptr length, GLbitfield access,
 			    struct gl_buffer_object *obj)
 {
-	struct nouveau_bufferobj *nbo = to_nouveau_bufferobj(obj);
-	uint32_t flags = 0;
+	unsigned flags = 0;
+	char *map;
 
 	assert(!obj->Pointer);
 
-	if (!nbo->bo)
-		return NULL;
-
-	if (access == GL_READ_ONLY_ARB ||
-	    access == GL_READ_WRITE_ARB)
+	if (access & GL_MAP_READ_BIT)
 		flags |= NOUVEAU_BO_RD;
-	if (access == GL_WRITE_ONLY_ARB ||
-	    access == GL_READ_WRITE_ARB)
+	if (access & GL_MAP_WRITE_BIT)
 		flags |= NOUVEAU_BO_WR;
+	if (access & GL_MAP_UNSYNCHRONIZED_BIT)
+		flags |= NOUVEAU_BO_NOSYNC;
 
-	nouveau_bo_map_range(nbo->bo, offset, length, flags);
+	map = get_bufferobj_map(obj, flags);
+	if (!map)
+		return NULL;
 
-	obj->Pointer = nbo->bo->map;
+	obj->Pointer = map + offset;
 	obj->Offset = offset;
 	obj->Length = length;
 	obj->AccessFlags = access;
@@ -144,12 +171,8 @@ nouveau_bufferobj_map_range(struct gl_context *ctx, GLenum target, GLintptr offs
 static GLboolean
 nouveau_bufferobj_unmap(struct gl_context *ctx, GLenum target, struct gl_buffer_object *obj)
 {
-	struct nouveau_bufferobj *nbo = to_nouveau_bufferobj(obj);
-
 	assert(obj->Pointer);
 
-	nouveau_bo_unmap(nbo->bo);
-
 	obj->Pointer = NULL;
 	obj->Offset = 0;
 	obj->Length = 0;
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.h b/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.h
index acfc4cb9a90..01ef0bad0fd 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_bufferobj.h
@@ -30,9 +30,16 @@
 struct nouveau_bufferobj {
 	struct gl_buffer_object base;
 	struct nouveau_bo *bo;
+	void *sys;
 };
 #define to_nouveau_bufferobj(x) ((struct nouveau_bufferobj *)(x))
 
+#define nouveau_bufferobj_hw(x) \
+	(_mesa_is_bufferobj(x) ? to_nouveau_bufferobj(x)->bo : NULL)
+
+#define nouveau_bufferobj_sys(x) \
+	(_mesa_is_bufferobj(x) ? to_nouveau_bufferobj(x)->sys : NULL)
+
 void
 nouveau_bufferobj_functions_init(struct dd_function_table *functions);
 
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_class.h b/src/mesa/drivers/dri/nouveau/nouveau_class.h
index d41d431f796..687b847797b 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_class.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_class.h
@@ -4954,6 +4954,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV25TCL_DMA_IN_MEMORY5								0x000001a0
 #define  NV25TCL_DMA_IN_MEMORY8								0x000001ac
 #define  NV25TCL_DMA_IN_MEMORY9								0x000001b0
+#define  NV25TCL_HIERZ_PITCH								0x0000022c
+#define  NV25TCL_HIERZ_OFFSET								0x00000230
 
 #endif /* NOUVEAU_REG_H */
 
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_context.c b/src/mesa/drivers/dri/nouveau/nouveau_context.c
index 0ace139b886..f80aaedb257 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_context.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_context.c
@@ -119,6 +119,7 @@ nouveau_context_init(struct gl_context *ctx, struct nouveau_screen *screen,
 
 	nouveau_state_init(ctx);
 	nouveau_bo_state_init(ctx);
+	nouveau_scratch_init(ctx);
 	_mesa_meta_init(ctx);
 	_swrast_CreateContext(ctx);
 	_vbo_CreateContext(ctx);
@@ -163,6 +164,7 @@ nouveau_context_deinit(struct gl_context *ctx)
 	if (nctx->hw.chan)
 		nouveau_channel_free(&nctx->hw.chan);
 
+	nouveau_scratch_destroy(ctx);
 	nouveau_bo_state_destroy(ctx);
 	_mesa_free_context_data(ctx);
 }
@@ -325,10 +327,12 @@ nouveau_fallback(struct gl_context *ctx, enum nouveau_fallback mode)
 
 	nctx->fallback = MAX2(HWTNL, mode);
 
-	if (mode < SWRAST)
+	if (mode < SWRAST) {
 		nouveau_state_emit(ctx);
-	else
+		nouveau_bo_state_emit(ctx);
+	} else {
 		FIRE_RING(context_chan(ctx));
+	}
 }
 
 static void
@@ -365,5 +369,6 @@ nouveau_validate_framebuffer(struct gl_context *ctx)
 		validate_framebuffer(dri_ctx, dri_read,
 				     &dri_ctx->dri2.read_stamp);
 
-	nouveau_state_emit(ctx);
+	if (ctx->NewState & _NEW_BUFFERS)
+		_mesa_update_state(ctx);
 }
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_context.h b/src/mesa/drivers/dri/nouveau/nouveau_context.h
index 23a87256728..7ebc676379e 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_context.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_context.h
@@ -30,6 +30,7 @@
 #include "nouveau_screen.h"
 #include "nouveau_state.h"
 #include "nouveau_bo_state.h"
+#include "nouveau_scratch.h"
 #include "nouveau_render.h"
 
 #include "main/bitset.h"
@@ -67,6 +68,7 @@ struct nouveau_context {
 	struct nouveau_hw_state hw;
 	struct nouveau_bo_state bo;
 	struct nouveau_render_state render;
+	struct nouveau_scratch_state scratch;
 
 	struct {
 		GLboolean clear_blocked;
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_render.h b/src/mesa/drivers/dri/nouveau/nouveau_render.h
index 81c6119fcc6..0539c377585 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_render.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_render.h
@@ -28,46 +28,22 @@
 #define __NOUVEAU_RENDER_H__
 
 #include "vbo/vbo_context.h"
-
-struct nouveau_array_state;
+#include "nouveau_array.h"
 
 typedef void (*dispatch_t)(struct gl_context *, unsigned int, int, unsigned int);
-typedef unsigned (*extract_u_t)(struct nouveau_array_state *, int, int);
-typedef float (*extract_f_t)(struct nouveau_array_state *, int, int);
+typedef void (*emit_t)(struct gl_context *, struct nouveau_array *, const void *);
 
 struct nouveau_attr_info {
 	int vbo_index;
 	int imm_method;
 	int imm_fields;
 
-	void (*emit)(struct gl_context *, struct nouveau_array_state *, const void *);
-};
-
-struct nouveau_array_state {
-	int attr;
-	int stride, fields, type;
-
-	struct nouveau_bo *bo;
-	unsigned offset;
-	const void *buf;
-
-	extract_u_t extract_u;
-	extract_f_t extract_f;
-};
-
-#define RENDER_SCRATCH_COUNT 2
-#define RENDER_SCRATCH_SIZE 2*1024*1024
-
-struct nouveau_scratch_state {
-	struct nouveau_bo *bo[RENDER_SCRATCH_COUNT];
-
-	int index;
-	int offset;
-	void *buf;
+	emit_t emit;
 };
 
 struct nouveau_swtnl_state {
 	struct nouveau_bo *vbo;
+	unsigned offset;
 	void *buf;
 	unsigned vertex_count;
 	GLenum primitive;
@@ -79,8 +55,8 @@ struct nouveau_render_state {
 		IMM
 	} mode;
 
-	struct nouveau_array_state ib;
-	struct nouveau_array_state attrs[VERT_ATTRIB_MAX];
+	struct nouveau_array ib;
+	struct nouveau_array attrs[VERT_ATTRIB_MAX];
 
 	/* Maps a HW VBO index or IMM emission order to an index in
 	 * the attrs array above (or -1 if unused). */
@@ -89,10 +65,16 @@ struct nouveau_render_state {
 	int attr_count;
 	int vertex_size;
 
-	struct nouveau_scratch_state scratch;
 	struct nouveau_swtnl_state swtnl;
 };
 
 #define to_render_state(ctx) (&to_nouveau_context(ctx)->render)
 
+#define FOR_EACH_ATTR(render, i, attr)					\
+	for (i = 0; attr = (render)->map[i], i < NUM_VERTEX_ATTRS; i++)
+
+#define FOR_EACH_BOUND_ATTR(render, i, attr)				\
+	for (i = 0; attr = (render)->map[i], i < render->attr_count; i++) \
+		if (attr >= 0)
+
 #endif
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_render_t.c b/src/mesa/drivers/dri/nouveau/nouveau_render_t.c
index dd38c14aa7c..e0cf727d11d 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_render_t.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_render_t.c
@@ -100,8 +100,8 @@
 /*
  * Select an appropriate dispatch function for the given index buffer.
  */
-static void
-get_array_dispatch(struct nouveau_array_state *a, dispatch_t *dispatch)
+static dispatch_t
+get_array_dispatch(struct nouveau_array *a)
 {
 	if (!a->fields) {
 		auto void f(struct gl_context *, unsigned int, int, unsigned int);
@@ -114,7 +114,7 @@ get_array_dispatch(struct nouveau_array_state *a, dispatch_t *dispatch)
 			EMIT_VBO(L, ctx, start, delta, n);
 		};
 
-		*dispatch = f;
+		return f;
 
 	} else if (a->type == GL_UNSIGNED_INT) {
 		auto void f(struct gl_context *, unsigned int, int, unsigned int);
@@ -127,7 +127,7 @@ get_array_dispatch(struct nouveau_array_state *a, dispatch_t *dispatch)
 			EMIT_VBO(I32, ctx, start, delta, n);
 		};
 
-		*dispatch = f;
+		return f;
 
 	} else {
 		auto void f(struct gl_context *, unsigned int, int, unsigned int);
@@ -141,115 +141,11 @@ get_array_dispatch(struct nouveau_array_state *a, dispatch_t *dispatch)
 			EMIT_VBO(I16, ctx, start, delta, n & ~1);
 		};
 
-		*dispatch = f;
+		return f;
 	}
 }
 
 /*
- * Select appropriate element extraction functions for the given
- * array.
- */
-static void
-get_array_extract(struct nouveau_array_state *a,
-		  extract_u_t *extract_u, extract_f_t *extract_f)
-{
-#define EXTRACT(in_t, out_t, k)						\
-	({								\
-		auto out_t f(struct nouveau_array_state *, int, int);	\
-		out_t f(struct nouveau_array_state *a, int i, int j) {	\
-			in_t x = ((in_t *)(a->buf + i * a->stride))[j];	\
-									\
-			return (out_t)x / (k);				\
-		};							\
-		f;							\
-	});
-
-	switch (a->type) {
-	case GL_BYTE:
-		*extract_u = EXTRACT(char, unsigned, 1);
-		*extract_f = EXTRACT(char, float, SCHAR_MAX);
-		break;
-	case GL_UNSIGNED_BYTE:
-		*extract_u = EXTRACT(unsigned char, unsigned, 1);
-		*extract_f = EXTRACT(unsigned char, float, UCHAR_MAX);
-		break;
-	case GL_SHORT:
-		*extract_u = EXTRACT(short, unsigned, 1);
-		*extract_f = EXTRACT(short, float, SHRT_MAX);
-		break;
-	case GL_UNSIGNED_SHORT:
-		*extract_u = EXTRACT(unsigned short, unsigned, 1);
-		*extract_f = EXTRACT(unsigned short, float, USHRT_MAX);
-		break;
-	case GL_INT:
-		*extract_u = EXTRACT(int, unsigned, 1);
-		*extract_f = EXTRACT(int, float, INT_MAX);
-		break;
-	case GL_UNSIGNED_INT:
-		*extract_u = EXTRACT(unsigned int, unsigned, 1);
-		*extract_f = EXTRACT(unsigned int, float, UINT_MAX);
-		break;
-	case GL_FLOAT:
-		*extract_u = EXTRACT(float, unsigned, 1.0 / UINT_MAX);
-		*extract_f = EXTRACT(float, float, 1);
-		break;
-
-	default:
-		assert(0);
-	}
-}
-
-/*
- * Returns a pointer to a chunk of <size> bytes long GART memory. <bo>
- * will be updated with the buffer object the memory is located in.
- *
- * If <offset> is provided, it will be updated with the offset within
- * <bo> of the allocated memory. Otherwise the returned memory will
- * always be located right at the beginning of <bo>.
- */
-static inline void *
-get_scratch_vbo(struct gl_context *ctx, unsigned size, struct nouveau_bo **bo,
-		unsigned *offset)
-{
-	struct nouveau_scratch_state *scratch = &to_render_state(ctx)->scratch;
-	void *buf;
-
-	if (scratch->buf && offset &&
-	    size <= RENDER_SCRATCH_SIZE - scratch->offset) {
-		nouveau_bo_ref(scratch->bo[scratch->index], bo);
-
-		buf = scratch->buf + scratch->offset;
-		*offset = scratch->offset;
-		scratch->offset += size;
-
-	} else if (size <= RENDER_SCRATCH_SIZE) {
-		scratch->index = (scratch->index + 1) % RENDER_SCRATCH_COUNT;
-		nouveau_bo_ref(scratch->bo[scratch->index], bo);
-
-		nouveau_bo_map(*bo, NOUVEAU_BO_WR);
-		buf = scratch->buf = (*bo)->map;
-		nouveau_bo_unmap(*bo);
-
-		if (offset)
-			*offset = 0;
-		scratch->offset = size;
-
-	} else {
-		nouveau_bo_new(context_dev(ctx),
-			       NOUVEAU_BO_MAP | NOUVEAU_BO_GART, 0, size, bo);
-
-		nouveau_bo_map(*bo, NOUVEAU_BO_WR);
-		buf = (*bo)->map;
-		nouveau_bo_unmap(*bo);
-
-		if (offset)
-			*offset = 0;
-	}
-
-	return buf;
-}
-
-/*
  * Returns how many vertices you can draw using <n> pushbuf dwords.
  */
 static inline unsigned
@@ -277,6 +173,11 @@ get_max_vertices(struct gl_context *ctx, const struct _mesa_index_buffer *ib,
 			case GL_UNSIGNED_BYTE:
 				max_out = MAX_OUT_I16;
 				break;
+
+			default:
+				assert(0);
+				max_out = 0;
+				break;
 			}
 		} else {
 			max_out = MAX_OUT_L;
@@ -286,76 +187,26 @@ get_max_vertices(struct gl_context *ctx, const struct _mesa_index_buffer *ib,
 	}
 }
 
-#include "nouveau_vbo_t.c"
-#include "nouveau_swtnl_t.c"
-
 static void
-TAG(emit_material)(struct gl_context *ctx, struct nouveau_array_state *a,
+TAG(emit_material)(struct gl_context *ctx, struct nouveau_array *a,
 		   const void *v)
 {
-	const int attr = a->attr - VERT_ATTRIB_GENERIC0;
-	const int state = ((int []) {
-				NOUVEAU_STATE_MATERIAL_FRONT_AMBIENT,
-				NOUVEAU_STATE_MATERIAL_BACK_AMBIENT,
-				NOUVEAU_STATE_MATERIAL_FRONT_DIFFUSE,
-				NOUVEAU_STATE_MATERIAL_BACK_DIFFUSE,
-				NOUVEAU_STATE_MATERIAL_FRONT_SPECULAR,
-				NOUVEAU_STATE_MATERIAL_BACK_SPECULAR,
-				NOUVEAU_STATE_MATERIAL_FRONT_AMBIENT,
-				NOUVEAU_STATE_MATERIAL_BACK_AMBIENT,
-				NOUVEAU_STATE_MATERIAL_FRONT_SHININESS,
-				NOUVEAU_STATE_MATERIAL_BACK_SHININESS
-			}) [attr];
+	int attr = a->attr - VERT_ATTRIB_GENERIC0;
+	int state = ((int []) {
+			NOUVEAU_STATE_MATERIAL_FRONT_AMBIENT,
+			NOUVEAU_STATE_MATERIAL_BACK_AMBIENT,
+			NOUVEAU_STATE_MATERIAL_FRONT_DIFFUSE,
+			NOUVEAU_STATE_MATERIAL_BACK_DIFFUSE,
+			NOUVEAU_STATE_MATERIAL_FRONT_SPECULAR,
+			NOUVEAU_STATE_MATERIAL_BACK_SPECULAR,
+			NOUVEAU_STATE_MATERIAL_FRONT_AMBIENT,
+			NOUVEAU_STATE_MATERIAL_BACK_AMBIENT,
+			NOUVEAU_STATE_MATERIAL_FRONT_SHININESS,
+			NOUVEAU_STATE_MATERIAL_BACK_SHININESS
+		}) [attr];
 
 	COPY_4V(ctx->Light.Material.Attrib[attr], (float *)v);
 	_mesa_update_material(ctx, 1 << attr);
 
 	context_drv(ctx)->emit[state](ctx, state);
 }
-
-static void
-TAG(render_prims)(struct gl_context *ctx, const struct gl_client_array **arrays,
-		  const struct _mesa_prim *prims, GLuint nr_prims,
-		  const struct _mesa_index_buffer *ib,
-		  GLboolean index_bounds_valid,
-		  GLuint min_index, GLuint max_index)
-{
-	struct nouveau_context *nctx = to_nouveau_context(ctx);
-
-	nouveau_validate_framebuffer(ctx);
-
-	if (nctx->fallback == HWTNL)
-		TAG(vbo_render_prims)(ctx, arrays, prims, nr_prims, ib,
-				      index_bounds_valid, min_index, max_index);
-
-	if (nctx->fallback == SWTNL)
-		_tnl_vbo_draw_prims(ctx, arrays, prims, nr_prims, ib,
-				    index_bounds_valid, min_index, max_index);
-}
-
-void
-TAG(render_init)(struct gl_context *ctx)
-{
-	struct nouveau_render_state *render = to_render_state(ctx);
-	struct nouveau_scratch_state *scratch = &render->scratch;
-	int ret, i;
-
-	for (i = 0; i < RENDER_SCRATCH_COUNT; i++) {
-		ret = nouveau_bo_new(context_dev(ctx),
-				     NOUVEAU_BO_MAP | NOUVEAU_BO_GART,
-				     0, RENDER_SCRATCH_SIZE, &scratch->bo[i]);
-		assert(!ret);
-	}
-
-	for (i = 0; i < VERT_ATTRIB_MAX; i++)
-		render->map[i] = -1;
-
-	TAG(swtnl_init)(ctx);
-	vbo_set_draw_func(ctx, TAG(render_prims));
-}
-
-void
-TAG(render_destroy)(struct gl_context *ctx)
-{
-	TAG(swtnl_destroy)(ctx);
-}
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_scratch.c b/src/mesa/drivers/dri/nouveau/nouveau_scratch.c
new file mode 100644
index 00000000000..ddda67b2f14
--- /dev/null
+++ b/src/mesa/drivers/dri/nouveau/nouveau_scratch.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2009-2010 Francisco Jerez.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "nouveau_driver.h"
+#include "nouveau_context.h"
+
+/*
+ * Returns a pointer to a chunk of 'size' bytes long GART memory. 'bo'
+ * and 'offset' will point to the returned memory.
+ */
+void *
+nouveau_get_scratch(struct gl_context *ctx, unsigned size,
+		    struct nouveau_bo **bo, unsigned *offset)
+{
+	struct nouveau_scratch_state *scratch =
+		&to_nouveau_context(ctx)->scratch;
+	void *buf;
+
+	if (scratch->buf && size <= NOUVEAU_SCRATCH_SIZE - scratch->offset) {
+		nouveau_bo_ref(scratch->bo[scratch->index], bo);
+
+		buf = scratch->buf + scratch->offset;
+		*offset = scratch->offset;
+		scratch->offset += size;
+
+	} else if (size <= NOUVEAU_SCRATCH_SIZE) {
+		scratch->index = (scratch->index + 1) % NOUVEAU_SCRATCH_COUNT;
+		nouveau_bo_ref(scratch->bo[scratch->index], bo);
+
+		nouveau_bo_map(*bo, NOUVEAU_BO_WR);
+		buf = scratch->buf = (*bo)->map;
+		nouveau_bo_unmap(*bo);
+
+		*offset = 0;
+		scratch->offset = size;
+
+	} else {
+		nouveau_bo_new(context_dev(ctx),
+			       NOUVEAU_BO_MAP | NOUVEAU_BO_GART, 0, size, bo);
+
+		nouveau_bo_map(*bo, NOUVEAU_BO_WR);
+		buf = (*bo)->map;
+		nouveau_bo_unmap(*bo);
+
+		*offset = 0;
+	}
+
+	return buf;
+}
+
+void
+nouveau_scratch_init(struct gl_context *ctx)
+{
+	struct nouveau_scratch_state *scratch =
+		&to_nouveau_context(ctx)->scratch;
+	int ret, i;
+
+	for (i = 0; i < NOUVEAU_SCRATCH_COUNT; i++) {
+		ret = nouveau_bo_new(context_dev(ctx),
+				     NOUVEAU_BO_MAP | NOUVEAU_BO_GART,
+				     0, NOUVEAU_SCRATCH_SIZE, &scratch->bo[i]);
+		assert(!ret);
+	}
+}
+
+void
+nouveau_scratch_destroy(struct gl_context *ctx)
+{
+	struct nouveau_scratch_state *scratch =
+		&to_nouveau_context(ctx)->scratch;
+	int i;
+
+	for (i = 0; i < NOUVEAU_SCRATCH_COUNT; i++)
+		nouveau_bo_ref(NULL, &scratch->bo[i]);
+}
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_scratch.h b/src/mesa/drivers/dri/nouveau/nouveau_scratch.h
new file mode 100644
index 00000000000..b60b33dd1ac
--- /dev/null
+++ b/src/mesa/drivers/dri/nouveau/nouveau_scratch.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2009-2010 Francisco Jerez.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __NOUVEAU_SCRATCH_H__
+#define __NOUVEAU_SCRATCH_H__
+
+#define NOUVEAU_SCRATCH_COUNT 2
+#define NOUVEAU_SCRATCH_SIZE 3*1024*1024
+
+struct nouveau_scratch_state {
+	struct nouveau_bo *bo[NOUVEAU_SCRATCH_COUNT];
+
+	int index;
+	int offset;
+	void *buf;
+};
+
+void *
+nouveau_get_scratch(struct gl_context *ctx, unsigned size,
+		    struct nouveau_bo **bo, unsigned *offset);
+
+void
+nouveau_scratch_init(struct gl_context *ctx);
+
+void
+nouveau_scratch_destroy(struct gl_context *ctx);
+
+#endif
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_state.c b/src/mesa/drivers/dri/nouveau/nouveau_state.c
index 7b7ddd2f54d..1579d29efc2 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_state.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_state.c
@@ -113,6 +113,12 @@ nouveau_depth_range(struct gl_context *ctx, GLclampd nearval, GLclampd farval)
 }
 
 static void
+nouveau_read_buffer(struct gl_context *ctx, GLenum buffer)
+{
+	nouveau_validate_framebuffer(ctx);
+}
+
+static void
 nouveau_draw_buffers(struct gl_context *ctx, GLsizei n, const GLenum *buffers)
 {
 	nouveau_validate_framebuffer(ctx);
@@ -512,6 +518,7 @@ nouveau_state_init(struct gl_context *ctx)
 	ctx->Driver.DepthFunc = nouveau_depth_func;
 	ctx->Driver.DepthMask = nouveau_depth_mask;
 	ctx->Driver.DepthRange = nouveau_depth_range;
+	ctx->Driver.ReadBuffer = nouveau_read_buffer;
 	ctx->Driver.DrawBuffers = nouveau_draw_buffers;
 	ctx->Driver.Enable = nouveau_enable;
 	ctx->Driver.Fogfv = nouveau_fog;
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c b/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c
index b3588e8fd39..f084f89d29e 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c
@@ -28,6 +28,8 @@
 #include "tnl/t_pipeline.h"
 #include "tnl/t_vertex.h"
 
+#define SWTNL_VBO_SIZE 65536
+
 static enum tnl_attr_format
 swtnl_get_format(int type, int fields) {
 	switch (type) {
@@ -105,7 +107,7 @@ swtnl_choose_attrs(struct gl_context *ctx)
 	TNLcontext *tnl = TNL_CONTEXT(ctx);
 	struct tnl_clipspace *vtx = &tnl->clipspace;
 	static struct tnl_attr_map map[NUM_VERTEX_ATTRS];
-	int fields, i, n = 0;
+	int fields, attr, i, n = 0;
 
 	render->mode = VBO;
 	render->attr_count = NUM_VERTEX_ATTRS;
@@ -116,7 +118,7 @@ swtnl_choose_attrs(struct gl_context *ctx)
 	for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 		struct nouveau_attr_info *ha = &TAG(vertex_attrs)[i];
 		struct swtnl_attr_info *sa = &swtnl_attrs[i];
-		struct nouveau_array_state *a = &render->attrs[i];
+		struct nouveau_array *a = &render->attrs[i];
 
 		if (!sa->fields)
 			continue; /* Unsupported attribute. */
@@ -141,13 +143,8 @@ swtnl_choose_attrs(struct gl_context *ctx)
 
 	_tnl_install_attrs(ctx, map, n, NULL, 0);
 
-	for (i = 0; i < vtx->attr_count; i++) {
-		struct tnl_clipspace_attr *ta = &vtx->attr[i];
-		struct nouveau_array_state *a = &render->attrs[ta->attrib];
-
-		a->stride = vtx->vertex_size;
-		a->offset = ta->vertoffset;
-	}
+	FOR_EACH_BOUND_ATTR(render, i, attr)
+		render->attrs[attr].stride = vtx->vertex_size;
 
 	TAG(render_set_format)(ctx);
 }
@@ -158,8 +155,8 @@ swtnl_alloc_vertices(struct gl_context *ctx)
 	struct nouveau_swtnl_state *swtnl = &to_render_state(ctx)->swtnl;
 
 	nouveau_bo_ref(NULL, &swtnl->vbo);
-	swtnl->buf = get_scratch_vbo(ctx, RENDER_SCRATCH_SIZE,
-				     &swtnl->vbo, NULL);
+	swtnl->buf = nouveau_get_scratch(ctx, SWTNL_VBO_SIZE, &swtnl->vbo,
+					 &swtnl->offset);
 	swtnl->vertex_count = 0;
 }
 
@@ -168,14 +165,15 @@ swtnl_bind_vertices(struct gl_context *ctx)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
 	struct nouveau_swtnl_state *swtnl = &render->swtnl;
+	struct tnl_clipspace *vtx = &TNL_CONTEXT(ctx)->clipspace;
 	int i;
 
-	for (i = 0; i < render->attr_count; i++) {
-		int attr = render->map[i];
+	for (i = 0; i < vtx->attr_count; i++) {
+		struct tnl_clipspace_attr *ta = &vtx->attr[i];
+		struct nouveau_array *a = &render->attrs[ta->attrib];
 
-		if (attr >= 0)
-			nouveau_bo_ref(swtnl->vbo,
-				       &render->attrs[attr].bo);
+		nouveau_bo_ref(swtnl->vbo, &a->bo);
+		a->offset = swtnl->offset + ta->vertoffset;
 	}
 
 	TAG(render_bind_vertices)(ctx);
@@ -185,15 +183,11 @@ static void
 swtnl_unbind_vertices(struct gl_context *ctx)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
-	int i;
-
-	for (i = 0; i < render->attr_count; i++) {
-		int *attr = &render->map[i];
+	int i, attr;
 
-		if (*attr >= 0) {
-			nouveau_bo_ref(NULL, &render->attrs[*attr].bo);
-			*attr = -1;
-		}
+	FOR_EACH_BOUND_ATTR(render, i, attr) {
+		nouveau_bo_ref(NULL, &render->attrs[attr].bo);
+		render->map[i] = -1;
 	}
 
 	render->attr_count = 0;
@@ -260,7 +254,7 @@ swtnl_reset_stipple(struct gl_context *ctx)
 	struct nouveau_swtnl_state *swtnl = &to_render_state(ctx)->swtnl; \
 	int vertex_len = TNL_CONTEXT(ctx)->clipspace.vertex_size;	\
 									\
-	if (swtnl->vertex_count + (n) > swtnl->vbo->size/vertex_len	\
+	if (swtnl->vertex_count + (n) > SWTNL_VBO_SIZE/vertex_len	\
 	    || (swtnl->vertex_count && swtnl->primitive != p))		\
 		swtnl_flush_vertices(ctx);				\
 									\
@@ -280,7 +274,7 @@ swtnl_points(struct gl_context *ctx, GLuint first, GLuint last)
 	while (first < last) {
 		BEGIN_PRIMITIVE(GL_POINTS, last - first);
 
-		count = MIN2(swtnl->vbo->size / vertex_len, last - first);
+		count = MIN2(SWTNL_VBO_SIZE / vertex_len, last - first);
 		for (i = 0; i < count; i++)
 			OUT_VERTEX(first + i);
 
@@ -316,7 +310,7 @@ swtnl_quad(struct gl_context *ctx, GLuint v1, GLuint v2, GLuint v3, GLuint v4)
 }
 
 /* TnL initialization. */
-static void
+void
 TAG(swtnl_init)(struct gl_context *ctx)
 {
 	TNLcontext *tnl = TNL_CONTEXT(ctx);
@@ -347,7 +341,7 @@ TAG(swtnl_init)(struct gl_context *ctx)
 	swtnl_alloc_vertices(ctx);
 }
 
-static void
+void
 TAG(swtnl_destroy)(struct gl_context *ctx)
 {
 	nouveau_bo_ref(NULL, &to_render_state(ctx)->swtnl.vbo);
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_texture.c b/src/mesa/drivers/dri/nouveau/nouveau_texture.c
index cd063702af0..060c2c5bcc0 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_texture.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_texture.c
@@ -79,26 +79,65 @@ nouveau_teximage_free(struct gl_context *ctx, struct gl_texture_image *ti)
 }
 
 static void
-nouveau_teximage_map(struct gl_context *ctx, struct gl_texture_image *ti)
+nouveau_teximage_map(struct gl_context *ctx, struct gl_texture_image *ti,
+		     int access, int x, int y, int w, int h)
 {
-	struct nouveau_surface *s = &to_nouveau_teximage(ti)->surface;
-	int ret;
+	struct nouveau_teximage *nti = to_nouveau_teximage(ti);
+	struct nouveau_surface *s = &nti->surface;
+	struct nouveau_surface *st = &nti->transfer.surface;
 
 	if (s->bo) {
-		ret = nouveau_bo_map(s->bo, NOUVEAU_BO_RDWR);
-		assert(!ret);
-
-		ti->Data = s->bo->map;
+		if (!(access & GL_MAP_READ_BIT) &&
+		    nouveau_bo_pending(s->bo)) {
+			/*
+			 * Heuristic: use a bounce buffer to pipeline
+			 * teximage transfers.
+			 */
+			st->layout = LINEAR;
+			st->format = s->format;
+			st->cpp = s->cpp;
+			st->width = w;
+			st->height = h;
+			st->pitch = s->pitch;
+			nti->transfer.x = x;
+			nti->transfer.y = y;
+
+			ti->Data = nouveau_get_scratch(ctx, st->pitch * h,
+						       &st->bo, &st->offset);
+
+		} else {
+			int ret, flags = 0;
+
+			if (access & GL_MAP_READ_BIT)
+				flags |= NOUVEAU_BO_RD;
+			if (access & GL_MAP_WRITE_BIT)
+				flags |= NOUVEAU_BO_WR;
+
+			ret = nouveau_bo_map(s->bo, flags);
+			assert(!ret);
+
+			ti->Data = s->bo->map + y * s->pitch + x * s->cpp;
+		}
 	}
 }
 
 static void
 nouveau_teximage_unmap(struct gl_context *ctx, struct gl_texture_image *ti)
 {
-	struct nouveau_surface *s = &to_nouveau_teximage(ti)->surface;
+	struct nouveau_teximage *nti = to_nouveau_teximage(ti);
+	struct nouveau_surface *s = &nti->surface;
+	struct nouveau_surface *st = &nti->transfer.surface;
 
-	if (s->bo)
+	if (st->bo) {
+		context_drv(ctx)->surface_copy(ctx, s, st, nti->transfer.x,
+					       nti->transfer.y, 0, 0,
+					       st->width, st->height);
+		nouveau_surface_ref(NULL, st);
+
+	} else if (s->bo) {
 		nouveau_bo_unmap(s->bo);
+	}
+
 	ti->Data = NULL;
 }
 
@@ -115,6 +154,7 @@ nouveau_choose_tex_format(struct gl_context *ctx, GLint internalFormat,
 	case GL_RGBA12:
 	case GL_RGBA16:
 	case GL_RGB10_A2:
+	case GL_COMPRESSED_RGBA:
 		return MESA_FORMAT_ARGB8888;
 	case GL_RGB5_A1:
 		return MESA_FORMAT_ARGB1555;
@@ -124,6 +164,7 @@ nouveau_choose_tex_format(struct gl_context *ctx, GLint internalFormat,
 	case GL_RGB10:
 	case GL_RGB12:
 	case GL_RGB16:
+	case GL_COMPRESSED_RGB:
 		return MESA_FORMAT_XRGB8888;
 	case 3:
 	case GL_R3_G3_B2:
@@ -139,6 +180,7 @@ nouveau_choose_tex_format(struct gl_context *ctx, GLint internalFormat,
 	case GL_LUMINANCE12_ALPHA12:
 	case GL_LUMINANCE16_ALPHA16:
 	case GL_LUMINANCE8_ALPHA8:
+	case GL_COMPRESSED_LUMINANCE_ALPHA:
 		return MESA_FORMAT_ARGB8888;
 
 	case 1:
@@ -147,6 +189,7 @@ nouveau_choose_tex_format(struct gl_context *ctx, GLint internalFormat,
 	case GL_LUMINANCE12:
 	case GL_LUMINANCE16:
 	case GL_LUMINANCE8:
+	case GL_COMPRESSED_LUMINANCE:
 		return MESA_FORMAT_L8;
 
 	case GL_ALPHA:
@@ -154,6 +197,7 @@ nouveau_choose_tex_format(struct gl_context *ctx, GLint internalFormat,
 	case GL_ALPHA12:
 	case GL_ALPHA16:
 	case GL_ALPHA8:
+	case GL_COMPRESSED_ALPHA:
 		return MESA_FORMAT_A8;
 
 	case GL_INTENSITY:
@@ -356,7 +400,8 @@ nouveau_teximage(struct gl_context *ctx, GLint dims, GLenum target, GLint level,
 					     "glTexImage");
 	if (pixels) {
 		/* Store the pixel data. */
-		nouveau_teximage_map(ctx, ti);
+		nouveau_teximage_map(ctx, ti, GL_MAP_WRITE_BIT,
+				     0, 0, width, height);
 
 		ret = _mesa_texstore(ctx, dims, ti->_BaseFormat,
 				     ti->TexFormat, ti->Data,
@@ -443,13 +488,13 @@ nouveau_texsubimage(struct gl_context *ctx, GLint dims, GLenum target, GLint lev
 					     format, type, pixels, packing,
 					     "glTexSubImage");
 	if (pixels) {
-		nouveau_teximage_map(ctx, ti);
+		nouveau_teximage_map(ctx, ti, GL_MAP_WRITE_BIT,
+				     xoffset, yoffset, width, height);
 
 		ret = _mesa_texstore(ctx, 3, ti->_BaseFormat, ti->TexFormat,
-				     ti->Data, xoffset, yoffset, zoffset,
-				     s->pitch, ti->ImageOffsets,
-				     width, height, depth, format, type,
-				     pixels, packing);
+				     ti->Data, 0, 0, 0, s->pitch,
+				     ti->ImageOffsets, width, height, depth,
+				     format, type, pixels, packing);
 		assert(ret);
 
 		nouveau_teximage_unmap(ctx, ti);
@@ -508,7 +553,8 @@ nouveau_get_teximage(struct gl_context *ctx, GLenum target, GLint level,
 		     struct gl_texture_object *t,
 		     struct gl_texture_image *ti)
 {
-	nouveau_teximage_map(ctx, ti);
+	nouveau_teximage_map(ctx, ti, GL_MAP_READ_BIT,
+			     0, 0, ti->Width, ti->Height);
 	_mesa_get_teximage(ctx, target, level, format, type, pixels,
 			   t, ti);
 	nouveau_teximage_unmap(ctx, ti);
@@ -579,8 +625,11 @@ nouveau_texture_map(struct gl_context *ctx, struct gl_texture_object *t)
 	int i;
 
 	for (i = t->BaseLevel; i < t->_MaxLevel; i++) {
-		if (t->Image[0][i])
-			nouveau_teximage_map(ctx, t->Image[0][i]);
+		struct gl_texture_image *ti = t->Image[0][i];
+
+		if (ti)
+			nouveau_teximage_map(ctx, ti, GL_MAP_READ_BIT,
+					     0, 0, ti->Width, ti->Height);
 	}
 }
 
@@ -630,7 +679,8 @@ nouveau_generate_mipmap(struct gl_context *ctx, GLenum target,
 	if (_mesa_meta_check_generate_mipmap_fallback(ctx, target, t)) {
 		struct gl_texture_image *base = t->Image[0][t->BaseLevel];
 
-		nouveau_teximage_map(ctx, base);
+		nouveau_teximage_map(ctx, base, GL_MAP_READ_BIT,
+				     0, 0, base->Width, base->Height);
 		_mesa_generate_mipmap(ctx, target, t);
 		nouveau_teximage_unmap(ctx, base);
 
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_texture.h b/src/mesa/drivers/dri/nouveau/nouveau_texture.h
index fc170215f35..56e61c7337b 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_texture.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_texture.h
@@ -30,6 +30,10 @@
 struct nouveau_teximage {
 	struct gl_texture_image base;
 	struct nouveau_surface surface;
+	struct {
+		struct nouveau_surface surface;
+		int x, y;
+	} transfer;
 };
 #define to_nouveau_teximage(x) ((struct nouveau_teximage *)(x))
 
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
index 394f3c9b500..7a0eb9fc23d 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
@@ -31,59 +31,11 @@
 #include "main/image.h"
 
 /* Arbitrary pushbuf length we can assume we can get with a single
- * WAIT_RING. */
+ * call to WAIT_RING. */
 #define PUSHBUF_DWORDS 65536
 
-/* Functions to set up struct nouveau_array_state from something like
- * a GL array or index buffer. */
-
-static void
-vbo_init_array(struct nouveau_array_state *a, int attr, int stride,
-	       int fields, int type, struct gl_buffer_object *obj,
-	       const void *ptr, GLboolean map)
-{
-	a->attr = attr;
-	a->stride = stride;
-	a->fields = fields;
-	a->type = type;
-
-	if (_mesa_is_bufferobj(obj)) {
-		nouveau_bo_ref(to_nouveau_bufferobj(obj)->bo, &a->bo);
-		a->offset = (intptr_t)ptr;
-
-		if (map) {
-			nouveau_bo_map(a->bo, NOUVEAU_BO_RD);
-			a->buf = a->bo->map + a->offset;
-		} else {
-			a->buf = NULL;
-		}
-
-	} else {
-		nouveau_bo_ref(NULL, &a->bo);
-		a->offset = 0;
-
-		if (map)
-			a->buf = ptr;
-		else
-			a->buf = NULL;
-	}
-
-	if (a->buf)
-		get_array_extract(a, &a->extract_u, &a->extract_f);
-}
-
-static void
-vbo_deinit_array(struct nouveau_array_state *a)
-{
-	if (a->bo) {
-		if (a->bo->map)
-			nouveau_bo_unmap(a->bo);
-		nouveau_bo_ref(NULL, &a->bo);
-	}
-
-	a->buf = NULL;
-	a->fields = 0;
-}
+/* Functions to turn GL arrays or index buffers into nouveau_array
+ * structures. */
 
 static int
 get_array_stride(struct gl_context *ctx, const struct gl_client_array *a)
@@ -102,48 +54,45 @@ vbo_init_arrays(struct gl_context *ctx, const struct _mesa_index_buffer *ib,
 		const struct gl_client_array **arrays)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
-	int i;
+	GLboolean imm = (render->mode == IMM);
+	int i, attr;
 
 	if (ib)
-		vbo_init_array(&render->ib, 0, 0, ib->count, ib->type,
-			       ib->obj, ib->ptr, GL_TRUE);
+		nouveau_init_array(&render->ib, 0, 0, ib->count, ib->type,
+				   ib->obj, ib->ptr, GL_TRUE);
 
-	for (i = 0; i < render->attr_count; i++) {
-		int attr = render->map[i];
+	FOR_EACH_BOUND_ATTR(render, i, attr) {
+		const struct gl_client_array *array = arrays[attr];
 
-		if (attr >= 0) {
-			const struct gl_client_array *array = arrays[attr];
-
-			vbo_init_array(&render->attrs[attr], attr,
-				       get_array_stride(ctx, array),
-				       array->Size, array->Type,
-				       array->BufferObj, array->Ptr,
-				       render->mode == IMM);
-		}
+		nouveau_init_array(&render->attrs[attr], attr,
+				   get_array_stride(ctx, array),
+				   array->Size, array->Type,
+				   imm ? array->BufferObj : NULL,
+				   array->Ptr, imm);
 	}
 }
 
 static void
 vbo_deinit_arrays(struct gl_context *ctx, const struct _mesa_index_buffer *ib,
-		const struct gl_client_array **arrays)
+		  const struct gl_client_array **arrays)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
-	int i;
+	int i, attr;
 
 	if (ib)
-		vbo_deinit_array(&render->ib);
+		nouveau_cleanup_array(&render->ib);
 
-	for (i = 0; i < render->attr_count; i++) {
-		int *attr = &render->map[i];
+	FOR_EACH_BOUND_ATTR(render, i, attr) {
+		struct nouveau_array *a = &render->attrs[attr];
 
-		if (*attr >= 0) {
-			vbo_deinit_array(&render->attrs[*attr]);
-			*attr = -1;
-		}
+		if (render->mode == IMM)
+			nouveau_bo_ref(NULL, &a->bo);
+
+		nouveau_deinit_array(a);
+		render->map[i] = -1;
 	}
 
 	render->attr_count = 0;
-	context_bctx(ctx, VERTEX);
 }
 
 /* Make some rendering decisions from the GL context. */
@@ -164,20 +113,16 @@ vbo_choose_render_mode(struct gl_context *ctx, const struct gl_client_array **ar
 			}
 		}
 	}
-
-	if (render->mode == VBO)
-		render->attr_count = NUM_VERTEX_ATTRS;
-	else
-		render->attr_count = 0;
 }
 
 static void
-vbo_emit_attr(struct gl_context *ctx, const struct gl_client_array **arrays, int attr)
+vbo_emit_attr(struct gl_context *ctx, const struct gl_client_array **arrays,
+	      int attr)
 {
 	struct nouveau_channel *chan = context_chan(ctx);
 	struct nouveau_render_state *render = to_render_state(ctx);
 	const struct gl_client_array *array = arrays[attr];
-	struct nouveau_array_state *a = &render->attrs[attr];
+	struct nouveau_array *a = &render->attrs[attr];
 	RENDER_LOCALS(ctx);
 
 	if (!array->StrideB) {
@@ -186,11 +131,11 @@ vbo_emit_attr(struct gl_context *ctx, const struct gl_client_array **arrays, int
 			return;
 
 		/* Constant attribute. */
-		vbo_init_array(a, attr, array->StrideB, array->Size,
-			       array->Type, array->BufferObj, array->Ptr,
-			       GL_TRUE);
+		nouveau_init_array(a, attr, array->StrideB, array->Size,
+				   array->Type, array->BufferObj, array->Ptr,
+				   GL_TRUE);
 		EMIT_IMM(ctx, a, 0);
-		vbo_deinit_array(a);
+		nouveau_deinit_array(a);
 
 	} else {
 		/* Varying attribute. */
@@ -199,10 +144,13 @@ vbo_emit_attr(struct gl_context *ctx, const struct gl_client_array **arrays, int
 		if (render->mode == VBO) {
 			render->map[info->vbo_index] = attr;
 			render->vertex_size += array->_ElementSize;
+			render->attr_count = MAX2(render->attr_count,
+						  info->vbo_index + 1);
 		} else {
 			render->map[render->attr_count++] = attr;
 			render->vertex_size += 4 * info->imm_fields;
 		}
+
 	}
 }
 
@@ -216,6 +164,7 @@ vbo_choose_attrs(struct gl_context *ctx, const struct gl_client_array **arrays)
 
 	/* Reset the vertex size. */
 	render->vertex_size = 0;
+	render->attr_count = 0;
 
 	vbo_emit_attr(ctx, arrays, VERT_ATTRIB_COLOR0);
 	if (ctx->Fog.ColorSumEnabled && !ctx->Light.Enabled)
@@ -233,7 +182,7 @@ vbo_choose_attrs(struct gl_context *ctx, const struct gl_client_array **arrays)
 	    (ctx->Texture._GenFlags & TEXGEN_NEED_NORMALS))
 		vbo_emit_attr(ctx, arrays, VERT_ATTRIB_NORMAL);
 
-	if (ctx->Light.Enabled) {
+	if (ctx->Light.Enabled && render->mode == IMM) {
 		vbo_emit_attr(ctx, arrays, MAT(FRONT_AMBIENT));
 		vbo_emit_attr(ctx, arrays, MAT(FRONT_DIFFUSE));
 		vbo_emit_attr(ctx, arrays, MAT(FRONT_SPECULAR));
@@ -254,17 +203,13 @@ static int
 get_max_client_stride(struct gl_context *ctx, const struct gl_client_array **arrays)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
-	int i, s = 0;
+	int i, attr, s = 0;
 
-	for (i = 0; i < render->attr_count; i++) {
-		int attr = render->map[i];
+	FOR_EACH_BOUND_ATTR(render, i, attr) {
+		const struct gl_client_array *a = arrays[attr];
 
-		if (attr >= 0) {
-			const struct gl_client_array *a = arrays[attr];
-
-			if (!_mesa_is_bufferobj(a->BufferObj))
-				s = MAX2(s, get_array_stride(ctx, a));
-		}
+		if (!_mesa_is_bufferobj(a->BufferObj))
+			s = MAX2(s, get_array_stride(ctx, a));
 	}
 
 	return s;
@@ -295,7 +240,7 @@ vbo_maybe_split(struct gl_context *ctx, const struct gl_client_array **arrays,
 	if (render->mode == VBO &&
 	    (stride = get_max_client_stride(ctx, arrays)))
 		    vert_avail = MIN2(vert_avail,
-				      RENDER_SCRATCH_SIZE / stride);
+				      NOUVEAU_SCRATCH_SIZE / stride);
 
 	if (max_index - min_index > vert_avail ||
 	    (ib && ib->count > idx_avail)) {
@@ -315,42 +260,93 @@ vbo_maybe_split(struct gl_context *ctx, const struct gl_client_array **arrays,
 
 /* VBO rendering path. */
 
+static GLboolean
+check_update_array(struct nouveau_array *a, unsigned offset,
+		   struct nouveau_bo *bo, int *pdelta)
+{
+	int delta = *pdelta;
+	GLboolean dirty;
+
+	if (a->bo == bo) {
+		if (delta < 0)
+			delta = ((int)offset - (int)a->offset) / a->stride;
+
+		dirty = (delta < 0 ||
+			 offset != (a->offset + delta * a->stride));
+	} else {
+		dirty = GL_TRUE;
+	}
+
+	*pdelta = (dirty ? 0 : delta);
+	return dirty;
+}
+
 static void
 vbo_bind_vertices(struct gl_context *ctx, const struct gl_client_array **arrays,
-		  GLint basevertex, GLuint min_index, GLuint max_index)
+		  int base, unsigned min_index, unsigned max_index, int *pdelta)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
-	int i;
+	struct nouveau_channel *chan = context_chan(ctx);
+	struct nouveau_bo *bo[NUM_VERTEX_ATTRS];
+	unsigned offset[NUM_VERTEX_ATTRS];
+	GLboolean dirty = GL_FALSE;
+	int i, j, attr;
+	RENDER_LOCALS(ctx);
 
-	for (i = 0; i < NUM_VERTEX_ATTRS; i++) {
-		int attr = render->map[i];
-
-		if (attr >= 0) {
-			const struct gl_client_array *array = arrays[attr];
-			struct nouveau_array_state *a = &render->attrs[attr];
-			unsigned delta = (basevertex + min_index)
-				* array->StrideB;
-
-			if (a->bo) {
-				/* Array in a buffer obj. */
-				a->offset = (intptr_t)array->Ptr + delta;
-			} else {
-				int j, n = max_index - min_index + 1;
-				char *sp = (char *)array->Ptr + delta;
-				char *dp = get_scratch_vbo(ctx, n * a->stride,
-							   &a->bo, &a->offset);
-
-				/* Array in client memory, move it to
-				 * a scratch buffer obj. */
-				for (j = 0; j < n; j++)
-					memcpy(dp + j * a->stride,
-					       sp + j * array->StrideB,
-					       a->stride);
-			}
+	*pdelta = -1;
+
+	FOR_EACH_BOUND_ATTR(render, i, attr) {
+		const struct gl_client_array *array = arrays[attr];
+		struct gl_buffer_object *obj = array->BufferObj;
+		struct nouveau_array *a = &render->attrs[attr];
+		unsigned delta = (base + min_index) * array->StrideB;
+
+		bo[i] = NULL;
+
+		if (nouveau_bufferobj_hw(obj)) {
+			/* Array in a buffer obj. */
+			nouveau_bo_ref(to_nouveau_bufferobj(obj)->bo, &bo[i]);
+			offset[i] = delta + (intptr_t)array->Ptr;
+
+		} else {
+			int n = max_index - min_index + 1;
+			char *sp = (char *)ADD_POINTERS(
+				nouveau_bufferobj_sys(obj), array->Ptr) + delta;
+			char *dp  = nouveau_get_scratch(ctx, n * a->stride,
+							&bo[i], &offset[i]);
+
+			/* Array in client memory, move it to a
+			 * scratch buffer obj. */
+			for (j = 0; j < n; j++)
+				memcpy(dp + j * a->stride,
+				       sp + j * array->StrideB,
+				       a->stride);
 		}
+
+		dirty |= check_update_array(a, offset[i], bo[i], pdelta);
+	}
+
+	*pdelta -= min_index;
+
+	if (dirty) {
+		/* Buffers changed, update the attribute binding. */
+		FOR_EACH_BOUND_ATTR(render, i, attr) {
+			struct nouveau_array *a = &render->attrs[attr];
+
+			nouveau_bo_ref(NULL, &a->bo);
+			a->offset = offset[i];
+			a->bo = bo[i];
+		}
+
+		TAG(render_bind_vertices)(ctx);
+
+	} else {
+		/* Just cleanup. */
+		FOR_EACH_BOUND_ATTR(render, i, attr)
+			nouveau_bo_ref(NULL, &bo[i]);
 	}
 
-	TAG(render_bind_vertices)(ctx);
+	BATCH_VALIDATE();
 }
 
 static void
@@ -360,12 +356,10 @@ vbo_draw_vbo(struct gl_context *ctx, const struct gl_client_array **arrays,
 	     GLuint max_index)
 {
 	struct nouveau_channel *chan = context_chan(ctx);
-	dispatch_t dispatch;
-	int delta = -min_index, basevertex = 0, i;
+	dispatch_t dispatch = get_array_dispatch(&to_render_state(ctx)->ib);
+	int i, delta = 0, basevertex = 0;
 	RENDER_LOCALS(ctx);
 
-	get_array_dispatch(&to_render_state(ctx)->ib, &dispatch);
-
 	TAG(render_set_format)(ctx);
 
 	for (i = 0; i < nr_prims; i++) {
@@ -374,8 +368,8 @@ vbo_draw_vbo(struct gl_context *ctx, const struct gl_client_array **arrays,
 
 		if (i == 0 || basevertex != prims[i].basevertex) {
 			basevertex = prims[i].basevertex;
-			vbo_bind_vertices(ctx, arrays, basevertex,
-					  min_index, max_index);
+			vbo_bind_vertices(ctx, arrays, basevertex, min_index,
+					  max_index, &delta);
 		}
 
 		if (count > get_max_vertices(ctx, ib, AVAIL_RING(chan)))
@@ -390,7 +384,7 @@ vbo_draw_vbo(struct gl_context *ctx, const struct gl_client_array **arrays,
 /* Immediate rendering path. */
 
 static unsigned
-extract_id(struct nouveau_array_state *a, int i, int j)
+extract_id(struct nouveau_array *a, int i, int j)
 {
 	return j;
 }
@@ -404,7 +398,7 @@ vbo_draw_imm(struct gl_context *ctx, const struct gl_client_array **arrays,
 	struct nouveau_render_state *render = to_render_state(ctx);
 	struct nouveau_channel *chan = context_chan(ctx);
 	extract_u_t extract = ib ? render->ib.extract_u : extract_id;
-	int i, j, k;
+	int i, j, k, attr;
 	RENDER_LOCALS(ctx);
 
 	for (i = 0; i < nr_prims; i++) {
@@ -421,9 +415,8 @@ vbo_draw_imm(struct gl_context *ctx, const struct gl_client_array **arrays,
 			j = prims[i].basevertex +
 				extract(&render->ib, 0, start);
 
-			for (k = 0; k < render->attr_count; k++)
-				EMIT_IMM(ctx, &render->attrs[render->map[k]],
-					 j);
+			FOR_EACH_BOUND_ATTR(render, k, attr)
+				EMIT_IMM(ctx, &render->attrs[attr], j);
 		}
 
 		BATCH_END();
@@ -433,7 +426,8 @@ vbo_draw_imm(struct gl_context *ctx, const struct gl_client_array **arrays,
 /* draw_prims entry point when we're doing hw-tnl. */
 
 static void
-TAG(vbo_render_prims)(struct gl_context *ctx, const struct gl_client_array **arrays,
+TAG(vbo_render_prims)(struct gl_context *ctx,
+		      const struct gl_client_array **arrays,
 		      const struct _mesa_prim *prims, GLuint nr_prims,
 		      const struct _mesa_index_buffer *ib,
 		      GLboolean index_bounds_valid,
@@ -462,3 +456,44 @@ TAG(vbo_render_prims)(struct gl_context *ctx, const struct gl_client_array **arr
 
 	vbo_deinit_arrays(ctx, ib, arrays);
 }
+
+/* VBO rendering entry points. */
+
+static void
+TAG(vbo_check_render_prims)(struct gl_context *ctx,
+			    const struct gl_client_array **arrays,
+			    const struct _mesa_prim *prims, GLuint nr_prims,
+			    const struct _mesa_index_buffer *ib,
+			    GLboolean index_bounds_valid,
+			    GLuint min_index, GLuint max_index)
+{
+	struct nouveau_context *nctx = to_nouveau_context(ctx);
+
+	nouveau_validate_framebuffer(ctx);
+
+	if (nctx->fallback == HWTNL)
+		TAG(vbo_render_prims)(ctx, arrays, prims, nr_prims, ib,
+				      index_bounds_valid, min_index, max_index);
+
+	if (nctx->fallback == SWTNL)
+		_tnl_vbo_draw_prims(ctx, arrays, prims, nr_prims, ib,
+				    index_bounds_valid, min_index, max_index);
+}
+
+void
+TAG(vbo_init)(struct gl_context *ctx)
+{
+	struct nouveau_render_state *render = to_render_state(ctx);
+	int i;
+
+	for (i = 0; i < VERT_ATTRIB_MAX; i++)
+		render->map[i] = -1;
+
+	vbo_set_draw_func(ctx, TAG(vbo_check_render_prims));
+	vbo_use_buffer_objects(ctx);
+}
+
+void
+TAG(vbo_destroy)(struct gl_context *ctx)
+{
+}
diff --git a/src/mesa/drivers/dri/nouveau/nv10_context.c b/src/mesa/drivers/dri/nouveau/nv10_context.c
index fdcb43b7718..de2c93ec815 100644
--- a/src/mesa/drivers/dri/nouveau/nv10_context.c
+++ b/src/mesa/drivers/dri/nouveau/nv10_context.c
@@ -24,6 +24,7 @@
  *
  */
 
+#include "main/state.h"
 #include "nouveau_driver.h"
 #include "nouveau_context.h"
 #include "nouveau_fbo.h"
@@ -184,6 +185,9 @@ nv10_clear(struct gl_context *ctx, GLbitfield buffers)
 			nv17_zclear(ctx, &buffers);
 		else
 			nv10_zclear(ctx, &buffers);
+
+		/* Emit the zclear state if it's dirty */
+		_mesa_update_state(ctx);
 	}
 
 	nouveau_clear(ctx, buffers);
@@ -407,7 +411,8 @@ nv10_context_destroy(struct gl_context *ctx)
 	struct nouveau_context *nctx = to_nouveau_context(ctx);
 
 	nv04_surface_takedown(ctx);
-	nv10_render_destroy(ctx);
+	nv10_swtnl_destroy(ctx);
+	nv10_vbo_destroy(ctx);
 
 	nouveau_grobj_free(&nctx->hw.eng3d);
 
@@ -463,7 +468,8 @@ nv10_context_create(struct nouveau_screen *screen, const struct gl_config *visua
 		goto fail;
 
 	nv10_hwctx_init(ctx);
-	nv10_render_init(ctx);
+	nv10_vbo_init(ctx);
+	nv10_swtnl_init(ctx);
 
 	return ctx;
 
diff --git a/src/mesa/drivers/dri/nouveau/nv10_driver.h b/src/mesa/drivers/dri/nouveau/nv10_driver.h
index dec3d64e7d2..6fdc4641623 100644
--- a/src/mesa/drivers/dri/nouveau/nv10_driver.h
+++ b/src/mesa/drivers/dri/nouveau/nv10_driver.h
@@ -45,10 +45,16 @@ nv10_transform_depth(struct gl_context *ctx, float z);
 
 /* nv10_render.c */
 void
-nv10_render_init(struct gl_context *ctx);
+nv10_vbo_init(struct gl_context *ctx);
 
 void
-nv10_render_destroy(struct gl_context *ctx);
+nv10_vbo_destroy(struct gl_context *ctx);
+
+void
+nv10_swtnl_init(struct gl_context *ctx);
+
+void
+nv10_swtnl_destroy(struct gl_context *ctx);
 
 /* nv10_state_fb.c */
 void
diff --git a/src/mesa/drivers/dri/nouveau/nv10_render.c b/src/mesa/drivers/dri/nouveau/nv10_render.c
index a03ace35366..7115739b5aa 100644
--- a/src/mesa/drivers/dri/nouveau/nv10_render.c
+++ b/src/mesa/drivers/dri/nouveau/nv10_render.c
@@ -32,7 +32,7 @@
 #define NUM_VERTEX_ATTRS 8
 
 static void
-nv10_emit_material(struct gl_context *ctx, struct nouveau_array_state *a,
+nv10_emit_material(struct gl_context *ctx, struct nouveau_array *a,
 		   const void *v);
 
 /* Vertex attribute format. */
@@ -111,13 +111,11 @@ nv10_render_set_format(struct gl_context *ctx)
 	struct nouveau_render_state *render = to_render_state(ctx);
 	struct nouveau_channel *chan = context_chan(ctx);
 	struct nouveau_grobj *celsius = context_eng3d(ctx);
-	int i, hw_format;
-
-	for (i = 0; i < NUM_VERTEX_ATTRS; i++) {
-		int attr = render->map[i];
+	int i, attr, hw_format;
 
+	FOR_EACH_ATTR(render, i, attr) {
 		if (attr >= 0) {
-			struct nouveau_array_state *a = &render->attrs[attr];
+			struct nouveau_array *a = &render->attrs[attr];
 
 			hw_format = a->stride << 8 |
 				a->fields << 4 |
@@ -140,31 +138,27 @@ nv10_render_bind_vertices(struct gl_context *ctx)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
 	struct nouveau_bo_context *bctx = context_bctx(ctx, VERTEX);
-	struct nouveau_channel *chan = context_chan(ctx);
 	struct nouveau_grobj *celsius = context_eng3d(ctx);
-	int i;
+	int i, attr;
 
-	for (i = 0; i < NUM_VERTEX_ATTRS; i++) {
-		int attr = render->map[i];
-
-		if (attr >= 0) {
-			struct nouveau_array_state *a = &render->attrs[attr];
+	FOR_EACH_BOUND_ATTR(render, i, attr) {
+		struct nouveau_array *a = &render->attrs[attr];
 
-			nouveau_bo_markl(bctx, celsius,
-					 NV10TCL_VTXBUF_ADDRESS(i),
-					 a->bo, a->offset,
-					 NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-		}
+		nouveau_bo_markl(bctx, celsius,
+				 NV10TCL_VTXBUF_ADDRESS(i),
+				 a->bo, a->offset,
+				 NOUVEAU_BO_GART | NOUVEAU_BO_RD);
 	}
-
-	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_ARRAY_VALIDATE, 1);
-	OUT_RING(chan, 0);
 }
 
 /* Vertex array rendering defs. */
 #define RENDER_LOCALS(ctx)					\
 	struct nouveau_grobj *celsius = context_eng3d(ctx)
 
+#define BATCH_VALIDATE()						\
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_ARRAY_VALIDATE, 1);	\
+	OUT_RING(chan, 0)
+
 #define BATCH_BEGIN(prim)						\
 	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);	\
 	OUT_RING(chan, prim)
@@ -199,3 +193,5 @@ nv10_render_bind_vertices(struct gl_context *ctx)
 
 #define TAG(x) nv10_##x
 #include "nouveau_render_t.c"
+#include "nouveau_vbo_t.c"
+#include "nouveau_swtnl_t.c"
diff --git a/src/mesa/drivers/dri/nouveau/nv10_state_fb.c b/src/mesa/drivers/dri/nouveau/nv10_state_fb.c
index d87fe96b1c0..0fda9faf49b 100644
--- a/src/mesa/drivers/dri/nouveau/nv10_state_fb.c
+++ b/src/mesa/drivers/dri/nouveau/nv10_state_fb.c
@@ -51,11 +51,11 @@ get_rt_format(gl_format format)
 }
 
 static void
-setup_lma_buffer(struct gl_context *ctx)
+setup_hierz_buffer(struct gl_context *ctx)
 {
 	struct nouveau_channel *chan = context_chan(ctx);
 	struct nouveau_grobj *celsius = context_eng3d(ctx);
-	struct nouveau_bo_context *bctx = context_bctx(ctx, LMA_DEPTH);
+	struct nouveau_bo_context *bctx = context_bctx(ctx, HIERZ);
 	struct gl_framebuffer *fb = ctx->DrawBuffer;
 	struct nouveau_framebuffer *nfb = to_nouveau_framebuffer(fb);
 	unsigned pitch = align(fb->Width, 128),
@@ -135,7 +135,7 @@ nv10_emit_framebuffer(struct gl_context *ctx, int emit)
 				 s->bo, 0, bo_flags);
 
 		if (context_chipset(ctx) >= 0x17) {
-			setup_lma_buffer(ctx);
+			setup_hierz_buffer(ctx);
 			context_dirty(ctx, ZCLEAR);
 		}
 	}
diff --git a/src/mesa/drivers/dri/nouveau/nv20_context.c b/src/mesa/drivers/dri/nouveau/nv20_context.c
index c6111a2a9a0..89200fb70da 100644
--- a/src/mesa/drivers/dri/nouveau/nv20_context.c
+++ b/src/mesa/drivers/dri/nouveau/nv20_context.c
@@ -26,6 +26,8 @@
 
 #include "nouveau_driver.h"
 #include "nouveau_context.h"
+#include "nouveau_fbo.h"
+#include "nouveau_util.h"
 #include "nouveau_class.h"
 #include "nv04_driver.h"
 #include "nv10_driver.h"
@@ -40,6 +42,57 @@ static const struct dri_extension nv20_extensions[] = {
 };
 
 static void
+nv20_clear(struct gl_context *ctx, GLbitfield buffers)
+{
+	struct nouveau_channel *chan = context_chan(ctx);
+	struct nouveau_grobj *kelvin = context_eng3d(ctx);
+	struct gl_framebuffer *fb = ctx->DrawBuffer;
+	uint32_t clear = 0;
+
+	nouveau_validate_framebuffer(ctx);
+
+	if (buffers & BUFFER_BITS_COLOR) {
+		struct nouveau_surface *s = &to_nouveau_renderbuffer(
+			fb->_ColorDrawBuffers[0])->surface;
+
+		if (ctx->Color.ColorMask[0][RCOMP])
+			clear |= NV20TCL_CLEAR_BUFFERS_COLOR_R;
+		if (ctx->Color.ColorMask[0][GCOMP])
+			clear |= NV20TCL_CLEAR_BUFFERS_COLOR_G;
+		if (ctx->Color.ColorMask[0][BCOMP])
+			clear |= NV20TCL_CLEAR_BUFFERS_COLOR_B;
+		if (ctx->Color.ColorMask[0][ACOMP])
+			clear |= NV20TCL_CLEAR_BUFFERS_COLOR_A;
+
+		BEGIN_RING(chan, kelvin, NV20TCL_CLEAR_VALUE, 1);
+		OUT_RING(chan, pack_rgba_f(s->format, ctx->Color.ClearColor));
+
+		buffers &= ~BUFFER_BITS_COLOR;
+	}
+
+	if (buffers & (BUFFER_BIT_DEPTH | BUFFER_BIT_STENCIL)) {
+		struct nouveau_surface *s = &to_nouveau_renderbuffer(
+			fb->_DepthBuffer->Wrapped)->surface;
+
+		if (buffers & BUFFER_BIT_DEPTH && ctx->Depth.Mask)
+			clear |= NV20TCL_CLEAR_BUFFERS_DEPTH;
+		if (buffers & BUFFER_BIT_STENCIL && ctx->Stencil.WriteMask[0])
+			clear |= NV20TCL_CLEAR_BUFFERS_STENCIL;
+
+		BEGIN_RING(chan, kelvin, NV20TCL_CLEAR_DEPTH_VALUE, 1);
+		OUT_RING(chan, pack_zs_f(s->format, ctx->Depth.Clear,
+					 ctx->Stencil.Clear));
+
+		buffers &= ~(BUFFER_BIT_DEPTH | BUFFER_BIT_STENCIL);
+	}
+
+	BEGIN_RING(chan, kelvin, NV20TCL_CLEAR_BUFFERS, 1);
+	OUT_RING(chan, clear);
+
+	nouveau_clear(ctx, buffers);
+}
+
+static void
 nv20_hwctx_init(struct gl_context *ctx)
 {
 	struct nouveau_channel *chan = context_chan(ctx);
@@ -134,10 +187,6 @@ nv20_hwctx_init(struct gl_context *ctx)
 	OUT_RING  (chan, 2);
 
 	if (context_chipset(ctx) >= 0x25) {
-		BEGIN_RING(chan, kelvin, 0x022c, 2);
-		OUT_RING  (chan, 0x280);
-		OUT_RING  (chan, 0x07d28000);
-
 		BEGIN_RING(chan, kelvin, 0x1da4, 1);
 		OUT_RING  (chan, 0);
 	}
@@ -376,7 +425,8 @@ nv20_context_destroy(struct gl_context *ctx)
 	struct nouveau_context *nctx = to_nouveau_context(ctx);
 
 	nv04_surface_takedown(ctx);
-	nv20_render_destroy(ctx);
+	nv20_swtnl_destroy(ctx);
+	nv20_vbo_destroy(ctx);
 
 	nouveau_grobj_free(&nctx->hw.eng3d);
 
@@ -410,6 +460,7 @@ nv20_context_create(struct nouveau_screen *screen, const struct gl_config *visua
 	ctx->Const.MaxTextureUnits = NV20_TEXTURE_UNITS;
 	ctx->Const.MaxTextureMaxAnisotropy = 8;
 	ctx->Const.MaxTextureLodBias = 15;
+	ctx->Driver.Clear = nv20_clear;
 
 	/* 2D engine. */
 	ret = nv04_surface_init(ctx);
@@ -428,7 +479,8 @@ nv20_context_create(struct nouveau_screen *screen, const struct gl_config *visua
 		goto fail;
 
 	nv20_hwctx_init(ctx);
-	nv20_render_init(ctx);
+	nv20_vbo_init(ctx);
+	nv20_swtnl_init(ctx);
 
 	return ctx;
 
diff --git a/src/mesa/drivers/dri/nouveau/nv20_driver.h b/src/mesa/drivers/dri/nouveau/nv20_driver.h
index 7fbe6ccfa68..f2a6097b937 100644
--- a/src/mesa/drivers/dri/nouveau/nv20_driver.h
+++ b/src/mesa/drivers/dri/nouveau/nv20_driver.h
@@ -39,10 +39,16 @@ extern const struct nouveau_driver nv20_driver;
 
 /* nv20_render.c */
 void
-nv20_render_init(struct gl_context *ctx);
+nv20_vbo_init(struct gl_context *ctx);
 
 void
-nv20_render_destroy(struct gl_context *ctx);
+nv20_vbo_destroy(struct gl_context *ctx);
+
+void
+nv20_swtnl_init(struct gl_context *ctx);
+
+void
+nv20_swtnl_destroy(struct gl_context *ctx);
 
 /* nv20_state_fb.c */
 void
diff --git a/src/mesa/drivers/dri/nouveau/nv20_render.c b/src/mesa/drivers/dri/nouveau/nv20_render.c
index 6b668544627..dbdb85da203 100644
--- a/src/mesa/drivers/dri/nouveau/nv20_render.c
+++ b/src/mesa/drivers/dri/nouveau/nv20_render.c
@@ -32,7 +32,7 @@
 #define NUM_VERTEX_ATTRS 16
 
 static void
-nv20_emit_material(struct gl_context *ctx, struct nouveau_array_state *a,
+nv20_emit_material(struct gl_context *ctx, struct nouveau_array *a,
 		   const void *v);
 
 /* Vertex attribute format. */
@@ -135,13 +135,11 @@ nv20_render_set_format(struct gl_context *ctx)
 	struct nouveau_render_state *render = to_render_state(ctx);
 	struct nouveau_channel *chan = context_chan(ctx);
 	struct nouveau_grobj *kelvin = context_eng3d(ctx);
-	int i, hw_format;
-
-	for (i = 0; i < NUM_VERTEX_ATTRS; i++) {
-		int attr = render->map[i];
+	int i, attr, hw_format;
 
+	FOR_EACH_ATTR(render, i, attr) {
 		if (attr >= 0) {
-			struct nouveau_array_state *a = &render->attrs[attr];
+			struct nouveau_array *a = &render->attrs[attr];
 
 			hw_format = a->stride << 8 |
 				a->fields << 4 |
@@ -162,33 +160,29 @@ nv20_render_bind_vertices(struct gl_context *ctx)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
 	struct nouveau_bo_context *bctx = context_bctx(ctx, VERTEX);
-	struct nouveau_channel *chan = context_chan(ctx);
 	struct nouveau_grobj *kelvin = context_eng3d(ctx);
-	int i;
+	int i, attr;
 
-	for (i = 0; i < NUM_VERTEX_ATTRS; i++) {
-		int attr = render->map[i];
+	FOR_EACH_BOUND_ATTR(render, i, attr) {
+		struct nouveau_array *a = &render->attrs[attr];
 
-		if (attr >= 0) {
-			struct nouveau_array_state *a = &render->attrs[attr];
-
-			nouveau_bo_mark(bctx, kelvin,
-					NV20TCL_VTXBUF_ADDRESS(i),
-					a->bo, a->offset, 0,
-					0, NV20TCL_VTXBUF_ADDRESS_DMA1,
-					NOUVEAU_BO_LOW | NOUVEAU_BO_OR |
-					NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-		}
+		nouveau_bo_mark(bctx, kelvin,
+				NV20TCL_VTXBUF_ADDRESS(i),
+				a->bo, a->offset, 0,
+				0, NV20TCL_VTXBUF_ADDRESS_DMA1,
+				NOUVEAU_BO_LOW | NOUVEAU_BO_OR |
+				NOUVEAU_BO_GART | NOUVEAU_BO_RD);
 	}
-
-	BEGIN_RING(chan, kelvin, NV20TCL_VTX_CACHE_INVALIDATE, 1);
-	OUT_RING(chan, 0);
 }
 
 /* Vertex array rendering defs. */
 #define RENDER_LOCALS(ctx)					\
 	struct nouveau_grobj *kelvin = context_eng3d(ctx)
 
+#define BATCH_VALIDATE()						\
+	BEGIN_RING(chan, kelvin, NV20TCL_VTX_CACHE_INVALIDATE, 1);	\
+	OUT_RING(chan, 0)
+
 #define BATCH_BEGIN(prim)					\
 	BEGIN_RING(chan, kelvin, NV20TCL_VERTEX_BEGIN_END, 1);	\
 	OUT_RING(chan, prim)
@@ -223,3 +217,5 @@ nv20_render_bind_vertices(struct gl_context *ctx)
 
 #define TAG(x) nv20_##x
 #include "nouveau_render_t.c"
+#include "nouveau_vbo_t.c"
+#include "nouveau_swtnl_t.c"
diff --git a/src/mesa/drivers/dri/nouveau/nv20_state_fb.c b/src/mesa/drivers/dri/nouveau/nv20_state_fb.c
index 7822ca2a098..854392f9ff3 100644
--- a/src/mesa/drivers/dri/nouveau/nv20_state_fb.c
+++ b/src/mesa/drivers/dri/nouveau/nv20_state_fb.c
@@ -51,6 +51,31 @@ get_rt_format(gl_format format)
 	}
 }
 
+static void
+setup_hierz_buffer(struct gl_context *ctx)
+{
+	struct nouveau_channel *chan = context_chan(ctx);
+	struct nouveau_grobj *kelvin = context_eng3d(ctx);
+	struct nouveau_bo_context *bctx = context_bctx(ctx, HIERZ);
+	struct gl_framebuffer *fb = ctx->DrawBuffer;
+	struct nouveau_framebuffer *nfb = to_nouveau_framebuffer(fb);
+	unsigned pitch = align(fb->Width, 128),
+		height = align(fb->Height, 2),
+		size = pitch * height;
+
+	if (!nfb->hierz.bo || nfb->hierz.bo->size != size) {
+		nouveau_bo_ref(NULL, &nfb->hierz.bo);
+		nouveau_bo_new(context_dev(ctx), NOUVEAU_BO_VRAM, 0, size,
+			       &nfb->hierz.bo);
+	}
+
+	BEGIN_RING(chan, kelvin, NV25TCL_HIERZ_PITCH, 1);
+	OUT_RING(chan, pitch);
+
+	nouveau_bo_markl(bctx, kelvin, NV25TCL_HIERZ_OFFSET, nfb->hierz.bo,
+			 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR);
+}
+
 void
 nv20_emit_framebuffer(struct gl_context *ctx, int emit)
 {
@@ -88,6 +113,9 @@ nv20_emit_framebuffer(struct gl_context *ctx, int emit)
 
 		nouveau_bo_markl(bctx, kelvin, NV20TCL_ZETA_OFFSET,
 				 s->bo, 0, bo_flags);
+
+		if (context_chipset(ctx) >= 0x25)
+			setup_hierz_buffer(ctx);
 	} else {
 		rt_format |= get_rt_format(MESA_FORMAT_Z24_S8);
 		zeta_pitch = rt_pitch;
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index 723e31401de..5abfc9dac51 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -71,6 +71,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define need_GL_NV_vertex_program
 #define need_GL_ARB_point_parameters
 #define need_GL_EXT_framebuffer_object
+#define need_GL_OES_EGL_image
+
 #include "main/remap_helper.h"
 
 #define DRIVER_DATE	"20060602"
@@ -137,6 +139,9 @@ static const struct dri_extension card_extensions[] =
     { "GL_ATI_texture_mirror_once",        NULL },
     { "GL_MESA_pack_invert",               NULL },
     { "GL_NV_blend_square",                NULL },
+#if FEATURE_OES_EGL_image
+    { "GL_OES_EGL_image",                  GL_OES_EGL_image_functions },
+#endif
     { NULL,                                NULL }
 };
 
diff --git a/src/mesa/drivers/dri/r200/r200_swtcl.c b/src/mesa/drivers/dri/r200/r200_swtcl.c
index 38864162ced..c56a49d5ad6 100644
--- a/src/mesa/drivers/dri/r200/r200_swtcl.c
+++ b/src/mesa/drivers/dri/r200/r200_swtcl.c
@@ -319,10 +319,9 @@ static INLINE GLuint reduced_hw_prim( struct gl_context *ctx, GLuint prim)
 {
    switch (prim) {
    case GL_POINTS:
-      return (ctx->Point.PointSprite ||
-	 ((ctx->_TriangleCaps & (DD_POINT_SIZE | DD_POINT_ATTEN)) &&
-	 !(ctx->_TriangleCaps & (DD_POINT_SMOOTH)))) ?
-	 R200_VF_PRIM_POINT_SPRITES : R200_VF_PRIM_POINTS;
+      return (((R200_CONTEXT(ctx))->radeon.radeonScreen->drmSupportsPointSprites &&
+              !(ctx->_TriangleCaps & DD_POINT_SMOOTH)) ?
+	 R200_VF_PRIM_POINT_SPRITES : R200_VF_PRIM_POINTS);
    case GL_LINES:
    /* fallthrough */
    case GL_LINE_LOOP:
diff --git a/src/mesa/drivers/dri/r200/r200_tcl.c b/src/mesa/drivers/dri/r200/r200_tcl.c
index 84db7c9d4eb..7aed116f0b3 100644
--- a/src/mesa/drivers/dri/r200/r200_tcl.c
+++ b/src/mesa/drivers/dri/r200/r200_tcl.c
@@ -68,9 +68,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define HAVE_ELTS        1
 
 
-#define HW_POINTS           ((ctx->Point.PointSprite || \
-				((ctx->_TriangleCaps & (DD_POINT_SIZE | DD_POINT_ATTEN)) && \
-	 			!(ctx->_TriangleCaps & (DD_POINT_SMOOTH)))) ? \
+#define HW_POINTS           (((R200_CONTEXT(ctx))->radeon.radeonScreen->drmSupportsPointSprites && \
+			      !(ctx->_TriangleCaps & DD_POINT_SMOOTH)) ? \
 				R200_VF_PRIM_POINT_SPRITES : R200_VF_PRIM_POINTS)
 #define HW_LINES            R200_VF_PRIM_LINES
 #define HW_LINE_LOOP        0
diff --git a/src/mesa/drivers/dri/r200/r200_tex.c b/src/mesa/drivers/dri/r200/r200_tex.c
index 5207c2901a3..064324731b5 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.c
+++ b/src/mesa/drivers/dri/r200/r200_tex.c
@@ -537,6 +537,10 @@ void r200InitTextureFuncs( radeonContextPtr radeon, struct dd_function_table *fu
    functions->MapTexture = radeonMapTexture;
    functions->UnmapTexture = radeonUnmapTexture;
 
+#if FEATURE_OES_EGL_image
+   functions->EGLImageTargetTexture2D = radeon_image_target_texture_2d;
+#endif
+
    driInitTextureFormats();
 
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c
index 5927498818b..fd94194dc34 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c
@@ -465,15 +465,16 @@ static void get_readers_normal_read_callback(
 {
 	struct get_readers_callback_data * d = userdata;
 	unsigned int read_mask;
+	unsigned int shared_mask;
 
 	if (src->RelAddr)
 		d->ReaderData->Abort = 1;
 
-	unsigned int shared_mask = rc_src_reads_dst_mask(src->File, src->Index,
-				src->Swizzle,
-				d->ReaderData->Writer->U.I.DstReg.File,
-				d->ReaderData->Writer->U.I.DstReg.Index,
-				d->AliveWriteMask);
+	shared_mask = rc_src_reads_dst_mask(src->File, src->Index,
+		src->Swizzle,
+		d->ReaderData->Writer->U.I.DstReg.File,
+		d->ReaderData->Writer->U.I.DstReg.Index,
+		d->AliveWriteMask);
 
 	if (shared_mask == RC_MASK_NONE)
 		return;
@@ -624,6 +625,9 @@ void  rc_get_readers_normal(
 			data->Abort = 1;
 			return;
 		case RC_OPCODE_IF:
+			/* XXX We can do better here, but this will have to
+			 * do until this dataflow analysis is more mature. */
+			data->Abort = 1;
 			branch_depth++;
 			break;
 		case RC_OPCODE_ELSE:
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
index 5556927357b..15b9c5e7dc3 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
@@ -32,9 +32,11 @@
 #include "radeon_compiler_util.h"
 #include "radeon_swizzle.h"
 
-struct src_clobbered_data {
-	unsigned int NumSrcRegs;
-	unsigned int SrcMasks[3];
+struct src_clobbered_reads_cb_data {
+	rc_register_file File;
+	unsigned int Index;
+	unsigned int Mask;
+	struct rc_reader_data * ReaderData;
 };
 
 typedef void (*rc_presub_replace_fn)(struct rc_instruction *,
@@ -99,6 +101,25 @@ static void copy_propagate_scan_read(void * data, struct rc_instruction * inst,
 	}
 }
 
+static void src_clobbered_reads_cb(
+	void * data,
+	struct rc_instruction * inst,
+	struct rc_src_register * src)
+{
+	struct src_clobbered_reads_cb_data * sc_data = data;
+
+	if (src->File == sc_data->File
+	    && src->Index == sc_data->Index
+	    && (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
+
+		sc_data->ReaderData->AbortOnRead = 1;
+	}
+
+	if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
+		sc_data->ReaderData->AbortOnRead = 1;
+	}
+}
+
 static void is_src_clobbered_scan_write(
 	void * data,
 	struct rc_instruction * inst,
@@ -106,29 +127,19 @@ static void is_src_clobbered_scan_write(
 	unsigned int index,
 	unsigned int mask)
 {
-	unsigned int i;
+	struct src_clobbered_reads_cb_data sc_data;
 	struct rc_reader_data * reader_data = data;
-	struct src_clobbered_data * d = reader_data->CbData;
-	for (i = 0; i < d->NumSrcRegs; i++) {
-		if (file == reader_data->Writer->U.I.SrcReg[i].File
-			&& index == reader_data->Writer->U.I.SrcReg[i].Index
-			&& (mask & d->SrcMasks[i])){
-
-			reader_data->AbortOnRead = 1;
-			return;
-		}
-		if (reader_data->Writer->U.I.SrcReg[i].RelAddr &&
-						file == RC_FILE_ADDRESS) {
-			reader_data->AbortOnRead = 1;
-			return;
-		}
-	}
+	sc_data.File = file;
+	sc_data.Index = index;
+	sc_data.Mask = mask;
+	sc_data.ReaderData = reader_data;
+	rc_for_all_reads_src(reader_data->Writer,
+					src_clobbered_reads_cb, &sc_data);
 }
 
 static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * inst_mov)
 {
 	struct rc_reader_data reader_data;
-	struct src_clobbered_data sc_data;
 	unsigned int i;
 
 	if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY ||
@@ -137,12 +148,6 @@ static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * i
 	    inst_mov->U.I.SaturateMode)
 		return;
 
-	sc_data.NumSrcRegs = 1;
-	sc_data.SrcMasks[0] = rc_swizzle_to_writemask(
-					inst_mov->U.I.SrcReg[0].Swizzle);
-
-	reader_data.CbData = &sc_data;
-
 	/* Get a list of all the readers of this MOV instruction. */
 	rc_get_readers_normal(c, inst_mov, &reader_data,
 			copy_propagate_scan_read, is_src_clobbered_scan_write);
@@ -203,8 +208,8 @@ static int is_src_uniform_constant(struct rc_src_register src,
 
 static void constant_folding_mad(struct rc_instruction * inst)
 {
-	rc_swizzle swz;
-	unsigned int negate;
+	rc_swizzle swz = 0;
+	unsigned int negate= 0;
 
 	if (is_src_uniform_constant(inst->U.I.SrcReg[2], &swz, &negate)) {
 		if (swz == RC_SWIZZLE_ZERO) {
@@ -244,8 +249,8 @@ static void constant_folding_mad(struct rc_instruction * inst)
 
 static void constant_folding_mul(struct rc_instruction * inst)
 {
-	rc_swizzle swz;
-	unsigned int negate;
+	rc_swizzle swz = 0;
+	unsigned int negate = 0;
 
 	if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
 		if (swz == RC_SWIZZLE_ONE) {
@@ -277,8 +282,8 @@ static void constant_folding_mul(struct rc_instruction * inst)
 
 static void constant_folding_add(struct rc_instruction * inst)
 {
-	rc_swizzle swz;
-	unsigned int negate;
+	rc_swizzle swz = 0;
+	unsigned int negate = 0;
 
 	if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
 		if (swz == RC_SWIZZLE_ZERO) {
@@ -448,15 +453,8 @@ static int presub_helper(
 	rc_presub_replace_fn presub_replace)
 {
 	struct rc_reader_data reader_data;
-	struct src_clobbered_data sc_data;
 	unsigned int i;
 
-	sc_data.NumSrcRegs = 2;
-	sc_data.SrcMasks[0] = rc_swizzle_to_writemask(
-					inst_add->U.I.SrcReg[0].Swizzle);
-	sc_data.SrcMasks[1] = rc_swizzle_to_writemask(
-					inst_add->U.I.SrcReg[1].Swizzle);
-	reader_data.CbData = &sc_data;
 	rc_get_readers_normal(c, inst_add, &reader_data, presub_scan_read,
 						is_src_clobbered_scan_write);
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
index d4a38607d9e..553e9dcf7c1 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
@@ -290,6 +290,7 @@ static int merge_presub_sources(
 {
 	unsigned int srcp_src, srcp_regs, is_rgb, is_alpha;
 	struct rc_pair_sub_instruction * dst_sub;
+	const struct rc_opcode_info * info;
 
 	assert(dst_full->Alpha.Opcode == RC_OPCODE_NOP);
 
@@ -309,8 +310,8 @@ static int merge_presub_sources(
 		return 0;
 	}
 
-	const struct rc_opcode_info * info =
-					rc_get_opcode_info(dst_full->RGB.Opcode);
+	info = rc_get_opcode_info(dst_full->RGB.Opcode);
+
 	if (dst_sub->Src[RC_PAIR_PRESUB_SRC].Used)
 		return 0;
 
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index 9fbd36bfe63..c288834d243 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -86,6 +86,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define need_GL_EXT_stencil_two_side
 #define need_GL_ATI_separate_stencil
 #define need_GL_NV_vertex_program
+#define need_GL_OES_EGL_image
 
 #include "main/remap_helper.h"
 
@@ -134,6 +135,9 @@ static const struct dri_extension card_extensions[] = {
   {"GL_MESAX_texture_float",		NULL},
   {"GL_NV_blend_square",		NULL},
   {"GL_NV_vertex_program",		GL_NV_vertex_program_functions},
+#if FEATURE_OES_EGL_image
+  {"GL_OES_EGL_image",                  GL_OES_EGL_image_functions },
+#endif
   {NULL,				NULL}
   /* *INDENT-ON* */
 };
diff --git a/src/mesa/drivers/dri/r300/r300_draw.c b/src/mesa/drivers/dri/r300/r300_draw.c
index 81769e1ee5f..0c4d8537c61 100644
--- a/src/mesa/drivers/dri/r300/r300_draw.c
+++ b/src/mesa/drivers/dri/r300/r300_draw.c
@@ -717,6 +717,10 @@ static void r300DrawPrims(struct gl_context *ctx,
 			 GLuint max_index)
 {
 	GLboolean retval;
+	struct r300_context *r300 = R300_CONTEXT(ctx);
+	radeonContextPtr radeon = &r300->radeon;
+
+	radeon_prepare_render(radeon);
 
 	/* This check should get folded into just the places that
 	 * min/max index are really needed.
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index 821318e7a59..44090ec2894 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -327,8 +327,6 @@ void r300RunRenderPrimitive(struct gl_context * ctx, int start, int end, int pri
 	BATCH_LOCALS(&rmesa->radeon);
 	int type, num_verts;
 
-	radeon_prepare_render(&rmesa->radeon);
-
 	type = r300PrimitiveType(rmesa, prim);
 	num_verts = r300NumVerts(rmesa, end - start, prim);
 
diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c
index a6bda0e4990..de662939992 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.c
+++ b/src/mesa/drivers/dri/r300/r300_tex.c
@@ -382,5 +382,9 @@ void r300InitTextureFuncs(radeonContextPtr radeon, struct dd_function_table *fun
 
 	functions->GenerateMipmap = radeonGenerateMipmap;
 
+#if FEATURE_OES_EGL_image
+	functions->EGLImageTargetTexture2D = radeon_image_target_texture_2d;
+#endif
+
 	driInitTextureFormats();
 }
diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c
index c882a9cce9e..b6443bf0c53 100644
--- a/src/mesa/drivers/dri/r600/r600_context.c
+++ b/src/mesa/drivers/dri/r600/r600_context.c
@@ -94,6 +94,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define need_GL_EXT_stencil_two_side
 #define need_GL_ATI_separate_stencil
 #define need_GL_NV_vertex_program
+#define need_GL_OES_EGL_image
 
 #include "main/remap_helper.h"
 
@@ -148,6 +149,9 @@ static const struct dri_extension card_extensions[] = {
   {"GL_NV_vertex_program",		GL_NV_vertex_program_functions},
   {"GL_ARB_pixel_buffer_object",        NULL},
   {"GL_ARB_draw_elements_base_vertex",	GL_ARB_draw_elements_base_vertex_functions },
+#if FEATURE_OES_EGL_image
+  {"GL_OES_EGL_image",			GL_OES_EGL_image_functions},
+#endif
   {NULL,				NULL}
   /* *INDENT-ON* */
 };
diff --git a/src/mesa/drivers/dri/r600/r600_tex.c b/src/mesa/drivers/dri/r600/r600_tex.c
index d6a58f410cc..c3d68c41e57 100644
--- a/src/mesa/drivers/dri/r600/r600_tex.c
+++ b/src/mesa/drivers/dri/r600/r600_tex.c
@@ -475,5 +475,9 @@ void r600InitTextureFuncs(radeonContextPtr radeon, struct dd_function_table *fun
 
 	functions->GenerateMipmap = radeonGenerateMipmap;
 
+#if FEATURE_OES_EGL_image
+	functions->EGLImageTargetTexture2D = radeon_image_target_texture_2d;
+#endif
+
 	driInitTextureFormats();
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index 43a6355ad8b..7361adffcf7 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -171,6 +171,10 @@ void radeonSetCliprects(radeonContextPtr radeon)
 {
 	__DRIdrawable *const drawable = radeon_get_drawable(radeon);
 	__DRIdrawable *const readable = radeon_get_readable(radeon);
+
+	if(drawable == NULL && readable == NULL)
+		return;
+
 	struct radeon_framebuffer *const draw_rfb = drawable->driverPrivate;
 	struct radeon_framebuffer *const read_rfb = readable->driverPrivate;
 	int x_off, y_off;
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index 40544860b3b..a436ec112cc 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -39,6 +39,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drirenderbuffer.h"
 #include "drivers/common/meta.h"
 #include "main/context.h"
+#include "main/framebuffer.h"
 #include "main/renderbuffer.h"
 #include "main/state.h"
 #include "main/simple_list.h"
@@ -251,9 +252,9 @@ GLboolean radeonInitContext(radeonContextPtr radeon,
 			radeon->texture_rect_row_align = 512;
 			radeon->texture_compressed_row_align = 512;
 		} else {
-			radeon->texture_row_align = 256;
-			radeon->texture_rect_row_align = 256;
-			radeon->texture_compressed_row_align = 256;
+			radeon->texture_row_align = radeon->radeonScreen->group_bytes;
+			radeon->texture_rect_row_align = radeon->radeonScreen->group_bytes;
+			radeon->texture_compressed_row_align = radeon->radeonScreen->group_bytes;
 		}
 	} else if (IS_R200_CLASS(radeon->radeonScreen) ||
 		   IS_R100_CLASS(radeon->radeonScreen)) {
@@ -379,12 +380,12 @@ GLboolean radeonUnbindContext(__DRIcontext * driContextPriv)
 
 static void
 radeon_make_kernel_renderbuffer_current(radeonContextPtr radeon,
-					struct radeon_framebuffer *draw)
+					struct gl_framebuffer *draw)
 {
 	/* if radeon->fake */
 	struct radeon_renderbuffer *rb;
 
-	if ((rb = (void *)draw->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer)) {
+	if ((rb = (void *)draw->Attachment[BUFFER_FRONT_LEFT].Renderbuffer)) {
 		if (!rb->bo) {
 			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
 						radeon->radeonScreen->frontOffset,
@@ -396,7 +397,7 @@ radeon_make_kernel_renderbuffer_current(radeonContextPtr radeon,
 		rb->cpp = radeon->radeonScreen->cpp;
 		rb->pitch = radeon->radeonScreen->frontPitch * rb->cpp;
 	}
-	if ((rb = (void *)draw->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer)) {
+	if ((rb = (void *)draw->Attachment[BUFFER_BACK_LEFT].Renderbuffer)) {
 		if (!rb->bo) {
 			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
 						radeon->radeonScreen->backOffset,
@@ -408,7 +409,7 @@ radeon_make_kernel_renderbuffer_current(radeonContextPtr radeon,
 		rb->cpp = radeon->radeonScreen->cpp;
 		rb->pitch = radeon->radeonScreen->backPitch * rb->cpp;
 	}
-	if ((rb = (void *)draw->base.Attachment[BUFFER_DEPTH].Renderbuffer)) {
+	if ((rb = (void *)draw->Attachment[BUFFER_DEPTH].Renderbuffer)) {
 		if (!rb->bo) {
 			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
 						radeon->radeonScreen->depthOffset,
@@ -420,7 +421,7 @@ radeon_make_kernel_renderbuffer_current(radeonContextPtr radeon,
 		rb->cpp = radeon->radeonScreen->cpp;
 		rb->pitch = radeon->radeonScreen->depthPitch * rb->cpp;
 	}
-	if ((rb = (void *)draw->base.Attachment[BUFFER_STENCIL].Renderbuffer)) {
+	if ((rb = (void *)draw->Attachment[BUFFER_STENCIL].Renderbuffer)) {
 		if (!rb->bo) {
 			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
 						radeon->radeonScreen->depthOffset,
@@ -436,7 +437,7 @@ radeon_make_kernel_renderbuffer_current(radeonContextPtr radeon,
 
 static void
 radeon_make_renderbuffer_current(radeonContextPtr radeon,
-				 struct radeon_framebuffer *draw)
+				 struct gl_framebuffer *draw)
 {
 	int size = 4096*4096*4;
 	/* if radeon->fake */
@@ -448,7 +449,7 @@ radeon_make_renderbuffer_current(radeonContextPtr radeon,
 	}
 
 
-	if ((rb = (void *)draw->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer)) {
+	if ((rb = (void *)draw->Attachment[BUFFER_FRONT_LEFT].Renderbuffer)) {
 		if (!rb->bo) {
 			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
 						radeon->radeonScreen->frontOffset +
@@ -461,7 +462,7 @@ radeon_make_renderbuffer_current(radeonContextPtr radeon,
 		rb->cpp = radeon->radeonScreen->cpp;
 		rb->pitch = radeon->radeonScreen->frontPitch * rb->cpp;
 	}
-	if ((rb = (void *)draw->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer)) {
+	if ((rb = (void *)draw->Attachment[BUFFER_BACK_LEFT].Renderbuffer)) {
 		if (!rb->bo) {
 			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
 						radeon->radeonScreen->backOffset +
@@ -474,7 +475,7 @@ radeon_make_renderbuffer_current(radeonContextPtr radeon,
 		rb->cpp = radeon->radeonScreen->cpp;
 		rb->pitch = radeon->radeonScreen->backPitch * rb->cpp;
 	}
-	if ((rb = (void *)draw->base.Attachment[BUFFER_DEPTH].Renderbuffer)) {
+	if ((rb = (void *)draw->Attachment[BUFFER_DEPTH].Renderbuffer)) {
 		if (!rb->bo) {
 			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
 						radeon->radeonScreen->depthOffset +
@@ -487,7 +488,7 @@ radeon_make_renderbuffer_current(radeonContextPtr radeon,
 		rb->cpp = radeon->radeonScreen->cpp;
 		rb->pitch = radeon->radeonScreen->depthPitch * rb->cpp;
 	}
-	if ((rb = (void *)draw->base.Attachment[BUFFER_STENCIL].Renderbuffer)) {
+	if ((rb = (void *)draw->Attachment[BUFFER_STENCIL].Renderbuffer)) {
 		if (!rb->bo) {
 			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
 						radeon->radeonScreen->depthOffset +
@@ -793,8 +794,8 @@ GLboolean radeonMakeCurrent(__DRIcontext * driContextPriv,
 			    __DRIdrawable * driReadPriv)
 {
 	radeonContextPtr radeon;
-	struct radeon_framebuffer *drfb;
-	struct gl_framebuffer *readfb;
+	struct radeon_framebuffer *rdrfb;
+	struct gl_framebuffer *drfb, *readfb;
 
 	if (!driContextPriv) {
 		if (RADEON_DEBUG & RADEON_DRI)
@@ -804,17 +805,25 @@ GLboolean radeonMakeCurrent(__DRIcontext * driContextPriv,
 	}
 
 	radeon = (radeonContextPtr) driContextPriv->driverPrivate;
-	drfb = driDrawPriv->driverPrivate;
-	readfb = driReadPriv->driverPrivate;
+
+	if(driDrawPriv == NULL && driReadPriv == NULL) {
+		drfb = _mesa_create_framebuffer(&radeon->glCtx->Visual);
+		readfb = drfb;
+	}
+	else {
+		drfb = driDrawPriv->driverPrivate;
+		readfb = driReadPriv->driverPrivate;
+	}
 
 	if (driContextPriv->driScreenPriv->dri2.enabled) {
-		radeon_update_renderbuffers(driContextPriv, driDrawPriv, GL_FALSE);
+		if(driDrawPriv)
+			radeon_update_renderbuffers(driContextPriv, driDrawPriv, GL_FALSE);
 		if (driDrawPriv != driReadPriv)
 			radeon_update_renderbuffers(driContextPriv, driReadPriv, GL_FALSE);
 		_mesa_reference_renderbuffer(&radeon->state.color.rb,
-			&(radeon_get_renderbuffer(&drfb->base, BUFFER_BACK_LEFT)->base));
+			&(radeon_get_renderbuffer(drfb, BUFFER_BACK_LEFT)->base));
 		_mesa_reference_renderbuffer(&radeon->state.depth.rb,
-			&(radeon_get_renderbuffer(&drfb->base, BUFFER_DEPTH)->base));
+			&(radeon_get_renderbuffer(drfb, BUFFER_DEPTH)->base));
 	} else {
 		radeon_make_renderbuffer_current(radeon, drfb);
 	}
@@ -822,35 +831,40 @@ GLboolean radeonMakeCurrent(__DRIcontext * driContextPriv,
 	if (RADEON_DEBUG & RADEON_DRI)
 	     fprintf(stderr, "%s ctx %p dfb %p rfb %p\n", __FUNCTION__, radeon->glCtx, drfb, readfb);
 
-	driUpdateFramebufferSize(radeon->glCtx, driDrawPriv);
+	if(driDrawPriv)
+		driUpdateFramebufferSize(radeon->glCtx, driDrawPriv);
 	if (driReadPriv != driDrawPriv)
 		driUpdateFramebufferSize(radeon->glCtx, driReadPriv);
 
-	_mesa_make_current(radeon->glCtx, &drfb->base, readfb);
+	_mesa_make_current(radeon->glCtx, drfb, readfb);
+	if (driDrawPriv == NULL && driReadPriv == NULL)
+		_mesa_reference_framebuffer(&drfb, NULL);
 
 	_mesa_update_state(radeon->glCtx);
 
-	if (radeon->glCtx->DrawBuffer == &drfb->base) {
-		if (driDrawPriv->swap_interval == (unsigned)-1) {
-			int i;
-			driDrawPriv->vblFlags =
-				(radeon->radeonScreen->irq != 0)
-				? driGetDefaultVBlankFlags(&radeon->
-							   optionCache)
-				: VBLANK_FLAG_NO_IRQ;
-
-			driDrawableInitVBlank(driDrawPriv);
-			drfb->vbl_waited = driDrawPriv->vblSeq;
-
-			for (i = 0; i < 2; i++) {
-				if (drfb->color_rb[i])
-					drfb->color_rb[i]->vbl_pending = driDrawPriv->vblSeq;
+	if (radeon->glCtx->DrawBuffer == drfb) {
+		if(driDrawPriv != NULL) {
+			rdrfb = (struct radeon_framebuffer *)drfb;
+			if (driDrawPriv->swap_interval == (unsigned)-1) {
+				int i;
+				driDrawPriv->vblFlags =
+					(radeon->radeonScreen->irq != 0)
+					? driGetDefaultVBlankFlags(&radeon->
+								   optionCache)
+					: VBLANK_FLAG_NO_IRQ;
+
+				driDrawableInitVBlank(driDrawPriv);
+				rdrfb->vbl_waited = driDrawPriv->vblSeq;
+
+				for (i = 0; i < 2; i++) {
+					if (rdrfb->color_rb[i])
+						rdrfb->color_rb[i]->vbl_pending = driDrawPriv->vblSeq;
+				}
 			}
-
+			radeon_window_moved(radeon);
 		}
 
-		radeon_window_moved(radeon);
-		radeon_draw_buffer(radeon->glCtx, &drfb->base);
+		radeon_draw_buffer(radeon->glCtx, drfb);
 	}
 
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
index cc9590213c4..e3de534b5f7 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_context.c
@@ -66,6 +66,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_secondary_color
 #define need_GL_EXT_framebuffer_object
+#define need_GL_OES_EGL_image
 #include "main/remap_helper.h"
 
 #define DRIVER_DATE	"20061018"
@@ -101,6 +102,9 @@ static const struct dri_extension card_extensions[] =
     { "GL_ATI_texture_mirror_once",        NULL },
     { "GL_MESA_ycbcr_texture",             NULL },
     { "GL_NV_blend_square",                NULL },
+#if FEATURE_OES_EGL_image
+    { "GL_OES_EGL_image",                  GL_OES_EGL_image_functions },
+#endif
     { NULL,                                NULL }
 };
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
index 2a6fbaeaf09..a36a1dc94ac 100644
--- a/src/mesa/drivers/dri/radeon/radeon_fbo.c
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -199,6 +199,48 @@ radeon_alloc_renderbuffer_storage(struct gl_context * ctx, struct gl_renderbuffe
    
 }
 
+#if FEATURE_OES_EGL_image
+static void
+radeon_image_target_renderbuffer_storage(struct gl_context *ctx,
+                                         struct gl_renderbuffer *rb,
+                                         void *image_handle)
+{
+   radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+   struct radeon_renderbuffer *rrb;
+   __DRIscreen *screen;
+   __DRIimage *image;
+
+   screen = radeon->radeonScreen->driScreen;
+   image = screen->dri2.image->lookupEGLImage(screen, image_handle,
+					      screen->loaderPrivate);
+   if (image == NULL)
+      return;
+
+   rrb = radeon_renderbuffer(rb);
+
+   if (ctx->Driver.Flush)
+      ctx->Driver.Flush(ctx); /* +r6/r7 */
+
+   if (rrb->bo)
+      radeon_bo_unref(rrb->bo);
+   rrb->bo = image->bo;
+   radeon_bo_ref(rrb->bo);
+   fprintf(stderr, "image->bo: %p, name: %d, rbs: w %d -> p %d\n", image->bo, image->bo->handle,
+           image->width, image->pitch);
+
+   rrb->cpp = image->cpp;
+   rrb->pitch = image->pitch * image->cpp;
+
+   rb->Format = image->format;
+   rb->InternalFormat = image->internal_format;
+   rb->Width = image->width;
+   rb->Height = image->height;
+   rb->Format = image->format;
+   rb->DataType = image->data_type;
+   rb->_BaseFormat = _mesa_base_fbo_format(radeon->glCtx,
+                                           image->internal_format);
+}
+#endif
 
 /**
  * Called for each hardware renderbuffer when a _window_ is resized.
@@ -622,6 +664,10 @@ void radeon_fbo_init(struct radeon_context *radeon)
 #if FEATURE_EXT_framebuffer_blit
   radeon->glCtx->Driver.BlitFramebuffer = _mesa_meta_BlitFramebuffer;
 #endif
+#if FEATURE_OES_EGL_image
+  radeon->glCtx->Driver.EGLImageTargetRenderbufferStorage =
+	  radeon_image_target_renderbuffer_storage;
+#endif
 }
 
   
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index 43ebc810939..1ea52f96d7e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -41,12 +41,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/mtypes.h"
 #include "main/framebuffer.h"
 #include "main/renderbuffer.h"
+#include "main/fbobject.h"
 
 #define STANDALONE_MMIO
 #include "radeon_chipset.h"
 #include "radeon_macros.h"
 #include "radeon_screen.h"
 #include "radeon_common.h"
+#include "radeon_common_context.h"
 #if defined(RADEON_R100)
 #include "radeon_context.h"
 #include "radeon_tex.h"
@@ -398,6 +400,188 @@ static const struct __DRI2flushExtensionRec radeonFlushExtension = {
     dri2InvalidateDrawable,
 };
 
+static __DRIimage *
+radeon_create_image_from_name(__DRIcontext *context,
+                              int width, int height, int format,
+                              int name, int pitch, void *loaderPrivate)
+{
+   __DRIimage *image;
+   radeonContextPtr radeon = context->driverPrivate;
+
+   if (name == 0)
+      return NULL;
+
+   image = CALLOC(sizeof *image);
+   if (image == NULL)
+      return NULL;
+
+   switch (format) {
+   case __DRI_IMAGE_FORMAT_RGB565:
+      image->format = MESA_FORMAT_RGB565;
+      image->internal_format = GL_RGB;
+      image->data_type = GL_UNSIGNED_BYTE;
+      break;
+   case __DRI_IMAGE_FORMAT_XRGB8888:
+      image->format = MESA_FORMAT_XRGB8888;
+      image->internal_format = GL_RGB;
+      image->data_type = GL_UNSIGNED_BYTE;
+      break;
+   case __DRI_IMAGE_FORMAT_ARGB8888:
+      image->format = MESA_FORMAT_ARGB8888;
+      image->internal_format = GL_RGBA;
+      image->data_type = GL_UNSIGNED_BYTE;
+      break;
+   default:
+      free(image);
+      return NULL;
+   }
+
+   image->data = loaderPrivate;
+   image->cpp = _mesa_get_format_bytes(image->format);
+   image->width = width;
+   image->pitch = pitch;
+   image->height = height;
+
+   image->bo = radeon_bo_open(radeon->radeonScreen->bom,
+                              (uint32_t)name,
+                              image->pitch * image->height * image->cpp,
+                              0,
+                              RADEON_GEM_DOMAIN_VRAM,
+                              0);
+
+   if (image->bo == NULL) {
+      FREE(image);
+      return NULL;
+   }
+
+   return image;
+}
+
+static __DRIimage *
+radeon_create_image_from_renderbuffer(__DRIcontext *context,
+                                      int renderbuffer, void *loaderPrivate)
+{
+   __DRIimage *image;
+   radeonContextPtr radeon = context->driverPrivate;
+   struct gl_renderbuffer *rb;
+   struct radeon_renderbuffer *rrb;
+
+   rb = _mesa_lookup_renderbuffer(radeon->glCtx, renderbuffer);
+   if (!rb) {
+      _mesa_error(radeon->glCtx,
+                  GL_INVALID_OPERATION, "glRenderbufferExternalMESA");
+      return NULL;
+   }
+
+   rrb = radeon_renderbuffer(rb);
+   image = CALLOC(sizeof *image);
+   if (image == NULL)
+      return NULL;
+
+   image->internal_format = rb->InternalFormat;
+   image->format = rb->Format;
+   image->cpp = rrb->cpp;
+   image->data_type = rb->DataType;
+   image->data = loaderPrivate;
+   radeon_bo_ref(rrb->bo);
+   image->bo = rrb->bo;
+
+   image->width = rb->Width;
+   image->height = rb->Height;
+   image->pitch = rrb->pitch / image->cpp;
+
+   return image;
+}
+
+static void
+radeon_destroy_image(__DRIimage *image)
+{
+   radeon_bo_unref(image->bo);
+   FREE(image);
+}
+
+static __DRIimage *
+radeon_create_image(__DRIscreen *screen,
+                    int width, int height, int format,
+                    unsigned int use,
+                    void *loaderPrivate)
+{
+   __DRIimage *image;
+   radeonScreenPtr radeonScreen = screen->private;
+
+   image = CALLOC(sizeof *image);
+   if (image == NULL)
+      return NULL;
+
+   switch (format) {
+   case __DRI_IMAGE_FORMAT_RGB565:
+      image->format = MESA_FORMAT_RGB565;
+      image->internal_format = GL_RGB;
+      image->data_type = GL_UNSIGNED_BYTE;
+      break;
+   case __DRI_IMAGE_FORMAT_XRGB8888:
+      image->format = MESA_FORMAT_XRGB8888;
+      image->internal_format = GL_RGB;
+      image->data_type = GL_UNSIGNED_BYTE;
+      break;
+   case __DRI_IMAGE_FORMAT_ARGB8888:
+      image->format = MESA_FORMAT_ARGB8888;
+      image->internal_format = GL_RGBA;
+      image->data_type = GL_UNSIGNED_BYTE;
+      break;
+   default:
+      free(image);
+      return NULL;
+   }
+
+   image->data = loaderPrivate;
+   image->cpp = _mesa_get_format_bytes(image->format);
+   image->width = width;
+   image->height = height;
+   image->pitch = ((image->cpp * image->width + 255) & ~255) / image->cpp;
+
+   image->bo = radeon_bo_open(radeonScreen->bom,
+                              0,
+                              image->pitch * image->height * image->cpp,
+                              0,
+                              RADEON_GEM_DOMAIN_VRAM,
+                              0);
+
+   if (image->bo == NULL) {
+      FREE(image);
+      return NULL;
+   }
+
+   return image;
+}
+
+static GLboolean
+radeon_query_image(__DRIimage *image, int attrib, int *value)
+{
+   switch (attrib) {
+   case __DRI_IMAGE_ATTRIB_STRIDE:
+      *value = image->pitch * image->cpp;
+      return GL_TRUE;
+   case __DRI_IMAGE_ATTRIB_HANDLE:
+      *value = image->bo->handle;
+      return GL_TRUE;
+   case __DRI_IMAGE_ATTRIB_NAME:
+      radeon_gem_get_kernel_name(image->bo, (uint32_t *) value);
+      return GL_TRUE;
+   default:
+      return GL_FALSE;
+   }
+}
+
+static struct __DRIimageExtensionRec radeonImageExtension = {
+    { __DRI_IMAGE, __DRI_IMAGE_VERSION },
+   radeon_create_image_from_name,
+   radeon_create_image_from_renderbuffer,
+   radeon_destroy_image,
+   radeon_create_image,
+   radeon_query_image
+};
+
 static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
 {
    screen->device_id = device_id;
@@ -1138,6 +1322,8 @@ radeonCreateScreen( __DRIscreen *sPriv )
    else
 	   screen->chip_flags |= RADEON_CLASS_R600;
 
+   /* set group bytes for r6xx+ */
+   screen->group_bytes = 256;
    screen->cpp = dri_priv->bpp / 8;
    screen->AGPMode = dri_priv->AGPMode;
 
@@ -1382,7 +1568,8 @@ radeonCreateScreen2(__DRIscreen *sPriv)
    else
 	   screen->chip_flags |= RADEON_CLASS_R600;
 
-   /* r6xx+ tiling */
+   /* r6xx+ tiling, default to 256 group bytes */
+   screen->group_bytes = 256;
    if (IS_R600_CLASS(screen) && (sPriv->drm_version.minor >= 6)) {
 	   ret = radeonGetParam(sPriv, RADEON_INFO_TILE_CONFIG, &temp);
 	   if (ret)
@@ -1507,6 +1694,7 @@ radeonCreateScreen2(__DRIscreen *sPriv)
 #endif
 
    screen->extensions[i++] = &radeonFlushExtension.base;
+   screen->extensions[i++] = &radeonImageExtension.base;
 
    screen->extensions[i++] = NULL;
    sPriv->extensions = screen->extensions;
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.h b/src/mesa/drivers/dri/radeon/radeon_screen.h
index 2b33201a538..417ebf3b067 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.h
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.h
@@ -121,6 +121,17 @@ typedef struct radeon_screen {
    GLint r7xx_bank_op;
 } radeonScreenRec, *radeonScreenPtr;
 
+struct __DRIimageRec {
+   struct radeon_bo *bo;
+   GLenum internal_format;
+   GLuint format;
+   GLenum data_type;
+   int width, height;  /* in pixels */
+   int pitch;          /* in pixels */
+   int cpp;
+   void *data;
+};
+
 #define IS_R100_CLASS(screen) \
 	((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R100)
 #define IS_R200_CLASS(screen) \
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.c b/src/mesa/drivers/dri/radeon/radeon_tex.c
index d5285e24cd5..83b1d1b1d74 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.c
@@ -465,5 +465,9 @@ void radeonInitTextureFuncs( radeonContextPtr radeon, struct dd_function_table *
    functions->MapTexture = radeonMapTexture;
    functions->UnmapTexture = radeonUnmapTexture;
 
+#if FEATURE_OES_EGL_image
+   functions->EGLImageTargetTexture2D = radeon_image_target_texture_2d;
+#endif
+
    driInitTextureFormats();
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index 18ccb512d7a..8b1e34fe766 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -1007,3 +1007,67 @@ unsigned radeonIsFormatRenderable(gl_format mesa_format)
 			return 0;
 	}
 }
+
+#if FEATURE_OES_EGL_image
+void radeon_image_target_texture_2d(struct gl_context *ctx, GLenum target,
+				    struct gl_texture_object *texObj,
+				    struct gl_texture_image *texImage,
+				    GLeglImageOES image_handle)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	radeonTexObj *t = radeon_tex_obj(texObj);
+	radeon_texture_image *radeonImage = get_radeon_texture_image(texImage);
+	__DRIscreen *screen;
+	__DRIimage *image;
+
+	screen = radeon->dri.screen;
+	image = screen->dri2.image->lookupEGLImage(screen, image_handle,
+						   screen->loaderPrivate);
+	if (image == NULL)
+		return;
+
+	radeonFreeTexImageData(ctx, texImage);
+
+	texImage->Width = image->width;
+	texImage->Height = image->height;
+	texImage->Depth = 1;
+	texImage->_BaseFormat = GL_RGBA;
+	texImage->TexFormat = image->format;
+	texImage->RowStride = image->pitch;
+	texImage->InternalFormat = image->internal_format;
+
+	if(t->mt)
+	{
+		radeon_miptree_unreference(&t->mt);
+		t->mt = NULL;
+	}
+
+	/* NOTE: The following is *very* ugly and will probably break. But
+	   I don't know how to deal with it, without creating a whole new
+	   function like radeon_miptree_from_bo() so I'm going with the
+	   easy but error-prone way. */
+
+	radeon_try_alloc_miptree(radeon, t);
+
+	radeonImage->mtface = _mesa_tex_target_to_face(target);
+	radeonImage->mtlevel = 0;
+	radeon_miptree_reference(t->mt, &radeonImage->mt);
+
+	if (t->mt == NULL)
+	{
+		radeon_print(RADEON_TEXTURE, RADEON_VERBOSE,
+			     "%s Failed to allocate miptree.\n", __func__);
+		return;
+	}
+
+	/* Particularly ugly: this is guaranteed to break, if image->bo is
+	   not of the required size for a miptree. */
+	radeon_bo_unref(t->mt->bo);
+	radeon_bo_ref(image->bo);
+	t->mt->bo = image->bo;
+
+	if (!radeon_miptree_matches_image(t->mt, &radeonImage->base,
+					  radeonImage->mtface, 0))
+		fprintf(stderr, "miptree doesn't match image\n");
+}
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.h b/src/mesa/drivers/dri/radeon/radeon_texture.h
index 9138a7d5548..a1908c6bc72 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.h
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.h
@@ -137,4 +137,11 @@ void radeonCopyTexSubImage2D(struct gl_context *ctx, GLenum target, GLint level,
 
 unsigned radeonIsFormatRenderable(gl_format mesa_format);
 
+#if FEATURE_OES_EGL_image
+void radeon_image_target_texture_2d(struct gl_context *ctx, GLenum target,
+				    struct gl_texture_object *texObj,
+				    struct gl_texture_image *texImage,
+				    GLeglImageOES image_handle);
+#endif
+
 #endif
diff --git a/src/mesa/drivers/dri/savage/savage_xmesa.c b/src/mesa/drivers/dri/savage/savage_xmesa.c
index b3aaa0e504e..92fb4f44884 100644
--- a/src/mesa/drivers/dri/savage/savage_xmesa.c
+++ b/src/mesa/drivers/dri/savage/savage_xmesa.c
@@ -50,7 +50,6 @@
 #include "savagespan.h"
 #include "savagetris.h"
 #include "savageioctl.h"
-#include "savage_bci.h"
 
 #include "savage_dri.h"