35 files changed, 776 insertions, 59 deletions
diff --git a/src/glu/Makefile b/src/glu/Makefile
index b025a90b675..e519dfeec49 100644
--- a/src/glu/Makefile
+++ b/src/glu/Makefile
@@ -18,7 +18,11 @@ pcedit = sed \
 	-e 's,@INSTALL_DIR@,$(INSTALL_DIR),' \
 	-e 's,@INSTALL_LIB_DIR@,$(INSTALL_LIB_DIR),' \
 	-e 's,@INSTALL_INC_DIR@,$(INSTALL_INC_DIR),' \
-	-e 's,@VERSION@,$(MESA_MAJOR).$(MESA_MINOR).$(MESA_TINY),'
+	-e 's,@VERSION@,$(MESA_MAJOR).$(MESA_MINOR).$(MESA_TINY),' \
+	-e 's,@GLU_PC_REQ@,$(GLU_PC_REQ),' \
+	-e 's,@GLU_PC_REQ_PRIV@,$(GLU_PC_REQ_PRIV),' \
+	-e 's,@GLU_PC_LIB_PRIV@,$(GLU_PC_LIB_PRIV),' \
+	-e 's,@GLU_PC_CFLAGS@,$(GLU_PC_CFLAGS),'
 glu.pc: glu.pc.in
 	$(pcedit) $< > $@
 
diff --git a/src/glu/glu.pc.in b/src/glu/glu.pc.in
index 8606b9b222c..bc2517e90ed 100644
--- a/src/glu/glu.pc.in
+++ b/src/glu/glu.pc.in
@@ -5,7 +5,9 @@ includedir=@INSTALL_INC_DIR@
 
 Name: glu
 Description: Mesa OpenGL Utility library
-Requires: gl
+Requires: @GLU_PC_REQ@
+Requires.private: @GLU_PC_REQ_PRIV@
 Version: @VERSION@
 Libs: -L${libdir} -lGLU
-Cflags: -I${includedir}
+Libs.private: @GLU_PC_LIB_PRIV@
+Cflags: -I${includedir} @GLU_PC_CFLAGS@
diff --git a/src/glut/glx/Makefile b/src/glut/glx/Makefile
index 74f69c3cff0..87301f1b77d 100644
--- a/src/glut/glx/Makefile
+++ b/src/glut/glx/Makefile
@@ -104,7 +104,10 @@ pcedit = sed \
 	-e 's,@INSTALL_DIR@,$(INSTALL_DIR),' \
 	-e 's,@INSTALL_LIB_DIR@,$(INSTALL_LIB_DIR),' \
 	-e 's,@INSTALL_INC_DIR@,$(INSTALL_INC_DIR),' \
-	-e 's,@VERSION@,$(GLUT_MAJOR).$(GLUT_MINOR).$(GLUT_TINY),'
+	-e 's,@VERSION@,$(GLUT_MAJOR).$(GLUT_MINOR).$(GLUT_TINY),' \
+	-e 's,@GLUT_PC_REQ_PRIV@,$(GLUT_PC_REQ_PRIV),' \
+	-e 's,@GLUT_PC_LIB_PRIV@,$(GLUT_PC_LIB_PRIV),' \
+	-e 's,@GLUT_PC_CFLAGS@,$(GLUT_PC_CFLAGS),'
 glut.pc: glut.pc.in
 	$(pcedit) $< > $@
 
diff --git a/src/glut/glx/glut.pc.in b/src/glut/glx/glut.pc.in
index e8638fe1488..ae0689d7e81 100644
--- a/src/glut/glx/glut.pc.in
+++ b/src/glut/glx/glut.pc.in
@@ -6,6 +6,8 @@ includedir=@INSTALL_INC_DIR@
 Name: glut
 Description: Mesa OpenGL Utility Toolkit library
 Requires: gl glu
+Requires.private: @GLUT_PC_REQ_PRIV@
 Version: @VERSION@
 Libs: -L${libdir} -lglut
-Cflags: -I${includedir}
+Libs.private: @GLUT_PC_LIB_PRIV@
+Cflags: -I${includedir} @GLUT_PC_CFLAGS@
diff --git a/src/glut/mini/Makefile b/src/glut/mini/Makefile
index e47d09edb71..112a250bbd8 100644
--- a/src/glut/mini/Makefile
+++ b/src/glut/mini/Makefile
@@ -77,7 +77,10 @@ pcedit = sed \
 	-e 's,@INSTALL_DIR@,$(INSTALL_DIR),' \
 	-e 's,@INSTALL_LIB_DIR@,$(INSTALL_LIB_DIR),' \
 	-e 's,@INSTALL_INC_DIR@,$(INSTALL_INC_DIR),' \
-	-e 's,@VERSION@,$(GLUT_MAJOR).$(GLUT_MINOR).$(GLUT_TINY),'
+	-e 's,@VERSION@,$(GLUT_MAJOR).$(GLUT_MINOR).$(GLUT_TINY),' \
+	-e 's,@GLUT_PC_REQ_PRIV@,$(GLUT_PC_REQ_PRIV),' \
+	-e 's,@GLUT_PC_LIB_PRIV@,$(GLUT_PC_LIB_PRIV),' \
+	-e 's,@GLUT_PC_CFLAGS@,$(GLUT_PC_CFLAGS),'
 glut.pc: glut.pc.in
 	$(pcedit) $< > $@
 
diff --git a/src/glut/mini/glut.pc.in b/src/glut/mini/glut.pc.in
index e8638fe1488..ae0689d7e81 100644
--- a/src/glut/mini/glut.pc.in
+++ b/src/glut/mini/glut.pc.in
@@ -6,6 +6,8 @@ includedir=@INSTALL_INC_DIR@
 Name: glut
 Description: Mesa OpenGL Utility Toolkit library
 Requires: gl glu
+Requires.private: @GLUT_PC_REQ_PRIV@
 Version: @VERSION@
 Libs: -L${libdir} -lglut
-Cflags: -I${includedir}
+Libs.private: @GLUT_PC_LIB_PRIV@
+Cflags: -I${includedir} @GLUT_PC_CFLAGS@
diff --git a/src/glw/Makefile b/src/glw/Makefile
index 753c4b74d4a..cf412b225eb 100644
--- a/src/glw/Makefile
+++ b/src/glw/Makefile
@@ -30,7 +30,10 @@ pcedit = sed \
 	-e 's,@INSTALL_DIR@,$(INSTALL_DIR),' \
 	-e 's,@INSTALL_LIB_DIR@,$(INSTALL_LIB_DIR),' \
 	-e 's,@INSTALL_INC_DIR@,$(INSTALL_INC_DIR),' \
-	-e 's,@VERSION@,$(MAJOR).$(MINOR).$(TINY),'
+	-e 's,@VERSION@,$(MAJOR).$(MINOR).$(TINY),' \
+	-e 's,@GLW_PC_REQ_PRIV@,$(GLW_PC_REQ_PRIV),' \
+	-e 's,@GLW_PC_LIB_PRIV@,$(GLW_PC_LIB_PRIV),' \
+	-e 's,@GLW_PC_CFLAGS@,$(GLW_PC_CFLAGS),'
 glw.pc: glw.pc.in
 	$(pcedit) $< > $@
 
diff --git a/src/glw/glw.pc.in b/src/glw/glw.pc.in
index 25f3f73b8c6..5493093be19 100644
--- a/src/glw/glw.pc.in
+++ b/src/glw/glw.pc.in
@@ -6,6 +6,8 @@ includedir=@INSTALL_INC_DIR@
 Name: glw
 Description: Mesa OpenGL widget library
 Requires: gl
+Requires.private: @GLW_PC_REQ_PRIV@
 Version: @VERSION@
 Libs: -L${libdir} -lGLw
-Cflags: -I${includedir}
+Libs.private: @GLW_PC_LIB_PRIV@
+Cflags: -I${includedir} @GLW_PC_CFLAGS@
diff --git a/src/glx/x11/indirect.c b/src/glx/x11/indirect.c
index 1fcd5ca1bb1..08d52aeea30 100644
--- a/src/glx/x11/indirect.c
+++ b/src/glx/x11/indirect.c
@@ -5397,8 +5397,7 @@ __glx_TexSubImage_1D2D(unsigned opcode, unsigned dim, GLenum target,
             (void) memcpy((void *) (gc->pc + 44), (void *) (&height), 4);
             (void) memcpy((void *) (gc->pc + 48), (void *) (&format), 4);
             (void) memcpy((void *) (gc->pc + 52), (void *) (&type), 4);
-            (void) memcpy((void *) (gc->pc + 56),
-                          (void *) ((pixels == NULL) ? one : zero), 4);
+            (void) memset((void *) (gc->pc + 56), 0, 4);
             if (compsize > 0) {
                 (*gc->fillImage) (gc, dim, width, height, 1, format, type,
                                   pixels, gc->pc + 60, gc->pc + 4);
@@ -5424,7 +5423,7 @@ __glx_TexSubImage_1D2D(unsigned opcode, unsigned dim, GLenum target,
             (void) memcpy((void *) (pc + 48), (void *) (&height), 4);
             (void) memcpy((void *) (pc + 52), (void *) (&format), 4);
             (void) memcpy((void *) (pc + 56), (void *) (&type), 4);
-            (void) memcpy((void *) (pc + 60), zero, 4);
+            (void) memset((void *) (pc + 60), 0, 4);
             __glXSendLargeImage(gc, compsize, dim, width, height, 1, format,
                                 type, pixels, pc + 64, pc + 8);
         }
@@ -6869,8 +6868,7 @@ __glx_TexSubImage_3D4D(unsigned opcode, unsigned dim, GLenum target,
             (void) memcpy((void *) (gc->pc + 76), (void *) (&extent), 4);
             (void) memcpy((void *) (gc->pc + 80), (void *) (&format), 4);
             (void) memcpy((void *) (gc->pc + 84), (void *) (&type), 4);
-            (void) memcpy((void *) (gc->pc + 88),
-                          (void *) ((pixels == NULL) ? one : zero), 4);
+            (void) memset((void *) (gc->pc + 88), 0, 4);
             if (compsize > 0) {
                 (*gc->fillImage) (gc, dim, width, height, depth, format, type,
                                   pixels, gc->pc + 92, gc->pc + 4);
@@ -6900,7 +6898,7 @@ __glx_TexSubImage_3D4D(unsigned opcode, unsigned dim, GLenum target,
             (void) memcpy((void *) (pc + 80), (void *) (&extent), 4);
             (void) memcpy((void *) (pc + 84), (void *) (&format), 4);
             (void) memcpy((void *) (pc + 88), (void *) (&type), 4);
-            (void) memcpy((void *) (pc + 92), zero, 4);
+            (void) memset((void *) (pc + 92), 0, 4);
             __glXSendLargeImage(gc, compsize, dim, width, height, depth,
                                 format, type, pixels, pc + 96, pc + 8);
         }
diff --git a/src/mesa/Makefile b/src/mesa/Makefile
index ba65ce695fb..6b4057030ef 100644
--- a/src/mesa/Makefile
+++ b/src/mesa/Makefile
@@ -88,14 +88,29 @@ install: default
 	done
 
 
-pcedit = sed \
+gl_pcedit = sed \
 	-e 's,@INSTALL_DIR@,$(INSTALL_DIR),' \
 	-e 's,@INSTALL_LIB_DIR@,$(INSTALL_LIB_DIR),' \
 	-e 's,@INSTALL_INC_DIR@,$(INSTALL_INC_DIR),' \
-	-e 's,@VERSION@,$(MESA_MAJOR).$(MESA_MINOR).$(MESA_TINY),'
+	-e 's,@VERSION@,$(MESA_MAJOR).$(MESA_MINOR).$(MESA_TINY),' \
+	-e 's,@GL_PC_REQ_PRIV@,$(GL_PC_REQ_PRIV),' \
+	-e 's,@GL_PC_LIB_PRIV@,$(GL_PC_LIB_PRIV),' \
+	-e 's,@GL_PC_CFLAGS@,$(GL_PC_CFLAGS),'
 
 gl.pc: gl.pc.in
-	$(pcedit) $< > $@
+	$(gl_pcedit) $< > $@
+
+osmesa_pcedit = sed \
+	-e 's,@INSTALL_DIR@,$(INSTALL_DIR),' \
+	-e 's,@INSTALL_LIB_DIR@,$(INSTALL_LIB_DIR),' \
+	-e 's,@INSTALL_INC_DIR@,$(INSTALL_INC_DIR),' \
+	-e 's,@VERSION@,$(MESA_MAJOR).$(MESA_MINOR).$(MESA_TINY),' \
+	-e 's,@OSMESA_LIB@,$(OSMESA_LIB),' \
+	-e 's,@OSMESA_PC_REQ@,$(OSMESA_PC_REQ),' \
+	-e 's,@OSMESA_PC_LIB_PRIV@,$(OSMESA_PC_LIB_PRIV),'
+
+osmesa.pc: osmesa.pc.in
+	$(osmesa_pcedit) $< > $@
 
 install-headers:
 	$(INSTALL) -d $(DESTDIR)$(INSTALL_INC_DIR)/GL
@@ -109,10 +124,12 @@ install-libgl: default gl.pc install-headers
 		$(DESTDIR)$(INSTALL_LIB_DIR)
 	$(INSTALL) -m 644 gl.pc $(DESTDIR)$(INSTALL_LIB_DIR)/pkgconfig
 
-install-osmesa: default
+install-osmesa: default osmesa.pc
 	$(INSTALL) -d $(DESTDIR)$(INSTALL_LIB_DIR)
+	$(INSTALL) -d $(DESTDIR)$(INSTALL_LIB_DIR)/pkgconfig
 	$(INSTALL) $(TOP)/$(LIB_DIR)/$(OSMESA_LIB_GLOB) \
 		$(DESTDIR)$(INSTALL_LIB_DIR)
+	$(INSTALL) -m 644 osmesa.pc $(DESTDIR)$(INSTALL_LIB_DIR)/pkgconfig
 
 install-dri: default
 	cd drivers/dri && $(MAKE) install
diff --git a/src/mesa/drivers/dri/Makefile b/src/mesa/drivers/dri/Makefile
index eef68825bc9..9e49fb16f53 100644
--- a/src/mesa/drivers/dri/Makefile
+++ b/src/mesa/drivers/dri/Makefile
@@ -25,7 +25,8 @@ pcedit = sed \
 	-e 's,@INSTALL_LIB_DIR@,$(INSTALL_LIB_DIR),' \
 	-e 's,@INSTALL_INC_DIR@,$(INSTALL_INC_DIR),' \
 	-e 's,@VERSION@,$(MESA_MAJOR).$(MESA_MINOR).$(MESA_TINY),' \
-	-e 's,@DRI_DRIVER_DIR@,$(DRI_DRIVER_SEARCH_DIR),'
+	-e 's,@DRI_DRIVER_DIR@,$(DRI_DRIVER_SEARCH_DIR),' \
+	-e 's,@DRI_PC_REQ_PRIV@,$(DRI_PC_REQ_PRIV),'
 
 dri.pc: dri.pc.in
 	$(pcedit) $< > $@
diff --git a/src/mesa/drivers/dri/dri.pc.in b/src/mesa/drivers/dri/dri.pc.in
index c47ee9c7e7d..695aa6cfd66 100644
--- a/src/mesa/drivers/dri/dri.pc.in
+++ b/src/mesa/drivers/dri/dri.pc.in
@@ -7,4 +7,5 @@ dridriverdir=@DRI_DRIVER_DIR@
 Name: dri
 Description: Direct Rendering Infrastructure
 Version: @VERSION@
+Requires.private: @DRI_PC_REQ_PRIV@
 Cflags: -I${includedir}
diff --git a/src/mesa/drivers/dri/i915/i915_texstate.c b/src/mesa/drivers/dri/i915/i915_texstate.c
index d1b0dcdf319..d53e2cbd5aa 100644
--- a/src/mesa/drivers/dri/i915/i915_texstate.c
+++ b/src/mesa/drivers/dri/i915/i915_texstate.c
@@ -295,6 +295,13 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
            wt == GL_CLAMP_TO_BORDER || wr == GL_CLAMP_TO_BORDER))
          return GL_FALSE;
 
+      /* Only support TEXCOORDMODE_CLAMP_EDGE and TEXCOORDMODE_CUBE (not 
+       * used) when using cube map texture coordinates
+       */
+      if (tObj->Target == GL_TEXTURE_CUBE_MAP_ARB &&
+          (((ws != GL_CLAMP) && (ws != GL_CLAMP_TO_EDGE)) ||
+           ((wr != GL_CLAMP) && (wr != GL_CLAMP_TO_EDGE))))
+          return GL_FALSE;
 
       state[I915_TEXREG_SS3] = ss3;     /* SS3_NORMALIZED_COORDS */
 
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index c3a26fc82eb..785fb784ca9 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -309,12 +309,12 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
       if (first_time || (brw->state.dirty.brw & BRW_NEW_PRIMITIVE)) {
 	 first_time = GL_FALSE;
 
+	 brw_validate_state(brw);
+
 	 /* Various fallback checks:  */
 	 if (brw->intel.Fallback)
 	    goto out;
 
-	 brw_validate_state(brw);
-
 	 /* Check that we can fit our state in with our existing batchbuffer, or
 	  * flush otherwise.
 	  */
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
index cb728190f5c..baecfdcb799 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@@ -1095,7 +1095,7 @@ static void noise1_sub( struct brw_wm_compile *c ) {
     /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
        be hashed.  Also compute the remainder (offset within the unit
        length), interleaved to reduce register dependency penalties. */
-    brw_RNDD( p, itmp[ 0 ], param );
+    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
     brw_FRC( p, param, param );
     brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
     brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
@@ -1220,8 +1220,8 @@ static void noise2_sub( struct brw_wm_compile *c ) {
     /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
        be hashed.  Also compute the remainders (offsets within the unit
        square), interleaved to reduce register dependency penalties. */
-    brw_RNDD( p, itmp[ 0 ], param0 );
-    brw_RNDD( p, itmp[ 1 ], param1 );
+    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
+    brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
     brw_FRC( p, param0, param0 );
     brw_FRC( p, param1, param1 );
     brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
@@ -1400,21 +1400,19 @@ static void noise3_sub( struct brw_wm_compile *c ) {
     /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
        be hashed.  Also compute the remainders (offsets within the unit
        cube), interleaved to reduce register dependency penalties. */
-    brw_RNDD( p, itmp[ 0 ], param0 );
-    brw_RNDD( p, itmp[ 1 ], param1 );
-    brw_RNDD( p, itmp[ 2 ], param2 );
-    brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBC8F ) ); /* constant used later */
-    brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0xD0BD ) ); /* constant used later */
-    brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0x9B93 ) ); /* constant used later */
+    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
+    brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
+    brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
     brw_FRC( p, param0, param0 );
     brw_FRC( p, param1, param1 );
     brw_FRC( p, param2, param2 );
     /* Since we now have only 16 bits of precision in the hash, we must
        be more careful about thorough mixing to maintain entropy as we
        squash the input vector into a small scalar. */
-    brw_MUL( p, brw_acc_reg(), itmp[ 4 ], itmp[ 0 ] );
-    brw_MAC( p, brw_acc_reg(), itmp[ 5 ], itmp[ 1 ] );
-    brw_MAC( p, itmp[ 0 ], itmp[ 6 ], itmp[ 2 ] );
+    brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
+    brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
+    brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
+	     brw_imm_uw( 0x9B93 ) );
     brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
 	     brw_imm_uw( 0xBC8F ) );
 
@@ -1668,6 +1666,430 @@ static void emit_noise3( struct brw_wm_compile *c,
     release_tmps( c, mark );
 }
     
+/* For the four-dimensional case, the little micro-optimisation benefits
+   we obtain by unrolling all the loops aren't worth the massive bloat it
+   now causes.  Instead, we loop twice around performing a similar operation
+   to noise3, once for the w=0 cube and once for the w=1, with a bit more
+   code to glue it all together. */
+static void noise4_sub( struct brw_wm_compile *c ) {
+
+    struct brw_compile *p = &c->func;
+    struct brw_reg param[ 4 ],
+	x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
+	w0, /* noise for the w=0 cube */
+	floors[ 2 ], /* integer coordinates of base corner of hypercube */
+	interp[ 4 ], /* interpolation coefficients */
+	t, tmp[ 8 ], /* float temporaries */
+	itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
+	wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
+    int i, j;
+    int mark = mark_tmps( c );
+    GLuint loop, origin;
+    
+    x0y0 = alloc_tmp( c );
+    x0y1 = alloc_tmp( c );
+    x1y0 = alloc_tmp( c );
+    x1y1 = alloc_tmp( c );
+    t = alloc_tmp( c );
+    w0 = alloc_tmp( c );    
+    floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
+    floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
+
+    for( i = 0; i < 4; i++ ) {
+	param[ i ] = lookup_tmp( c, mark - 5 + i );
+	interp[ i ] = alloc_tmp( c );
+    }
+    
+    for( i = 0; i < 8; i++ ) {
+	tmp[ i ] = alloc_tmp( c );
+	itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
+	wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
+    }
+
+    brw_set_access_mode( p, BRW_ALIGN_1 );
+
+    /* We only want 16 bits of precision from the integral part of each
+       co-ordinate, but unfortunately the RNDD semantics would saturate
+       at 16 bits if we performed the operation directly to a 16-bit
+       destination.  Therefore, we round to 32-bit temporaries where
+       appropriate, and then store only the lower 16 bits. */
+    brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
+    brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
+    brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
+    brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
+    brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
+    brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
+
+    /* Modify the flag register here, because the side effect is useful
+       later (see below).  We know for certain that all flags will be
+       cleared, since the FRC instruction cannot possibly generate
+       negative results.  Even for exceptional inputs (infinities, denormals,
+       NaNs), the architecture guarantees that the L conditional is false. */
+    brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
+    brw_FRC( p, param[ 0 ], param[ 0 ] );
+    brw_set_predicate_control( p, BRW_PREDICATE_NONE );
+    for( i = 1; i < 4; i++ )	
+	brw_FRC( p, param[ i ], param[ i ] );
+    
+    /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
+       of all. */
+    for( i = 0; i < 4; i++ )
+	brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
+    for( i = 0; i < 4; i++ )
+	brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
+    for( i = 0; i < 4; i++ )
+	brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
+    for( i = 0; i < 4; i++ )
+	brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
+    for( j = 0; j < 3; j++ )
+	for( i = 0; i < 4; i++ )
+	    brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
+
+    /* Mark the current address, as it will be a jump destination.  The
+       following code will be executed twice: first, with the flag
+       register clear indicating the w=0 case, and second with flags
+       set for w=1. */
+    loop = p->nr_insn;
+    
+    /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
+       be hashed.  Since we have only 16 bits of precision in the hash, we
+       must be careful about thorough mixing to maintain entropy as we
+       squash the input vector into a small scalar. */
+    brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
+	     brw_imm_uw( 0xBC8F ) );
+    brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
+	     brw_imm_uw( 0xD0BD ) );
+    brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
+	     brw_imm_uw( 0x9B93 ) );
+    brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
+	     brw_imm_uw( 0xA359 ) );
+    brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
+	     brw_imm_uw( 0xBC8F ) );
+
+    /* Temporarily disable the execution mask while we work with ExecSize=16
+       channels (the mask is set for ExecSize=8 and is probably incorrect).
+       Although this might cause execution of unwanted channels, the code
+       writes only to temporary registers and has no side effects, so
+       disabling the mask is harmless. */
+    brw_push_insn_state( p );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
+    brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
+    brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
+
+    /* We're now ready to perform the hashing.  The eight hashes are
+       interleaved for performance.  The hash function used is
+       designed to rapidly achieve avalanche and require only 16x16
+       bit multiplication, and 8-bit swizzles (which we get for
+       free). */
+    for( i = 0; i < 4; i++ )
+	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
+    for( i = 0; i < 4; i++ )
+	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
+		 odd_bytes( wtmp[ i ] ) );
+    for( i = 0; i < 4; i++ )
+	brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
+    for( i = 0; i < 4; i++ )
+	brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
+		 odd_bytes( wtmp[ i ] ) );
+    brw_pop_insn_state( p );
+
+    /* Now we want to initialise the four rear gradients based on the
+       hashes.  Format conversion from signed integer to float leaves
+       everything scaled too high by a factor of pow( 2, 15 ), but
+       we correct for that right at the end. */
+    /* x component */
+    brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
+    brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
+    brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
+    brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
+    brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
+
+    brw_push_insn_state( p );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
+    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
+    brw_pop_insn_state( p );
+    
+    brw_MUL( p, x1y0, x1y0, t );
+    brw_MUL( p, x1y1, x1y1, t );
+    brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
+    brw_MUL( p, x0y0, x0y0, param[ 0 ] );
+    brw_MUL( p, x0y1, x0y1, param[ 0 ] );
+
+    /* y component */
+    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
+    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
+    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
+    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
+    
+    brw_push_insn_state( p );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
+    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
+    brw_pop_insn_state( p );
+
+    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
+    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );    
+    /* prepare t for the w component (used below): w the first time through
+       the loop; w - 1 the second time) */
+    brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
+    brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
+    p->current->header.predicate_inverse = 1;
+    brw_MOV( p, t, param[ 3 ] );
+    p->current->header.predicate_inverse = 0;
+    brw_set_predicate_control( p, BRW_PREDICATE_NONE );
+    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
+    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
+    
+    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+    
+    /* z component */
+    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
+    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
+    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
+    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
+
+    brw_push_insn_state( p );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
+    brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
+    brw_pop_insn_state( p );
+
+    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
+    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
+    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
+    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
+    
+    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+
+    /* w component */
+    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
+    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
+    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
+    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
+
+    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
+    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
+    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
+    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
+    brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
+    
+    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+
+    /* Here we interpolate in the y dimension... */
+    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
+    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
+    brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
+    brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
+    brw_ADD( p, x0y0, x0y0, x0y1 );
+    brw_ADD( p, x1y0, x1y0, x1y1 );
+
+    /* And now in x.  Leave the result in tmp[ 0 ] (see below)... */
+    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
+    brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
+    brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
+
+    /* Now do the same thing for the front four gradients... */
+    /* x component */
+    brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
+    brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
+    brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
+    brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
+
+    brw_push_insn_state( p );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
+    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
+    brw_pop_insn_state( p );
+
+    brw_MUL( p, x1y0, x1y0, t );
+    brw_MUL( p, x1y1, x1y1, t );
+    brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
+    brw_MUL( p, x0y0, x0y0, param[ 0 ] );
+    brw_MUL( p, x0y1, x0y1, param[ 0 ] );
+
+    /* y component */
+    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
+    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
+    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
+    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
+    
+    brw_push_insn_state( p );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
+    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
+    brw_pop_insn_state( p );
+
+    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
+    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
+    brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
+    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
+    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
+    
+    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+    
+    /* z component */
+    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
+    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
+    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
+    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
+
+    brw_push_insn_state( p );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
+    brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
+    brw_pop_insn_state( p );
+
+    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
+    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
+    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
+    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
+    /* prepare t for the w component (used below): w the first time through
+       the loop; w - 1 the second time) */
+    brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
+    brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
+    p->current->header.predicate_inverse = 1;
+    brw_MOV( p, t, param[ 3 ] );
+    p->current->header.predicate_inverse = 0;
+    brw_set_predicate_control( p, BRW_PREDICATE_NONE );
+    
+    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+
+    /* w component */
+    brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
+    brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
+    brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
+    brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
+
+    brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
+    brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
+    brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
+    brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
+    
+    brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+    brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+    brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+    brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+
+    /* Interpolate in the y dimension: */
+    brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
+    brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
+    brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
+    brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
+    brw_ADD( p, x0y0, x0y0, x0y1 );
+    brw_ADD( p, x1y0, x1y0, x1y1 );
+
+    /* And now in x.  The rear face is in tmp[ 0 ] (see above), so this
+       time put the front face in tmp[ 1 ] and we're nearly there... */
+    brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
+    brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
+    brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
+
+    /* Another interpolation, in the z dimension: */
+    brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );    
+    brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
+    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
+
+    /* Exit the loop if we've computed both cubes... */
+    origin = p->nr_insn;
+    brw_push_insn_state( p );
+    brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
+    brw_pop_insn_state( p );
+
+    /* Save the result for the w=0 case, and increment the w coordinate: */
+    brw_MOV( p, w0, tmp[ 0 ] );
+    brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
+	     brw_imm_uw( 1 ) );
+
+    /* Loop around for the other cube.  Explicitly set the flag register
+       (unfortunately we must spend an extra instruction to do this: we
+       can't rely on a side effect of the previous MOV or ADD because
+       conditional modifiers which are normally true might be false in
+       exceptional circumstances, e.g. given a NaN input; the add to
+       brw_ip_reg() is not suitable because the IP is not an 8-vector). */
+    brw_push_insn_state( p );
+    brw_set_mask_control( p, BRW_MASK_DISABLE );
+    brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
+    brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
+	     brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
+    brw_pop_insn_state( p );
+
+    /* Patch the previous conditional branch now that we know the
+       destination address. */
+    brw_set_src1( p->store + origin,
+		  brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
+
+    /* The very last interpolation. */
+    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );    
+    brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
+    brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
+
+    /* scale by pow( 2, -15 ), as described above */
+    brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
+
+    release_tmps( c, mark );
+}
+
+static void emit_noise4( struct brw_wm_compile *c,
+			 struct prog_instruction *inst )
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    int mark = mark_tmps( c );
+
+    assert( mark == 0 );
+    
+    src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
+    src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
+    src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
+    src3 = get_src_reg( c, inst->SrcReg, 3, 1 );
+
+    param0 = alloc_tmp( c );
+    param1 = alloc_tmp( c );
+    param2 = alloc_tmp( c );
+    param3 = alloc_tmp( c );
+
+    brw_MOV( p, param0, src0 );
+    brw_MOV( p, param1, src1 );
+    brw_MOV( p, param2, src2 );
+    brw_MOV( p, param3, src3 );
+
+    invoke_subroutine( c, SUB_NOISE4, noise4_sub );
+    
+    /* Fill in the result: */
+    brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    brw_MOV( p, dst, param0 );
+	}
+    }
+    if( inst->SaturateMode == SATURATE_ZERO_ONE )
+	brw_set_saturate( p, 0 );
+    
+    release_tmps( c, mark );
+}
+    
 static void emit_wpos_xy(struct brw_wm_compile *c,
                 struct prog_instruction *inst)
 {
@@ -1996,8 +2418,9 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    case OPCODE_NOISE3:
 		emit_noise3(c, inst);
 		break;
-	    /* case OPCODE_NOISE4: */
-		/* not yet implemented */
+	    case OPCODE_NOISE4:
+		emit_noise4(c, inst);
+		break;
 	    case OPCODE_TEX:
 		emit_tex(c, inst);
 		break;
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.h b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
index 8129996979f..51579df09e7 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
@@ -55,6 +55,12 @@ struct intel_batchbuffer
 
    GLuint size;
 
+   /** Tracking of BEGIN_BATCH()/OUT_BATCH()/ADVANCE_BATCH() debugging */
+   struct {
+      GLuint total;
+      GLubyte *start_ptr;
+   } emit;
+
    GLuint dirty_state;
 };
 
@@ -143,9 +149,12 @@ intel_batchbuffer_require_space(struct intel_batchbuffer *batch,
 
 #define BEGIN_BATCH(n, cliprect_mode) do {				\
    intel_batchbuffer_require_space(intel->batch, (n)*4, cliprect_mode); \
+   assert(intel->batch->emit.start_ptr == NULL);			\
+   intel->batch->emit.total = (n) * 4;					\
+   intel->batch->emit.start_ptr = intel->batch->ptr;			\
 } while (0)
 
-#define OUT_BATCH(d)  intel_batchbuffer_emit_dword(intel->batch, d)
+#define OUT_BATCH(d) intel_batchbuffer_emit_dword(intel->batch, d)
 
 #define OUT_RELOC(buf, read_domains, write_domain, delta) do {		\
    assert((delta) >= 0);						\
@@ -153,7 +162,16 @@ intel_batchbuffer_require_space(struct intel_batchbuffer *batch,
 				read_domains, write_domain, delta);	\
 } while (0)
 
-#define ADVANCE_BATCH() do { } while(0)
+#define ADVANCE_BATCH() do {						\
+   unsigned int _n = intel->batch->ptr - intel->batch->emit.start_ptr;	\
+   assert(intel->batch->emit.start_ptr != NULL);			\
+   if (_n != intel->batch->emit.total) {				\
+      fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",	\
+	      _n, intel->batch->emit.total);				\
+      abort();								\
+   }									\
+   intel->batch->emit.start_ptr = NULL;					\
+} while(0)
 
 
 static INLINE void
diff --git a/src/mesa/drivers/dri/intel/intel_decode.c b/src/mesa/drivers/dri/intel/intel_decode.c
index 0e72ca08b22..0b8a287f6fe 100644
--- a/src/mesa/drivers/dri/intel/intel_decode.c
+++ b/src/mesa/drivers/dri/intel/intel_decode.c
@@ -836,10 +836,71 @@ get_965_depthformat(unsigned int depthformat)
     }
 }
 
+static const char *
+get_965_element_component(uint32_t data, int component)
+{
+    uint32_t component_control = (data >> (16 + (3 - component) * 4)) & 0x7;
+
+    switch (component_control) {
+    case 0:
+	return "nostore";
+    case 1:
+	switch (component) {
+	case 0: return "X";
+	case 1: return "Y";
+	case 2: return "Z";
+	case 3: return "W";
+	default: return "fail";
+	}
+    case 2:
+	return "0.0";
+    case 3:
+	return "1.0";
+    case 4:
+	return "0x1";
+    case 5:
+	return "VID";
+    default:
+	return "fail";
+    }
+}
+
+static const char *
+get_965_prim_type(uint32_t data)
+{
+    uint32_t primtype = (data >> 10) & 0x1f;
+
+    switch (primtype) {
+    case 0x01: return "point list";
+    case 0x02: return "line list";
+    case 0x03: return "line strip";
+    case 0x04: return "tri list";
+    case 0x05: return "tri strip";
+    case 0x06: return "tri fan";
+    case 0x07: return "quad list";
+    case 0x08: return "quad strip";
+    case 0x09: return "line list adj";
+    case 0x0a: return "line strip adj";
+    case 0x0b: return "tri list adj";
+    case 0x0c: return "tri strip adj";
+    case 0x0d: return "tri strip reverse";
+    case 0x0e: return "polygon";
+    case 0x0f: return "rect list";
+    case 0x10: return "line loop";
+    case 0x11: return "point list bf";
+    case 0x12: return "line strip cont";
+    case 0x13: return "line strip bf";
+    case 0x14: return "line strip cont bf";
+    case 0x15: return "tri fan no stipple";
+    default: return "fail";
+    }
+}
+
 static int
 decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 {
     unsigned int opcode, len;
+    int i;
 
     struct {
 	uint32_t opcode;
@@ -860,8 +921,7 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 	{ 0x780b, 1, 1, "3DSTATE_VF_STATISTICS" },
 	{ 0x7808, 5, 257, "3DSTATE_VERTEX_BUFFERS" },
 	{ 0x7809, 3, 256, "3DSTATE_VERTEX_ELEMENTS" },
-	/* 0x7808: 3DSTATE_VERTEX_BUFFERS */
-	/* 0x7809: 3DSTATE_VERTEX_ELEMENTS */
+	{ 0x780a, 3, 3, "3DSTATE_INDEX_BUFFER" },
 	{ 0x7900, 4, 4, "3DSTATE_DRAWING_RECTANGLE" },
 	{ 0x7901, 5, 5, "3DSTATE_CONSTANT_COLOR" },
 	{ 0x7905, 5, 7, "3DSTATE_DEPTH_BUFFER" },
@@ -947,6 +1007,64 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 
 	return len;
 
+    case 0x7808:
+	len = (data[0] & 0xff) + 2;
+	if ((len - 1) % 4 != 0)
+	    fprintf(out, "Bad count in 3DSTATE_VERTEX_BUFFERS\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DSTATE_VERTEX_BUFFERS");
+	instr_out(data, hw_offset, 0, "3DSTATE_VERTEX_BUFFERS\n");
+
+	for (i = 1; i < len;) {
+	    instr_out(data, hw_offset, i, "buffer %d: %s, pitch %db\n",
+		      data[i] >> 27,
+		      data[i] & (1 << 26) ? "random" : "sequential",
+		      data[i] & 0x07ff);
+	    i++;
+	    instr_out(data, hw_offset, i++, "buffer address\n");
+	    instr_out(data, hw_offset, i++, "max index\n");
+	    instr_out(data, hw_offset, i++, "mbz\n");
+	}
+	return len;
+
+    case 0x7809:
+	len = (data[0] & 0xff) + 2;
+	if ((len + 1) % 2 != 0)
+	    fprintf(out, "Bad count in 3DSTATE_VERTEX_ELEMENTS\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DSTATE_VERTEX_ELEMENTS");
+	instr_out(data, hw_offset, 0, "3DSTATE_VERTEX_ELEMENTS\n");
+
+	for (i = 1; i < len;) {
+	    instr_out(data, hw_offset, i, "buffer %d: %svalid, type 0x%04x, "
+		      "src offset 0x%04xd bytes\n",
+		      data[i] >> 27,
+		      data[i] & (1 << 26) ? "" : "in",
+		      (data[i] >> 16) & 0x1ff,
+		      data[i] & 0x07ff);
+	    i++;
+	    instr_out(data, hw_offset, i, "(%s, %s, %s, %s), "
+		      "dst offset 0x%02x bytes\n",
+		      get_965_element_component(data[i], 0),
+		      get_965_element_component(data[i], 1),
+		      get_965_element_component(data[i], 2),
+		      get_965_element_component(data[i], 3),
+		      (data[i] & 0xff) * 4);
+	    i++;
+	}
+	return len;
+
+    case 0x780a:
+	len = (data[0] & 0xff) + 2;
+	if (len != 3)
+	    fprintf(out, "Bad count in 3DSTATE_INDEX_BUFFER\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DSTATE_INDEX_BUFFER");
+	instr_out(data, hw_offset, 0, "3DSTATE_INDEX_BUFFER\n");
+	instr_out(data, hw_offset, 1, "beginning buffer address\n");
+	instr_out(data, hw_offset, 2, "ending buffer address\n");
+	return len;
+
     case 0x7900:
 	if (len != 4)
 	    fprintf(out, "Bad count in 3DSTATE_DRAWING_RECTANGLE\n");
@@ -968,9 +1086,9 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 	return len;
 
     case 0x7905:
-	if (len != 5)
+	if (len != 5 && len != 6)
 	    fprintf(out, "Bad count in 3DSTATE_DEPTH_BUFFER\n");
-	if (count < 5)
+	if (count < len)
 	    BUFFER_FAIL(count, len, "3DSTATE_DEPTH_BUFFER");
 
 	instr_out(data, hw_offset, 0,
@@ -985,7 +1103,27 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 		  ((data[3] & 0x0007ffc0) >> 6) + 1,
 		  ((data[3] & 0xfff80000) >> 19) + 1);
 	instr_out(data, hw_offset, 4, "volume depth\n");
+	if (len == 6)
+	    instr_out(data, hw_offset, 5, "\n");
+
+	return len;
 
+    case 0x7b00:
+	len = (data[0] & 0xff) + 2;
+	if (len != 6)
+	    fprintf(out, "Bad count in 3DPRIMITIVE\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DPRIMITIVE");
+
+	instr_out(data, hw_offset, 0,
+		  "3DPRIMITIVE: %s %s\n",
+		  get_965_prim_type(data[0]),
+		  (data[0] & (1 << 15)) ? "random" : "sequential");
+	instr_out(data, hw_offset, 1, "primitive count\n");
+	instr_out(data, hw_offset, 2, "start vertex\n");
+	instr_out(data, hw_offset, 3, "instance count\n");
+	instr_out(data, hw_offset, 4, "start instance\n");
+	instr_out(data, hw_offset, 5, "index bias\n");
 	return len;
     }
 
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
index b96ba72853e..bf1c3f03f0e 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
@@ -111,9 +111,9 @@ intel_miptree_create(struct intel_context *intel,
 				      first_level, last_level, width0,
 				      height0, depth0, cpp, compress_byte);
    /*
-    * pitch == 0 indicates the null texture
+    * pitch == 0 || height == 0  indicates the null texture
     */
-   if (!mt || !mt->pitch)
+   if (!mt || !mt->pitch || !mt->total_height)
       return NULL;
 
    mt->region = intel_region_alloc(intel,
@@ -163,7 +163,7 @@ intel_miptree_create_for_region(struct intel_context *intel,
    mt->pitch = region->pitch;
 #endif
 
-   mt->region = region;
+   intel_region_reference(&mt->region, region);
 
    return mt;
  }
diff --git a/src/mesa/drivers/dri/intel/intel_tex.c b/src/mesa/drivers/dri/intel/intel_tex.c
index 82f8b870095..e64d8a1556d 100644
--- a/src/mesa/drivers/dri/intel/intel_tex.c
+++ b/src/mesa/drivers/dri/intel/intel_tex.c
@@ -231,6 +231,7 @@ intelInitTextureFuncs(struct dd_function_table *functions)
 
    /* compressed texture functions */
    functions->CompressedTexImage2D = intelCompressedTexImage2D;
+   functions->CompressedTexSubImage2D = intelCompressedTexSubImage2D;
    functions->GetCompressedTexImage = intelGetCompressedTexImage;
 
    functions->NewTextureObject = intelNewTextureObject;
diff --git a/src/mesa/drivers/dri/intel/intel_tex.h b/src/mesa/drivers/dri/intel/intel_tex.h
index 6219c1c953b..742ccc043aa 100644
--- a/src/mesa/drivers/dri/intel/intel_tex.h
+++ b/src/mesa/drivers/dri/intel/intel_tex.h
@@ -130,6 +130,16 @@ void intelCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
 				struct gl_texture_object *texObj,
 				struct gl_texture_image *texImage );
 
+void intelCompressedTexSubImage2D(GLcontext * ctx,
+				  GLenum target,
+				  GLint level,
+				  GLint xoffset, GLint yoffset,
+				  GLsizei width, GLsizei height,
+				  GLenum format, GLsizei imageSize,
+				  const GLvoid * pixels,
+				  struct gl_texture_object *texObj,
+				  struct gl_texture_image *texImage);
+
 void intelGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
 				GLvoid *pixels,
 				struct gl_texture_object *texObj,
diff --git a/src/mesa/drivers/dri/intel/intel_tex_subimage.c b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
index b7523618868..f86de568976 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
@@ -184,3 +184,18 @@ intelTexSubImage1D(GLcontext * ctx,
                     format, type, pixels, packing, texObj, texImage);
 
 }
+
+void
+intelCompressedTexSubImage2D(GLcontext * ctx,
+			     GLenum target,
+			     GLint level,
+			     GLint xoffset, GLint yoffset,
+			     GLsizei width, GLsizei height,
+			     GLenum format, GLsizei imageSize,
+			     const GLvoid * pixels,
+			     struct gl_texture_object *texObj,
+			     struct gl_texture_image *texImage)
+{
+   fprintf(stderr, "stubbed CompressedTexSubImage2D: %dx%d@%dx%d\n",
+	   width, height, xoffset, yoffset);
+}
diff --git a/src/mesa/gl.pc.in b/src/mesa/gl.pc.in
index 1927880d5f9..0462b9fca2b 100644
--- a/src/mesa/gl.pc.in
+++ b/src/mesa/gl.pc.in
@@ -5,7 +5,8 @@ includedir=@INSTALL_INC_DIR@
 
 Name: gl
 Description: Mesa OpenGL library
-Requires:
+Requires.private: @GL_PC_REQ_PRIV@
 Version: @VERSION@
 Libs: -L${libdir} -lGL
-Cflags: -I${includedir}
+Libs.private: @GL_PC_LIB_PRIV@
+Cflags: -I${includedir} @GL_PC_CFLAGS@
diff --git a/src/mesa/glapi/extension_helper.py b/src/mesa/glapi/extension_helper.py
index 375e3ea59e7..64f64a2fd86 100644
--- a/src/mesa/glapi/extension_helper.py
+++ b/src/mesa/glapi/extension_helper.py
@@ -174,6 +174,9 @@ class PrintGlExtensionGlue(gl_XML.gl_print_base):
 
 				parameter_signature = ''
 				for p in f.parameterIterator():
+					if p.is_padding:
+						continue
+
 					# FIXME: This is a *really* ugly hack. :(
 
 					tn = p.type_expr.get_base_type_node()
diff --git a/src/mesa/glapi/glX_proto_recv.py b/src/mesa/glapi/glX_proto_recv.py
index 20f75575cf1..923c1958f0f 100644
--- a/src/mesa/glapi/glX_proto_recv.py
+++ b/src/mesa/glapi/glX_proto_recv.py
@@ -89,8 +89,8 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
 		print '#include "glxbyteorder.h"'
 		print '#include "indirect_util.h"'
 		print '#include "singlesize.h"'
-		print '#include "glapitable.h"'
 		print '#include "glapi.h"'
+		print '#include "glapitable.h"'
 		print '#include "glthread.h"'
 		print '#include "dispatch.h"'
 		print ''
@@ -225,6 +225,8 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
 		list = []
 
 		for param in f.parameterIterator():
+			if param.is_padding:
+				continue
 
 			if param.is_counter or param.is_image() or param.is_output or param.name in f.count_parameter_list or len(param.count_parameter_list):
 				location = param.name
diff --git a/src/mesa/glapi/glX_proto_send.py b/src/mesa/glapi/glX_proto_send.py
index b00b8a1ba6d..501706acc77 100644
--- a/src/mesa/glapi/glX_proto_send.py
+++ b/src/mesa/glapi/glX_proto_send.py
@@ -333,7 +333,7 @@ const GLuint __glXDefaultPixelStore[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 1 };
 					if image.img_pad_dimensions:
 						do_it = 1
 						break
-			
+
 
 				if do_it:
 					[h, n] = hash_pixel_function(func)
@@ -422,7 +422,10 @@ generic_%u_byte( GLint rop, const void * ptr )
 		else:
 			src_ptr = "&" + p.name
 
-		if not extra_offset:
+		if p.is_padding:
+			print '(void) memset((void *)(%s + %u), 0, %s);' \
+			    % (pc, p.offset + adjust, p.size_string() )
+		elif not extra_offset:
 			print '(void) memcpy((void *)(%s + %u), (void *)(%s), %s);' \
 			    % (pc, p.offset + adjust, src_ptr, p.size_string() )
 		else:
@@ -472,6 +475,10 @@ generic_%u_byte( GLint rop, const void * ptr )
 				else:
 					dim_str = str(dim)
 
+				if param.is_padding:
+					print '(void) memset((void *)(%s + %u), 0, %s);' \
+					% (pc, (param.offset - 4) + adjust, param.size_string() )
+
 				if param.img_null_flag:
 					if large:
 						print '(void) memcpy((void *)(%s + %u), zero, 4);' % (pc, (param.offset - 4) + adjust)
@@ -739,6 +746,9 @@ generic_%u_byte( GLint rop, const void * ptr )
 
 			p_string = ""
 			for param in f.parameterIterateGlxSend():
+				if param.is_padding:
+					continue
+
 				p_string += ", " + param.name
 
 				if param.is_image():
diff --git a/src/mesa/glapi/glX_proto_size.py b/src/mesa/glapi/glX_proto_size.py
index 2b9a6433622..95cb5110cc3 100644
--- a/src/mesa/glapi/glX_proto_size.py
+++ b/src/mesa/glapi/glX_proto_size.py
@@ -581,6 +581,11 @@ class PrintGlxReqSize_c(PrintGlxReqSize_common):
 
 		self.common_emit_fixups(fixup)
 
+		if img.img_null_flag:
+			print ''
+			print '	   if (*(CARD32 *) (pc + %s))' % (img.offset - 4)
+			print '	       return 0;'
+
 		print ''
 		print '    return __glXImageSize(%s, %s, %s, %s, %s, %s,' % (img.img_format, img.img_type, img.img_target, w, h, d )
 		print '                          image_height, row_length, skip_images,'
diff --git a/src/mesa/glapi/gl_API.dtd b/src/mesa/glapi/gl_API.dtd
index f89d3818663..30c646c9244 100644
--- a/src/mesa/glapi/gl_API.dtd
+++ b/src/mesa/glapi/gl_API.dtd
@@ -45,6 +45,7 @@
                    counter             (true | false) "false"
                    count_scale         NMTOKEN "1"
                    output              (true | false) "false"
+                   padding             (true | false) "false"
                    img_width           NMTOKEN #IMPLIED
                    img_height          NMTOKEN #IMPLIED
                    img_depth           NMTOKEN #IMPLIED
diff --git a/src/mesa/glapi/gl_API.xml b/src/mesa/glapi/gl_API.xml
index 6c0367aad79..951fd957994 100644
--- a/src/mesa/glapi/gl_API.xml
+++ b/src/mesa/glapi/gl_API.xml
@@ -3267,7 +3267,8 @@
         <param name="width" type="GLsizei"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
-        <param name="pixels" type="const GLvoid *" img_width="width" img_xoff="xoffset" img_format="format" img_type="type" img_target="target" img_null_flag="true" img_pad_dimensions="true"/>
+        <param name="UNUSED" type="GLuint" padding="true"/>
+        <param name="pixels" type="const GLvoid *" img_width="width" img_xoff="xoffset" img_format="format" img_type="type" img_target="target" img_pad_dimensions="true"/>
         <glx rop="4099" large="true"/>
     </function>
 
@@ -3280,7 +3281,8 @@
         <param name="height" type="GLsizei"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
-        <param name="pixels" type="const GLvoid *" img_width="width" img_height="height" img_xoff="xoffset" img_yoff="yoffset" img_format="format" img_type="type" img_target="target" img_null_flag="true" img_pad_dimensions="true"/>
+        <param name="UNUSED" type="GLuint" padding="true"/>
+        <param name="pixels" type="const GLvoid *" img_width="width" img_height="height" img_xoff="xoffset" img_yoff="yoffset" img_format="format" img_type="type" img_target="target" img_pad_dimensions="true"/>
         <glx rop="4100" large="true"/>
     </function>
 
@@ -3994,7 +3996,8 @@
         <param name="depth" type="GLsizei"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
-        <param name="pixels" type="const GLvoid *" img_width="width" img_height="height" img_depth="depth" img_xoff="xoffset" img_yoff="yoffset" img_zoff="zoffset" img_format="format" img_type="type" img_target="target" img_null_flag="true" img_pad_dimensions="true"/>
+        <param name="UNUSED" type="GLuint" padding="true"/>
+        <param name="pixels" type="const GLvoid *" img_width="width" img_height="height" img_depth="depth" img_xoff="xoffset" img_yoff="yoffset" img_zoff="zoffset" img_format="format" img_type="type" img_target="target" img_pad_dimensions="true"/>
         <glx rop="4115" large="true"/>
     </function>
 
@@ -8061,6 +8064,7 @@
         <param name="depth" type="GLsizei"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
+        <param name="UNUSED" type="GLuint" padding="true"/>
         <param name="pixels" type="const GLvoid *"/>
     </function>
 </category>
@@ -8092,6 +8096,7 @@
         <param name="width" type="GLsizei"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
+        <param name="UNUSED" type="GLuint" padding="true"/>
         <param name="pixels" type="const GLvoid *"/>
     </function>
 
@@ -8104,6 +8109,7 @@
         <param name="height" type="GLsizei"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
+        <param name="UNUSED" type="GLuint" padding="true"/>
         <param name="pixels" type="const GLvoid *"/>
     </function>
 </category>
@@ -8627,7 +8633,8 @@
         <param name="size4d" type="GLsizei"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
-        <param name="pixels" type="const GLvoid *" img_width="width" img_height="height" img_depth="depth" img_extent="size4d" img_xoff="xoffset" img_yoff="yoffset" img_zoff="zoffset" img_woff="woffset" img_format="format" img_type="type" img_target="target" img_null_flag="true" img_pad_dimensions="true"/>
+        <param name="UNUSED" type="GLuint" padding="true"/>
+        <param name="pixels" type="const GLvoid *" img_width="width" img_height="height" img_depth="depth" img_extent="size4d" img_xoff="xoffset" img_yoff="yoffset" img_zoff="zoffset" img_woff="woffset" img_format="format" img_type="type" img_target="target" img_pad_dimensions="true"/>
         <glx rop="2058" ignore="true"/>
     </function>
 </category>
diff --git a/src/mesa/glapi/gl_XML.py b/src/mesa/glapi/gl_XML.py
index b7a7388400d..b98919134fb 100644
--- a/src/mesa/glapi/gl_XML.py
+++ b/src/mesa/glapi/gl_XML.py
@@ -309,6 +309,9 @@ def create_parameter_string(parameters, include_names):
 
 	list = []
 	for p in parameters:
+		if p.is_padding:
+			continue
+
 		if include_names:
 			list.append( p.string() )
 		else:
@@ -463,6 +466,7 @@ class gl_parameter:
 		self.img_null_flag      = is_attr_true( element, 'img_null_flag' )
 		self.img_send_null      = is_attr_true( element, 'img_send_null' )
 
+		self.is_padding = is_attr_true( element, 'padding' )
 		return
 
 
diff --git a/src/mesa/glapi/gl_apitemp.py b/src/mesa/glapi/gl_apitemp.py
index 6e35571e143..a37c08d6ce1 100644
--- a/src/mesa/glapi/gl_apitemp.py
+++ b/src/mesa/glapi/gl_apitemp.py
@@ -63,6 +63,9 @@ class PrintGlOffsets(gl_XML.gl_print_base):
 		n = f.static_name(name)
 
 		for p in f.parameterIterator():
+			if p.is_padding:
+				continue
+
 			if p.is_pointer():
 				cast = "(const void *) "
 			else:
diff --git a/src/mesa/glapi/gl_x86_asm.py b/src/mesa/glapi/gl_x86_asm.py
index 651cb03f14d..0dbf3ebe0ab 100644
--- a/src/mesa/glapi/gl_x86_asm.py
+++ b/src/mesa/glapi/gl_x86_asm.py
@@ -44,6 +44,9 @@ class PrintGenericStubs(gl_XML.gl_print_base):
 	def get_stack_size(self, f):
 		size = 0
 		for p in f.parameterIterator():
+			if p.is_padding:
+				continue
+
 			size += p.get_stack_size()
 
 		return size
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index 2a54ff7ff90..d8e8b559f53 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -383,6 +383,18 @@ _mesa_test_texobj_completeness( const GLcontext *ctx,
 
    t->_Complete = GL_TRUE;  /* be optimistic */
 
+   /* Detect cases where the application set the base level to an invalid
+    * value.
+    */
+   if ((baseLevel < 0) || (baseLevel > MAX_TEXTURE_LEVELS)) {
+      char s[100];
+      _mesa_sprintf(s, "obj %p (%d) base level = %d is invalid",
+              (void *) t, t->Name, baseLevel);
+      incomplete(t, s);
+      t->_Complete = GL_FALSE;
+      return;
+   }
+
    /* Always need the base level image */
    if (!t->Image[0][baseLevel]) {
       char s[100];
diff --git a/src/mesa/osmesa.pc.in b/src/mesa/osmesa.pc.in
new file mode 100644
index 00000000000..05327f40aa0
--- /dev/null
+++ b/src/mesa/osmesa.pc.in
@@ -0,0 +1,12 @@
+prefix=@INSTALL_DIR@
+exec_prefix=${prefix}
+libdir=@INSTALL_LIB_DIR@
+includedir=@INSTALL_INC_DIR@
+
+Name: osmesa
+Description: Mesa Off-screen Rendering library
+Requires: @OSMESA_PC_REQ@
+Version: @VERSION@
+Libs: -L${libdir} -l@OSMESA_LIB@
+Libs.private: @OSMESA_PC_LIB_PRIV@
+Cflags: -I${includedir}
diff --git a/src/mesa/tnl/t_vertex_generic.c b/src/mesa/tnl/t_vertex_generic.c
index f763522f91f..9812f8c8081 100644
--- a/src/mesa/tnl/t_vertex_generic.c
+++ b/src/mesa/tnl/t_vertex_generic.c
@@ -113,7 +113,7 @@ static INLINE void insert_3f_viewport_2( const struct tnl_clipspace_attr *a, GLu
    DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
-   out[2] = vp[10] * in[2] + vp[14];
+   out[2] = vp[14];
 }
 
 static INLINE void insert_3f_viewport_1( const struct tnl_clipspace_attr *a, GLubyte *v,
diff --git a/src/mesa/tnl/t_vertex_sse.c b/src/mesa/tnl/t_vertex_sse.c
index 76043bd1b54..7a255d680a1 100644
--- a/src/mesa/tnl/t_vertex_sse.c
+++ b/src/mesa/tnl/t_vertex_sse.c
@@ -146,7 +146,8 @@ static void emit_load3f_1( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
-   emit_load4f_1(p, dest, arg0);
+   /* Loading from memory erases the upper bits. */
+   sse_movss(&p->func, dest, arg0);
 }
 
 static void emit_load2f_2( struct x86_program *p, 
@@ -160,7 +161,8 @@ static void emit_load2f_1( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
-   emit_load4f_1(p, dest, arg0);
+   /* Loading from memory erases the upper bits. */
+   sse_movss(&p->func, dest, arg0);
 }
 
 static void emit_load1f_1( struct x86_program *p, 
@@ -352,6 +354,7 @@ static GLboolean build_vertex_emit( struct x86_program *p )
    struct x86_reg temp = x86_make_reg(file_XMM, 0);
    struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
    struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
+   struct x86_reg temp2 = x86_make_reg(file_XMM, 3);
    GLubyte *fixup, *label;
 
    /* Push a few regs?
@@ -524,7 +527,8 @@ static GLboolean build_vertex_emit( struct x86_program *p )
 	    sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
 
 	    get_src_ptr(p, srcECX, vtxESI, &a[1]);
-	    emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
+	    emit_load(p, temp2, 1, x86_deref(srcECX), a[1].inputsize);
+	    sse_movss(&p->func, temp, temp2);
 	    update_src_ptr(p, srcECX, vtxESI, &a[1]);
 
 	    /* Rearrange and possibly do BGR conversion:
@@ -539,8 +543,8 @@ static GLboolean build_vertex_emit( struct x86_program *p )
 	 }
 	 else {
 	    _mesa_printf("Can't emit 3ub\n");
+	    return GL_FALSE;	/* add this later */
 	 }
-	 return GL_FALSE;	/* add this later */
 	 break;
 
       case EMIT_4UB_4F_RGBA: