Merge master and fix conflicts

author: Alex Deucher <[email protected]> 2009-06-12 12:09:34 -0400
committer: Alex Deucher <[email protected]> 2009-06-12 12:09:34 -0400
commit: 1036ef2bf468611d37b5df06fc4424f2002e3837 (patch)
tree: f0859a6d903c2570a0a00c918da88139f8f7d065 /src/mesa/drivers
parent: 917f8bc1a85e61311cef6478127b387df70fba14 (diff)
parent: 1cd0afffc9edbcac690f8ab436aecfced26b0aba (diff)
150 files changed, 6182 insertions, 4201 deletions
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 276da41f4e4..f31a2a25bf3 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -231,12 +231,16 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->BeginQuery = _mesa_begin_query;
    driver->EndQuery = _mesa_end_query;
    driver->WaitQuery = _mesa_wait_query;
+   driver->CheckQuery = _mesa_check_query;
 
    /* APPLE_vertex_array_object */
    driver->NewArrayObject = _mesa_new_array_object;
    driver->DeleteArrayObject = _mesa_delete_array_object;
    driver->BindArrayObject = NULL;
 
+   /* GL_ARB_copy_buffer */
+   driver->CopyBufferSubData = _mesa_copy_buffer_subdata;
+
    /* T&L stuff */
    driver->NeedValidate = GL_FALSE;
    driver->ValidateTnlModule = NULL;
diff --git a/src/mesa/drivers/dri/Makefile.template b/src/mesa/drivers/dri/Makefile.template
index 2fa36bab3f9..bd38e3be47c 100644
--- a/src/mesa/drivers/dri/Makefile.template
+++ b/src/mesa/drivers/dri/Makefile.template
@@ -72,10 +72,11 @@ $(TOP)/$(LIB_DIR)/$(LIBNAME): $(LIBNAME)
 
 
 depend: $(C_SOURCES) $(ASM_SOURCES) $(SYMLINKS)
-	rm -f depend
-	touch depend
-	$(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) \
-		$(ASM_SOURCES)
+	@ echo "running $(MKDEP)"
+	@ rm -f depend
+	@ touch depend
+	@ $(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) \
+		$(ASM_SOURCES) > /dev/null 2>/dev/null
 
 
 # Emacs tags
@@ -91,7 +92,7 @@ clean:
 
 install: $(LIBNAME)
 	$(INSTALL) -d $(DESTDIR)$(DRI_DRIVER_INSTALL_DIR)
-	$(INSTALL) -m 755 $(LIBNAME) $(DESTDIR)$(DRI_DRIVER_INSTALL_DIR)
+	$(MINSTALL) -m 755 $(LIBNAME) $(DESTDIR)$(DRI_DRIVER_INSTALL_DIR)
 
 
 -include depend
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index 38c2e7b00d1..ae0e61e515b 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -37,6 +37,9 @@
 typedef GLboolean ( * PFNGLXGETMSCRATEOMLPROC) (__DRIdrawable *drawable, int32_t *numerator, int32_t *denominator);
 #endif
 
+static void dri_get_drawable(__DRIdrawable *pdp);
+static void dri_put_drawable(__DRIdrawable *pdp);
+
 /**
  * This is just a token extension used to signal that the driver
  * supports setting a read drawable.
@@ -59,7 +62,7 @@ __driUtilMessage(const char *f, ...)
     va_list args;
 
     if (getenv("LIBGL_DEBUG")) {
-        fprintf(stderr, "libGL error: \n");
+        fprintf(stderr, "libGL: ");
         va_start(args, f);
         vfprintf(stderr, f, args);
         va_end(args);
@@ -127,7 +130,7 @@ static int driUnbindContext(__DRIcontext *pcp)
 	return GL_FALSE;
     }
 
-    pdp->refcount--;
+    dri_put_drawable(pdp);
 
     if (prp != pdp) {
         if (prp->refcount == 0) {
@@ -135,7 +138,7 @@ static int driUnbindContext(__DRIcontext *pcp)
 	    return GL_FALSE;
 	}
 
-	prp->refcount--;
+    	dri_put_drawable(prp);
     }
 
 
@@ -170,10 +173,10 @@ static int driBindContext(__DRIcontext *pcp,
 	pcp->driReadablePriv = prp;
 	if (pdp) {
 	    pdp->driContextPriv = pcp;
-	    pdp->refcount++;
+    	    dri_get_drawable(pdp);
 	}
 	if ( prp && pdp != prp ) {
-	    prp->refcount++;
+    	    dri_get_drawable(prp);
 	}
     }
 
@@ -311,12 +314,12 @@ static void driSwapBuffers(__DRIdrawable *dPriv)
     __DRIscreen *psp = dPriv->driScreenPriv;
     drm_clip_rect_t *rects;
     int i;
-    
-    if (!dPriv->numClipRects)
-        return;
 
     psp->DriverAPI.SwapBuffers(dPriv);
 
+    if (!dPriv->numClipRects)
+        return;
+
     rects = _mesa_malloc(sizeof(*rects) * dPriv->numClipRects);
 
     if (!rects)
@@ -430,7 +433,7 @@ driCreateNewDrawable(__DRIscreen *psp, const __DRIconfig *config,
 
     pdp->loaderPrivate = data;
     pdp->hHWDrawable = hwDrawable;
-    pdp->refcount = 0;
+    pdp->refcount = 1;
     pdp->pStamp = NULL;
     pdp->lastStamp = 0;
     pdp->index = 0;
@@ -483,12 +486,19 @@ dri2CreateNewDrawable(__DRIscreen *screen,
     return pdraw;
 }
 
-
-static void
-driDestroyDrawable(__DRIdrawable *pdp)
+static void dri_get_drawable(__DRIdrawable *pdp)
+{
+    pdp->refcount++;
+}
+	
+static void dri_put_drawable(__DRIdrawable *pdp)
 {
     __DRIscreenPrivate *psp;
 
+    pdp->refcount--;
+    if (pdp->refcount)
+	return;
+
     if (pdp) {
 	psp = pdp->driScreenPriv;
         (*psp->DriverAPI.DestroyBuffer)(pdp);
@@ -504,6 +514,12 @@ driDestroyDrawable(__DRIdrawable *pdp)
     }
 }
 
+static void
+driDestroyDrawable(__DRIdrawable *pdp)
+{
+    dri_put_drawable(pdp);
+}
+
 /*@}*/
 
 
diff --git a/src/mesa/drivers/dri/common/extension_helper.h b/src/mesa/drivers/dri/common/extension_helper.h
index 8dcaaee3079..f5e35e41618 100644
--- a/src/mesa/drivers/dri/common/extension_helper.h
+++ b/src/mesa/drivers/dri/common/extension_helper.h
@@ -1763,6 +1763,13 @@ static const char DeleteFencesNV_names[] =
     "";
 #endif
 
+#if defined(need_GL_SGIX_polynomial_ffd)
+static const char DeformationMap3dSGIX_names[] = 
+    "iddiiddiiddiip\0" /* Parameter signature */
+    "glDeformationMap3dSGIX\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_2_0)
 static const char IsShader_names[] = 
     "i\0" /* Parameter signature */
@@ -4396,6 +4403,13 @@ static const char WindowPos3ivMESA_names[] =
     "";
 #endif
 
+#if defined(need_GL_ARB_copy_buffer)
+static const char CopyBufferSubData_names[] = 
+    "iiiii\0" /* Parameter signature */
+    "glCopyBufferSubData\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_vertex_buffer_object)
 static const char IsBufferARB_names[] = 
     "i\0" /* Parameter signature */
@@ -4560,13 +4574,6 @@ static const char Minmax_names[] =
     "";
 #endif
 
-#if defined(need_GL_SGIX_polynomial_ffd)
-static const char DeformationMap3dSGIX_names[] = 
-    "iddiiddiiddiip\0" /* Parameter signature */
-    "glDeformationMap3dSGIX\0"
-    "";
-#endif
-
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_fog_coord)
 static const char FogCoorddvEXT_names[] = 
     "p\0" /* Parameter signature */
@@ -4939,6 +4946,13 @@ static const struct dri_extension_function GL_APPLE_vertex_array_object_function
 };
 #endif
 
+#if defined(need_GL_ARB_copy_buffer)
+static const struct dri_extension_function GL_ARB_copy_buffer_functions[] = {
+    { CopyBufferSubData_names, CopyBufferSubData_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
 #if defined(need_GL_ARB_draw_buffers)
 static const struct dri_extension_function GL_ARB_draw_buffers_functions[] = {
     { DrawBuffersARB_names, DrawBuffersARB_remap_index, -1 },
@@ -6055,9 +6069,9 @@ static const struct dri_extension_function GL_SGIX_pixel_texture_functions[] = {
 #if defined(need_GL_SGIX_polynomial_ffd)
 static const struct dri_extension_function GL_SGIX_polynomial_ffd_functions[] = {
     { LoadIdentityDeformationMapSGIX_names, LoadIdentityDeformationMapSGIX_remap_index, -1 },
+    { DeformationMap3dSGIX_names, DeformationMap3dSGIX_remap_index, -1 },
     { DeformSGIX_names, DeformSGIX_remap_index, -1 },
     { DeformationMap3fSGIX_names, DeformationMap3fSGIX_remap_index, -1 },
-    { DeformationMap3dSGIX_names, DeformationMap3dSGIX_remap_index, -1 },
     { NULL, 0, 0 }
 };
 #endif
diff --git a/src/mesa/drivers/dri/common/spantmp2.h b/src/mesa/drivers/dri/common/spantmp2.h
index f2868cb58a6..89c815722f6 100644
--- a/src/mesa/drivers/dri/common/spantmp2.h
+++ b/src/mesa/drivers/dri/common/spantmp2.h
@@ -82,6 +82,71 @@
       rgba[3] = 0xff;							\
    } while (0)
 
+#elif (SPANTMP_PIXEL_FMT == GL_BGRA)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_4_4_4_4_REV)
+
+/**
+ ** GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4_REV
+ **/
+
+#ifndef GET_VALUE
+#ifndef GET_PTR
+#define GET_PTR(_x, _y) (buf + (_x) * 2 + (_y) * pitch)
+#endif
+
+#define GET_VALUE(_x, _y) *(volatile GLushort *)(GET_PTR(_x, _y))
+#define PUT_VALUE(_x, _y, _v) *(volatile GLushort *)(GET_PTR(_x, _y)) = (_v)
+#endif /* GET_VALUE */
+
+#define INIT_MONO_PIXEL(p, color) \
+   p = PACK_COLOR_4444(color[3], color[0], color[1], color[2])
+
+#define WRITE_RGBA( _x, _y, r, g, b, a )				\
+   PUT_VALUE(_x, _y, PACK_COLOR_4444(a, r, g, b))			\
+
+#define WRITE_PIXEL( _x, _y, p ) PUT_VALUE(_x, _y, p)
+
+#define READ_RGBA( rgba, _x, _y )					\
+   do {									\
+      GLushort p = GET_VALUE(_x, _y);					\
+      rgba[0] = ((p >> 8) & 0xf) * 0x11;				\
+      rgba[1] = ((p >> 4) & 0xf) * 0x11;				\
+      rgba[2] = ((p >> 0) & 0xf) * 0x11;				\
+      rgba[3] = ((p >> 12) & 0xf) * 0x11;				\
+   } while (0)
+
+
+#elif (SPANTMP_PIXEL_FMT == GL_BGRA)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_1_5_5_5_REV)
+
+/**
+ ** GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV
+ **/
+
+#ifndef GET_VALUE
+#ifndef GET_PTR
+#define GET_PTR(_x, _y) (buf + (_x) * 2 + (_y) * pitch)
+#endif
+
+#define GET_VALUE(_x, _y) *(volatile GLushort *)(GET_PTR(_x, _y))
+#define PUT_VALUE(_x, _y, _v) *(volatile GLushort *)(GET_PTR(_x, _y)) = (_v)
+#endif /* GET_VALUE */
+
+#define INIT_MONO_PIXEL(p, color) \
+   p = PACK_COLOR_1555(color[3], color[0], color[1], color[2])
+
+#define WRITE_RGBA( _x, _y, r, g, b, a )				\
+   PUT_VALUE(_x, _y, PACK_COLOR_1555(a, r, g, b))			\
+
+#define WRITE_PIXEL( _x, _y, p ) PUT_VALUE(_x, _y, p)
+
+#define READ_RGBA( rgba, _x, _y )					\
+   do {									\
+      GLushort p = GET_VALUE(_x, _y);					\
+      rgba[0] = ((p >> 7) & 0xf8) * 255 / 0xf8;				\
+      rgba[1] = ((p >> 2) & 0xf8) * 255 / 0xf8;				\
+      rgba[2] = ((p << 3) & 0xf8) * 255 / 0xf8;				\
+      rgba[3] = ((p >> 15) & 0x1) * 0xff;				\
+   } while (0)
+
 #elif (SPANTMP_PIXEL_FMT == GL_BGRA) && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
 
 /**
diff --git a/src/mesa/drivers/dri/i915/Makefile b/src/mesa/drivers/dri/i915/Makefile
index 954a7e2af1f..beaf9a4b129 100644
--- a/src/mesa/drivers/dri/i915/Makefile
+++ b/src/mesa/drivers/dri/i915/Makefile
@@ -11,7 +11,6 @@ DRIVER_SOURCES = \
 	i830_metaops.c \
 	i830_state.c \
 	i830_texblend.c \
-	i830_tex.c \
 	i830_texstate.c \
 	i830_vtbl.c \
 	intel_render.c \
@@ -20,6 +19,7 @@ DRIVER_SOURCES = \
 	intel_batchbuffer.c \
 	intel_clear.c \
 	intel_extensions.c \
+	intel_generatemipmap.c \
 	intel_mipmap_tree.c \
 	intel_tex_layout.c \
 	intel_tex_image.c \
@@ -36,7 +36,6 @@ DRIVER_SOURCES = \
 	intel_buffers.c \
 	intel_blit.c \
 	intel_swapbuffers.c \
-	i915_tex.c \
 	i915_tex_layout.c \
 	i915_texstate.c \
 	i915_context.c \
diff --git a/src/mesa/drivers/dri/i915/i830_context.c b/src/mesa/drivers/dri/i915/i830_context.c
index 9c540cb2bb7..840946f908d 100644
--- a/src/mesa/drivers/dri/i915/i830_context.c
+++ b/src/mesa/drivers/dri/i915/i830_context.c
@@ -47,7 +47,6 @@ i830InitDriverFunctions(struct dd_function_table *functions)
 {
    intelInitDriverFunctions(functions);
    i830InitStateFuncs(functions);
-   i830InitTextureFuncs(functions);
 }
 
 extern const struct tnl_pipeline_stage *intel_pipeline[];
@@ -99,6 +98,8 @@ i830CreateContext(const __GLcontextModes * mesaVis,
    ctx->Const.MaxTextureRectSize = (1 << 11);
    ctx->Const.MaxTextureUnits = I830_TEX_UNITS;
 
+   ctx->Const.MaxTextureMaxAnisotropy = 2.0;
+
    ctx->Const.MaxDrawBuffers = 1;
 
    _tnl_init_vertices(ctx, ctx->Const.MaxArrayLockSize + 12,
diff --git a/src/mesa/drivers/dri/i915/i830_reg.h b/src/mesa/drivers/dri/i915/i830_reg.h
index d210c2d08e4..db16871001d 100644
--- a/src/mesa/drivers/dri/i915/i830_reg.h
+++ b/src/mesa/drivers/dri/i915/i830_reg.h
@@ -48,19 +48,6 @@
 #define AA_LINE_ENABLE			((1<<1) | 1)
 #define AA_LINE_DISABLE			(1<<1)
 
-#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
-/* Dword 1 */
-#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
-#define BUF_3D_ID_DEPTH 	(0x7<<24)
-#define BUF_3D_USE_FENCE	(1<<23)
-#define BUF_3D_TILED_SURFACE	(1<<22)
-#define BUF_3D_TILE_WALK_X	0
-#define BUF_3D_TILE_WALK_Y	(1<<21)
-#define BUF_3D_PITCH(x)         (((x)/4)<<2)
-/* Dword 2 */
-#define BUF_3D_ADDR(x)		((x) & ~0x3)
-
-
 #define _3DSTATE_COLOR_FACTOR_CMD	(CMD_3D | (0x1d<<24) | (0x1<<16))
 
 #define _3DSTATE_COLOR_FACTOR_N_CMD(stage)	(CMD_3D | (0x1d<<24) | \
diff --git a/src/mesa/drivers/dri/i915/i830_state.c b/src/mesa/drivers/dri/i915/i830_state.c
index d9cad0c4bf8..8ef6c9144f1 100644
--- a/src/mesa/drivers/dri/i915/i830_state.c
+++ b/src/mesa/drivers/dri/i915/i830_state.c
@@ -39,6 +39,7 @@
 #include "intel_screen.h"
 #include "intel_batchbuffer.h"
 #include "intel_fbo.h"
+#include "intel_buffers.h"
 
 #include "i830_context.h"
 #include "i830_reg.h"
@@ -446,6 +447,24 @@ i830DepthMask(GLcontext * ctx, GLboolean flag)
       i830->state.Ctx[I830_CTXREG_ENABLES_2] |= DISABLE_DEPTH_WRITE;
 }
 
+/** Called from ctx->Driver.Viewport() */
+static void
+i830Viewport(GLcontext * ctx,
+              GLint x, GLint y, GLsizei width, GLsizei height)
+{
+   intelCalcViewport(ctx);
+
+   intel_viewport(ctx, x, y, width, height);
+}
+
+
+/** Called from ctx->Driver.DepthRange() */
+static void
+i830DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
+{
+   intelCalcViewport(ctx);
+}
+
 /* =============================================================
  * Polygon stipple
  *
@@ -1064,6 +1083,8 @@ i830InitStateFuncs(struct dd_function_table *functions)
    functions->StencilFuncSeparate = i830StencilFuncSeparate;
    functions->StencilMaskSeparate = i830StencilMaskSeparate;
    functions->StencilOpSeparate = i830StencilOpSeparate;
+   functions->DepthRange = i830DepthRange;
+   functions->Viewport = i830Viewport;
 }
 
 void
diff --git a/src/mesa/drivers/dri/i915/i830_tex.c b/src/mesa/drivers/dri/i915/i830_tex.c
deleted file mode 100644
index 34ac42a78e0..00000000000
--- a/src/mesa/drivers/dri/i915/i830_tex.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "main/glheader.h"
-#include "main/mtypes.h"
-#include "main/imports.h"
-#include "main/simple_list.h"
-#include "main/enums.h"
-#include "main/image.h"
-#include "main/mm.h"
-#include "main/texstore.h"
-#include "main/texformat.h"
-#include "swrast/swrast.h"
-
-#include "texmem.h"
-
-#include "i830_context.h"
-#include "i830_reg.h"
-
-
-
-static void
-i830TexEnv(GLcontext * ctx, GLenum target,
-           GLenum pname, const GLfloat * param)
-{
-
-   switch (pname) {
-   case GL_TEXTURE_ENV_COLOR:
-   case GL_TEXTURE_ENV_MODE:
-   case GL_COMBINE_RGB:
-   case GL_COMBINE_ALPHA:
-   case GL_SOURCE0_RGB:
-   case GL_SOURCE1_RGB:
-   case GL_SOURCE2_RGB:
-   case GL_SOURCE0_ALPHA:
-   case GL_SOURCE1_ALPHA:
-   case GL_SOURCE2_ALPHA:
-   case GL_OPERAND0_RGB:
-   case GL_OPERAND1_RGB:
-   case GL_OPERAND2_RGB:
-   case GL_OPERAND0_ALPHA:
-   case GL_OPERAND1_ALPHA:
-   case GL_OPERAND2_ALPHA:
-   case GL_RGB_SCALE:
-   case GL_ALPHA_SCALE:
-      break;
-
-   case GL_TEXTURE_LOD_BIAS:{
-         struct i830_context *i830 = i830_context(ctx);
-         GLuint unit = ctx->Texture.CurrentUnit;
-         int b = (int) ((*param) * 16.0);
-         if (b > 63)
-            b = 63;
-         if (b < -64)
-            b = -64;
-         I830_STATECHANGE(i830, I830_UPLOAD_TEX(unit));
-         i830->lodbias_tm0s3[unit] =
-            ((b << TM0S3_LOD_BIAS_SHIFT) & TM0S3_LOD_BIAS_MASK);
-         break;
-      }
-
-   default:
-      break;
-   }
-}
-
-
-
-
-void
-i830InitTextureFuncs(struct dd_function_table *functions)
-{
-/*
-   functions->TexEnv = i830TexEnv;
-*/
-}
diff --git a/src/mesa/drivers/dri/i915/i830_texstate.c b/src/mesa/drivers/dri/i915/i830_texstate.c
index 753c25b57ed..6f998fa6f77 100644
--- a/src/mesa/drivers/dri/i915/i830_texstate.c
+++ b/src/mesa/drivers/dri/i915/i830_texstate.c
@@ -174,14 +174,16 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    state[I830_TEXREG_TM0LI] = (_3DSTATE_LOAD_STATE_IMMEDIATE_2 |
                                (LOAD_TEXTURE_MAP0 << unit) | 4);
 
-/*    state[I830_TEXREG_TM0S0] = (TM0S0_USE_FENCE | */
-/* 			       t->intel.TextureOffset); */
-
-
    state[I830_TEXREG_TM0S1] =
       (((firstImage->Height - 1) << TM0S1_HEIGHT_SHIFT) |
        ((firstImage->Width - 1) << TM0S1_WIDTH_SHIFT) | format);
 
+   if (intelObj->mt->region->tiling != I915_TILING_NONE) {
+      state[I830_TEXREG_TM0S1] |= TM0S1_TILED_SURFACE;
+      if (intelObj->mt->region->tiling == I915_TILING_Y)
+	 state[I830_TEXREG_TM0S1] |= TM0S1_TILE_WALK;
+   }
+
    state[I830_TEXREG_TM0S2] =
       ((((pitch / 4) - 1) << TM0S2_PITCH_SHIFT) | TM0S2_CUBE_FACE_ENA_MASK);
 
diff --git a/src/mesa/drivers/dri/i915/i830_vtbl.c b/src/mesa/drivers/dri/i915/i830_vtbl.c
index 1a949210789..cbee9f9efe9 100644
--- a/src/mesa/drivers/dri/i915/i830_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i830_vtbl.c
@@ -26,12 +26,14 @@
  **************************************************************************/
 
 #include "glapi/glapi.h"
+#include "main/texformat.h"
 
 #include "i830_context.h"
 #include "i830_reg.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
 #include "intel_tris.h"
+#include "intel_fbo.h"
 #include "tnl/t_context.h"
 #include "tnl/t_vertex.h"
 
@@ -550,7 +552,7 @@ i830_emit_state(struct intel_context *intel)
          if (state->tex_buffer[i]) {
             OUT_RELOC(state->tex_buffer[i],
 		      I915_GEM_DOMAIN_SAMPLER, 0,
-                      state->tex_offset[i] | TM0S0_USE_FENCE);
+                      state->tex_offset[i]);
          }
 	 else if (state == &i830->meta) {
 	    assert(i == 0);
@@ -614,6 +616,8 @@ i830_state_draw_region(struct intel_context *intel,
 {
    struct i830_context *i830 = i830_context(&intel->ctx);
    GLcontext *ctx = &intel->ctx;
+   struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
+   struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    GLuint value;
 
    ASSERT(state == &i830->state || state == &i830->meta);
@@ -630,34 +634,38 @@ i830_state_draw_region(struct intel_context *intel,
    /*
     * Set stride/cpp values
     */
-   if (color_region) {
-      state->Buffer[I830_DESTREG_CBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I830_DESTREG_CBUFADDR1] =
-         (BUF_3D_ID_COLOR_BACK |
-          BUF_3D_PITCH(color_region->pitch * color_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I830_DESTREG_CBUFADDR0],
+				color_region, BUF_3D_ID_COLOR_BACK);
 
-   if (depth_region) {
-      state->Buffer[I830_DESTREG_DBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I830_DESTREG_DBUFADDR1] =
-         (BUF_3D_ID_DEPTH |
-          BUF_3D_PITCH(depth_region->pitch * depth_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I830_DESTREG_DBUFADDR0],
+				depth_region, BUF_3D_ID_DEPTH);
 
    /*
     * Compute/set I830_DESTREG_DV1 value
     */
    value = (DSTORG_HORT_BIAS(0x8) |     /* .5 */
             DSTORG_VERT_BIAS(0x8) | DEPTH_IS_Z);    /* .5 */
-            
-   if (color_region && color_region->cpp == 4) {
-      value |= DV_PF_8888;
-   }
-   else {
-      value |= DV_PF_565;
+
+   if (irb != NULL) {
+      switch (irb->texformat->MesaFormat) {
+      case MESA_FORMAT_ARGB8888:
+	 value |= DV_PF_8888;
+	 break;
+      case MESA_FORMAT_RGB565:
+	 value |= DV_PF_565;
+	 break;
+      case MESA_FORMAT_ARGB1555:
+	 value |= DV_PF_1555;
+	 break;
+      case MESA_FORMAT_ARGB4444:
+	 value |= DV_PF_4444;
+	 break;
+      default:
+	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
+		       irb->texformat->MesaFormat);
+      }
    }
+
    if (depth_region && depth_region->cpp == 4) {
       value |= DEPTH_FRMT_24_FIXED_8_OTHER;
    }
@@ -700,26 +708,6 @@ i830_set_draw_region(struct intel_context *intel,
    i830_state_draw_region(intel, &i830->state, color_regions[0], depth_region);
 }
 
-#if 0
-static void
-i830_update_color_z_regions(intelContextPtr intel,
-                            const intelRegion * colorRegion,
-                            const intelRegion * depthRegion)
-{
-   i830ContextPtr i830 = I830_CONTEXT(intel);
-
-   i830->state.Buffer[I830_DESTREG_CBUFADDR1] =
-      (BUF_3D_ID_COLOR_BACK | BUF_3D_PITCH(colorRegion->pitch) |
-       BUF_3D_USE_FENCE);
-   i830->state.Buffer[I830_DESTREG_CBUFADDR2] = colorRegion->offset;
-
-   i830->state.Buffer[I830_DESTREG_DBUFADDR1] =
-      (BUF_3D_ID_DEPTH | BUF_3D_PITCH(depthRegion->pitch) | BUF_3D_USE_FENCE);
-   i830->state.Buffer[I830_DESTREG_DBUFADDR2] = depthRegion->offset;
-}
-#endif
-
-
 /* This isn't really handled at the moment.
  */
 static void
diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index 7549029a1be..367d2a3b648 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -73,7 +73,7 @@ i915InvalidateState(GLcontext * ctx, GLuint new_state)
          p->params_uptodate = 0;
    }
 
-   if (new_state & (_NEW_FOG | _NEW_HINT | _NEW_PROGRAM))
+   if (new_state & (_NEW_FOG | _NEW_HINT | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS))
       i915_update_fog(ctx);
 }
 
@@ -83,7 +83,6 @@ i915InitDriverFunctions(struct dd_function_table *functions)
 {
    intelInitDriverFunctions(functions);
    i915InitStateFunctions(functions);
-   i915InitTextureFuncs(functions);
    i915InitFragProgFuncs(functions);
    functions->UpdateState = i915InvalidateState;
 }
@@ -146,6 +145,8 @@ i915CreateContext(const __GLcontextModes * mesaVis,
    ctx->Const.MaxTextureRectSize = (1 << 11);
    ctx->Const.MaxTextureUnits = I915_TEX_UNITS;
 
+   ctx->Const.MaxTextureMaxAnisotropy = 4.0;
+
    /* GL_ARB_fragment_program limits - don't think Mesa actually
     * validates programs against these, and in any case one ARB
     * instruction can translate to more than one HW instruction, so
diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c
index 52f09a4b1b2..2db10c60e99 100644
--- a/src/mesa/drivers/dri/i915/i915_fragprog.c
+++ b/src/mesa/drivers/dri/i915/i915_fragprog.c
@@ -162,12 +162,12 @@ src_vector(struct i915_fragment_program *p,
                  GET_SWZ(source->Swizzle, 1),
                  GET_SWZ(source->Swizzle, 2), GET_SWZ(source->Swizzle, 3));
 
-   if (source->NegateBase)
+   if (source->Negate)
       src = negate(src,
-                   GET_BIT(source->NegateBase, 0),
-                   GET_BIT(source->NegateBase, 1),
-                   GET_BIT(source->NegateBase, 2),
-                   GET_BIT(source->NegateBase, 3));
+                   GET_BIT(source->Negate, 0),
+                   GET_BIT(source->Negate, 1),
+                   GET_BIT(source->Negate, 2),
+                   GET_BIT(source->Negate, 3));
 
    return src;
 }
@@ -323,7 +323,8 @@ upload_program(struct i915_fragment_program *p)
       p->ctx->FragmentProgram._Current;
    const struct prog_instruction *inst = program->Base.Instructions;
 
-/*    _mesa_debug_fp_inst(program->Base.NumInstructions, inst); */
+   if (INTEL_DEBUG & DEBUG_WM)
+      _mesa_print_program(&program->Base);
 
    /* Is this a parse-failed program?  Ensure a valid program is
     * loaded, as the flagging of an error isn't sufficient to stop
@@ -1049,9 +1050,6 @@ i915ProgramStringNotify(GLcontext * ctx,
          _mesa_append_fog_code(ctx, &p->FragProg);
          p->FragProg.FogOption = GL_NONE;
       }
-
-      if (INTEL_DEBUG & DEBUG_STATE)
-	 _mesa_print_program(prog);
    }
 
    _tnl_program_string(ctx, target, prog);
diff --git a/src/mesa/drivers/dri/i915/i915_reg.h b/src/mesa/drivers/dri/i915/i915_reg.h
index 8891e11c6fd..84db58ea950 100644
--- a/src/mesa/drivers/dri/i915/i915_reg.h
+++ b/src/mesa/drivers/dri/i915/i915_reg.h
@@ -93,20 +93,6 @@
 
 /* 3DSTATE_BIN_CONTROL p141 */
 
-/* p143 */
-#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
-/* Dword 1 */
-#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
-#define BUF_3D_ID_DEPTH 	(0x7<<24)
-#define BUF_3D_USE_FENCE	(1<<23)
-#define BUF_3D_TILED_SURFACE	(1<<22)
-#define BUF_3D_TILE_WALK_X	0
-#define BUF_3D_TILE_WALK_Y	(1<<21)
-#define BUF_3D_PITCH(x)         (((x)/4)<<2)
-/* Dword 2 */
-#define BUF_3D_ADDR(x)		((x) & ~0x3)
-
-
 /* 3DSTATE_CHROMA_KEY */
 
 /* 3DSTATE_CLEAR_PARAMETERS, p150 */
@@ -155,6 +141,7 @@
 /* p161 */
 #define _3DSTATE_DST_BUF_VARS_CMD	(CMD_3D | (0x1d<<24) | (0x85<<16))
 /* Dword 1 */
+#define CLASSIC_EARLY_DEPTH		(1<<31)
 #define TEX_DEFAULT_COLOR_OGL           (0<<30)
 #define TEX_DEFAULT_COLOR_D3D           (1<<30)
 #define ZR_EARLY_DEPTH                  (1<<29)
diff --git a/src/mesa/drivers/dri/i915/i915_tex.c b/src/mesa/drivers/dri/i915/i915_tex.c
deleted file mode 100644
index e38d8fe79d1..00000000000
--- a/src/mesa/drivers/dri/i915/i915_tex.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "main/glheader.h"
-#include "main/mtypes.h"
-#include "main/imports.h"
-#include "main/simple_list.h"
-#include "main/enums.h"
-#include "main/image.h"
-#include "main/mm.h"
-#include "main/texstore.h"
-#include "main/texformat.h"
-#include "swrast/swrast.h"
-
-#include "texmem.h"
-
-#include "i915_context.h"
-#include "i915_reg.h"
-
-
-
-static void
-i915TexEnv(GLcontext * ctx, GLenum target,
-           GLenum pname, const GLfloat * param)
-{
-   struct i915_context *i915 = I915_CONTEXT(ctx);
-
-   switch (pname) {
-   case GL_TEXTURE_LOD_BIAS:{
-         GLuint unit = ctx->Texture.CurrentUnit;
-         GLint b = (int) ((*param) * 16.0);
-         if (b > 255)
-            b = 255;
-         if (b < -256)
-            b = -256;
-         I915_STATECHANGE(i915, I915_UPLOAD_TEX(unit));
-         i915->lodbias_ss2[unit] =
-            ((b << SS2_LOD_BIAS_SHIFT) & SS2_LOD_BIAS_MASK);
-         break;
-      }
-
-   default:
-      break;
-   }
-}
-
-
-void
-i915InitTextureFuncs(struct dd_function_table *functions)
-{
-/*
-   functions->TexEnv = i915TexEnv;
-*/
-}
diff --git a/src/mesa/drivers/dri/i915/i915_tex_layout.c b/src/mesa/drivers/dri/i915/i915_tex_layout.c
index d44a2f47b37..d9588e5b56d 100644
--- a/src/mesa/drivers/dri/i915/i915_tex_layout.c
+++ b/src/mesa/drivers/dri/i915/i915_tex_layout.c
@@ -55,6 +55,17 @@ static GLint step_offsets[6][2] = {
    [FACE_NEG_Z] = {-1, 1},
 };
 
+
+static GLint bottom_offsets[6] = {
+   [FACE_POS_X] = 16 + 0 * 8,
+   [FACE_POS_Y] = 16 + 1 * 8,
+   [FACE_POS_Z] = 16 + 2 * 8,
+   [FACE_NEG_X] = 16 + 3 * 8,
+   [FACE_NEG_Y] = 16 + 4 * 8,
+   [FACE_NEG_Z] = 16 + 5 * 8,
+};
+
+
 /**
  * Cube texture map layout for i830M-GM915.
  *
@@ -101,7 +112,8 @@ static GLint step_offsets[6][2] = {
  */
 static void
 i915_miptree_layout_cube(struct intel_context *intel,
-			 struct intel_mipmap_tree * mt)
+			 struct intel_mipmap_tree * mt,
+			 uint32_t tiling)
 {
    const GLuint dim = mt->width0;
    GLuint face;
@@ -111,7 +123,7 @@ i915_miptree_layout_cube(struct intel_context *intel,
    assert(lvlWidth == lvlHeight); /* cubemap images are square */
 
    /* double pitch for cube layouts */
-   mt->pitch = intel_miptree_pitch_align (intel, mt, dim * 2);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, dim * 2);
    mt->total_height = dim * 4;
 
    for (level = mt->first_level; level <= mt->last_level; level++) {
@@ -145,7 +157,8 @@ i915_miptree_layout_cube(struct intel_context *intel,
 
 static void
 i915_miptree_layout_3d(struct intel_context *intel,
-		       struct intel_mipmap_tree * mt)
+		       struct intel_mipmap_tree * mt,
+		       uint32_t tiling)
 {
    GLuint width = mt->width0;
    GLuint height = mt->height0;
@@ -154,7 +167,7 @@ i915_miptree_layout_3d(struct intel_context *intel,
    GLint level;
 
    /* Calculate the size of a single slice. */
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
 
    /* XXX: hardware expects/requires 9 levels at minimum. */
    for (level = mt->first_level; level <= MAX2(8, mt->last_level); level++) {
@@ -189,14 +202,15 @@ i915_miptree_layout_3d(struct intel_context *intel,
 
 static void
 i915_miptree_layout_2d(struct intel_context *intel,
-		       struct intel_mipmap_tree * mt)
+		       struct intel_mipmap_tree * mt,
+		       uint32_t tiling)
 {
    GLuint width = mt->width0;
    GLuint height = mt->height0;
    GLuint img_height;
    GLint level;
 
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
    mt->total_height = 0;
 
    for (level = mt->first_level; level <= mt->last_level; level++) {
@@ -217,19 +231,20 @@ i915_miptree_layout_2d(struct intel_context *intel,
 }
 
 GLboolean
-i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
+i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt,
+		    uint32_t tiling)
 {
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
-      i915_miptree_layout_cube(intel, mt);
+      i915_miptree_layout_cube(intel, mt, tiling);
       break;
    case GL_TEXTURE_3D:
-      i915_miptree_layout_3d(intel, mt);
+      i915_miptree_layout_3d(intel, mt, tiling);
       break;
    case GL_TEXTURE_1D:
    case GL_TEXTURE_2D:
    case GL_TEXTURE_RECTANGLE_ARB:
-      i915_miptree_layout_2d(intel, mt);
+      i915_miptree_layout_2d(intel, mt, tiling);
       break;
    default:
       _mesa_problem(NULL, "Unexpected tex target in i915_miptree_layout()");
@@ -297,7 +312,7 @@ i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
  * +---+   +---+   +---+   +---+   +---+   +---+
  *
  * The bottom row continues with the remaining 2x2 then the 1x1 mip contents
- * in order, with each of them aligned to a 4x4 block boundary.  Thus, for
+ * in order, with each of them aligned to a 8x8 block boundary.  Thus, for
  * 32x32 cube maps and smaller, the bottom row layout is going to dictate the
  * pitch of the tree.  For a tree with 4x4 images, the pitch is at least
  * 14 * 8 = 112 texels, for 2x2 it is at least 12 * 8 texels, and for 1x1
@@ -306,7 +321,8 @@ i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
 
 static void
 i945_miptree_layout_cube(struct intel_context *intel,
-			 struct intel_mipmap_tree * mt)
+			 struct intel_mipmap_tree * mt,
+			 uint32_t tiling)
 {
    const GLuint dim = mt->width0;
    GLuint face;
@@ -320,9 +336,9 @@ i945_miptree_layout_cube(struct intel_context *intel,
     * or the final row of 4x4, 2x2 and 1x1 faces below this.
     */
    if (dim > 32)
-      mt->pitch = intel_miptree_pitch_align (intel, mt, dim * 2);
+      mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, dim * 2);
    else
-      mt->pitch = intel_miptree_pitch_align (intel, mt, 14 * 8);
+      mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, 14 * 8);
 
    if (dim >= 4)
       mt->total_height = dim * 4 + 4;
@@ -375,10 +391,11 @@ i945_miptree_layout_cube(struct intel_context *intel,
 	       x = (face - 4) * 8;
 	       break;
 	    }
+	    break;
 
 	 case 2:
 	    y = mt->total_height - 4;
-	    x = 16 + face * 8;
+	    x = bottom_offsets[face];
 	    break;
 
 	 case 1:
@@ -396,7 +413,8 @@ i945_miptree_layout_cube(struct intel_context *intel,
 
 static void
 i945_miptree_layout_3d(struct intel_context *intel,
-		       struct intel_mipmap_tree * mt)
+		       struct intel_mipmap_tree * mt,
+		       uint32_t tiling)
 {
    GLuint width = mt->width0;
    GLuint height = mt->height0;
@@ -405,7 +423,7 @@ i945_miptree_layout_3d(struct intel_context *intel,
    GLuint pack_y_pitch;
    GLuint level;
 
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
    mt->total_height = 0;
 
    pack_y_pitch = MAX2(mt->height0, 2);
@@ -450,19 +468,23 @@ i945_miptree_layout_3d(struct intel_context *intel,
 }
 
 GLboolean
-i945_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
+i945_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt,
+		    uint32_t tiling)
 {
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
-      i945_miptree_layout_cube(intel, mt);
+      if (mt->compressed)
+	 i945_miptree_layout_cube(intel, mt, tiling);
+      else
+	 i915_miptree_layout_cube(intel, mt, tiling);
       break;
    case GL_TEXTURE_3D:
-      i945_miptree_layout_3d(intel, mt);
+      i945_miptree_layout_3d(intel, mt, tiling);
       break;
    case GL_TEXTURE_1D:
    case GL_TEXTURE_2D:
    case GL_TEXTURE_RECTANGLE_ARB:
-      i945_miptree_layout_2d(intel, mt);
+      i945_miptree_layout_2d(intel, mt, tiling);
       break;
    default:
       _mesa_problem(NULL, "Unexpected tex target in i945_miptree_layout()");
diff --git a/src/mesa/drivers/dri/i915/i915_texstate.c b/src/mesa/drivers/dri/i915/i915_texstate.c
index 43f65392b56..32d4b30cf9a 100644
--- a/src/mesa/drivers/dri/i915/i915_texstate.c
+++ b/src/mesa/drivers/dri/i915/i915_texstate.c
@@ -132,7 +132,7 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    struct intel_texture_object *intelObj = intel_texture_object(tObj);
    struct gl_texture_image *firstImage;
    GLuint *state = i915->state.Tex[unit], format, pitch;
-   GLint lodbias;
+   GLint lodbias, aniso = 0;
    GLubyte border[4];
 
    memset(state, 0, sizeof(state));
@@ -185,8 +185,13 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 
    state[I915_TEXREG_MS3] =
       (((firstImage->Height - 1) << MS3_HEIGHT_SHIFT) |
-       ((firstImage->Width - 1) << MS3_WIDTH_SHIFT) | format |
-       MS3_USE_FENCE_REGS);
+       ((firstImage->Width - 1) << MS3_WIDTH_SHIFT) | format);
+
+   if (intelObj->mt->region->tiling != I915_TILING_NONE) {
+      state[I915_TEXREG_MS3] |= MS3_TILED_SURFACE;
+      if (intelObj->mt->region->tiling == I915_TILING_Y)
+	 state[I915_TEXREG_MS3] |= MS3_TILE_WALK;
+   }
 
    state[I915_TEXREG_MS4] =
      ((((pitch / 4) - 1) << MS4_PITCH_SHIFT) | MS4_CUBE_FACE_ENA_MASK |
@@ -230,6 +235,10 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
       if (tObj->MaxAnisotropy > 1.0) {
          minFilt = FILTER_ANISOTROPIC;
          magFilt = FILTER_ANISOTROPIC;
+         if (tObj->MaxAnisotropy > 2.0)
+            aniso = SS2_MAX_ANISO_4;
+         else
+            aniso = SS2_MAX_ANISO_2;
       }
       else {
          switch (tObj->MagFilter) {
@@ -275,7 +284,8 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 
       state[I915_TEXREG_SS2] |= ((minFilt << SS2_MIN_FILTER_SHIFT) |
                                  (mipFilt << SS2_MIP_FILTER_SHIFT) |
-                                 (magFilt << SS2_MAG_FILTER_SHIFT));
+                                 (magFilt << SS2_MAG_FILTER_SHIFT) |
+                                 aniso);
    }
 
    {
diff --git a/src/mesa/drivers/dri/i915/i915_vtbl.c b/src/mesa/drivers/dri/i915/i915_vtbl.c
index 3f6d282d342..2fca247af19 100644
--- a/src/mesa/drivers/dri/i915/i915_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i915_vtbl.c
@@ -32,6 +32,7 @@
 #include "main/imports.h"
 #include "main/macros.h"
 #include "main/colormac.h"
+#include "main/texformat.h"
 
 #include "tnl/t_context.h"
 #include "tnl/t_vertex.h"
@@ -40,6 +41,8 @@
 #include "intel_tex.h"
 #include "intel_regions.h"
 #include "intel_tris.h"
+#include "intel_fbo.h"
+#include "intel_chipset.h"
 
 #include "i915_reg.h"
 #include "i915_context.h"
@@ -527,6 +530,23 @@ i915_destroy_context(struct intel_context *intel)
    _tnl_free_vertices(&intel->ctx);
 }
 
+void
+i915_set_buf_info_for_region(uint32_t *state, struct intel_region *region,
+			     uint32_t buffer_id)
+{
+   state[0] = _3DSTATE_BUF_INFO_CMD;
+   state[1] = buffer_id;
+
+   if (region != NULL) {
+      state[1] |= BUF_3D_PITCH(region->pitch * region->cpp);
+
+      if (region->tiling != I915_TILING_NONE) {
+	 state[1] |= BUF_3D_TILED_SURFACE;
+	 if (region->tiling == I915_TILING_Y)
+	    state[1] |= BUF_3D_TILE_WALK_Y;
+      }
+   }
+}
 
 /**
  * Set the drawing regions for the color and depth/stencil buffers.
@@ -542,6 +562,8 @@ i915_state_draw_region(struct intel_context *intel,
 {
    struct i915_context *i915 = i915_context(&intel->ctx);
    GLcontext *ctx = &intel->ctx;
+   struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
+   struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    GLuint value;
 
    ASSERT(state == &i915->state || state == &i915->meta);
@@ -558,21 +580,11 @@ i915_state_draw_region(struct intel_context *intel,
    /*
     * Set stride/cpp values
     */
-   if (color_region) {
-      state->Buffer[I915_DESTREG_CBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I915_DESTREG_CBUFADDR1] =
-         (BUF_3D_ID_COLOR_BACK |
-          BUF_3D_PITCH(color_region->pitch * color_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I915_DESTREG_CBUFADDR0],
+				color_region, BUF_3D_ID_COLOR_BACK);
 
-   if (depth_region) {
-      state->Buffer[I915_DESTREG_DBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I915_DESTREG_DBUFADDR1] =
-         (BUF_3D_ID_DEPTH |
-          BUF_3D_PITCH(depth_region->pitch * depth_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I915_DESTREG_DBUFADDR0],
+				depth_region, BUF_3D_ID_DEPTH);
 
    /*
     * Compute/set I915_DESTREG_DV1 value
@@ -580,12 +592,34 @@ i915_state_draw_region(struct intel_context *intel,
    value = (DSTORG_HORT_BIAS(0x8) |     /* .5 */
             DSTORG_VERT_BIAS(0x8) |     /* .5 */
             LOD_PRECLAMP_OGL | TEX_DEFAULT_COLOR_OGL);
-   if (color_region && color_region->cpp == 4) {
-      value |= DV_PF_8888;
-   }
-   else {
-      value |= (DITHER_FULL_ALWAYS | DV_PF_565);
+   if (irb != NULL) {
+      switch (irb->texformat->MesaFormat) {
+      case MESA_FORMAT_ARGB8888:
+	 value |= DV_PF_8888;
+	 break;
+      case MESA_FORMAT_RGB565:
+	 value |= DV_PF_565 | DITHER_FULL_ALWAYS;
+	 break;
+      case MESA_FORMAT_ARGB1555:
+	 value |= DV_PF_1555 | DITHER_FULL_ALWAYS;
+	 break;
+      case MESA_FORMAT_ARGB4444:
+	 value |= DV_PF_4444 | DITHER_FULL_ALWAYS;
+	 break;
+      default:
+	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
+		       irb->texformat->MesaFormat);
+      }
    }
+
+   /* This isn't quite safe, thus being hidden behind an option.  When changing
+    * the value of this bit, the pipeline needs to be MI_FLUSHed.  And it
+    * can only be set when a depth buffer is already defined.
+    */
+   if (IS_945(intel->intelScreen->deviceID) && intel->use_early_z &&
+       depth_region->tiling != I915_TILING_NONE)
+      value |= CLASSIC_EARLY_DEPTH;
+
    if (depth_region && depth_region->cpp == 4) {
       value |= DEPTH_FRMT_24_FIXED_8_OTHER;
    }
diff --git a/src/mesa/drivers/dri/i915/intel_generatemipmap.c b/src/mesa/drivers/dri/i915/intel_generatemipmap.c
new file mode 120000
index 00000000000..4c6b37ada01
--- /dev/null
+++ b/src/mesa/drivers/dri/i915/intel_generatemipmap.c
@@ -0,0 +1 @@
+../intel/intel_generatemipmap.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index 2934414d99a..9712c387254 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -14,6 +14,7 @@ DRIVER_SOURCES = \
 	intel_decode.c \
 	intel_extensions.c \
 	intel_fbo.c \
+	intel_generatemipmap.c \
 	intel_mipmap_tree.c \
 	intel_regions.c \
 	intel_screen.c \
@@ -69,6 +70,7 @@ DRIVER_SOURCES = \
 	brw_vs_constval.c \
 	brw_vs_emit.c \
 	brw_vs_state.c \
+	brw_vs_surface_state.c \
 	brw_vtbl.c \
 	brw_wm.c \
 	brw_wm_debug.c \
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index d96ff293102..4dbe551d832 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -118,6 +118,8 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
    ctx->Const.MaxCubeTextureLevels = 12;
    ctx->Const.MaxTextureRectSize = (1<<12);
    
+   ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+
    /* if conformance mode is set, swrast can handle any size AA point */
    ctx->Const.MaxPointSizeAA = 255.0;
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 01e07c967fa..873fc8ffff6 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -131,6 +131,7 @@ struct brw_context;
 #define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
 #define BRW_NEW_INPUT_VARYING           0x200
 #define BRW_NEW_PSP                     0x800
+#define BRW_NEW_WM_SURFACES		0x1000
 #define BRW_NEW_FENCE                   0x2000
 #define BRW_NEW_INDICES			0x4000
 #define BRW_NEW_VERTICES		0x8000
@@ -141,7 +142,8 @@ struct brw_context;
 #define BRW_NEW_BATCH			0x10000
 /** brw->depth_region updated */
 #define BRW_NEW_DEPTH_BUFFER		0x20000
-#define BRW_NEW_NR_SURFACES		0x40000
+#define BRW_NEW_NR_WM_SURFACES		0x40000
+#define BRW_NEW_NR_VS_SURFACES		0x80000
 
 struct brw_state_flags {
    /** State update flags signalled by mesa internals */
@@ -159,6 +161,8 @@ struct brw_state_flags {
 struct brw_vertex_program {
    struct gl_vertex_program program;
    GLuint id;
+   dri_bo *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
 };
 
 
@@ -168,8 +172,8 @@ struct brw_fragment_program {
    GLuint id;  /**< serial no. to identify frag progs, never re-used */
    GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
 
-   /** Program constant buffer/surface */
-   dri_bo *const_buffer;
+   dri_bo *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
 };
 
 
@@ -186,7 +190,7 @@ struct brw_wm_prog_data {
    GLuint total_grf;
    GLuint total_scratch;
 
-   GLuint nr_params;
+   GLuint nr_params;       /**< number of float params/constants */
    GLboolean error;
 
    /* Pointer to tracked values (only valid once
@@ -225,6 +229,7 @@ struct brw_vs_prog_data {
    GLuint urb_read_length;
    GLuint total_grf;
    GLuint outputs_written;
+   GLuint nr_params;       /**< number of float params/constants */
 
    GLuint inputs_read;
 
@@ -241,16 +246,38 @@ struct brw_vs_ouput_sizes {
 };
 
 
+/** Number of general purpose registers (VS, WM, etc) */
+#define BRW_MAX_GRF 128
+
 /** Number of texture sampler units */
 #define BRW_MAX_TEX_UNIT 16
 
 /**
- * Size of our surface binding table.
+ * Size of our surface binding table for the WM.
  * This contains pointers to the drawing surfaces and current texture
- * objects and shader constant buffer (+1).
+ * objects and shader constant buffers (+2).
  */
 #define BRW_WM_MAX_SURF (MAX_DRAW_BUFFERS + BRW_MAX_TEX_UNIT + 1)
 
+/**
+ * Helpers to convert drawing buffers, textures and constant buffers
+ * to surface binding table indexes, for WM.
+ */
+#define SURF_INDEX_DRAW(d)           (d)
+#define SURF_INDEX_FRAG_CONST_BUFFER (MAX_DRAW_BUFFERS) 
+#define SURF_INDEX_TEXTURE(t)        (MAX_DRAW_BUFFERS + 1 + (t))
+
+/**
+ * Size of surface binding table for the VS.
+ * Only one constant buffer for now.
+ */
+#define BRW_VS_MAX_SURF 1
+
+/**
+ * Only a VS constant buffer
+ */
+#define SURF_INDEX_VERT_CONST_BUFFER 0
+
 
 enum brw_cache_id {
    BRW_CC_VP,
@@ -427,8 +454,6 @@ struct brw_context
 
    struct {
       struct brw_state_flags dirty;
-      struct brw_tracked_state **atoms;
-      GLuint nr_atoms;
 
       GLuint nr_color_regions;
       struct intel_region *color_regions[MAX_DRAW_BUFFERS];
@@ -448,7 +473,8 @@ struct brw_context
       int validated_bo_count;
    } state;
 
-   struct brw_cache cache;
+   struct brw_cache cache;  /** non-surface items */
+   struct brw_cache surface_cache;  /* surface items */
    struct brw_cached_batch_item *cached_batch_items;
 
    struct {
@@ -532,11 +558,6 @@ struct brw_context
       GLuint vs_size;
       GLuint total_size;
 
-      /* Dynamic tracker which changes to reflect the state referenced
-       * by active fp and vp program parameters:
-       */
-      struct brw_tracked_state tracked_state;
-
       dri_bo *curbe_bo;
       /** Offset within curbe_bo of space for current curbe entry */
       GLuint curbe_offset;
@@ -557,6 +578,11 @@ struct brw_context
 
       dri_bo *prog_bo;
       dri_bo *state_bo;
+
+      /** Binding table of pointers to surf_bo entries */
+      dri_bo *bind_bo;
+      dri_bo *surf_bo[BRW_VS_MAX_SURF];
+      GLuint nr_surfaces;      
    } vs;
 
    struct {
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index a6bfb7507e7..a1a6c53d0e0 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -36,6 +36,7 @@
 #include "main/macros.h"
 #include "main/enums.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
 #include "shader/prog_statevars.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
@@ -45,17 +46,21 @@
 #include "brw_util.h"
 
 
-/* Partition the CURBE between the various users of constant values:
+/**
+ * Partition the CURBE between the various users of constant values:
+ * Note that vertex and fragment shaders can now fetch constants out
+ * of constant buffers.  We no longer allocatea block of the GRF for
+ * constants.  That greatly reduces the demand for space in the CURBE.
+ * Some of the comments within are dated...
  */
 static void calculate_curbe_offsets( struct brw_context *brw )
 {
    GLcontext *ctx = &brw->intel.ctx;
    /* CACHE_NEW_WM_PROG */
-   GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16;
+   const GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16;
    
    /* BRW_NEW_VERTEX_PROGRAM */
-   const struct brw_vertex_program *vp = brw_vertex_program_const(brw->vertex_program);
-   GLuint nr_vp_regs = (vp->program.Base.Parameters->NumParameters * 4 + 15) / 16;
+   const GLuint nr_vp_regs = (brw->vs.prog_data->nr_params + 15) / 16;
    GLuint nr_clip_regs = 0;
    GLuint total_regs;
 
@@ -184,13 +189,6 @@ static void prepare_constant_buffer(struct brw_context *brw)
    GLfloat *buf;
    GLuint i;
 
-   /* Update our own dependency flags.  This works because this
-    * function will also be called whenever fp or vp changes.
-    */
-   brw->curbe.tracked_state.dirty.mesa = (_NEW_TRANSFORM|_NEW_PROJECTION);
-   brw->curbe.tracked_state.dirty.mesa |= vp->program.Base.Parameters->StateFlags;
-   brw->curbe.tracked_state.dirty.mesa |= fp->program.Base.Parameters->StateFlags;
-
    if (sz == 0) {
       if (brw->curbe.last_buf) {
 	 free(brw->curbe.last_buf);
@@ -248,7 +246,7 @@ static void prepare_constant_buffer(struct brw_context *brw)
    /* vertex shader constants */
    if (brw->curbe.vs_size) {
       GLuint offset = brw->curbe.vs_start * 16;
-      GLuint nr = vp->program.Base.Parameters->NumParameters;
+      GLuint nr = brw->vs.prog_data->nr_params / 4;
 
       _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
 
@@ -331,40 +329,11 @@ static void prepare_constant_buffer(struct brw_context *brw)
     */
 }
 
-
-/**
- * Vertex/fragment shader constants are stored in a pseudo 1D texture.
- * This function updates the constants in that buffer.
- */
-static void
-update_texture_constant_buffer(struct brw_context *brw)
-{
-   struct brw_fragment_program *fp =
-      (struct brw_fragment_program *) brw->fragment_program;
-   const struct gl_program_parameter_list *params = fp->program.Base.Parameters;
-   const int size = params->NumParameters * 4 * sizeof(GLfloat);
-
-   assert(fp->const_buffer);
-   assert(fp->const_buffer->size >= size);
-
-   /* copy constants into the buffer */
-   if (size > 0) {
-      GLubyte *map;
-      dri_bo_map(fp->const_buffer, GL_TRUE);
-      map = fp->const_buffer->virtual;
-      memcpy(map, params->ParameterValues, size);
-      dri_bo_unmap(fp->const_buffer);
-   }
-}
-
-
 static void emit_constant_buffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    GLuint sz = brw->curbe.total_size;
 
-   update_texture_constant_buffer(brw);
-
    BEGIN_BATCH(2, IGNORE_CLIPRECTS);
    if (sz == 0) {
       OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
@@ -386,7 +355,7 @@ static void emit_constant_buffer(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_constant_buffer = {
    .dirty = {
-      .mesa = (_NEW_TRANSFORM|_NEW_PROJECTION),      /* plus fp and vp flags */
+      .mesa = _NEW_PROGRAM_CONSTANTS,
       .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
 	       BRW_NEW_VERTEX_PROGRAM |
 	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index b91b20bec6f..1b8bcc14ec0 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -343,7 +343,7 @@ static void brw_prepare_vertices(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = intel_context(ctx);
-   GLuint tmp = brw->vs.prog_data->inputs_read; 
+   GLbitfield vs_inputs = brw->vs.prog_data->inputs_read; 
    GLuint i;
    const unsigned char *ptr = NULL;
    GLuint interleave = 0;
@@ -362,11 +362,11 @@ static void brw_prepare_vertices(struct brw_context *brw)
       _mesa_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
 
    /* Accumulate the list of enabled arrays. */
-   while (tmp) {
-      GLuint i = _mesa_ffsll(tmp)-1;
+   while (vs_inputs) {
+      GLuint i = _mesa_ffsll(vs_inputs) - 1;
       struct brw_vertex_element *input = &brw->vb.inputs[i];
 
-      tmp &= ~(1<<i);
+      vs_inputs &= ~(1 << i);
       enabled[nr_enabled++] = input;
    }
 
@@ -477,17 +477,17 @@ static void brw_emit_vertices(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = intel_context(ctx);
-   GLuint tmp = brw->vs.prog_data->inputs_read;
+   GLbitfield vs_inputs = brw->vs.prog_data->inputs_read;
    struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
    GLuint i;
    GLuint nr_enabled = 0;
 
   /* Accumulate the list of enabled arrays. */
-   while (tmp) {
-      i = _mesa_ffsll(tmp)-1;
+   while (vs_inputs) {
+      i = _mesa_ffsll(vs_inputs) - 1;
       struct brw_vertex_element *input = &brw->vb.inputs[i];
 
-      tmp &= ~(1<<i);
+      vs_inputs &= ~(1 << i);
       enabled[nr_enabled++] = input;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index d05f2e6c410..bc7756ceab4 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -97,7 +97,7 @@ struct brw_glsl_call;
 
 
 #define BRW_EU_MAX_INSN_STACK 5
-#define BRW_EU_MAX_INSN 4000
+#define BRW_EU_MAX_INSN 10000
 
 struct brw_compile {
    struct brw_instruction store[BRW_EU_MAX_INSN];
@@ -862,9 +862,17 @@ void brw_dp_READ_4( struct brw_compile *p,
                     struct brw_reg dest,
                     GLuint msg_reg_nr,
                     GLboolean relAddr,
-                    GLuint scratch_offset,
+                    GLuint location,
                     GLuint bind_table_index );
 
+void brw_dp_READ_4_vs( struct brw_compile *p,
+                       struct brw_reg dest,
+                       GLuint oword,
+                       GLboolean relAddr,
+                       struct brw_reg addrReg,
+                       GLuint location,
+                       GLuint bind_table_index );
+
 void brw_dp_WRITE_16( struct brw_compile *p,
 		      struct brw_reg src,
 		      GLuint msg_reg_nr,
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 21ce8369db4..60ea44f7a96 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -952,7 +952,7 @@ void brw_dp_READ_16( struct brw_compile *p,
 
 /**
  * Read a float[4] vector from the data port Data Cache (const buffer).
- * Scratch offset should be a multiple of 16.
+ * Location (in buffer) should be a multiple of 16.
  * Used for fetching shader constants.
  * If relAddr is true, we'll do an indirect fetch using the address register.
  */
@@ -960,7 +960,7 @@ void brw_dp_READ_4( struct brw_compile *p,
                     struct brw_reg dest,
                     GLuint msg_reg_nr,
                     GLboolean relAddr,
-                    GLuint scratch_offset,
+                    GLuint location,
                     GLuint bind_table_index )
 {
    {
@@ -969,18 +969,20 @@ void brw_dp_READ_4( struct brw_compile *p,
       brw_set_mask_control(p, BRW_MASK_DISABLE);
 
       /* set message header global offset field (reg 0, element 2) */
+      /* Note that grf[0] will be copied to mrf[1] implicitly by the SEND instr */
       brw_MOV(p,
 	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
-	      brw_imm_d(scratch_offset));
+	      brw_imm_d(location));
       brw_pop_insn_state(p);
    }
 
    {
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
    
-      insn->header.predicate_control = 0; /* XXX */
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
       insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.mask_control = BRW_MASK_DISABLE;
   
       /* cast dest to a uword[8] vector */
       dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
@@ -989,7 +991,7 @@ void brw_dp_READ_4( struct brw_compile *p,
       brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
 
       brw_set_dp_read_message(insn,
-			      bind_table_index, /* binding table index (255=stateless) */
+			      bind_table_index,
 			      0,  /* msg_control (0 means 1 Oword) */
 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
 			      0, /* source cache = data cache */
@@ -1000,6 +1002,78 @@ void brw_dp_READ_4( struct brw_compile *p,
 }
 
 
+/**
+ * Read float[4] constant(s) from VS constant buffer.
+ * For relative addressing, two float[4] constants will be read into 'dest'.
+ * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
+ */
+void brw_dp_READ_4_vs(struct brw_compile *p,
+                      struct brw_reg dest,
+                      GLuint oword,
+                      GLboolean relAddr,
+                      struct brw_reg addrReg,
+                      GLuint location,
+                      GLuint bind_table_index)
+{
+   GLuint msg_reg_nr = 1;
+
+   assert(oword < 2);
+   /*
+   printf("vs const read msg, location %u, msg_reg_nr %d\n",
+          location, msg_reg_nr);
+   */
+
+   /* Setup MRF[1] with location/offset into const buffer */
+   {
+      struct brw_reg b;
+
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      /*brw_set_access_mode(p, BRW_ALIGN_16);*/
+
+      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
+       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
+       */
+      b = brw_message_reg(msg_reg_nr);
+      b = retype(b, BRW_REGISTER_TYPE_UD);
+      /*b = get_element_ud(b, 2);*/
+      if (relAddr) {
+         brw_ADD(p, b, addrReg, brw_imm_ud(location));
+      }
+      else {
+         brw_MOV(p, b, brw_imm_ud(location));
+      }
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      insn->header.compression_control = BRW_COMPRESSION_NONE; 
+      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.mask_control = BRW_MASK_DISABLE;
+      /*insn->header.access_mode = BRW_ALIGN_16;*/
+  
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, brw_null_reg());
+
+      brw_set_dp_read_message(insn,
+			      bind_table_index,
+			      oword,  /* 0 = lower Oword, 1 = upper Oword */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      0, /* source cache = data cache */
+			      1, /* msg_length */
+			      1, /* response_length (1 Oword) */
+			      0); /* eot */
+   }
+}
+
+
+
 void brw_fb_WRITE(struct brw_compile *p,
                   struct brw_reg dest,
                   GLuint msg_reg_nr,
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 5c94a49f60a..4784254bc7d 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -101,6 +101,7 @@ const struct brw_tracked_state brw_drawing_rect = {
 
 static void prepare_binding_table_pointers(struct brw_context *brw)
 {
+   brw_add_validated_bo(brw, brw->vs.bind_bo);
    brw_add_validated_bo(brw, brw->wm.bind_bo);
 }
 
@@ -117,13 +118,14 @@ static void upload_binding_table_pointers(struct brw_context *brw)
 
    BEGIN_BATCH(6, IGNORE_CLIPRECTS);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
-   OUT_BATCH(0); /* vs */
+   if (brw->vs.bind_bo != NULL)
+      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
+   else
+      OUT_BATCH(0);
    OUT_BATCH(0); /* gs */
    OUT_BATCH(0); /* clip */
    OUT_BATCH(0); /* sf */
-   OUT_RELOC(brw->wm.bind_bo,
-	     I915_GEM_DOMAIN_SAMPLER, 0,
-	     0);
+   OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */
    ADVANCE_BATCH();
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 457bc2fc7f4..bac69187c19 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -95,6 +95,12 @@ static struct gl_program *brwNewProgram( GLcontext *ctx,
 static void brwDeleteProgram( GLcontext *ctx,
 			      struct gl_program *prog )
 {
+   if (prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
+      struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
+      struct brw_fragment_program *brw_fprog = brw_fragment_program(fprog);
+      dri_bo_unreference(brw_fprog->const_buffer);
+   }
+
    _mesa_delete_program( ctx, prog );
 }
 
@@ -111,7 +117,6 @@ static void brwProgramStringNotify( GLcontext *ctx,
 				    struct gl_program *prog )
 {
    struct brw_context *brw = brw_context(ctx);
-   struct intel_context *intel = &brw->intel;
 
    if (target == GL_FRAGMENT_PROGRAM_ARB) {
       struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
@@ -128,24 +133,6 @@ static void brwProgramStringNotify( GLcontext *ctx,
 	 brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
       newFP->id = brw->program_id++;      
       newFP->isGLSL = brw_wm_is_glsl(fprog);
-
-      /* alloc constant buffer/surface */
-      {
-         const struct gl_program_parameter_list *params = prog->Parameters;
-         const int size = params->NumParameters * 4 * sizeof(GLfloat);
-
-         /* free old const buffer if too small */
-         if (newFP->const_buffer && newFP->const_buffer->size < size) {
-            dri_bo_unreference(newFP->const_buffer);
-            newFP->const_buffer = NULL;
-         }
-
-         if (!newFP->const_buffer) {
-            newFP->const_buffer = drm_intel_bo_alloc(intel->bufmgr,
-                                                     "fp_const_buffer",
-                                                     size, 64);
-         }
-      }
    }
    else if (target == GL_VERTEX_PROGRAM_ARB) {
       struct gl_vertex_program *vprog = (struct gl_vertex_program *) prog;
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index fc4eddda0a5..c99918724b3 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -66,7 +66,9 @@ static void upload_sf_vp(struct brw_context *brw)
    sfv.viewport.m31 = v[MAT_TY] * y_scale + y_bias;
    sfv.viewport.m32 = v[MAT_TZ] * depth_scale;
 
-   /* _NEW_SCISSOR */
+   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT
+    * for DrawBuffer->_[XY]{min,max}
+    */
 
    /* The scissor only needs to handle the intersection of drawable and
     * scissor rect.  Clipping to the boundaries of static shared buffers
@@ -97,7 +99,8 @@ static void upload_sf_vp(struct brw_context *brw)
 const struct brw_tracked_state brw_sf_vp = {
    .dirty = {
       .mesa  = (_NEW_VIEWPORT | 
-		_NEW_SCISSOR),
+		_NEW_SCISSOR |
+		_NEW_BUFFERS),
       .brw   = 0,
       .cache = 0
    },
@@ -147,7 +150,7 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
    key->line_smooth = ctx->Line.SmoothFlag;
 
    key->point_sprite = ctx->Point.PointSprite;
-   key->point_size = ctx->Point.Size;
+   key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
    key->point_attenuated = ctx->Point._Attenuated;
 
    key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 81b0a45998f..bf9f6cae55e 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -72,11 +72,13 @@ const struct brw_tracked_state brw_sf_vp;
 const struct brw_tracked_state brw_state_base_address;
 const struct brw_tracked_state brw_urb_fence;
 const struct brw_tracked_state brw_vertex_state;
+const struct brw_tracked_state brw_vs_surfaces;
 const struct brw_tracked_state brw_vs_prog;
 const struct brw_tracked_state brw_vs_unit;
 const struct brw_tracked_state brw_wm_input_sizes;
 const struct brw_tracked_state brw_wm_prog;
 const struct brw_tracked_state brw_wm_samplers;
+const struct brw_tracked_state brw_wm_constant_surface;
 const struct brw_tracked_state brw_wm_surfaces;
 const struct brw_tracked_state brw_wm_unit;
 
@@ -91,6 +93,20 @@ const struct brw_tracked_state brw_drawing_rect;
 const struct brw_tracked_state brw_indices;
 const struct brw_tracked_state brw_vertices;
 
+/**
+ * Use same key for WM and VS surfaces.
+ */
+struct brw_surface_key {
+   GLenum target, depthmode;
+   dri_bo *bo;
+   GLint format, internal_format;
+   GLint first_level, last_level;
+   GLint width, height, depth;
+   GLint pitch, cpp;
+   uint32_t tiling;
+   GLuint offset;
+};
+
 /***********************************************************************
  * brw_state.c
  */
@@ -135,8 +151,8 @@ dri_bo *brw_search_cache( struct brw_cache *cache,
 			  void *aux_return);
 void brw_state_cache_check_size( struct brw_context *brw );
 
-void brw_init_cache( struct brw_context *brw );
-void brw_destroy_cache( struct brw_context *brw );
+void brw_init_caches( struct brw_context *brw );
+void brw_destroy_caches( struct brw_context *brw );
 
 /***********************************************************************
  * brw_state_batch.c
@@ -150,4 +166,9 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 void brw_destroy_batch_cache( struct brw_context *brw );
 void brw_clear_batch_cache_flush( struct brw_context *brw );
 
+/* brw_wm_surface_state.c */
+dri_bo *
+brw_create_constant_surface( struct brw_context *brw,
+                             struct brw_surface_key *key );
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index d5b51664066..e40d7a04164 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -56,9 +56,9 @@
  * incorrect program is run for the other instance.
  */
 
+#include "main/imports.h"
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
-#include "main/imports.h"
 
 /* XXX: Fixme - have to include these to get the sizes of the prog_key
  * structs:
@@ -69,8 +69,10 @@
 #include "brw_sf.h"
 #include "brw_gs.h"
 
-static GLuint hash_key( const void *key, GLuint key_size,
-			dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
+
+static GLuint
+hash_key(const void *key, GLuint key_size,
+         dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
 {
    GLuint *ikey = (GLuint *)key;
    GLuint hash = 0, i;
@@ -95,6 +97,7 @@ static GLuint hash_key( const void *key, GLuint key_size,
    return hash;
 }
 
+
 /**
  * Marks a new buffer as being chosen for the given cache id.
  */
@@ -111,6 +114,7 @@ update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
    cache->brw->state.dirty.cache |= 1 << cache_id;
 }
 
+
 static struct brw_cache_item *
 search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 	     GLuint hash, const void *key, GLuint key_size,
@@ -143,7 +147,8 @@ search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 }
 
 
-static void rehash( struct brw_cache *cache )
+static void
+rehash(struct brw_cache *cache)
 {
    struct brw_cache_item **items;
    struct brw_cache_item *c, *next;
@@ -164,15 +169,17 @@ static void rehash( struct brw_cache *cache )
    cache->size = size;
 }
 
+
 /**
  * Returns the buffer object matching cache_id and key, or NULL.
  */
-dri_bo *brw_search_cache( struct brw_cache *cache,
-			  enum brw_cache_id cache_id,
-			  const void *key,
-			  GLuint key_size,
-			  dri_bo **reloc_bufs, GLuint nr_reloc_bufs,
-			  void *aux_return )
+dri_bo *
+brw_search_cache(struct brw_cache *cache,
+                 enum brw_cache_id cache_id,
+                 const void *key,
+                 GLuint key_size,
+                 dri_bo **reloc_bufs, GLuint nr_reloc_bufs,
+                 void *aux_return)
 {
    struct brw_cache_item *item;
    GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
@@ -192,6 +199,7 @@ dri_bo *brw_search_cache( struct brw_cache *cache,
    return item->bo;
 }
 
+
 dri_bo *
 brw_upload_cache( struct brw_cache *cache,
 		  enum brw_cache_id cache_id,
@@ -265,7 +273,9 @@ brw_upload_cache( struct brw_cache *cache,
    return bo;
 }
 
-/* This doesn't really work with aux data.  Use search/upload instead
+
+/**
+ * This doesn't really work with aux data.  Use search/upload instead
  */
 dri_bo *
 brw_cache_data_sz(struct brw_cache *cache,
@@ -296,6 +306,7 @@ brw_cache_data_sz(struct brw_cache *cache,
    return bo;
 }
 
+
 /**
  * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
  *
@@ -319,21 +330,22 @@ enum pool_type {
    DW_GENERAL_STATE
 };
 
+
 static void
-brw_init_cache_id( struct brw_context *brw,
-		const char *name,
-		enum brw_cache_id id,
-		GLuint key_size,
-		GLuint aux_size)
+brw_init_cache_id(struct brw_cache *cache,
+                  const char *name,
+                  enum brw_cache_id id,
+                  GLuint key_size,
+                  GLuint aux_size)
 {
-   struct brw_cache *cache = &brw->cache;
-
    cache->name[id] = strdup(name);
    cache->key_size[id] = key_size;
    cache->aux_size[id] = aux_size;
 }
 
-void brw_init_cache( struct brw_context *brw )
+
+static void
+brw_init_non_surface_cache(struct brw_context *brw)
 {
    struct brw_cache *cache = &brw->cache;
 
@@ -342,114 +354,136 @@ void brw_init_cache( struct brw_context *brw )
    cache->size = 7;
    cache->n_items = 0;
    cache->items = (struct brw_cache_item **)
-      _mesa_calloc(cache->size * 
-		   sizeof(struct brw_cache_item));
+      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CC_VP",
 		     BRW_CC_VP,
 		     sizeof(struct brw_cc_viewport),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CC_UNIT",
 		     BRW_CC_UNIT,
 		     sizeof(struct brw_cc_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "WM_PROG",
 		     BRW_WM_PROG,
 		     sizeof(struct brw_wm_prog_key),
 		     sizeof(struct brw_wm_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SAMPLER_DEFAULT_COLOR",
 		     BRW_SAMPLER_DEFAULT_COLOR,
 		     sizeof(struct brw_sampler_default_color),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SAMPLER",
 		     BRW_SAMPLER,
 		     0,		/* variable key/data size */
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "WM_UNIT",
 		     BRW_WM_UNIT,
 		     sizeof(struct brw_wm_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_PROG",
 		     BRW_SF_PROG,
 		     sizeof(struct brw_sf_prog_key),
 		     sizeof(struct brw_sf_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_VP",
 		     BRW_SF_VP,
 		     sizeof(struct brw_sf_viewport),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_UNIT",
 		     BRW_SF_UNIT,
 		     sizeof(struct brw_sf_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "VS_UNIT",
 		     BRW_VS_UNIT,
 		     sizeof(struct brw_vs_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "VS_PROG",
 		     BRW_VS_PROG,
 		     sizeof(struct brw_vs_prog_key),
 		     sizeof(struct brw_vs_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CLIP_UNIT",
 		     BRW_CLIP_UNIT,
 		     sizeof(struct brw_clip_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CLIP_PROG",
 		     BRW_CLIP_PROG,
 		     sizeof(struct brw_clip_prog_key),
 		     sizeof(struct brw_clip_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "GS_UNIT",
 		     BRW_GS_UNIT,
 		     sizeof(struct brw_gs_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "GS_PROG",
 		     BRW_GS_PROG,
 		     sizeof(struct brw_gs_prog_key),
 		     sizeof(struct brw_gs_prog_data));
+}
+
+
+static void
+brw_init_surface_cache(struct brw_context *brw)
+{
+   struct brw_cache *cache = &brw->surface_cache;
+
+   cache->brw = brw;
 
-   brw_init_cache_id(brw,
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
+
+   brw_init_cache_id(cache,
 		     "SS_SURFACE",
 		     BRW_SS_SURFACE,
 		     sizeof(struct brw_surface_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SS_SURF_BIND",
 		     BRW_SS_SURF_BIND,
 		     0,
 		     0);
 }
 
+
+void
+brw_init_caches(struct brw_context *brw)
+{
+   brw_init_non_surface_cache(brw);
+   brw_init_surface_cache(brw);
+}
+
+
 static void
-brw_clear_cache( struct brw_context *brw )
+brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 {
    struct brw_cache_item *c, *next;
    GLuint i;
@@ -457,8 +491,8 @@ brw_clear_cache( struct brw_context *brw )
    if (INTEL_DEBUG & DEBUG_STATE)
       _mesa_printf("%s\n", __FUNCTION__);
 
-   for (i = 0; i < brw->cache.size; i++) {
-      for (c = brw->cache.items[i]; c; c = next) {
+   for (i = 0; i < cache->size; i++) {
+      for (c = cache->items[i]; c; c = next) {
 	 int j;
 
 	 next = c->next;
@@ -468,10 +502,10 @@ brw_clear_cache( struct brw_context *brw )
 	 free((void *)c->key);
 	 free(c);
       }
-      brw->cache.items[i] = NULL;
+      cache->items[i] = NULL;
    }
 
-   brw->cache.n_items = 0;
+   cache->n_items = 0;
 
    if (brw->curbe.last_buf) {
       _mesa_free(brw->curbe.last_buf);
@@ -483,25 +517,46 @@ brw_clear_cache( struct brw_context *brw )
    brw->state.dirty.cache |= ~0;
 }
 
-void brw_state_cache_check_size( struct brw_context *brw )
+
+void
+brw_state_cache_check_size(struct brw_context *brw)
 {
+   if (INTEL_DEBUG & DEBUG_STATE)
+      _mesa_printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
+
    /* un-tuned guess.  We've got around 20 state objects for a total of around
     * 32k, so 1000 of them is around 1.5MB.
     */
    if (brw->cache.n_items > 1000)
-      brw_clear_cache(brw);
+      brw_clear_cache(brw, &brw->cache);
+
+   if (brw->surface_cache.n_items > 1000)
+      brw_clear_cache(brw, &brw->surface_cache);
 }
 
-void brw_destroy_cache( struct brw_context *brw )
+
+static void
+brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 {
    GLuint i;
 
-   brw_clear_cache(brw);
+   if (INTEL_DEBUG & DEBUG_STATE)
+      _mesa_printf("%s\n", __FUNCTION__);
+
+   brw_clear_cache(brw, cache);
    for (i = 0; i < BRW_MAX_CACHE; i++) {
-      dri_bo_unreference(brw->cache.last_bo[i]);
-      free(brw->cache.name[i]);
+      dri_bo_unreference(cache->last_bo[i]);
+      free(cache->name[i]);
    }
-   free(brw->cache.items);
-   brw->cache.items = NULL;
-   brw->cache.size = 0;
+   free(cache->items);
+   cache->items = NULL;
+   cache->size = 0;
+}
+
+
+void
+brw_destroy_caches(struct brw_context *brw)
+{
+   brw_destroy_cache(brw, &brw->cache);
+   brw_destroy_cache(brw, &brw->surface_cache);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index 5d332d010c2..a7132622696 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -84,6 +84,19 @@ get_965_surfacetype(unsigned int surfacetype)
     }
 }
 
+static const char *
+get_965_surface_format(unsigned int surface_format)
+{
+    switch (surface_format) {
+    case 0x000: return "r32g32b32a32_float";
+    case 0x0c1: return "b8g8r8a8_unorm";
+    case 0x100: return "b5g6r5_unorm";
+    case 0x102: return "b5g5r5a1_unorm";
+    case 0x104: return "b4g4r4a4_unorm";
+    default: return "unknown";
+    }
+}
+
 static void dump_wm_surface_state(struct brw_context *brw)
 {
    int i;
@@ -95,7 +108,7 @@ static void dump_wm_surface_state(struct brw_context *brw)
       char name[20];
 
       if (surf_bo == NULL) {
-	 fprintf(stderr, "WM SS%d: NULL\n", i);
+	 fprintf(stderr, "  WM SS%d: NULL\n", i);
 	 continue;
       }
       dri_bo_map(surf_bo, GL_FALSE);
@@ -103,8 +116,9 @@ static void dump_wm_surface_state(struct brw_context *brw)
       surf = (struct brw_surface_state *)(surf_bo->virtual);
 
       sprintf(name, "WM SS%d", i);
-      state_out(name, surf, surfoff, 0, "%s\n",
-		get_965_surfacetype(surf->ss0.surface_type));
+      state_out(name, surf, surfoff, 0, "%s %s\n",
+		get_965_surfacetype(surf->ss0.surface_type),
+		get_965_surface_format(surf->ss0.surface_format));
       state_out(name, surf, surfoff, 1, "offset\n");
       state_out(name, surf, surfoff, 2, "%dx%d size, %d mips\n",
 		surf->ss2.width + 1, surf->ss2.height + 1, surf->ss2.mip_count);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 5de1450e612..c6dfea4743c 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -59,11 +59,12 @@ const struct brw_tracked_state *atoms[] =
    &brw_curbe_offsets,
    &brw_recalculate_urb_fence,
 
-
    &brw_cc_vp,
    &brw_cc_unit,
 
-   &brw_wm_surfaces,		/* must do before samplers */
+   &brw_vs_surfaces,		/* must do before unit */
+   &brw_wm_constant_surface,	/* must do before wm surfaces/bind bo */
+   &brw_wm_surfaces,		/* must do before samplers and unit */
    &brw_wm_samplers,
 
    &brw_wm_unit,
@@ -88,54 +89,26 @@ const struct brw_tracked_state *atoms[] =
 
    &brw_line_stipple,
    &brw_aa_line_parameters,
-   /* Ordering of the commands below is documented as fixed.  
-    */
-#if 0
-   &brw_pipelined_state_pointers,
-   &brw_urb_fence,
-   &brw_constant_buffer_state,
-#else
+
    &brw_psp_urb_cbs,
-#endif
 
    &brw_drawing_rect,
    &brw_indices,
    &brw_vertices,
 
-   NULL,			/* brw_constant_buffer */
+   &brw_constant_buffer
 };
 
 
 void brw_init_state( struct brw_context *brw )
 {
-   GLuint i;
-
-   brw_init_cache(brw);
-
-   brw->state.atoms = _mesa_malloc(sizeof(atoms));
-   brw->state.nr_atoms = sizeof(atoms)/sizeof(*atoms);
-   _mesa_memcpy(brw->state.atoms, atoms, sizeof(atoms));
-
-   /* Patch in a pointer to the dynamic state atom:
-    */
-   for (i = 0; i < brw->state.nr_atoms; i++)
-      if (brw->state.atoms[i] == NULL)
-	 brw->state.atoms[i] = &brw->curbe.tracked_state;
-
-   _mesa_memcpy(&brw->curbe.tracked_state, 
-		&brw_constant_buffer,
-		sizeof(brw_constant_buffer));
+   brw_init_caches(brw);
 }
 
 
 void brw_destroy_state( struct brw_context *brw )
 {
-   if (brw->state.atoms) {
-      _mesa_free(brw->state.atoms);
-      brw->state.atoms = NULL;
-   }
-
-   brw_destroy_cache(brw);
+   brw_destroy_caches(brw);
    brw_destroy_batch_cache(brw);
 }
 
@@ -218,6 +191,7 @@ static struct dirty_bit_map mesa_bits[] = {
    DEFINE_BIT(_NEW_MULTISAMPLE),
    DEFINE_BIT(_NEW_TRACK_MATRIX),
    DEFINE_BIT(_NEW_PROGRAM),
+   DEFINE_BIT(_NEW_PROGRAM_CONSTANTS),
    {0, 0, 0}
 };
 
@@ -336,7 +310,7 @@ void brw_validate_state( struct brw_context *brw )
 
    /* do prepare stage for all atoms */
    for (i = 0; i < Elements(atoms); i++) {
-      const struct brw_tracked_state *atom = brw->state.atoms[i];
+      const struct brw_tracked_state *atom = atoms[i];
 
       if (brw->intel.Fallback)
          break;
@@ -367,8 +341,8 @@ void brw_upload_state(struct brw_context *brw)
       _mesa_memset(&examined, 0, sizeof(examined));
       prev = *state;
 
-      for (i = 0; i < brw->state.nr_atoms; i++) {	 
-	 const struct brw_tracked_state *atom = brw->state.atoms[i];
+      for (i = 0; i < Elements(atoms); i++) {	 
+	 const struct brw_tracked_state *atom = atoms[i];
 	 struct brw_state_flags generated;
 
 	 assert(atom->dirty.mesa ||
@@ -397,7 +371,7 @@ void brw_upload_state(struct brw_context *brw)
    }
    else {
       for (i = 0; i < Elements(atoms); i++) {	 
-	 const struct brw_tracked_state *atom = brw->state.atoms[i];
+	 const struct brw_tracked_state *atom = atoms[i];
 
 	 if (brw->intel.Fallback)
 	    break;
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 51a617fcb40..5c5455813a4 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -28,7 +28,6 @@
   * Authors:
   *   Keith Whitwell <[email protected]>
   */
-        
 
 /* Code to layout images in a mipmap tree for i965.
  */
@@ -40,14 +39,15 @@
 
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
 
-GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_tree *mt )
+GLboolean brw_miptree_layout(struct intel_context *intel,
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling)
 {
-   /* XXX: these vary depending on image format: 
-    */
-/*    GLint align_w = 4; */
+   /* XXX: these vary depending on image format: */
+   /* GLint align_w = 4; */
 
    switch (mt->target) {
-   case GL_TEXTURE_CUBE_MAP: 
+   case GL_TEXTURE_CUBE_MAP:
    case GL_TEXTURE_3D: {
       GLuint width  = mt->width0;
       GLuint height = mt->height0;
@@ -59,25 +59,25 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
       GLuint align_w = 4;
 
       mt->total_height = 0;
-      
+
       if (mt->compressed) {
           align_w = intel_compressed_alignment(mt->internal_format);
           mt->pitch = ALIGN(width, align_w);
           pack_y_pitch = (height + 3) / 4;
       } else {
-          mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
-          pack_y_pitch = ALIGN(mt->height0, align_h);
+	 mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
+	 pack_y_pitch = ALIGN(mt->height0, align_h);
       }
 
       pack_x_pitch = mt->pitch;
       pack_x_nr = 1;
 
-      for ( level = mt->first_level ; level <= mt->last_level ; level++ ) {
+      for (level = mt->first_level ; level <= mt->last_level ; level++) {
 	 GLuint nr_images = mt->target == GL_TEXTURE_3D ? depth : 6;
 	 GLint x = 0;
 	 GLint y = 0;
 	 GLint q, j;
-	    
+
 	 intel_miptree_set_level_info(mt, level, nr_images,
 				      0, mt->total_height,
 				      width, height, depth);
@@ -89,7 +89,7 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
 	    }
 
 	    x = 0;
-	    y += pack_y_pitch;	    
+	    y += pack_y_pitch;
 	 }
 
 
@@ -98,40 +98,40 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
 	 height = minify(height);
 	 depth  = minify(depth);
 
-    if (mt->compressed) {
-        pack_y_pitch = (height + 3) / 4;
-        
-        if (pack_x_pitch > ALIGN(width, align_w)) {
-            pack_x_pitch = ALIGN(width, align_w);
-            pack_x_nr <<= 1;
-        }
-    } else {
-        if (pack_x_pitch > 4) {
-            pack_x_pitch >>= 1;
-            pack_x_nr <<= 1;
-            assert(pack_x_pitch * pack_x_nr <= mt->pitch);
-        }
-
-        if (pack_y_pitch > 2) {
-            pack_y_pitch >>= 1;
-            pack_y_pitch = ALIGN(pack_y_pitch, align_h);
-        }
-    }
+	 if (mt->compressed) {
+	    pack_y_pitch = (height + 3) / 4;
+
+	    if (pack_x_pitch > ALIGN(width, align_w)) {
+	       pack_x_pitch = ALIGN(width, align_w);
+	       pack_x_nr <<= 1;
+	    }
+	 } else {
+	    if (pack_x_pitch > 4) {
+	       pack_x_pitch >>= 1;
+	       pack_x_nr <<= 1;
+	       assert(pack_x_pitch * pack_x_nr <= mt->pitch);
+	    }
+
+	    if (pack_y_pitch > 2) {
+	       pack_y_pitch >>= 1;
+	       pack_y_pitch = ALIGN(pack_y_pitch, align_h);
+	    }
+	 }
 
       }
       break;
    }
 
    default:
-      i945_miptree_layout_2d(intel, mt);
+      i945_miptree_layout_2d(intel, mt, tiling);
       break;
    }
-   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__, 
-		mt->pitch, 
+   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+		mt->pitch,
 		mt->total_height,
 		mt->cpp,
 		mt->pitch * mt->total_height * mt->cpp );
-		
+
    return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 99d0e937226..1e4f66091e3 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -75,6 +75,11 @@ struct brw_vs_compile {
 
    struct brw_reg userplane[6];
 
+   /** we may need up to 3 constants per instruction (if use_const_buffer) */
+   struct {
+      GLint index;
+      struct brw_reg reg;
+   } current_const[3];
 };
 
 void brw_vs_emit( struct brw_vs_compile *c );
diff --git a/src/mesa/drivers/dri/i965/brw_vs_constval.c b/src/mesa/drivers/dri/i965/brw_vs_constval.c
index d29eb17f8cf..2637344b482 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_constval.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_constval.c
@@ -96,7 +96,7 @@ static GLubyte get_active( struct tracker *t,
 			   struct prog_src_register src )
 {
    GLuint i;
-   GLubyte active = src.NegateBase; /* NOTE! */
+   GLubyte active = src.Negate; /* NOTE! */
 
    if (src.RelAddr)
       return 0xf;
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 0d6c6ab9a8a..d7f75e3685e 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -38,14 +38,49 @@
 #include "brw_vs.h"
 
 
+static struct brw_reg get_tmp( struct brw_vs_compile *c )
+{
+   struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
 
-/* Do things as simply as possible.  Allocate and populate all regs
+static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+			       
+static void release_tmps( struct brw_vs_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+/**
+ * Preallocate GRF register before code emit.
+ * Do things as simply as possible.  Allocate and populate all regs
  * ahead of time.
  */
 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 {
    GLuint i, reg = 0, mrf;
-   GLuint nr_params;
+
+   /* Determine whether to use a real constant buffer or use a block
+    * of GRF registers for constants.  The later is faster but only
+    * works if everything fits in the GRF.
+    * XXX this heuristic/check may need some fine tuning...
+    */
+   if (c->vp->program.Base.Parameters->NumParameters +
+       c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
+      c->vp->use_const_buffer = GL_TRUE;
+   else
+      c->vp->use_const_buffer = GL_FALSE;
+
+   /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 
    /* r0 -- reserved as usual
     */
@@ -66,13 +101,22 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 
    /* Vertex program parameters from curbe:
     */
-   nr_params = c->vp->program.Base.Parameters->NumParameters;
-   for (i = 0; i < nr_params; i++) {
-      c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
-   }     
-   reg += (nr_params + 1) / 2;
+   if (c->vp->use_const_buffer) {
+      /* get constants from a real constant buffer */
+      c->prog_data.curb_read_length = 0;
+      c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
+   }
+   else {
+      /* use a section of the GRF for constants */
+      GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
+      for (i = 0; i < nr_params; i++) {
+         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
+      }
+      reg += (nr_params + 1) / 2;
+      c->prog_data.curb_read_length = reg - 1;
 
-   c->prog_data.curb_read_length = reg - 1;
+      c->prog_data.nr_params = nr_params * 4;
+   }
 
    /* Allocate input regs:  
     */
@@ -133,6 +177,14 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
       reg++;
    }
 
+   if (c->vp->use_const_buffer) {
+      for (i = 0; i < 3; i++) {
+         c->current_const[i].index = -1;
+         c->current_const[i].reg = brw_vec8_grf(reg, 0);
+         reg++;
+      }
+   }
+
    for (i = 0; i < 128; i++) {
       if (c->output_regs[i].used_in_src) {
          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
@@ -165,28 +217,6 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 }
 
 
-static struct brw_reg get_tmp( struct brw_vs_compile *c )
-{
-   struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
-
-   if (++c->last_tmp > c->prog_data.total_grf)
-      c->prog_data.total_grf = c->last_tmp;
-
-   return tmp;
-}
-
-static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
-{
-   if (tmp.nr == c->last_tmp-1)
-      c->last_tmp--;
-}
-			       
-static void release_tmps( struct brw_vs_compile *c )
-{
-   c->last_tmp = c->first_tmp;
-}
-
-
 /**
  * If an instruction uses a temp reg both as a src and the dest, we
  * sometimes need to allocate an intermediate temporary.
@@ -633,6 +663,8 @@ static void emit_lit_noalias( struct brw_vs_compile *c,
    }
 
    brw_ENDIF(p, if_insn);
+
+   release_tmp(c, tmp);
 }
 
 static void emit_lrp_noalias(struct brw_vs_compile *c,
@@ -673,13 +705,84 @@ static void emit_nrm( struct brw_vs_compile *c,
 }
 
 
+static struct brw_reg
+get_constant(struct brw_vs_compile *c,
+             const struct prog_instruction *inst,
+             GLuint argIndex)
+{
+   const struct prog_src_register *src = &inst->SrcReg[argIndex];
+   struct brw_compile *p = &c->func;
+   struct brw_reg const_reg;
+   struct brw_reg const2_reg;
+   const GLboolean relAddr = src->RelAddr;
+
+   assert(argIndex < 3);
+
+   if (c->current_const[argIndex].index != src->Index || relAddr) {
+      struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
+
+      c->current_const[argIndex].index = src->Index;
+
+#if 0
+      printf("  fetch const[%d] for arg %d into reg %d\n",
+             src->Index, argIndex, c->current_const[argIndex].reg.nr);
+#endif
+      /* need to fetch the constant now */
+      brw_dp_READ_4_vs(p,
+                       c->current_const[argIndex].reg,/* writeback dest */
+                       0,                             /* oword */
+                       relAddr,                       /* relative indexing? */
+                       addrReg,                       /* address register */
+                       16 * src->Index,               /* byte offset */
+                       SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
+                       );
+
+      if (relAddr) {
+         /* second read */
+         const2_reg = get_tmp(c);
+
+         /* use upper half of address reg for second read */
+         addrReg = stride(addrReg, 0, 4, 0);
+         addrReg.subnr = 16;
+
+         brw_dp_READ_4_vs(p,
+                          const2_reg,              /* writeback dest */
+                          1,                       /* oword */
+                          relAddr,                 /* relative indexing? */
+                          addrReg,                 /* address register */
+                          16 * src->Index,         /* byte offset */
+                          SURF_INDEX_VERT_CONST_BUFFER
+                          );
+      }
+   }
+
+   const_reg = c->current_const[argIndex].reg;
+
+   if (relAddr) {
+      /* merge the two Owords into the constant register */
+      /* const_reg[7..4] = const2_reg[7..4] */
+      brw_MOV(p,
+              suboffset(stride(const_reg, 0, 4, 1), 4),
+              suboffset(stride(const2_reg, 0, 4, 1), 4));
+      release_tmp(c, const2_reg);
+   }
+   else {
+      /* replicate lower four floats into upper half (to get XYZWXYZW) */
+      const_reg = stride(const_reg, 0, 4, 0);
+      const_reg.subnr = 0;
+   }
+
+   return const_reg;
+}
+
+
+
 /* TODO: relative addressing!
  */
 static struct brw_reg get_reg( struct brw_vs_compile *c,
 			       gl_register_file file,
 			       GLuint index )
 {
-
    switch (file) {
    case PROGRAM_TEMPORARY:
    case PROGRAM_INPUT:
@@ -708,13 +811,17 @@ static struct brw_reg get_reg( struct brw_vs_compile *c,
 }
 
 
+/**
+ * Indirect addressing:  get reg[[arg] + offset].
+ */
 static struct brw_reg deref( struct brw_vs_compile *c,
 			     struct brw_reg arg,
 			     GLint offset)
 {
    struct brw_compile *p = &c->func;
    struct brw_reg tmp = vec4(get_tmp(c));
-   struct brw_reg vp_address = retype(vec1(get_reg(c, PROGRAM_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
+   struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
+   struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
    struct brw_reg indirect = brw_vec4_indirect(0,0);
 
@@ -735,10 +842,67 @@ static struct brw_reg deref( struct brw_vs_compile *c,
       brw_pop_insn_state(p);
    }
    
+   /* NOTE: tmp not released */
    return vec8(tmp);
 }
 
 
+/**
+ * Get brw reg corresponding to the instruction's [argIndex] src reg.
+ * TODO: relative addressing!
+ */
+static struct brw_reg
+get_src_reg( struct brw_vs_compile *c,
+             const struct prog_instruction *inst,
+             GLuint argIndex )
+{
+   const GLuint file = inst->SrcReg[argIndex].File;
+   const GLint index = inst->SrcReg[argIndex].Index;
+   const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
+
+   switch (file) {
+   case PROGRAM_TEMPORARY:
+   case PROGRAM_INPUT:
+   case PROGRAM_OUTPUT:
+      if (relAddr) {
+         return deref(c, c->regs[file][0], index);
+      }
+      else {
+         assert(c->regs[file][index].nr != 0);
+         return c->regs[file][index];
+      }
+
+   case PROGRAM_STATE_VAR:
+   case PROGRAM_CONSTANT:
+   case PROGRAM_UNIFORM:
+      if (c->vp->use_const_buffer) {
+         return get_constant(c, inst, argIndex);
+      }
+      else if (relAddr) {
+         return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
+      }
+      else {
+         assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
+         return c->regs[PROGRAM_STATE_VAR][index];
+      }
+   case PROGRAM_ADDRESS:
+      assert(index == 0);
+      return c->regs[file][index];
+
+   case PROGRAM_UNDEFINED:
+      /* this is a normal case since we loop over all three src args */
+      return brw_null_reg();
+
+   case PROGRAM_LOCAL_PARAM: 
+   case PROGRAM_ENV_PARAM: 
+   case PROGRAM_WRITE_ONLY:
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
+}
+
+
 static void emit_arl( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0 )
@@ -750,30 +914,31 @@ static void emit_arl( struct brw_vs_compile *c,
    if (need_tmp) 
       tmp = get_tmp(c);
 
-   brw_RNDD(p, tmp, arg0);
-   brw_MUL(p, dst, tmp, brw_imm_d(16));
+   brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
+   brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
 
    if (need_tmp)
       release_tmp(c, tmp);
 }
 
 
-/* Will return mangled results for SWZ op.  The emit_swz() function
+/**
+ * Return the brw reg for the given instruction's src argument.
+ * Will return mangled results for SWZ op.  The emit_swz() function
  * ignores this result and recalculates taking extended swizzles into
  * account.
  */
 static struct brw_reg get_arg( struct brw_vs_compile *c,
-			       struct prog_src_register *src )
+                               const struct prog_instruction *inst,
+                               GLuint argIndex )
 {
+   const struct prog_src_register *src = &inst->SrcReg[argIndex];
    struct brw_reg reg;
 
    if (src->File == PROGRAM_UNDEFINED)
       return brw_null_reg();
 
-   if (src->RelAddr) 
-      reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
-   else
-      reg = get_reg(c, src->File, src->Index);
+   reg = get_src_reg(c, inst, argIndex);
 
    /* Convert 3-bit swizzle to 2-bit.  
     */
@@ -784,16 +949,38 @@ static struct brw_reg get_arg( struct brw_vs_compile *c,
 
    /* Note this is ok for non-swizzle instructions: 
     */
-   reg.negate = src->NegateBase ? 1 : 0;   
+   reg.negate = src->Negate ? 1 : 0;   
 
    return reg;
 }
 
 
+/**
+ * Get brw register for the given program dest register.
+ */
 static struct brw_reg get_dst( struct brw_vs_compile *c,
 			       struct prog_dst_register dst )
 {
-   struct brw_reg reg = get_reg(c, dst.File, dst.Index);
+   struct brw_reg reg;
+
+   switch (dst.File) {
+   case PROGRAM_TEMPORARY:
+   case PROGRAM_OUTPUT:
+      assert(c->regs[dst.File][dst.Index].nr != 0);
+      reg = c->regs[dst.File][dst.Index];
+      break;
+   case PROGRAM_ADDRESS:
+      assert(dst.Index == 0);
+      reg = c->regs[dst.File][dst.Index];
+      break;
+   case PROGRAM_UNDEFINED:
+      /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
+      reg = brw_null_reg();
+      break;
+   default:
+      assert(0);
+      reg = brw_null_reg();
+   }
 
    reg.dw1.bits.writemask = dst.WriteMask;
 
@@ -803,14 +990,16 @@ static struct brw_reg get_dst( struct brw_vs_compile *c,
 
 static void emit_swz( struct brw_vs_compile *c, 
 		      struct brw_reg dst,
-		      struct prog_src_register src )
+                      const struct prog_instruction *inst)
 {
+   const GLuint argIndex = 0;
+   const struct prog_src_register src = inst->SrcReg[argIndex];
    struct brw_compile *p = &c->func;
    GLuint zeros_mask = 0;
    GLuint ones_mask = 0;
    GLuint src_mask = 0;
    GLubyte src_swz[4];
-   GLboolean need_tmp = (src.NegateBase &&
+   GLboolean need_tmp = (src.Negate &&
 			 dst.file != BRW_GENERAL_REGISTER_FILE);
    struct brw_reg tmp = dst;
    GLuint i;
@@ -844,10 +1033,7 @@ static void emit_swz( struct brw_vs_compile *c,
    if (src_mask) {
       struct brw_reg arg0;
 
-      if (src.RelAddr) 
-	 arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
-      else
-	 arg0 = get_reg(c, src.File, src.Index);
+      arg0 = get_src_reg(c, inst, argIndex);
 
       arg0 = brw_swizzle(arg0, 
 			 src_swz[0], src_swz[1], 
@@ -862,8 +1048,8 @@ static void emit_swz( struct brw_vs_compile *c,
    if (ones_mask) 
       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
 
-   if (src.NegateBase)
-      brw_MOV(p, brw_writemask(tmp, src.NegateBase), negate(tmp));
+   if (src.Negate)
+      brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
    
    if (need_tmp) {
       brw_MOV(p, dst, tmp);
@@ -1039,21 +1225,26 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
    for (insn = 0; insn < nr_insns; insn++) {
 
-      struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
+      const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
       struct brw_reg args[3], dst;
       GLuint i;
       
+#if 0
+      printf("%d: ", insn);
+      _mesa_print_instruction(inst);
+#endif
+
       /* Get argument regs.  SWZ is special and does this itself.
        */
       if (inst->Opcode != OPCODE_SWZ)
 	  for (i = 0; i < 3; i++) {
-	      struct prog_src_register *src = &inst->SrcReg[i];
+	      const struct prog_src_register *src = &inst->SrcReg[i];
 	      index = src->Index;
 	      file = src->File;	
 	      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
 		  args[i] = c->output_regs[index].reg;
 	      else
-		  args[i] = get_arg(c, src);
+                  args[i] = get_arg(c, inst, i);
 	  }
 
       /* Get dest regs.  Note that it is possible for a reg to be both
@@ -1181,7 +1372,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 /* The args[0] value can't be used here as it won't have
 	  * correctly encoded the full swizzle:
 	  */
-	 emit_swz(c, dst, inst->SrcReg[0] );
+	 emit_swz(c, dst, inst);
 	 break;
       case OPCODE_TRUNC:
          /* round toward zero */
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index 1a63766ea1f..3d295388437 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -44,6 +44,8 @@ struct brw_vs_unit_key {
    unsigned int curbe_offset;
 
    unsigned int nr_urb_entries, urb_size;
+
+   unsigned int nr_surfaces;
 };
 
 static void
@@ -62,6 +64,9 @@ vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    key->nr_urb_entries = brw->urb.nr_vs_entries;
    key->urb_size = brw->urb.vsize;
 
+   /* BRW_NEW_NR_VS_SURFACES */
+   key->nr_surfaces = brw->vs.nr_surfaces;
+
    /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */
    if (ctx->Transform.ClipPlanesEnabled) {
       /* Note that we read in the userclip planes as well, hence
@@ -92,6 +97,8 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
     * brw_urb_WRITE() results.
     */
    vs.thread1.single_program_flow = 0;
+   vs.thread1.binding_table_entry_count = key->nr_surfaces;
+
    vs.thread3.urb_entry_read_length = key->urb_entry_read_length;
    vs.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
    vs.thread3.dispatch_grf_start_reg = 1;
@@ -158,6 +165,7 @@ const struct brw_tracked_state brw_vs_unit = {
    .dirty = {
       .mesa  = _NEW_TRANSFORM,
       .brw   = (BRW_NEW_CURBE_OFFSETS |
+                BRW_NEW_NR_VS_SURFACES |
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
new file mode 100644
index 00000000000..89f47522a1c
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -0,0 +1,226 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <[email protected]>
+  */
+
+#include "main/mtypes.h"
+#include "main/texformat.h"
+#include "main/texstore.h"
+#include "shader/prog_parameter.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+/* Creates a new VS constant buffer reflecting the current VS program's
+ * constants, if needed by the VS program.
+ *
+ * Otherwise, constants go through the CURBEs using the brw_constant_buffer
+ * state atom.
+ */
+static drm_intel_bo *
+brw_vs_update_constant_buffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_vertex_program *vp =
+      (struct brw_vertex_program *) brw->vertex_program;
+   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   drm_intel_bo *const_buffer;
+
+   /* BRW_NEW_VERTEX_PROGRAM */
+   if (!vp->use_const_buffer)
+      return NULL;
+
+   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer",
+				     size, 64);
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
+
+   return const_buffer;
+}
+
+/**
+ * Update the surface state for a VS constant buffer.
+ *
+ * Sets brw->vs.surf_bo[surf] and brw->vp->const_buffer.
+ */
+static void
+brw_update_vs_constant_surface( GLcontext *ctx,
+                                GLuint surf)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_surface_key key;
+   struct brw_vertex_program *vp =
+      (struct brw_vertex_program *) brw->vertex_program;
+   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+
+   assert(surf == 0);
+
+   /* If we're in this state update atom, we need to update VS constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   dri_bo_unreference(vp->const_buffer);
+   vp->const_buffer = brw_vs_update_constant_buffer(brw);
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (vp->const_buffer == 0) {
+      drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
+      brw->vs.surf_bo[surf] = NULL;
+      return;
+   }
+
+   memset(&key, 0, sizeof(key));
+
+   key.format = MESA_FORMAT_RGBA_FLOAT32;
+   key.internal_format = GL_RGBA;
+   key.bo = vp->const_buffer;
+   key.depthmode = GL_NONE;
+   key.pitch = params->NumParameters;
+   key.width = params->NumParameters;
+   key.height = 1;
+   key.depth = 1;
+   key.cpp = 16;
+
+   /*
+   printf("%s:\n", __FUNCTION__);
+   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
+          key.width, key.height, key.depth, key.cpp, key.pitch);
+   */
+
+   drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
+   brw->vs.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
+   if (brw->vs.surf_bo[surf] == NULL) {
+      brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
+   }
+}
+
+
+/**
+ * Constructs the binding table for the VS surface state.
+ */
+static dri_bo *
+brw_vs_get_binding_table(struct brw_context *brw)
+{
+   dri_bo *bind_bo;
+
+   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
+			      NULL, 0,
+			      brw->vs.surf_bo, BRW_VS_MAX_SURF,
+			      NULL);
+
+   if (bind_bo == NULL) {
+      GLuint data_size = BRW_VS_MAX_SURF * sizeof(GLuint);
+      uint32_t *data = malloc(data_size);
+      int i;
+
+      for (i = 0; i < BRW_VS_MAX_SURF; i++)
+         if (brw->vs.surf_bo[i])
+            data[i] = brw->vs.surf_bo[i]->offset;
+         else
+            data[i] = 0;
+
+      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
+				  NULL, 0,
+				  brw->vs.surf_bo, BRW_VS_MAX_SURF,
+				  data, data_size,
+				  NULL, NULL);
+
+      /* Emit binding table relocations to surface state */
+      for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+	 if (brw->vs.surf_bo[i] != NULL) {
+	    /* The presumed offsets were set in the data values for
+	     * brw_upload_cache.
+	     */
+	    drm_intel_bo_emit_reloc(bind_bo, i * 4,
+				    brw->vs.surf_bo[i], 0,
+				    I915_GEM_DOMAIN_INSTRUCTION, 0);
+	 }
+      }
+
+      free(data);
+   }
+
+   return bind_bo;
+}
+
+/**
+ * Vertex shader surfaces (constant buffer).
+ *
+ * This consumes the state updates for the constant buffer needing
+ * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and
+ * CACHE_NEW_SURF_BIND for the binding table upload.
+ */
+static void prepare_vs_surfaces(struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   int i;
+   int nr_surfaces = 0;
+
+   brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER);
+
+   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+      if (brw->vs.surf_bo[i] != NULL) {
+	 nr_surfaces = i + 1;
+      }
+   }
+
+   if (brw->vs.nr_surfaces != nr_surfaces) {
+      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
+      brw->vs.nr_surfaces = nr_surfaces;
+   }
+
+   /* Note that we don't end up updating the bind_bo if we don't have a
+    * surface to be pointing at.  This should be relatively harmless, as it
+    * just slightly increases our working set size.
+    */
+   if (brw->vs.nr_surfaces != 0) {
+      dri_bo_unreference(brw->vs.bind_bo);
+      brw->vs.bind_bo = brw_vs_get_binding_table(brw);
+   }
+}
+
+const struct brw_tracked_state brw_vs_surfaces = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_VERTEX_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_vs_surfaces,
+};
+
+
+
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 960bbb311e3..ba03afd6c13 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -79,6 +79,7 @@ static void brw_destroy_context( struct intel_context *intel )
    dri_bo_release(&brw->curbe.curbe_bo);
    dri_bo_release(&brw->vs.prog_bo);
    dri_bo_release(&brw->vs.state_bo);
+   dri_bo_release(&brw->vs.bind_bo);
    dri_bo_release(&brw->gs.prog_bo);
    dri_bo_release(&brw->gs.state_bo);
    dri_bo_release(&brw->clip.prog_bo);
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 90d74c2885c..3e476fd3be5 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -146,6 +146,13 @@ static void do_wm_prog( struct brw_context *brw,
    if (c == NULL) {
       brw->wm.compile_data = calloc(1, sizeof(*brw->wm.compile_data));
       c = brw->wm.compile_data;
+      if (c == NULL) {
+         /* Ouch - big out of memory problem.  Can't continue
+          * without triggering a segfault, no way to signal,
+          * so just return.
+          */
+         return;
+      }
    } else {
       memset(c, 0, sizeof(*brw->wm.compile_data));
    }
@@ -312,6 +319,9 @@ static void brw_wm_populate_key( struct brw_context *brw,
       key->drawable_height = brw->intel.driDrawable->h;
    }
 
+   /* CACHE_NEW_VS_PROG */
+   key->vp_outputs_written = brw->vs.prog_data->outputs_written & DO_SETUP_BITS;
+
    /* The unique fragment program ID */
    key->program_string_id = fp->id;
 }
@@ -350,7 +360,7 @@ const struct brw_tracked_state brw_wm_prog = {
       .brw   = (BRW_NEW_FRAGMENT_PROGRAM |
 		BRW_NEW_WM_INPUT_DIMENSIONS |
 		BRW_NEW_REDUCED_PRIMITIVE),
-      .cache = 0
+      .cache = CACHE_NEW_VS_PROG,
    },
    .prepare = brw_prepare_wm_prog
 };
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index d0ab3bdc65f..fb15c03e83d 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -75,6 +75,7 @@ struct brw_wm_prog_key {
    GLuint program_string_id:32;
    GLuint origin_x, origin_y;
    GLuint drawable_height;
+   GLuint vp_outputs_written;
 };
 
 
@@ -240,22 +241,25 @@ struct brw_wm_compile {
    GLuint max_wm_grf;
    GLuint last_scratch;
 
+   GLuint cur_inst;  /**< index of current instruction */
+
+   GLboolean out_of_regs;  /**< ran out of GRF registers? */
+
    /** Mapping from Mesa registers to hardware registers */
    struct {
       GLboolean inited;
       struct brw_reg reg;
    } wm_regs[PROGRAM_PAYLOAD+1][256][4];
 
+   GLboolean used_grf[BRW_WM_MAX_GRF];
+   GLuint first_free_grf;
    struct brw_reg stack;
    struct brw_reg emit_mask_reg;
-   GLuint reg_index;  /**< Index of next free GRF register */
    GLuint tmp_regs[BRW_WM_MAX_GRF];
    GLuint tmp_index;
    GLuint tmp_max;
    GLuint subroutines[BRW_WM_MAX_SUBROUTINE];
 
-   /** using a real constant buffer? */
-   GLboolean use_const_buffer;
    /** we may need up to 3 constants per instruction (if use_const_buffer) */
    struct {
       GLint index;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index d65b1332c61..14ab9042de7 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -353,6 +353,19 @@ static void emit_mad( struct brw_compile *p,
    }
 }
 
+static void emit_trunc( struct brw_compile *p,
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0)
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_RNDZ(p, dst[i], arg0[i]);
+      }
+   }
+}
 
 static void emit_lrp( struct brw_compile *p, 
 		      const struct brw_reg *dst,
@@ -742,7 +755,7 @@ static void emit_tex( struct brw_wm_compile *c,
 	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
 	      1,
 	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-	      inst->tex_unit + MAX_DRAW_BUFFERS, /* surface */
+              SURF_INDEX_TEXTURE(inst->tex_unit),
 	      inst->tex_unit,	  /* sampler */
 	      inst->writemask,
 	      (inst->tex_shadow ? 
@@ -791,7 +804,7 @@ static void emit_txb( struct brw_wm_compile *c,
 	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
 	      1,
 	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-	      inst->tex_unit + MAX_DRAW_BUFFERS, /* surface */
+              SURF_INDEX_TEXTURE(inst->tex_unit),
 	      inst->tex_unit,	  /* sampler */
 	      inst->writemask,
 	      BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
@@ -1224,6 +1237,10 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 emit_dph(p, dst, dst_flags, args[0], args[1]);
 	 break;
 
+      case OPCODE_TRUNC:
+	 emit_trunc(p, dst, dst_flags, args[0]);
+	 break;
+
       case OPCODE_LRP:
 	 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
 	 break;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index a7f5f1b9a28..1798d842c79 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -80,9 +80,8 @@ static struct prog_src_register src_reg(GLuint file, GLuint idx)
    reg.Index = idx;
    reg.Swizzle = SWIZZLE_NOOP;
    reg.RelAddr = 0;
-   reg.NegateBase = 0;
+   reg.Negate = NEGATE_NONE;
    reg.Abs = 0;
-   reg.NegateAbs = 0;
    return reg;
 }
 
@@ -569,7 +568,7 @@ static void precalc_dst( struct brw_wm_compile *c,
 		    src_undef(),
 		    src_undef());
       /* Avoid letting negation flag of src0 affect our 1 constant. */
-      swz->SrcReg[0].NegateBase &= ~NEGATE_X;
+      swz->SrcReg[0].Negate &= ~NEGATE_X;
    }
    if (dst.WriteMask & WRITEMASK_W) {
       /* dst.w = mov src1.w
@@ -604,7 +603,7 @@ static void precalc_lit( struct brw_wm_compile *c,
 		    src_undef(),
 		    src_undef());
       /* Avoid letting the negation flag of src0 affect our 1 constant. */
-      swz->SrcReg[0].NegateBase = 0;
+      swz->SrcReg[0].Negate = NEGATE_NONE;
    }
 
    if (dst.WriteMask & WRITEMASK_YZ) {
@@ -651,7 +650,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                      src0,
                      src_undef(),
                      src_undef());
-       out->SrcReg[0].NegateBase = 0;
+       out->SrcReg[0].Negate = NEGATE_NONE;
        out->SrcReg[0].Abs = 1;
 
        /* tmp0 = MAX(coord.X, coord.Y) */
@@ -1050,14 +1049,14 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
       case OPCODE_ABS:
 	 out = emit_insn(c, inst);
 	 out->Opcode = OPCODE_MOV;
-	 out->SrcReg[0].NegateBase = 0;
+	 out->SrcReg[0].Negate = NEGATE_NONE;
 	 out->SrcReg[0].Abs = 1;
 	 break;
 
       case OPCODE_SUB: 
 	 out = emit_insn(c, inst);
 	 out->Opcode = OPCODE_ADD;
-	 out->SrcReg[1].NegateBase ^= 0xf;
+	 out->SrcReg[1].Negate ^= NEGATE_XYZW;
 	 break;
 
       case OPCODE_SCS: 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
index 575cd45d572..0e6a2f8ef09 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@@ -1,5 +1,7 @@
 #include "main/macros.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+#include "shader/prog_optimize.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
@@ -21,7 +23,6 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 	const struct prog_instruction *inst = &fp->Base.Instructions[i];
 	switch (inst->Opcode) {
 	    case OPCODE_IF:
-	    case OPCODE_TRUNC:
 	    case OPCODE_ENDIF:
 	    case OPCODE_CAL:
 	    case OPCODE_BRK:
@@ -42,6 +43,83 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 }
 
 
+
+static void
+reclaim_temps(struct brw_wm_compile *c);
+
+
+/** Mark GRF register as used. */
+static void
+prealloc_grf(struct brw_wm_compile *c, int r)
+{
+   c->used_grf[r] = GL_TRUE;
+}
+
+
+/** Mark given GRF register as not in use. */
+static void
+release_grf(struct brw_wm_compile *c, int r)
+{
+   /*assert(c->used_grf[r]);*/
+   c->used_grf[r] = GL_FALSE;
+   c->first_free_grf = MIN2(c->first_free_grf, r);
+}
+
+
+/** Return index of a free GRF, mark it as used. */
+static int
+alloc_grf(struct brw_wm_compile *c)
+{
+   GLuint r;
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   /* no free temps, try to reclaim some */
+   reclaim_temps(c);
+   c->first_free_grf = 0;
+
+   /* try alloc again */
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   for (r = 0; r < BRW_WM_MAX_GRF; r++) {
+      assert(c->used_grf[r]);
+   }
+
+   /* really, no free GRF regs found */
+   if (!c->out_of_regs) {
+      /* print warning once per compilation */
+      _mesa_warning(NULL, "i965: ran out of registers for fragment program");
+      c->out_of_regs = GL_TRUE;
+   }
+
+   return -1;
+}
+
+
+/** Return number of GRF registers used */
+static int
+num_grf_used(const struct brw_wm_compile *c)
+{
+   int r;
+   for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
+      if (c->used_grf[r])
+         return r + 1;
+   return 0;
+}
+
+
+
 /**
  * Record the mapping of a Mesa register to a hardware register.
  */
@@ -56,7 +134,7 @@ static void set_reg(struct brw_wm_compile *c, int file, int index,
  * Examine instruction's write mask to find index of first component
  * enabled for writing.
  */
-static int get_scalar_dst_index(struct prog_instruction *inst)
+static int get_scalar_dst_index(const struct prog_instruction *inst)
 {
     int i;
     for (i = 0; i < 4; i++)
@@ -68,11 +146,23 @@ static int get_scalar_dst_index(struct prog_instruction *inst)
 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 {
     struct brw_reg reg;
-    if(c->tmp_index == c->tmp_max)
-	c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
-    
+
+    /* if we need to allocate another temp, grow the tmp_regs[] array */
+    if (c->tmp_index == c->tmp_max) {
+       int r = alloc_grf(c);
+       if (r < 0) {
+          /*printf("Out of temps in %s\n", __FUNCTION__);*/
+          r = 50; /* XXX random register! */
+       }
+       c->tmp_regs[ c->tmp_max++ ] = r;
+    }
+
+    /* form the GRF register */
     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
+    /*printf("alloc_temp %d\n", reg.nr);*/
+    assert(reg.nr < BRW_WM_MAX_GRF);
     return reg;
+
 }
 
 /**
@@ -130,35 +220,29 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component,
 	    return brw_null_reg();
     }
 
+    assert(index < 256);
+    assert(component < 4);
+
     /* see if we've already allocated a HW register for this Mesa register */
     if (c->wm_regs[file][index][component].inited) {
-	/* yes, re-use */
-	reg = c->wm_regs[file][index][component].reg;
+       /* yes, re-use */
+       reg = c->wm_regs[file][index][component].reg;
     }
     else {
 	/* no, allocate new register */
-	reg = brw_vec8_grf(c->reg_index, 0);
-    }
+       int grf = alloc_grf(c);
+       /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
+       if (grf < 0) {
+          /* totally out of temps */
+          grf = 51; /* XXX random register! */
+       }
 
-    /* if this is a new register allocation, record it in the table */
-    if (!c->wm_regs[file][index][component].inited) {
-	set_reg(c, file, index, component, reg);
-	c->reg_index++;
-    }
+       reg = brw_vec8_grf(grf, 0);
+       /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 
-    if (c->reg_index >= BRW_WM_MAX_GRF - 12) {
-	/* ran out of temporary registers! */
-#if 1
-        /* This is a big hack for now.
-         * Return bad register index, just don't hang the GPU.
-         */
-        _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
-        c->reg_index = BRW_WM_MAX_GRF - 13;
-#else
-	return brw_null_reg();
-#endif
+       set_reg(c, file, index, component, reg);
     }
- 
+
     if (neg & (1 << component)) {
 	reg = negate(reg);
     }
@@ -168,6 +252,46 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component,
 }
 
 
+
+/**
+ * This is called if we run out of GRF registers.  Examine the live intervals
+ * of temp regs in the program and free those which won't be used again.
+ */
+static void
+reclaim_temps(struct brw_wm_compile *c)
+{
+   GLint intBegin[MAX_PROGRAM_TEMPS];
+   GLint intEnd[MAX_PROGRAM_TEMPS];
+   int index;
+
+   /*printf("Reclaim temps:\n");*/
+
+   _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
+                             intBegin, intEnd);
+
+   for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
+      if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
+         /* program temp[i] can be freed */
+         int component;
+         /*printf("  temp[%d] is dead\n", index);*/
+         for (component = 0; component < 4; component++) {
+            if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
+               int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
+               release_grf(c, r);
+               /*
+               printf("  Reclaim temp %d, reg %d at inst %d\n",
+                      index, r, c->cur_inst);
+               */
+               c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
+            }
+         }
+      }
+   }
+}
+
+
+
+
 /**
  * Preallocate registers.  This sets up the Mesa to hardware register
  * mapping for certain registers, such as constants (uniforms/state vars)
@@ -177,8 +301,12 @@ static void prealloc_reg(struct brw_wm_compile *c)
 {
     int i, j;
     struct brw_reg reg;
-    int nr_interp_regs = 0;
+    int urb_read_length = 0;
     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
+    GLuint reg_index = 0;
+
+    memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
+    c->first_free_grf = 0;
 
     for (i = 0; i < 4; i++) {
         if (i < c->key.nr_depth_regs) 
@@ -187,16 +315,22 @@ static void prealloc_reg(struct brw_wm_compile *c)
             reg = brw_vec8_grf(0, 0);
 	set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
     }
-    c->reg_index += 2 * c->key.nr_depth_regs;
+    reg_index += 2 * c->key.nr_depth_regs;
 
     /* constants */
     {
-        const int nr_params = c->fp->program.Base.Parameters->NumParameters;
+        const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
+        const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 
         /* use a real constant buffer, or just use a section of the GRF? */
-        c->use_const_buffer = GL_FALSE; /* (nr_params > 8);*/
+        /* XXX this heuristic may need adjustment... */
+        if ((nr_params + nr_temps) * 4 + reg_index > 80)
+           c->fp->use_const_buffer = GL_TRUE;
+        else
+           c->fp->use_const_buffer = GL_FALSE;
+        /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 
-        if (c->use_const_buffer) {
+        if (c->fp->use_const_buffer) {
            /* We'll use a real constant buffer and fetch constants from
             * it with a dataport read message.
             */
@@ -216,7 +350,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
            for (i = 0; i < nr_params; i++) {
               /* loop over XYZW channels */
               for (j = 0; j < 4; j++, index++) {
-                 reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
+                 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
                  /* Save pointer to parameter/constant value.
                   * Constants will be copied in prepare_constant_buffer()
                   */
@@ -226,42 +360,62 @@ static void prealloc_reg(struct brw_wm_compile *c)
            }
            /* number of constant regs used (each reg is float[8]) */
            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
-           c->reg_index += c->nr_creg;
+           reg_index += c->nr_creg;
         }
     }
 
     /* fragment shader inputs */
-    for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
-	if (inputs & (1<<i)) {
-	    nr_interp_regs++;
-	    reg = brw_vec8_grf(c->reg_index, 0);
-	    for (j = 0; j < 4; j++)
-		set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
-	    c->reg_index += 2;
-	}
+    for (i = 0; i < VERT_RESULT_MAX; i++) {
+       int fp_input;
+
+       if (i >= VERT_RESULT_VAR0)
+	  fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
+       else if (i <= VERT_RESULT_TEX7)
+	  fp_input = i;
+       else
+	  fp_input = -1;
+
+       if (fp_input >= 0 && inputs & (1 << fp_input)) {
+	  urb_read_length = reg_index;
+	  reg = brw_vec8_grf(reg_index, 0);
+	  for (j = 0; j < 4; j++)
+	     set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
+       }
+       if (c->key.vp_outputs_written & (1 << i)) {
+	  reg_index += 2;
+       }
     }
 
     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
-    c->prog_data.urb_read_length = nr_interp_regs * 2;
+    c->prog_data.urb_read_length = urb_read_length;
     c->prog_data.curb_read_length = c->nr_creg;
-    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-    c->reg_index++;
-    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-    c->reg_index += 2;
+    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index++;
+    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index += 2;
+
+    /* mark GRF regs [0..reg_index-1] as in-use */
+    for (i = 0; i < reg_index; i++)
+       prealloc_grf(c, i);
+
+    /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
+    prealloc_grf(c, 126);
+    prealloc_grf(c, 127);
 
     /* An instruction may reference up to three constants.
      * They'll be found in these registers.
      * XXX alloc these on demand!
      */
-    if (c->use_const_buffer) {
-       c->current_const[0].reg = alloc_tmp(c);
-       c->current_const[1].reg = alloc_tmp(c);
-       c->current_const[2].reg = alloc_tmp(c);
-    }
-    /*
-    printf("USE CONST BUFFER? %d\n", c->use_const_buffer);
-    printf("AFTER PRE_ALLOC, reg_index = %d\n", c->reg_index);
-    */
+    if (c->fp->use_const_buffer) {
+       for (i = 0; i < 3; i++) {
+          c->current_const[i].index = -1;
+          c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
+       }
+    }
+#if 0
+    printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
+    printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
+#endif
 }
 
 
@@ -282,45 +436,21 @@ static void fetch_constants(struct brw_wm_compile *c,
       if (src->File == PROGRAM_STATE_VAR ||
           src->File == PROGRAM_CONSTANT ||
           src->File == PROGRAM_UNIFORM) {
-         if (c->current_const[i].index != src->Index) {
-
-            c->current_const[i].index = src->Index;
-            /*c->current_const[i].reg = alloc_tmp(c);*/
-
-            /*
-            printf("  fetch const[%d] for arg %d into reg %d\n",
-                   src->Index, i, c->current_const[i].reg.nr);
-            */
-
-            /* need to fetch the constant now */
-            brw_dp_READ_4(p,
-                          c->current_const[i].reg,  /* writeback dest */
-                          1,                        /* msg_reg */
-                          src->RelAddr,             /* relative indexing? */
-                          16 * src->Index,          /* byte offset */
-                          BRW_WM_MAX_SURF - 1       /* binding table index */
-                          );
+	 c->current_const[i].index = src->Index;
 
 #if 0
-            /* dependency stall */
-            {
-               int response_length = 1;
-               int mark = mark_tmps( c );
-               struct brw_reg src = c->current_const[i].reg;
-               struct brw_reg tmp = alloc_tmp(c);
-
-               /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
-                */
-               brw_push_insn_state(p);
-               brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-               brw_MOV(p, tmp, src);	      
-               brw_MOV(p, src, tmp);	      
-               brw_pop_insn_state(p);
-
-               release_tmps( c, mark );
-            }
+	 printf("  fetch const[%d] for arg %d into reg %d\n",
+		src->Index, i, c->current_const[i].reg.nr);
 #endif
-         }
+
+	 /* need to fetch the constant now */
+	 brw_dp_READ_4(p,
+		       c->current_const[i].reg,  /* writeback dest */
+		       1,                        /* msg_reg */
+		       src->RelAddr,             /* relative indexing? */
+		       16 * src->Index,          /* byte offset */
+		       SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
+		       );
       }
    }
 }
@@ -361,18 +491,18 @@ get_src_reg_const(struct brw_wm_compile *c,
    const_reg = stride(const_reg, 0, 1, 0);
    const_reg.subnr = component * 4;
 
-   if (src->NegateBase)
+   if (src->Negate & (1 << component))
       const_reg = negate(const_reg);
    if (src->Abs)
       const_reg = brw_abs(const_reg);
 
-   /*
-   printf("  form const[%d] for arg %d, comp %d, reg %d\n",
+#if 0
+   printf("  form const[%d].%d for arg %d, reg %d\n",
           c->current_const[srcRegIndex].index,
-          srcRegIndex,
           component,
+          srcRegIndex,
           const_reg.nr);
-   */
+#endif
 
    return const_reg;
 }
@@ -389,7 +519,15 @@ static struct brw_reg get_src_reg(struct brw_wm_compile *c,
     const GLuint nr = 1;
     const GLuint component = GET_SWZ(src->Swizzle, channel);
 
-    if (c->use_const_buffer &&
+    /* Extended swizzle terms */
+    if (component == SWIZZLE_ZERO) {
+       return brw_imm_f(0.0F);
+    }
+    else if (component == SWIZZLE_ONE) {
+       return brw_imm_f(1.0F);
+    }
+
+    if (c->fp->use_const_buffer &&
         (src->File == PROGRAM_STATE_VAR ||
          src->File == PROGRAM_CONSTANT ||
          src->File == PROGRAM_UNIFORM)) {
@@ -398,7 +536,7 @@ static struct brw_reg get_src_reg(struct brw_wm_compile *c,
     else {
        /* other type of source register */
        return get_reg(c, src->File, src->Index, component, nr, 
-                      src->NegateBase, src->Abs);
+                      src->Negate, src->Abs);
     }
 }
 
@@ -423,11 +561,13 @@ static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
        const GLfloat *param =
           c->fp->program.Base.Parameters->ParameterValues[src->Index];
        GLfloat value = param[component];
-       if (src->NegateBase)
+       if (src->Negate & (1 << channel))
           value = -value;
        if (src->Abs)
           value = FABSF(value);
-       /*printf("  form imm reg %f\n", value);*/
+#if 0
+       printf("  form immed value %f for chan %d\n", value, channel);
+#endif
        return brw_imm_f(value);
     }
     else {
@@ -501,7 +641,7 @@ static void invoke_subroutine( struct brw_wm_compile *c,
 }
 
 static void emit_abs( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                      const struct prog_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -518,7 +658,7 @@ static void emit_abs( struct brw_wm_compile *c,
 }
 
 static void emit_trunc( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                        const struct prog_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -536,7 +676,7 @@ static void emit_trunc( struct brw_wm_compile *c,
 }
 
 static void emit_mov( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                      const struct prog_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -546,7 +686,9 @@ static void emit_mov( struct brw_wm_compile *c,
 	if (mask & (1<<i)) {
 	    struct brw_reg src, dst;
 	    dst = get_dst_reg(c, inst, i);
-	    src = get_src_reg_imm(c, inst, 0, i);
+            /* XXX some moves from immediate value don't work reliably!!! */
+            /*src = get_src_reg_imm(c, inst, 0, i);*/
+            src = get_src_reg(c, inst, 0, i);
 	    brw_MOV(p, dst, src);
 	}
     }
@@ -554,7 +696,7 @@ static void emit_mov( struct brw_wm_compile *c,
 }
 
 static void emit_pixel_xy(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                          const struct prog_instruction *inst)
 {
     struct brw_reg r1 = brw_vec1_grf(1, 0);
     struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
@@ -584,7 +726,7 @@ static void emit_pixel_xy(struct brw_wm_compile *c,
 }
 
 static void emit_delta_xy(struct brw_wm_compile *c,
-                          struct prog_instruction *inst)
+                          const struct prog_instruction *inst)
 {
     struct brw_reg r1 = brw_vec1_grf(1, 0);
     struct brw_reg dst0, dst1, src0, src1;
@@ -644,7 +786,7 @@ static void fire_fb_write( struct brw_wm_compile *c,
 }
 
 static void emit_fb_write(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                          const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     int nr = 2;
@@ -682,27 +824,26 @@ static void emit_fb_write(struct brw_wm_compile *c,
     }
 
     if (c->key.dest_depth_reg) {
-        GLuint comp = c->key.dest_depth_reg / 2;
-        GLuint off = c->key.dest_depth_reg % 2;
+        const GLuint comp = c->key.dest_depth_reg / 2;
+        const GLuint off = c->key.dest_depth_reg % 2;
 
-        assert(comp == 1);
-        assert(off == 0);
-#if 0
-        /* XXX do we need this code?   comp always 1, off always 0, it seems */
         if (off != 0) {
+            /* XXX this code needs review/testing */
+            struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
+            struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
+
             brw_push_insn_state(p);
             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
-            brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
+            brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
             /* 2nd half? */
-            brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
+            brw_MOV(p, brw_message_reg(nr+1), arg1_1);
             brw_pop_insn_state(p);
         }
         else
-#endif
         {
-           struct brw_reg src =  get_src_reg(c, inst, 1, 1);
-           brw_MOV(p, brw_message_reg(nr), src);
+            struct brw_reg src =  get_src_reg(c, inst, 1, 1);
+            brw_MOV(p, brw_message_reg(nr), src);
         }
         nr += 2;
    }
@@ -713,7 +854,7 @@ static void emit_fb_write(struct brw_wm_compile *c,
 }
 
 static void emit_pixel_w( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                          const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -743,7 +884,7 @@ static void emit_pixel_w( struct brw_wm_compile *c,
 }
 
 static void emit_linterp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -772,7 +913,7 @@ static void emit_linterp(struct brw_wm_compile *c,
 }
 
 static void emit_cinterp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -798,7 +939,7 @@ static void emit_cinterp(struct brw_wm_compile *c,
 }
 
 static void emit_pinterp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -832,7 +973,7 @@ static void emit_pinterp(struct brw_wm_compile *c,
 
 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 static void emit_frontfacing(struct brw_wm_compile *c,
-			     struct prog_instruction *inst)
+			     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
@@ -861,7 +1002,7 @@ static void emit_frontfacing(struct brw_wm_compile *c,
 }
 
 static void emit_xpd(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -886,7 +1027,7 @@ static void emit_xpd(struct brw_wm_compile *c,
 }
 
 static void emit_dp3(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_reg src0[3], src1[3], dst;
     int i;
@@ -905,7 +1046,7 @@ static void emit_dp3(struct brw_wm_compile *c,
 }
 
 static void emit_dp4(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_reg src0[4], src1[4], dst;
     int i;
@@ -924,7 +1065,7 @@ static void emit_dp4(struct brw_wm_compile *c,
 }
 
 static void emit_dph(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_reg src0[4], src1[4], dst;
     int i;
@@ -948,7 +1089,7 @@ static void emit_dph(struct brw_wm_compile *c,
  * register's X, Y, Z and W channels (subject to writemasking of course).
  */
 static void emit_math1(struct brw_wm_compile *c,
-		struct prog_instruction *inst, GLuint func)
+                       const struct prog_instruction *inst, GLuint func)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, dst, tmp;
@@ -985,43 +1126,43 @@ static void emit_math1(struct brw_wm_compile *c,
 }
 
 static void emit_rcp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
 }
 
 static void emit_rsq(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
 }
 
 static void emit_sin(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
 }
 
 static void emit_cos(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
 }
 
 static void emit_ex2(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
 }
 
 static void emit_lg2(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
 }
 
 static void emit_add(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, dst;
@@ -1040,7 +1181,7 @@ static void emit_add(struct brw_wm_compile *c,
 }
 
 static void emit_arl(struct brw_wm_compile *c,
-                     struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, addr_reg;
@@ -1053,7 +1194,7 @@ static void emit_arl(struct brw_wm_compile *c,
 }
 
 static void emit_sub(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, dst;
@@ -1072,7 +1213,7 @@ static void emit_sub(struct brw_wm_compile *c,
 }
 
 static void emit_mul(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, dst;
@@ -1091,7 +1232,7 @@ static void emit_mul(struct brw_wm_compile *c,
 }
 
 static void emit_frc(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, dst;
@@ -1110,7 +1251,7 @@ static void emit_frc(struct brw_wm_compile *c,
 }
 
 static void emit_flr(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, dst;
@@ -1177,7 +1318,7 @@ static void emit_min_max(struct brw_wm_compile *c,
 }
 
 static void emit_pow(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst, src0, src1;
@@ -1199,7 +1340,7 @@ static void emit_pow(struct brw_wm_compile *c,
 }
 
 static void emit_lrp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -1252,7 +1393,7 @@ static void emit_kil(struct brw_wm_compile *c)
 }
 
 static void emit_mad(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -1275,7 +1416,7 @@ static void emit_mad(struct brw_wm_compile *c,
 }
 
 static void emit_sop(struct brw_wm_compile *c,
-		struct prog_instruction *inst, GLuint cond)
+                     const struct prog_instruction *inst, GLuint cond)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -1299,43 +1440,43 @@ static void emit_sop(struct brw_wm_compile *c,
 }
 
 static void emit_slt(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_L);
 }
 
 static void emit_sle(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_LE);
 }
 
 static void emit_sgt(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_G);
 }
 
 static void emit_sge(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_GE);
 }
 
 static void emit_seq(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_EQ);
 }
 
 static void emit_sne(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
 }
 
 static void emit_ddx(struct brw_wm_compile *c,
-                struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -1362,7 +1503,7 @@ static void emit_ddx(struct brw_wm_compile *c,
 }
 
 static void emit_ddy(struct brw_wm_compile *c,
-                struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -1505,7 +1646,7 @@ static void noise1_sub( struct brw_wm_compile *c ) {
 }
 
 static void emit_noise1( struct brw_wm_compile *c,
-			 struct prog_instruction *inst )
+			 const struct prog_instruction *inst )
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src, param, dst;
@@ -1675,7 +1816,7 @@ static void noise2_sub( struct brw_wm_compile *c ) {
 }
 
 static void emit_noise2( struct brw_wm_compile *c,
-			 struct prog_instruction *inst )
+			 const struct prog_instruction *inst )
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, param0, param1, dst;
@@ -1978,7 +2119,7 @@ static void noise3_sub( struct brw_wm_compile *c ) {
 }
 
 static void emit_noise3( struct brw_wm_compile *c,
-			 struct prog_instruction *inst )
+			 const struct prog_instruction *inst )
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
@@ -2401,7 +2542,7 @@ static void noise4_sub( struct brw_wm_compile *c )
 }
 
 static void emit_noise4( struct brw_wm_compile *c,
-			 struct prog_instruction *inst )
+			 const struct prog_instruction *inst )
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
@@ -2443,7 +2584,7 @@ static void emit_noise4( struct brw_wm_compile *c,
 }
     
 static void emit_wpos_xy(struct brw_wm_compile *c,
-                struct prog_instruction *inst)
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -2479,7 +2620,7 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
    BIAS on SIMD8 not working yet...
  */	
 static void emit_txb(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst[4], src[4], payload_reg;
@@ -2517,7 +2658,7 @@ static void emit_txb(struct brw_wm_compile *c,
                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
                1,                                           /* msg_reg_nr */
                retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
-               unit + MAX_DRAW_BUFFERS,                     /* surface */
+               SURF_INDEX_TEXTURE(unit),
                unit,                                        /* sampler */
                inst->DstReg.WriteMask,                      /* writemask */
                BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,      /* msg_type */
@@ -2528,7 +2669,7 @@ static void emit_txb(struct brw_wm_compile *c,
 
 
 static void emit_tex(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst[4], src[4], payload_reg;
@@ -2581,7 +2722,7 @@ static void emit_tex(struct brw_wm_compile *c,
                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
                1,                                          /* msg_reg_nr */
                retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
-               unit + MAX_DRAW_BUFFERS,                    /* surface */
+               SURF_INDEX_TEXTURE(unit),
                unit,                                       /* sampler */
                inst->DstReg.WriteMask,                     /* writemask */
                BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,           /* msg_type */
@@ -2612,28 +2753,31 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
     struct brw_compile *p = &c->func;
     struct brw_indirect stack_index = brw_indirect(0, 0);
 
-    c->reg_index = 0;
+    c->out_of_regs = GL_FALSE;
+
     prealloc_reg(c);
     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
 
     for (i = 0; i < c->nr_fp_insns; i++) {
-	struct prog_instruction *inst = &c->prog_instructions[i];
+        const struct prog_instruction *inst = &c->prog_instructions[i];
 
-	if (inst->CondUpdate)
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-	else
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+        c->cur_inst = i;
 
-        /*
+#if 0
         _mesa_printf("Inst %d: ", i);
         _mesa_print_instruction(inst);
-        */
+#endif
 
         /* fetch any constants that this instruction needs */
-        if (c->use_const_buffer)
+        if (c->fp->use_const_buffer)
            fetch_constants(c, inst);
 
+	if (inst->CondUpdate)
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	else
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+
 	switch (inst->Opcode) {
 	    case WM_PIXELXY:
 		emit_pixel_xy(c, inst);
@@ -2819,6 +2963,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 
 		break;
 	    case OPCODE_BGNLOOP:
+                /* XXX may need to invalidate the current_constant regs */
 		loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
 		break;
 	    case OPCODE_BRK:
@@ -2849,17 +2994,13 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		_mesa_printf("unsupported IR in fragment shader %d\n",
 			inst->Opcode);
 	}
+
 	if (inst->CondUpdate)
 	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 	else
 	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
     }
     post_wm_emit(c);
-
-    if (c->reg_index >= BRW_WM_MAX_GRF) {
-        _mesa_problem(NULL, "Ran out of registers in brw_wm_emit_glsl()");
-        /* XXX we need to do some proper error recovery here */
-    }
 }
 
 
@@ -2883,6 +3024,6 @@ void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
         brw_wm_print_program(c, "brw_wm_glsl_emit done");
     }
 
-    c->prog_data.total_grf = c->reg_index;
+    c->prog_data.total_grf = num_grf_used(c);
     c->prog_data.total_scratch = 0;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_iz.c b/src/mesa/drivers/dri/i965/brw_wm_iz.c
index bd60ac9b315..8fd067abe7d 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_iz.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_iz.c
@@ -116,6 +116,10 @@ const struct {
  { C, 0, 1, 1, 1 } 
 };
 
+/**
+ * \param line_aa  AA_NEVER, AA_ALWAYS or AA_SOMETIMES
+ * \param lookup  bitmask of IZ_* flags
+ */
 void brw_wm_lookup_iz( GLuint line_aa,
 		       GLuint lookup,
 		       struct brw_wm_prog_key *key )
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
index 2debd0678a5..92142764f5d 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
@@ -322,7 +322,7 @@ static struct brw_wm_ref *get_new_ref( struct brw_wm_compile *c,
       newref->value->lastuse = newref;
    }
 
-   if (src.NegateBase & (1<<i))
+   if (src.Negate & (1 << i))
       newref->hw_reg.negate ^= 1;
 
    if (src.Abs) {
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass1.c b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
index ab9aa2f10d0..3436a247170 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass1.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
@@ -159,6 +159,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       case OPCODE_FRC:
       case OPCODE_MOV:
       case OPCODE_SWZ:
+      case OPCODE_TRUNC:
 	 read0 = writemask;
 	 break;
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass2.c b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
index 780edbc42e6..6faea018fbc 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass2.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
@@ -69,8 +69,6 @@ static void prealloc_reg(struct brw_wm_compile *c,
  */
 static void init_registers( struct brw_wm_compile *c )
 {
-   struct brw_context *brw = c->func.brw;
-   GLuint inputs = (brw->vs.prog_data->outputs_written & DO_SETUP_BITS);
    GLuint nr_interp_regs = 0;
    GLuint i = 0;
    GLuint j;
@@ -85,16 +83,19 @@ static void init_registers( struct brw_wm_compile *c )
       prealloc_reg(c, &c->creg[j], i++);
 
    for (j = 0; j < FRAG_ATTRIB_MAX; j++) {
-      if (inputs & (1<<j)) {
-	 /* index for vs output and ps input are not the same 
-	    in shader varying */
-	 GLuint index;
-	 if (j > FRAG_ATTRIB_VAR0)
-	     index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
+      if (c->key.vp_outputs_written & (1<<j)) {
+	 int fp_index;
+
+	 if (j >= VERT_RESULT_VAR0)
+	    fp_index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
+	 else if (j <= VERT_RESULT_TEX7)
+	    fp_index = j;
 	 else
-	     index = j;
+	    fp_index = -1;
+
 	 nr_interp_regs++;
-	 prealloc_reg(c, &c->payload.input_interp[index], i++);
+	 if (fp_index >= 0)
+	    prealloc_reg(c, &c->payload.input_interp[fp_index], i++);
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index 1fc9f013727..3fc18ff1f3a 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -152,7 +152,7 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
       sampler->ss0.mag_filter = BRW_MAPFILTER_ANISOTROPIC;
 
       if (key->max_aniso > 2.0) {
-	 sampler->ss3.max_aniso = MAX2((key->max_aniso - 2) / 2,
+	 sampler->ss3.max_aniso = MIN2((key->max_aniso - 2) / 2,
 				       BRW_ANISORATIO_16);
       }
    }
@@ -178,6 +178,16 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
       sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
       sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
    }
+   else if (key->tex_target == GL_TEXTURE_1D) {
+      /* There's a bug in 1D texture sampling - it actually pays
+       * attention to the wrap_t value, though it should not.
+       * Override the wrap_t value here to GL_REPEAT to keep
+       * any nonexistent border pixels from floating in.
+       */
+      sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
+      sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
+      sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
+   }
    else {
       sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
       sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 58fa6aaf8f9..67b41173fb2 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -290,7 +290,7 @@ const struct brw_tracked_state brw_wm_unit = {
 
       .brw = (BRW_NEW_FRAGMENT_PROGRAM | 
 	      BRW_NEW_CURBE_OFFSETS |
-	      BRW_NEW_NR_SURFACES),
+	      BRW_NEW_NR_WM_SURFACES),
 
       .cache = (CACHE_NEW_WM_PROG |
 		CACHE_NEW_SAMPLER)
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index e7d55d5dbd1..c49a5f6b4ec 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -38,7 +38,7 @@
 #include "intel_mipmap_tree.h"
 #include "intel_batchbuffer.h"
 #include "intel_tex.h"
-
+#include "intel_fbo.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
@@ -176,17 +176,6 @@ static GLuint translate_tex_format( GLuint mesa_format, GLenum internal_format,
    }
 }
 
-struct brw_wm_surface_key {
-   GLenum target, depthmode;
-   dri_bo *bo;
-   GLint format, internal_format;
-   GLint first_level, last_level;
-   GLint width, height, depth;
-   GLint pitch, cpp;
-   uint32_t tiling;
-   GLuint offset;
-};
-
 static void
 brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling)
 {
@@ -208,7 +197,7 @@ brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling)
 
 static dri_bo *
 brw_create_texture_surface( struct brw_context *brw,
-			    struct brw_wm_surface_key *key )
+			    struct brw_surface_key *key )
 {
    struct brw_surface_state surf;
    dri_bo *bo;
@@ -263,7 +252,7 @@ brw_create_texture_surface( struct brw_context *brw,
       surf.ss0.cube_neg_z = 1;
    }
 
-   bo = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
+   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
 			 key, sizeof(*key),
 			 &key->bo, key->bo ? 1 : 0,
 			 &surf, sizeof(surf),
@@ -287,8 +276,8 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
    struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current;
    struct intel_texture_object *intelObj = intel_texture_object(tObj);
    struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel];
-   struct brw_wm_surface_key key;
-   const GLuint j = MAX_DRAW_BUFFERS + unit;
+   struct brw_surface_key key;
+   const GLuint surf = SURF_INDEX_TEXTURE(unit);
 
    memset(&key, 0, sizeof(key));
 
@@ -315,25 +304,26 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
    key.cpp = intelObj->mt->cpp;
    key.tiling = intelObj->mt->region->tiling;
 
-   dri_bo_unreference(brw->wm.surf_bo[j]);
-   brw->wm.surf_bo[j] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-                                         &key, sizeof(key),
-                                         &key.bo, key.bo ? 1 : 0,
-                                         NULL);
-   if (brw->wm.surf_bo[j] == NULL) {
-      brw->wm.surf_bo[j] = brw_create_texture_surface(brw, &key);
+   dri_bo_unreference(brw->wm.surf_bo[surf]);
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
+   if (brw->wm.surf_bo[surf] == NULL) {
+      brw->wm.surf_bo[surf] = brw_create_texture_surface(brw, &key);
    }
 }
 
 
 
 /**
- * Create the constant buffer surface.  Fragment shader constanst will be
+ * Create the constant buffer surface.  Vertex/fragment shader constants will be
  * read from this buffer with Data Port Read instructions/messages.
  */
-static dri_bo *
+dri_bo *
 brw_create_constant_surface( struct brw_context *brw,
-                             struct brw_wm_surface_key *key )
+                             struct brw_surface_key *key )
 {
    const GLint w = key->width - 1;
    struct brw_surface_state surf;
@@ -345,8 +335,6 @@ brw_create_constant_surface( struct brw_context *brw,
    surf.ss0.surface_type = BRW_SURFACE_BUFFER;
    surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
 
-   /* This is ok for all textures with channel width 8bit or less:
-    */
    assert(key->bo);
    if (key->bo)
       surf.ss1.base_addr = key->bo->offset; /* reloc */
@@ -356,10 +344,10 @@ brw_create_constant_surface( struct brw_context *brw,
    surf.ss2.width = w & 0x7f;            /* bits 6:0 of size or width */
    surf.ss2.height = (w >> 7) & 0x1fff;  /* bits 19:7 of size or width */
    surf.ss3.depth = (w >> 20) & 0x7f;    /* bits 26:20 of size or width */
-   surf.ss3.pitch = (key->pitch * key->cpp) - 1;
-   brw_set_surface_tiling(&surf, key->tiling);
+   surf.ss3.pitch = (key->pitch * key->cpp) - 1; /* ignored?? */
+   brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
  
-   bo = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
+   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
 			 key, sizeof(*key),
 			 &key->bo, key->bo ? 1 : 0,
 			 &surf, sizeof(surf),
@@ -377,28 +365,73 @@ brw_create_constant_surface( struct brw_context *brw,
    return bo;
 }
 
+/* Creates a new WM constant buffer reflecting the current fragment program's
+ * constants, if needed by the fragment program.
+ *
+ * Otherwise, constants go through the CURBEs using the brw_constant_buffer
+ * state atom.
+ */
+static drm_intel_bo *
+brw_wm_update_constant_buffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   const struct gl_program_parameter_list *params = fp->program.Base.Parameters;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   drm_intel_bo *const_buffer;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   if (!fp->use_const_buffer)
+      return NULL;
+
+   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "fp_const_buffer",
+				     size, 64);
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
+
+   return const_buffer;
+}
 
 /**
- * Update the constant buffer surface.
+ * Update the surface state for a WM constant buffer.
+ * The constant buffer will be (re)allocated here if needed.
  */
 static void
-brw_update_constant_surface( GLcontext *ctx,
-                             const struct brw_fragment_program *fp )
+brw_update_wm_constant_surface( GLcontext *ctx,
+                                GLuint surf)
 {
    struct brw_context *brw = brw_context(ctx);
-   struct brw_wm_surface_key key;
-   const GLuint j = BRW_WM_MAX_SURF - 1;
-   const GLuint numParams = fp->program.Base.Parameters->NumParameters;
+   struct brw_surface_key key;
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   const struct gl_program_parameter_list *params =
+      fp->program.Base.Parameters;
+
+   /* If we're in this state update atom, we need to update WM constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   dri_bo_unreference(fp->const_buffer);
+   fp->const_buffer = brw_wm_update_constant_buffer(brw);
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (fp->const_buffer == 0) {
+      drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
+      brw->wm.surf_bo[surf] = NULL;
+      return;
+   }
 
    memset(&key, 0, sizeof(key));
 
    key.format = MESA_FORMAT_RGBA_FLOAT32;
    key.internal_format = GL_RGBA;
    key.bo = fp->const_buffer;
-
    key.depthmode = GL_NONE;
-   key.pitch = numParams;
-   key.width = numParams;
+   key.pitch = params->NumParameters;
+   key.width = params->NumParameters;
    key.height = 1;
    key.depth = 1;
    key.cpp = 16;
@@ -409,16 +442,60 @@ brw_update_constant_surface( GLcontext *ctx,
           key.width, key.height, key.depth, key.cpp, key.pitch);
    */
 
-   dri_bo_unreference(brw->wm.surf_bo[j]);
-   brw->wm.surf_bo[j] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-                                         &key, sizeof(key),
-                                         &key.bo, key.bo ? 1 : 0,
-                                         NULL);
-   if (brw->wm.surf_bo[j] == NULL) {
-      brw->wm.surf_bo[j] = brw_create_constant_surface(brw, &key);
+   dri_bo_unreference(brw->wm.surf_bo[surf]);
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
+   if (brw->wm.surf_bo[surf] == NULL) {
+      brw->wm.surf_bo[surf] = brw_create_constant_surface(brw, &key);
    }
+   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
 }
 
+/**
+ * Updates surface / buffer for fragment shader constant buffer, if
+ * one is required.
+ *
+ * This consumes the state updates for the constant buffer, and produces
+ * BRW_NEW_WM_SURFACES to get picked up by brw_prepare_wm_surfaces for
+ * inclusion in the binding table.
+ */
+static void prepare_wm_constant_surface(struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
+
+   drm_intel_bo_unreference(fp->const_buffer);
+   fp->const_buffer = brw_wm_update_constant_buffer(brw);
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (fp->const_buffer == 0) {
+      if (brw->wm.surf_bo[surf] != NULL) {
+	 drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
+	 brw->wm.surf_bo[surf] = NULL;
+	 brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+      }
+      return;
+   }
+
+   brw_update_wm_constant_surface(ctx, surf);
+}
+
+const struct brw_tracked_state brw_wm_constant_surface = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_FRAGMENT_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_wm_constant_surface,
+};
+
 
 /**
  * Sets up a surface state structure to point at the given region.
@@ -426,18 +503,22 @@ brw_update_constant_surface( GLcontext *ctx,
  * usable for further buffers when doing ARB_draw_buffer support.
  */
 static void
-brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
-			  unsigned int unit, GLboolean cached)
+brw_update_renderbuffer_surface(struct brw_context *brw,
+				struct gl_renderbuffer *rb,
+				unsigned int unit)
 {
    GLcontext *ctx = &brw->intel.ctx;
    dri_bo *region_bo = NULL;
+   struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+   struct intel_region *region = irb ? irb->region : NULL;
    struct {
       unsigned int surface_type;
       unsigned int surface_format;
-      unsigned int width, height, cpp;
+      unsigned int width, height, pitch, cpp;
       GLubyte color_mask[4];
       GLboolean color_blend;
       uint32_t tiling;
+      uint32_t draw_offset;
    } key;
 
    memset(&key, 0, sizeof(key));
@@ -446,14 +527,29 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
       region_bo = region->buffer;
 
       key.surface_type = BRW_SURFACE_2D;
-      if (region->cpp == 4)
+      switch (irb->texformat->MesaFormat) {
+      case MESA_FORMAT_ARGB8888:
 	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-      else
+	 break;
+      case MESA_FORMAT_RGB565:
 	 key.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+	 break;
+      case MESA_FORMAT_ARGB1555:
+	 key.surface_format = BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
+	 break;
+      case MESA_FORMAT_ARGB4444:
+	 key.surface_format = BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
+	 break;
+      default:
+	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
+		       irb->texformat->MesaFormat);
+      }
       key.tiling = region->tiling;
-      key.width = region->pitch; /* XXX: not really! */
+      key.width = region->width;
       key.height = region->height;
+      key.pitch = region->pitch;
       key.cpp = region->cpp;
+      key.draw_offset = region->draw_offset; /* cur 3d or cube face offset */
    } else {
       key.surface_type = BRW_SURFACE_NULL;
       key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
@@ -461,6 +557,7 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
       key.width = 1;
       key.height = 1;
       key.cpp = 4;
+      key.draw_offset = 0;
    }
    memcpy(key.color_mask, ctx->Color.ColorMask,
 	  sizeof(key.color_mask));
@@ -468,12 +565,11 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
 		      ctx->Color.BlendEnabled);
 
    dri_bo_unreference(brw->wm.surf_bo[unit]);
-   brw->wm.surf_bo[unit] = NULL;
-   if (cached) 
-       brw->wm.surf_bo[unit] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-	       &key, sizeof(key),
-	       &region_bo, 1,
-	       NULL);
+   brw->wm.surf_bo[unit] = brw_search_cache(&brw->surface_cache,
+					    BRW_SS_SURFACE,
+					    &key, sizeof(key),
+					    &region_bo, 1,
+					    NULL);
 
    if (brw->wm.surf_bo[unit] == NULL) {
       struct brw_surface_state surf;
@@ -482,13 +578,14 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
 
       surf.ss0.surface_format = key.surface_format;
       surf.ss0.surface_type = key.surface_type;
+      surf.ss1.base_addr =  key.draw_offset;
       if (region_bo != NULL)
-	 surf.ss1.base_addr = region_bo->offset; /* reloc */
+	 surf.ss1.base_addr += region_bo->offset; /* reloc */
 
       surf.ss2.width = key.width - 1;
       surf.ss2.height = key.height - 1;
       brw_set_surface_tiling(&surf, key.tiling);
-      surf.ss3.pitch = (key.width * key.cpp) - 1;
+      surf.ss3.pitch = (key.pitch * key.cpp) - 1;
 
       /* _NEW_COLOR */
       surf.ss0.color_blend = key.color_blend;
@@ -498,8 +595,9 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
       surf.ss0.writedisable_alpha = !key.color_mask[3];
 
       /* Key size will never match key size for textures, so we're safe. */
-      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
-					      &key, sizeof(key),
+      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->surface_cache,
+                                               BRW_SS_SURFACE,
+                                               &key, sizeof(key),
 					       &region_bo, 1,
 					       &surf, sizeof(surf),
 					       NULL, NULL);
@@ -508,12 +606,12 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
 	  * them both.  We might be able to figure out from other state
 	  * a more restrictive relocation to emit.
 	  */
-	 dri_bo_emit_reloc(brw->wm.surf_bo[unit],
-			   I915_GEM_DOMAIN_RENDER,
-			   I915_GEM_DOMAIN_RENDER,
-			   0,
-			   offsetof(struct brw_surface_state, ss1),
-			   region_bo);
+	 drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit],
+				 offsetof(struct brw_surface_state, ss1),
+				 region_bo,
+				 key.draw_offset,
+				 I915_GEM_DOMAIN_RENDER,
+				 I915_GEM_DOMAIN_RENDER);
       }
    }
 }
@@ -528,7 +626,9 @@ brw_wm_get_binding_table(struct brw_context *brw)
 {
    dri_bo *bind_bo;
 
-   bind_bo = brw_search_cache(&brw->cache, BRW_SS_SURF_BIND,
+   assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
+
+   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
 			      NULL, 0,
 			      brw->wm.surf_bo, brw->wm.nr_surfaces,
 			      NULL);
@@ -544,7 +644,7 @@ brw_wm_get_binding_table(struct brw_context *brw)
          else
             data[i] = 0;
 
-      bind_bo = brw_upload_cache( &brw->cache, BRW_SS_SURF_BIND,
+      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
 				  NULL, 0,
 				  brw->wm.surf_bo, brw->wm.nr_surfaces,
 				  data, data_size,
@@ -574,66 +674,62 @@ static void prepare_wm_surfaces(struct brw_context *brw )
    GLuint i;
    int old_nr_surfaces;
 
+   /* _NEW_BUFFERS */
    /* Update surfaces for drawing buffers */
-   if (brw->state.nr_color_regions > 1) {
-      for (i = 0; i < brw->state.nr_color_regions; i++) {
-         brw_update_region_surface(brw, brw->state.color_regions[i], i,
-				   GL_FALSE);
+   if (ctx->DrawBuffer->_NumColorDrawBuffers >= 1) {
+      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+         brw_update_renderbuffer_surface(brw,
+					 ctx->DrawBuffer->_ColorDrawBuffers[i],
+					 i);
       }
    } else {
-      brw_update_region_surface(brw, brw->state.color_regions[0], 0, GL_TRUE);
+      brw_update_renderbuffer_surface(brw, NULL, 0);
    }
 
    old_nr_surfaces = brw->wm.nr_surfaces;
    brw->wm.nr_surfaces = MAX_DRAW_BUFFERS;
 
+   if (brw->wm.surf_bo[SURF_INDEX_FRAG_CONST_BUFFER] != NULL)
+       brw->wm.nr_surfaces = SURF_INDEX_FRAG_CONST_BUFFER + 1;
+
    /* Update surfaces for textures */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
       const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
-      const GLuint j = MAX_DRAW_BUFFERS + i;
+      const GLuint surf = SURF_INDEX_TEXTURE(i);
 
       /* _NEW_TEXTURE, BRW_NEW_TEXDATA */
       if (texUnit->_ReallyEnabled) {
          if (texUnit->_Current == intel->frame_buffer_texobj) {
             /* render to texture */
-            dri_bo_unreference(brw->wm.surf_bo[j]);
-            brw->wm.surf_bo[j] = brw->wm.surf_bo[0];
-            dri_bo_reference(brw->wm.surf_bo[j]);
-            brw->wm.nr_surfaces = j + 1;
+            dri_bo_unreference(brw->wm.surf_bo[surf]);
+            brw->wm.surf_bo[surf] = brw->wm.surf_bo[0];
+            dri_bo_reference(brw->wm.surf_bo[surf]);
+            brw->wm.nr_surfaces = surf + 1;
          } else {
             /* regular texture */
             brw_update_texture_surface(ctx, i);
-            brw->wm.nr_surfaces = j + 1;
+            brw->wm.nr_surfaces = surf + 1;
          }
       } else {
-         dri_bo_unreference(brw->wm.surf_bo[j]);
-         brw->wm.surf_bo[j] = NULL;
+         dri_bo_unreference(brw->wm.surf_bo[surf]);
+         brw->wm.surf_bo[surf] = NULL;
       }
    }
 
-   /* Update surface for fragment shader constant buffer */
-   {
-      const GLuint j = BRW_WM_MAX_SURF - 1;
-      const struct brw_fragment_program *fp =
-         brw_fragment_program_const(brw->fragment_program);
-
-      brw_update_constant_surface(ctx, fp);
-      brw->wm.nr_surfaces = j + 1;
-   }
-
-
    dri_bo_unreference(brw->wm.bind_bo);
    brw->wm.bind_bo = brw_wm_get_binding_table(brw);
 
    if (brw->wm.nr_surfaces != old_nr_surfaces)
-      brw->state.dirty.brw |= BRW_NEW_NR_SURFACES;
+      brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
 }
 
-
 const struct brw_tracked_state brw_wm_surfaces = {
    .dirty = {
-      .mesa = _NEW_COLOR | _NEW_TEXTURE | _NEW_BUFFERS,
-      .brw = BRW_NEW_CONTEXT,
+      .mesa = (_NEW_COLOR |
+               _NEW_TEXTURE |
+               _NEW_BUFFERS),
+      .brw = (BRW_NEW_CONTEXT |
+	      BRW_NEW_WM_SURFACES),
       .cache = 0
    },
    .prepare = prepare_wm_surfaces,
diff --git a/src/mesa/drivers/dri/i965/intel_generatemipmap.c b/src/mesa/drivers/dri/i965/intel_generatemipmap.c
new file mode 120000
index 00000000000..4c6b37ada01
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_generatemipmap.c
@@ -0,0 +1 @@
+../intel/intel_generatemipmap.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/intel/intel_blit.c b/src/mesa/drivers/dri/intel/intel_blit.c
index 4ae9b118a3d..49198281316 100644
--- a/src/mesa/drivers/dri/intel/intel_blit.c
+++ b/src/mesa/drivers/dri/intel/intel_blit.c
@@ -32,6 +32,8 @@
 #include "main/mtypes.h"
 #include "main/context.h"
 #include "main/enums.h"
+#include "main/texformat.h"
+#include "main/colormac.h"
 
 #include "intel_blit.h"
 #include "intel_buffers.h"
@@ -484,10 +486,9 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
             const GLbitfield bufBit = 1 << buf;
             if ((clearMask & bufBit) && !(bufBit & skipBuffers)) {
                /* OK, clear this renderbuffer */
-               struct intel_region *irb_region =
-		  intel_get_rb_region(fb, buf);
+	       struct intel_renderbuffer *irb = intel_get_renderbuffer(fb, buf);
                dri_bo *write_buffer =
-                  intel_region_buffer(intel, irb_region,
+                  intel_region_buffer(intel, irb->region,
                                       all ? INTEL_WRITE_FULL :
                                       INTEL_WRITE_PART);
 
@@ -495,15 +496,13 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                GLint pitch, cpp;
                GLuint BR13, CMD;
 
-               ASSERT(irb_region);
-
-               pitch = irb_region->pitch;
-               cpp = irb_region->cpp;
+               pitch = irb->region->pitch;
+               cpp = irb->region->cpp;
 
                DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
                    __FUNCTION__,
-                   irb_region->buffer, (pitch * cpp),
-                   irb_region->draw_offset,
+                   irb->region->buffer, (pitch * cpp),
+                   irb->region->draw_offset,
                    b.x1, b.y1, b.x2 - b.x1, b.y2 - b.y1);
 
 	       BR13 = 0xf0 << 16;
@@ -529,7 +528,7 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                }
 
 #ifndef I915
-	       if (irb_region->tiling != I915_TILING_NONE) {
+	       if (irb->region->tiling != I915_TILING_NONE) {
 		  CMD |= XY_DST_TILED;
 		  pitch /= 4;
 	       }
@@ -540,9 +539,36 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                   clearVal = clear_depth;
                }
                else {
-                  clearVal = (cpp == 4)
-                     ? intel->ClearColor8888 : intel->ClearColor565;
-               }
+		  uint8_t clear[4];
+		  GLclampf *color = ctx->Color.ClearColor;
+
+		  CLAMPED_FLOAT_TO_UBYTE(clear[0], color[0]);
+		  CLAMPED_FLOAT_TO_UBYTE(clear[1], color[1]);
+		  CLAMPED_FLOAT_TO_UBYTE(clear[2], color[2]);
+		  CLAMPED_FLOAT_TO_UBYTE(clear[3], color[3]);
+
+		  switch (irb->texformat->MesaFormat) {
+		  case MESA_FORMAT_ARGB8888:
+		     clearVal = intel->ClearColor8888;
+		     break;
+		  case MESA_FORMAT_RGB565:
+		     clearVal = intel->ClearColor565;
+		     break;
+		  case MESA_FORMAT_ARGB4444:
+		     clearVal = PACK_COLOR_4444(clear[3], clear[0],
+						clear[1], clear[2]);
+		     break;
+		  case MESA_FORMAT_ARGB1555:
+		     clearVal = PACK_COLOR_1555(clear[3], clear[0],
+						clear[1], clear[2]);
+		     break;
+		  default:
+		     _mesa_problem(ctx, "Unexpected renderbuffer format: %d\n",
+				   irb->texformat->MesaFormat);
+		     clearVal = 0;
+		  }
+	       }
+
                /*
                   _mesa_debug(ctx, "hardware blit clear buf %d rb id %d\n",
                   buf, irb->Base.Name);
@@ -558,7 +584,7 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                OUT_BATCH((b.y2 << 16) | b.x2);
                OUT_RELOC(write_buffer,
 			 I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                         irb_region->draw_offset);
+                         irb->region->draw_offset);
                OUT_BATCH(clearVal);
                ADVANCE_BATCH();
                clearMask &= ~bufBit;    /* turn off bit, for faster loop exit */
diff --git a/src/mesa/drivers/dri/intel/intel_buffer_objects.c b/src/mesa/drivers/dri/intel/intel_buffer_objects.c
index b7c7eeb368f..23ba3b9ef67 100644
--- a/src/mesa/drivers/dri/intel/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/intel/intel_buffer_objects.c
@@ -35,6 +35,9 @@
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
 
+static GLboolean
+intel_bufferobj_unmap(GLcontext * ctx,
+                      GLenum target, struct gl_buffer_object *obj);
 
 /** Allocates a new dri_bo to store the data for the buffer object. */
 static void
@@ -100,8 +103,15 @@ intel_bufferobj_free(GLcontext * ctx, struct gl_buffer_object *obj)
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
 
    assert(intel_obj);
-   assert(!obj->Pointer); /* Mesa should have unmapped it */
 
+   /* Buffer objects are automatically unmapped when deleting according
+    * to the spec, but Mesa doesn't do UnmapBuffer for us at context destroy
+    * (though it does if you call glDeleteBuffers)
+    */
+   if (obj->Pointer)
+      intel_bufferobj_unmap(ctx, 0, obj);
+
+   _mesa_free(intel_obj->sys_buffer);
    if (intel_obj->region) {
       intel_bufferobj_release_region(intel, intel_obj);
    }
@@ -142,7 +152,23 @@ intel_bufferobj_data(GLcontext * ctx,
       dri_bo_unreference(intel_obj->buffer);
       intel_obj->buffer = NULL;
    }
+   _mesa_free(intel_obj->sys_buffer);
+   intel_obj->sys_buffer = NULL;
+
    if (size != 0) {
+#ifdef I915
+      /* On pre-965, stick VBOs in system memory, as we're always doing swtnl
+       * with their contents anyway.
+       */
+      if (target == GL_ARRAY_BUFFER || target == GL_ELEMENT_ARRAY_BUFFER) {
+	 intel_obj->sys_buffer = _mesa_malloc(size);
+	 if (intel_obj->sys_buffer != NULL) {
+	    if (data != NULL)
+	       memcpy(intel_obj->sys_buffer, data, size);
+	    return;
+	 }
+      }
+#endif
       intel_bufferobj_alloc_buffer(intel, intel_obj);
 
       if (data != NULL)
@@ -172,7 +198,10 @@ intel_bufferobj_subdata(GLcontext * ctx,
    if (intel_obj->region)
       intel_bufferobj_cow(intel, intel_obj);
 
-   dri_bo_subdata(intel_obj->buffer, offset, size, data);
+   if (intel_obj->sys_buffer)
+      memcpy((char *)intel_obj->sys_buffer + offset, data, size);
+   else
+      dri_bo_subdata(intel_obj->buffer, offset, size, data);
 }
 
 
@@ -204,11 +233,16 @@ intel_bufferobj_map(GLcontext * ctx,
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
+   GLboolean read_only = (access == GL_READ_ONLY_ARB);
+   GLboolean write_only = (access == GL_WRITE_ONLY_ARB);
 
-   /* XXX: Translate access to flags arg below:
-    */
    assert(intel_obj);
 
+   if (intel_obj->sys_buffer) {
+      obj->Pointer = intel_obj->sys_buffer;
+      return obj->Pointer;
+   }
+
    if (intel_obj->region)
       intel_bufferobj_cow(intel, intel_obj);
 
@@ -217,7 +251,14 @@ intel_bufferobj_map(GLcontext * ctx,
       return NULL;
    }
 
-   dri_bo_map(intel_obj->buffer, GL_TRUE);
+   if (write_only && intel->intelScreen->kernel_exec_fencing) {
+      drm_intel_gem_bo_map_gtt(intel_obj->buffer);
+      intel_obj->mapped_gtt = GL_TRUE;
+   } else {
+      drm_intel_bo_map(intel_obj->buffer, !read_only);
+      intel_obj->mapped_gtt = GL_FALSE;
+   }
+
    obj->Pointer = intel_obj->buffer->virtual;
    return obj->Pointer;
 }
@@ -235,7 +276,11 @@ intel_bufferobj_unmap(GLcontext * ctx,
    assert(intel_obj);
    if (intel_obj->buffer != NULL) {
       assert(obj->Pointer);
-      dri_bo_unmap(intel_obj->buffer);
+      if (intel_obj->mapped_gtt) {
+	 drm_intel_gem_bo_unmap_gtt(intel_obj->buffer);
+      } else {
+	 drm_intel_bo_unmap(intel_obj->buffer);
+      }
       obj->Pointer = NULL;
    }
    return GL_TRUE;
@@ -254,6 +299,18 @@ intel_bufferobj_buffer(struct intel_context *intel,
       }
    }
 
+   if (intel_obj->buffer == NULL) {
+      intel_bufferobj_alloc_buffer(intel, intel_obj);
+      intel_bufferobj_subdata(&intel->ctx,
+			      GL_ARRAY_BUFFER_ARB,
+			      0,
+			      intel_obj->Base.Size,
+			      intel_obj->sys_buffer,
+			      &intel_obj->Base);
+      _mesa_free(intel_obj->sys_buffer);
+      intel_obj->sys_buffer = NULL;
+   }
+
    return intel_obj->buffer;
 }
 
diff --git a/src/mesa/drivers/dri/intel/intel_buffer_objects.h b/src/mesa/drivers/dri/intel/intel_buffer_objects.h
index bf6dbd58f27..04310156319 100644
--- a/src/mesa/drivers/dri/intel/intel_buffer_objects.h
+++ b/src/mesa/drivers/dri/intel/intel_buffer_objects.h
@@ -42,10 +42,13 @@ struct intel_buffer_object
 {
    struct gl_buffer_object Base;
    dri_bo *buffer;     /* the low-level buffer manager's buffer handle */
+   /** System memory buffer data, if not using a BO to store the data. */
+   void *sys_buffer;
 
    struct intel_region *region; /* Is there a zero-copy texture
                                    associated with this (pixel)
                                    buffer object? */
+   GLboolean mapped_gtt;
 };
 
 
diff --git a/src/mesa/drivers/dri/intel/intel_buffers.c b/src/mesa/drivers/dri/intel/intel_buffers.c
index 0929a2c223c..df5c3fc1766 100644
--- a/src/mesa/drivers/dri/intel/intel_buffers.c
+++ b/src/mesa/drivers/dri/intel/intel_buffers.c
@@ -157,7 +157,7 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
    /* Do this here, not core Mesa, since this function is called from
     * many places within the driver.
     */
-   if (ctx->NewState & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+   if (ctx->NewState & _NEW_BUFFERS) {
       /* this updates the DrawBuffer->_NumColorDrawBuffers fields, etc */
       _mesa_update_framebuffer(ctx);
       /* this updates the DrawBuffer's Width/Height if it's a FBO */
@@ -202,6 +202,8 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
 	       intel_batchbuffer_flush(intel->batch);
 	    intel->front_cliprects = GL_TRUE;
 	    colorRegions[0] = intel_get_rb_region(fb, BUFFER_FRONT_LEFT);
+
+	    intel->front_buffer_dirty = GL_TRUE;
 	 }
 	 else {
 	    if (!intel->constant_cliprect && intel->front_cliprects)
@@ -274,7 +276,7 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
       ctx->Driver.Enable(ctx, GL_DEPTH_TEST,
                          (ctx->Depth.Test && fb->Visual.depthBits > 0));
       ctx->Driver.Enable(ctx, GL_STENCIL_TEST,
-                         (ctx->Stencil._Enabled && fb->Visual.stencilBits > 0));
+                         (ctx->Stencil.Enabled && fb->Visual.stencilBits > 0));
    }
    else {
       /* Mesa's Stencil._Enabled field is updated when
@@ -319,6 +321,23 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
 static void
 intelDrawBuffer(GLcontext * ctx, GLenum mode)
 {
+   if ((ctx->DrawBuffer != NULL) && (ctx->DrawBuffer->Name == 0)) {
+      struct intel_context *const intel = intel_context(ctx);
+      const GLboolean was_front_buffer_rendering =
+	intel->is_front_buffer_rendering;
+
+      intel->is_front_buffer_rendering = (mode == GL_FRONT_LEFT)
+	|| (mode == GL_FRONT);
+
+      /* If we weren't front-buffer rendering before but we are now, make sure
+       * that the front-buffer has actually been allocated.
+       */
+      if (!was_front_buffer_rendering && intel->is_front_buffer_rendering) {
+	 intel_update_renderbuffers(intel->driContext,
+				    intel->driContext->driDrawablePriv);
+      }
+   }
+
    intel_draw_buffer(ctx, ctx->DrawBuffer);
 }
 
diff --git a/src/mesa/drivers/dri/intel/intel_clear.c b/src/mesa/drivers/dri/intel/intel_clear.c
index 28281b38615..309ac1923bc 100644
--- a/src/mesa/drivers/dri/intel/intel_clear.c
+++ b/src/mesa/drivers/dri/intel/intel_clear.c
@@ -30,6 +30,7 @@
 #include "main/enums.h"
 #include "main/image.h"
 #include "main/mtypes.h"
+#include "main/arrayobj.h"
 #include "main/attrib.h"
 #include "main/blend.h"
 #include "main/bufferobj.h"
@@ -38,6 +39,7 @@
 #include "main/enable.h"
 #include "main/macros.h"
 #include "main/matrix.h"
+#include "main/polygon.h"
 #include "main/texstate.h"
 #include "main/shaders.h"
 #include "main/stencil.h"
@@ -51,6 +53,7 @@
 #include "intel_clear.h"
 #include "intel_fbo.h"
 #include "intel_pixel.h"
+#include "intel_regions.h"
 
 #define FILE_DEBUG_FLAG DEBUG_BLIT
 
@@ -65,6 +68,45 @@
 			      BUFFER_BIT_COLOR6 |			\
 			      BUFFER_BIT_COLOR7)
 
+
+/**
+ * Per-context one-time init of things for intl_clear_tris().
+ * Basically set up a private array object for vertex/color arrays.
+ */
+static void
+init_clear(GLcontext *ctx)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct gl_array_object *arraySave = NULL;
+   const GLuint arrayBuffer = ctx->Array.ArrayBufferObj->Name;
+   const GLuint elementBuffer = ctx->Array.ElementArrayBufferObj->Name;
+
+   /* create new array object */
+   intel->clear.arrayObj = _mesa_new_array_object(ctx, ~0);
+
+   /* save current array object, bind new one */
+   _mesa_reference_array_object(ctx, &arraySave, ctx->Array.ArrayObj);
+   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, intel->clear.arrayObj);
+
+   /* one-time setup of vertex arrays (pos, color) */
+   _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
+   _mesa_BindBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB, 0);
+   _mesa_ColorPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), intel->clear.color);
+   _mesa_VertexPointer(3, GL_FLOAT, 3 * sizeof(GLfloat), intel->clear.vertices);
+   _mesa_Enable(GL_COLOR_ARRAY);
+   _mesa_Enable(GL_VERTEX_ARRAY);
+
+   /* restore original array object */
+   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, arraySave);
+   _mesa_reference_array_object(ctx, &arraySave, NULL);
+
+   /* restore original buffer objects */
+   _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, arrayBuffer);
+   _mesa_BindBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB, elementBuffer);
+}
+
+
+
 /**
  * Perform glClear where mask contains only color, depth, and/or stencil.
  *
@@ -77,14 +119,16 @@ void
 intel_clear_tris(GLcontext *ctx, GLbitfield mask)
 {
    struct intel_context *intel = intel_context(ctx);
-   GLfloat vertices[4][3];
-   GLfloat color[4][4];
    GLfloat dst_z;
    struct gl_framebuffer *fb = ctx->DrawBuffer;
    int i;
    GLboolean saved_fp_enable = GL_FALSE, saved_vp_enable = GL_FALSE;
    GLuint saved_shader_program = 0;
    unsigned int saved_active_texture;
+   struct gl_array_object *arraySave = NULL;
+
+   if (!intel->clear.arrayObj)
+      init_clear(ctx);
 
    assert((mask & ~(TRI_CLEAR_COLOR_BITS | BUFFER_BIT_DEPTH |
 		    BUFFER_BIT_STENCIL)) == 0);
@@ -93,10 +137,10 @@ intel_clear_tris(GLcontext *ctx, GLbitfield mask)
 		    GL_CURRENT_BIT |
 		    GL_DEPTH_BUFFER_BIT |
 		    GL_ENABLE_BIT |
+		    GL_POLYGON_BIT |
 		    GL_STENCIL_BUFFER_BIT |
 		    GL_TRANSFORM_BIT |
 		    GL_CURRENT_BIT);
-   _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
    saved_active_texture = ctx->Texture.CurrentUnit;
 
    /* Disable existing GL state we don't want to apply to a clear. */
@@ -114,6 +158,7 @@ intel_clear_tris(GLcontext *ctx, GLbitfield mask)
    _mesa_Disable(GL_CLIP_PLANE3);
    _mesa_Disable(GL_CLIP_PLANE4);
    _mesa_Disable(GL_CLIP_PLANE5);
+   _mesa_PolygonMode(GL_FRONT_AND_BACK, GL_FILL);
    if (ctx->Extensions.ARB_fragment_program && ctx->FragmentProgram.Enabled) {
       saved_fp_enable = GL_TRUE;
       _mesa_Disable(GL_FRAGMENT_PROGRAM_ARB);
@@ -146,13 +191,14 @@ intel_clear_tris(GLcontext *ctx, GLbitfield mask)
       }
    }
 
+   /* save current array object, bind our private one */
+   _mesa_reference_array_object(ctx, &arraySave, ctx->Array.ArrayObj);
+   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, intel->clear.arrayObj);
+
    intel_meta_set_passthrough_transform(intel);
 
    for (i = 0; i < 4; i++) {
-      color[i][0] = ctx->Color.ClearColor[0];
-      color[i][1] = ctx->Color.ClearColor[1];
-      color[i][2] = ctx->Color.ClearColor[2];
-      color[i][3] = ctx->Color.ClearColor[3];
+      COPY_4FV(intel->clear.color[i], ctx->Color.ClearColor);
    }
 
    /* convert clear Z from [0,1] to NDC coord in [-1,1] */
@@ -161,23 +207,18 @@ intel_clear_tris(GLcontext *ctx, GLbitfield mask)
    /* Prepare the vertices, which are the same regardless of which buffer we're
     * drawing to.
     */
-   vertices[0][0] = fb->_Xmin;
-   vertices[0][1] = fb->_Ymin;
-   vertices[0][2] = dst_z;
-   vertices[1][0] = fb->_Xmax;
-   vertices[1][1] = fb->_Ymin;
-   vertices[1][2] = dst_z;
-   vertices[2][0] = fb->_Xmax;
-   vertices[2][1] = fb->_Ymax;
-   vertices[2][2] = dst_z;
-   vertices[3][0] = fb->_Xmin;
-   vertices[3][1] = fb->_Ymax;
-   vertices[3][2] = dst_z;
-
-   _mesa_ColorPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), &color);
-   _mesa_VertexPointer(3, GL_FLOAT, 3 * sizeof(GLfloat), &vertices);
-   _mesa_Enable(GL_COLOR_ARRAY);
-   _mesa_Enable(GL_VERTEX_ARRAY);
+   intel->clear.vertices[0][0] = fb->_Xmin;
+   intel->clear.vertices[0][1] = fb->_Ymin;
+   intel->clear.vertices[0][2] = dst_z;
+   intel->clear.vertices[1][0] = fb->_Xmax;
+   intel->clear.vertices[1][1] = fb->_Ymin;
+   intel->clear.vertices[1][2] = dst_z;
+   intel->clear.vertices[2][0] = fb->_Xmax;
+   intel->clear.vertices[2][1] = fb->_Ymax;
+   intel->clear.vertices[2][2] = dst_z;
+   intel->clear.vertices[3][0] = fb->_Xmin;
+   intel->clear.vertices[3][1] = fb->_Ymax;
+   intel->clear.vertices[3][2] = dst_z;
 
    while (mask != 0) {
       GLuint this_mask = 0;
@@ -215,8 +256,10 @@ intel_clear_tris(GLcontext *ctx, GLbitfield mask)
       /* Control writing of the stencil clear value to stencil. */
       if (this_mask & BUFFER_BIT_STENCIL) {
 	 _mesa_Enable(GL_STENCIL_TEST);
-	 _mesa_StencilOp(GL_REPLACE, GL_REPLACE, GL_REPLACE);
-	 _mesa_StencilFuncSeparate(GL_FRONT, GL_ALWAYS, ctx->Stencil.Clear,
+	 _mesa_StencilOpSeparate(GL_FRONT_AND_BACK,
+				 GL_REPLACE, GL_REPLACE, GL_REPLACE);
+	 _mesa_StencilFuncSeparate(GL_FRONT_AND_BACK, GL_ALWAYS,
+				   ctx->Stencil.Clear,
 				   ctx->Stencil.WriteMask[0]);
       } else {
 	 _mesa_Disable(GL_STENCIL_TEST);
@@ -238,8 +281,11 @@ intel_clear_tris(GLcontext *ctx, GLbitfield mask)
    if (saved_shader_program)
       _mesa_UseProgramObjectARB(saved_shader_program);
 
-   _mesa_PopClientAttrib();
    _mesa_PopAttrib();
+
+   /* restore current array object */
+   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, arraySave);
+   _mesa_reference_array_object(ctx, &arraySave, NULL);
 }
 
 static const char *buffer_names[] = {
@@ -267,7 +313,6 @@ static const char *buffer_names[] = {
 static void
 intelClear(GLcontext *ctx, GLbitfield mask)
 {
-   struct intel_context *intel = intel_context(ctx);
    const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
    GLbitfield tri_mask = 0;
    GLbitfield blit_mask = 0;
@@ -295,7 +340,7 @@ intelClear(GLcontext *ctx, GLbitfield mask)
          = intel_get_rb_region(fb, BUFFER_STENCIL);
       if (stencilRegion) {
          /* have hw stencil */
-         if (IS_965(intel->intelScreen->deviceID) ||
+         if (stencilRegion->tiling == I915_TILING_Y ||
 	     (ctx->Stencil.WriteMask[0] & 0xff) != 0xff) {
 	    /* We have to use the 3D engine if we're clearing a partial mask
 	     * of the stencil buffer, or if we're on a 965 which has a tiled
@@ -312,9 +357,10 @@ intelClear(GLcontext *ctx, GLbitfield mask)
 
    /* HW depth */
    if (mask & BUFFER_BIT_DEPTH) {
+      const struct intel_region *irb = intel_get_rb_region(fb, BUFFER_DEPTH);
+
       /* clear depth with whatever method is used for stencil (see above) */
-      if (IS_965(intel->intelScreen->deviceID) ||
-	  tri_mask & BUFFER_BIT_STENCIL)
+      if (irb->tiling == I915_TILING_Y || tri_mask & BUFFER_BIT_STENCIL)
          tri_mask |= BUFFER_BIT_DEPTH;
       else
          blit_mask |= BUFFER_BIT_DEPTH;
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index a664e749360..f88b37d0f3a 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -28,6 +28,7 @@
 
 #include "main/glheader.h"
 #include "main/context.h"
+#include "main/arrayobj.h"
 #include "main/extensions.h"
 #include "main/framebuffer.h"
 #include "main/imports.h"
@@ -173,6 +174,24 @@ intelGetString(GLcontext * ctx, GLenum name)
    }
 }
 
+static unsigned
+intel_bits_per_pixel(const struct intel_renderbuffer *rb)
+{
+   switch (rb->Base._ActualFormat) {
+   case GL_RGB5:
+   case GL_DEPTH_COMPONENT16:
+      return 16;
+   case GL_RGB8:
+   case GL_RGBA8:
+   case GL_DEPTH_COMPONENT24:
+   case GL_DEPTH24_STENCIL8_EXT:
+   case GL_STENCIL_INDEX8_EXT:
+      return 32;
+   default:
+      return 0;
+   }
+}
+
 void
 intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 {
@@ -180,7 +199,7 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
    struct intel_renderbuffer *rb;
    struct intel_region *region, *depth_region;
    struct intel_context *intel = context->driverPrivate;
-   __DRIbuffer *buffers;
+   __DRIbuffer *buffers = NULL;
    __DRIscreen *screen;
    int i, count;
    unsigned int attachments[10];
@@ -192,22 +211,63 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 
    screen = intel->intelScreen->driScrnPriv;
 
-   i = 0;
-   if (intel_fb->color_rb[0])
-      attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
-   if (intel_fb->color_rb[1])
-      attachments[i++] = __DRI_BUFFER_BACK_LEFT;
-   if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH))
-      attachments[i++] = __DRI_BUFFER_DEPTH;
-   if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL))
-      attachments[i++] = __DRI_BUFFER_STENCIL;
-
-   buffers = (*screen->dri2.loader->getBuffers)(drawable,
-						&drawable->w,
-						&drawable->h,
-						attachments, i,
-						&count,
-						drawable->loaderPrivate);
+   if (screen->dri2.loader
+       && (screen->dri2.loader->base.version > 2)
+       && (screen->dri2.loader->getBuffersWithFormat != NULL)) {
+      struct intel_renderbuffer *depth_rb;
+      struct intel_renderbuffer *stencil_rb;
+
+      i = 0;
+      if ((intel->is_front_buffer_rendering || !intel_fb->color_rb[1])
+	   && intel_fb->color_rb[0]) {
+	 attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+	 attachments[i++] = intel_bits_per_pixel(intel_fb->color_rb[0]);
+      }
+
+      if (intel_fb->color_rb[1]) {
+	 attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+	 attachments[i++] = intel_bits_per_pixel(intel_fb->color_rb[1]);
+      }
+
+      depth_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+      stencil_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+
+      if ((depth_rb != NULL) && (stencil_rb != NULL)) {
+	 attachments[i++] = __DRI_BUFFER_DEPTH_STENCIL;
+	 attachments[i++] = intel_bits_per_pixel(depth_rb);
+      } else if (depth_rb != NULL) {
+	 attachments[i++] = __DRI_BUFFER_DEPTH;
+	 attachments[i++] = intel_bits_per_pixel(depth_rb);
+      } else if (stencil_rb != NULL) {
+	 attachments[i++] = __DRI_BUFFER_STENCIL;
+	 attachments[i++] = intel_bits_per_pixel(stencil_rb);
+      }
+
+      buffers =
+	 (*screen->dri2.loader->getBuffersWithFormat)(drawable,
+						      &drawable->w,
+						      &drawable->h,
+						      attachments, i / 2,
+						      &count,
+						      drawable->loaderPrivate);
+   } else if (screen->dri2.loader) {
+      i = 0;
+      if (intel_fb->color_rb[0])
+	 attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+      if (intel_fb->color_rb[1])
+	 attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+      if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH))
+	 attachments[i++] = __DRI_BUFFER_DEPTH;
+      if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL))
+	 attachments[i++] = __DRI_BUFFER_STENCIL;
+
+      buffers = (*screen->dri2.loader->getBuffers)(drawable,
+						   &drawable->w,
+						   &drawable->h,
+						   attachments, i,
+						   &count,
+						   drawable->loaderPrivate);
+   }
 
    if (buffers == NULL)
       return;
@@ -235,6 +295,11 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 	   region_name = "dri2 front buffer";
 	   break;
 
+       case __DRI_BUFFER_FAKE_FRONT_LEFT:
+	   rb = intel_fb->color_rb[0];
+	   region_name = "dri2 fake front buffer";
+	   break;
+
        case __DRI_BUFFER_BACK_LEFT:
 	   rb = intel_fb->color_rb[1];
 	   region_name = "dri2 back buffer";
@@ -245,6 +310,11 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 	   region_name = "dri2 depth buffer";
 	   break;
 
+       case __DRI_BUFFER_DEPTH_STENCIL:
+	   rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+	   region_name = "dri2 depth / stencil buffer";
+	   break;
+
        case __DRI_BUFFER_STENCIL:
 	   rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
 	   region_name = "dri2 stencil buffer";
@@ -291,6 +361,23 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 
        intel_renderbuffer_set_region(rb, region);
        intel_region_release(&region);
+
+       if (buffers[i].attachment == __DRI_BUFFER_DEPTH_STENCIL) {
+	  rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+	  if (rb != NULL) {
+	     struct intel_region *stencil_region = NULL;
+
+	     if (rb->region) {
+		dri_bo_flink(rb->region->buffer, &name);
+		if (name == buffers[i].name)
+		   continue;
+	     }
+
+	     intel_region_reference(&stencil_region, region);
+	     intel_renderbuffer_set_region(rb, stencil_region);
+	     intel_region_release(&stencil_region);
+	  }
+       }
    }
 
    driUpdateFramebufferSize(&intel->ctx, drawable);
@@ -307,7 +394,7 @@ intel_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei w, GLsizei h)
     if (!driContext->driScreenPriv->dri2.enabled)
 	return;
 
-    if (!intel->internal_viewport_call) {
+    if (!intel->internal_viewport_call && ctx->DrawBuffer->Name == 0) {
        intel_update_renderbuffers(driContext, driContext->driDrawablePriv);
        if (driContext->driDrawablePriv != driContext->driReadablePriv)
 	  intel_update_renderbuffers(driContext, driContext->driReadablePriv);
@@ -391,6 +478,27 @@ intel_flush(GLcontext *ctx, GLboolean needs_mi_flush)
 
    if (intel->batch->map != intel->batch->ptr)
       intel_batchbuffer_flush(intel->batch);
+
+   if ((ctx->DrawBuffer->Name == 0) && intel->front_buffer_dirty) {
+      __DRIscreen *const screen = intel->intelScreen->driScrnPriv;
+
+      if (screen->dri2.loader &&
+          (screen->dri2.loader->base.version >= 2)
+	  && (screen->dri2.loader->flushFrontBuffer != NULL)) {
+	 (*screen->dri2.loader->flushFrontBuffer)(intel->driDrawable,
+						  intel->driDrawable->loaderPrivate);
+
+	 /* Only clear the dirty bit if front-buffer rendering is no longer
+	  * enabled.  This is done so that the dirty bit can only be set in
+	  * glDrawBuffer.  Otherwise the dirty bit would have to be set at
+	  * each of N places that do rendering.  This has worse performances,
+	  * but it is much easier to get correct.
+	  */
+	 if (intel->is_front_buffer_rendering) {
+	    intel->front_buffer_dirty = GL_FALSE;
+	 }
+      }
+   }
 }
 
 void
@@ -442,6 +550,9 @@ intelInitDriverFunctions(struct dd_function_table *functions)
    functions->CopyConvolutionFilter2D = _swrast_CopyConvolutionFilter2D;
 
    intelInitTextureFuncs(functions);
+   intelInitTextureImageFuncs(functions);
+   intelInitTextureSubImageFuncs(functions);
+   intelInitTextureCopyImageFuncs(functions);
    intelInitStateFuncs(functions);
    intelInitClearFuncs(functions);
    intelInitBufferFuncs(functions);
@@ -502,8 +613,6 @@ intelInitContext(struct intel_context *intel,
       }
    }
 
-   ctx->Const.MaxTextureMaxAnisotropy = 2.0;
-
    /* This doesn't yet catch all non-conformant rendering, but it's a
     * start.
     */
@@ -544,6 +653,13 @@ intelInitContext(struct intel_context *intel,
    _mesa_init_point(ctx);
 
    ctx->Const.MaxColorAttachments = 4;  /* XXX FBO: review this */
+   if (IS_965(intelScreen->deviceID)) {
+      if (MAX_WIDTH > 8192)
+	 ctx->Const.MaxRenderbufferSize = 8192;
+   } else {
+      if (MAX_WIDTH > 2048)
+	 ctx->Const.MaxRenderbufferSize = 2048;
+   }
 
    /* Initialize the software rasterizer and helper modules. */
    _swrast_CreateContext(ctx);
@@ -611,6 +727,9 @@ intelInitContext(struct intel_context *intel,
    else if (driQueryOptionb(&intel->optionCache, "force_s3tc_enable")) {
       _mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
    }
+   intel->use_texture_tiling = driQueryOptionb(&intel->optionCache,
+					       "texture_tiling");
+   intel->use_early_z = driQueryOptionb(&intel->optionCache, "early_z");
 
    intel->prim.primitive = ~0;
 
@@ -650,6 +769,9 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
 
       INTEL_FIREVERTICES(intel);
 
+      if (intel->clear.arrayObj)
+         _mesa_delete_array_object(&intel->ctx, intel->clear.arrayObj);
+
       intel->vtbl.destroy(intel);
 
       release_texture_heaps = (intel->ctx.Shared->RefCount == 1);
@@ -669,13 +791,64 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
       intel->prim.vb_bo = NULL;
 
       if (release_texture_heaps) {
-         /* This share group is about to go away, free our private
-          * texture object data.
+         /* Nothing is currently done here to free texture heaps;
+          * but we're not using the texture heap utilities, so I
+          * rather think we shouldn't.  I've taken a look, and can't
+          * find any private texture data hanging around anywhere, but
+          * I'm not yet certain there isn't any at all...
           */
-         if (INTEL_DEBUG & DEBUG_TEXTURE)
+         /* if (INTEL_DEBUG & DEBUG_TEXTURE)
             fprintf(stderr, "do something to free texture heaps\n");
+          */
       }
 
+      /* XXX In intelMakeCurrent() below, the context's static regions are 
+       * referenced inside the frame buffer; it's listed as a hack,
+       * with a comment of "XXX FBO temporary fix-ups!", but
+       * as long as it's there, we should release the regions here.
+       * The do/while loop around the block is used to allow the
+       * "continue" statements inside the block to exit the block,
+       * to avoid many layers of "if" constructs.
+       */
+      do {
+         __DRIdrawablePrivate * driDrawPriv = intel->driDrawable;
+         struct intel_framebuffer *intel_fb;
+         struct intel_renderbuffer *irbDepth, *irbStencil;
+         if (!driDrawPriv) {
+            /* We're already detached from the drawable; exit this block. */
+            continue;
+         }
+         intel_fb = (struct intel_framebuffer *) driDrawPriv->driverPrivate;
+         if (!intel_fb) {
+            /* The frame buffer is already gone; exit this block. */
+            continue;
+         }
+         irbDepth = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+         irbStencil = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+
+         /* If the regions of the frame buffer still match the regions
+          * of the context, release them.  If they've changed somehow,
+          * leave them alone.
+          */
+         if (intel_fb->color_rb[0] && intel_fb->color_rb[0]->region == intel->front_region) {
+	    intel_renderbuffer_set_region(intel_fb->color_rb[0], NULL);
+         }
+         if (intel_fb->color_rb[1] && intel_fb->color_rb[1]->region == intel->back_region) {
+	    intel_renderbuffer_set_region(intel_fb->color_rb[1], NULL);
+         }
+
+         if (irbDepth && irbDepth->region == intel->depth_region) {
+	    intel_renderbuffer_set_region(irbDepth, NULL);
+         }
+         /* Usually, the stencil buffer is the same as the depth buffer;
+          * but they're handled separately in MakeCurrent, so we'll
+          * handle them separately here.
+          */
+         if (irbStencil && irbStencil->region == intel->depth_region) {
+	    intel_renderbuffer_set_region(irbStencil, NULL);
+         }
+      } while (0);
+
       intel_region_release(&intel->front_region);
       intel_region_release(&intel->back_region);
       intel_region_release(&intel->depth_region);
@@ -684,6 +857,8 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
 
       /* free the Mesa context */
       _mesa_free_context_data(&intel->ctx);
+
+      
    }
 }
 
@@ -712,7 +887,10 @@ intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
           if (driDrawPriv != driReadPriv)
               intel_update_renderbuffers(driContextPriv, driReadPriv);
       } else {
-          /* XXX FBO temporary fix-ups! */
+          /* XXX FBO temporary fix-ups!  These are released in 
+           * intelDextroyContext(), above.  Changes here should be
+           * reflected there.
+           */
           /* if the renderbuffers don't have regions, init them from the context */
          struct intel_renderbuffer *irbDepth
             = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index d635f3f50dc..7d3c80bb21c 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -48,6 +48,8 @@
 #define DV_PF_555  (1<<8)
 #define DV_PF_565  (2<<8)
 #define DV_PF_8888 (3<<8)
+#define DV_PF_4444 (8<<8)
+#define DV_PF_1555 (9<<8)
 
 struct intel_region;
 struct intel_context;
@@ -159,12 +161,22 @@ struct intel_context
    struct {
       struct gl_fragment_program *bitmap_fp;
       struct gl_vertex_program *passthrough_vp;
+      struct gl_buffer_object *texcoord_vbo;
 
       struct gl_fragment_program *saved_fp;
       GLboolean saved_fp_enable;
       struct gl_vertex_program *saved_vp;
       GLboolean saved_vp_enable;
 
+      struct gl_fragment_program *tex2d_fp;
+
+      GLboolean saved_texcoord_enable;
+      struct gl_buffer_object *saved_array_vbo, *saved_texcoord_vbo;
+      GLenum saved_texcoord_type;
+      GLsizei saved_texcoord_size, saved_texcoord_stride;
+      const void *saved_texcoord_ptr;
+      int saved_active_texture;
+
       GLint saved_vp_x, saved_vp_y;
       GLsizei saved_vp_width, saved_vp_height;
       GLenum saved_matrix_mode;
@@ -213,6 +225,14 @@ struct intel_context
    GLuint ClearColor565;
    GLuint ClearColor8888;
 
+   /* info for intel_clear_tris() */
+   struct
+   {
+      struct gl_array_object *arrayObj;
+      GLfloat vertices[4][3];
+      GLfloat color[4][4];
+   } clear;
+
    /* Offsets of fields within the current vertex:
     */
    GLuint coloroffset;
@@ -262,11 +282,32 @@ struct intel_context
     * flush time while the lock is held.
     */
    GLboolean constant_cliprect;
+
    /**
     * In !constant_cliprect mode, set to true if the front cliprects should be
     * used instead of back.
     */
    GLboolean front_cliprects;
+
+   /**
+    * Set if rendering has occured to the drawable's front buffer.
+    *
+    * This is used in the DRI2 case to detect that glFlush should also copy
+    * the contents of the fake front buffer to the real front buffer.
+    */
+   GLboolean front_buffer_dirty;
+
+   /**
+    * Track whether front-buffer rendering is currently enabled
+    *
+    * A separate flag is used to track this in order to support MRT more
+    * easily.
+    */
+   GLboolean is_front_buffer_rendering;
+
+   GLboolean use_texture_tiling;
+   GLboolean use_early_z;
+
    drm_clip_rect_t fboRect;     /**< cliprect for FBO rendering */
 
    int perf_boxes;
@@ -319,6 +360,7 @@ extern char *__progname;
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
 #define ALIGN(value, alignment)  ((value + alignment - 1) & ~(alignment - 1))
+#define IS_POWER_OF_TWO(val) (((val) & (val - 1)) == 0)
 
 #define INTEL_FIREVERTICES(intel)		\
 do {						\
@@ -520,6 +562,9 @@ void intel_viewport(GLcontext * ctx, GLint x, GLint y,
 void intel_update_renderbuffers(__DRIcontext *context,
 				__DRIdrawable *drawable);
 
+void i915_set_buf_info_for_region(uint32_t *state, struct intel_region *region,
+				  uint32_t buffer_id);
+
 /*======================================================================
  * Inline conversion functions.  
  * These are better-typed than the macros used previously:
@@ -530,4 +575,10 @@ intel_context(GLcontext * ctx)
    return (struct intel_context *) ctx;
 }
 
+static INLINE GLboolean
+is_power_of_two(uint32_t value)
+{
+   return (value & (value - 1)) == 0;
+}
+
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_decode.c b/src/mesa/drivers/dri/intel/intel_decode.c
index f04638206d5..a9dfe281cbd 100644
--- a/src/mesa/drivers/dri/intel/intel_decode.c
+++ b/src/mesa/drivers/dri/intel/intel_decode.c
@@ -800,6 +800,7 @@ static int
 decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i830)
 {
     unsigned int len, i, c, opcode, word, map, sampler, instr;
+    char *format;
 
     struct {
 	uint32_t opcode;
@@ -1001,6 +1002,35 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i
 	    (*failures)++;
 	}
 	return len;
+    case 0x85:
+	len = (data[0] & 0x0000000f) + 2;
+
+	if (len != 2)
+	    fprintf(out, "Bad count in 3DSTATE_DEST_BUFFER_VARIABLES\n");
+	if (count < 2)
+	    BUFFER_FAIL(count, len, "3DSTATE_DEST_BUFFER_VARIABLES");
+
+	instr_out(data, hw_offset, 0,
+		  "3DSTATE_DEST_BUFFER_VARIABLES\n");
+
+	switch ((data[1] >> 8) & 0xf) {
+	case 0x0: format = "g8"; break;
+	case 0x1: format = "x1r5g5b5"; break;
+	case 0x2: format = "r5g6b5"; break;
+	case 0x3: format = "a8r8g8b8"; break;
+	case 0x4: format = "ycrcb_swapy"; break;
+	case 0x5: format = "ycrcb_normal"; break;
+	case 0x6: format = "ycrcb_swapuv"; break;
+	case 0x7: format = "ycrcb_swapuvy"; break;
+	case 0x8: format = "a4r4g4b4"; break;
+	case 0x9: format = "a1r5g5b5"; break;
+	case 0xa: format = "a2r10g10b10"; break;
+	default: format = "BAD"; break;
+	}
+	instr_out(data, hw_offset, 1, "%s format, early Z %sabled\n",
+		  format,
+		  (data[1] & (1 << 31)) ? "en" : "dis");
+	return len;
     }
 
     for (opcode = 0; opcode < sizeof(opcodes_3d_1d) / sizeof(opcodes_3d_1d[0]);
diff --git a/src/mesa/drivers/dri/intel/intel_extensions.c b/src/mesa/drivers/dri/intel/intel_extensions.c
index 9ec1b4ec2f4..1e8b1878abe 100644
--- a/src/mesa/drivers/dri/intel/intel_extensions.c
+++ b/src/mesa/drivers/dri/intel/intel_extensions.c
@@ -48,6 +48,7 @@
 #define need_GL_EXT_point_parameters
 #define need_GL_EXT_secondary_color
 #define need_GL_EXT_stencil_two_side
+#define need_GL_APPLE_vertex_array_object
 #define need_GL_ATI_separate_stencil
 #define need_GL_ATI_envmap_bumpmap
 #define need_GL_NV_point_sprite
@@ -95,6 +96,7 @@ static const struct dri_extension card_extensions[] = {
    { "GL_EXT_texture_lod_bias",           NULL },
    { "GL_3DFX_texture_compression_FXT1",  NULL },
    { "GL_APPLE_client_storage",           NULL },
+   { "GL_APPLE_vertex_array_object",      GL_APPLE_vertex_array_object_functions},
    { "GL_MESA_pack_invert",               NULL },
    { "GL_MESA_ycbcr_texture",             NULL },
    { "GL_NV_blend_square",                NULL },
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index a401f730ba2..0ea413aee1d 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -119,6 +119,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->RedBits = 5;
       rb->GreenBits = 6;
       rb->BlueBits = 5;
+      irb->texformat = &_mesa_texformat_rgb565;
       cpp = 2;
       break;
    case GL_RGB:
@@ -132,6 +133,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->GreenBits = 8;
       rb->BlueBits = 8;
       rb->AlphaBits = 0;
+      irb->texformat = &_mesa_texformat_argb8888; /* XXX: Need xrgb8888 */
       cpp = 4;
       break;
    case GL_RGBA:
@@ -148,6 +150,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->GreenBits = 8;
       rb->BlueBits = 8;
       rb->AlphaBits = 8;
+      irb->texformat = &_mesa_texformat_argb8888;
       cpp = 4;
       break;
    case GL_STENCIL_INDEX:
@@ -160,12 +163,14 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
       rb->StencilBits = 8;
       cpp = 4;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    case GL_DEPTH_COMPONENT16:
       rb->_ActualFormat = GL_DEPTH_COMPONENT16;
       rb->DataType = GL_UNSIGNED_SHORT;
       rb->DepthBits = 16;
       cpp = 2;
+      irb->texformat = &_mesa_texformat_z16;
       break;
    case GL_DEPTH_COMPONENT:
    case GL_DEPTH_COMPONENT24:
@@ -174,6 +179,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
       rb->DepthBits = 24;
       cpp = 4;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    case GL_DEPTH_STENCIL_EXT:
    case GL_DEPTH24_STENCIL8_EXT:
@@ -182,6 +188,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->DepthBits = 24;
       rb->StencilBits = 8;
       cpp = 4;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    default:
       _mesa_problem(ctx,
@@ -210,7 +217,8 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       DBG("Allocating %d x %d Intel RBO (pitch %d)\n", width,
 	  height, pitch);
 
-      irb->region = intel_region_alloc(intel, cpp, width, height, pitch,
+      irb->region = intel_region_alloc(intel, I915_TILING_NONE,
+				       cpp, width, height, pitch,
 				       GL_TRUE);
       if (!irb->region)
          return GL_FALSE;       /* out of memory? */
@@ -322,6 +330,7 @@ intel_create_renderbuffer(GLenum intFormat)
       irb->Base.GreenBits = 6;
       irb->Base.BlueBits = 5;
       irb->Base.DataType = GL_UNSIGNED_BYTE;
+      irb->texformat = &_mesa_texformat_rgb565;
       break;
    case GL_RGB8:
       irb->Base._ActualFormat = GL_RGB8;
@@ -331,6 +340,7 @@ intel_create_renderbuffer(GLenum intFormat)
       irb->Base.BlueBits = 8;
       irb->Base.AlphaBits = 0;
       irb->Base.DataType = GL_UNSIGNED_BYTE;
+      irb->texformat = &_mesa_texformat_argb8888; /* XXX: Need xrgb8888 */
       break;
    case GL_RGBA8:
       irb->Base._ActualFormat = GL_RGBA8;
@@ -340,24 +350,28 @@ intel_create_renderbuffer(GLenum intFormat)
       irb->Base.BlueBits = 8;
       irb->Base.AlphaBits = 8;
       irb->Base.DataType = GL_UNSIGNED_BYTE;
+      irb->texformat = &_mesa_texformat_argb8888;
       break;
    case GL_STENCIL_INDEX8_EXT:
       irb->Base._ActualFormat = GL_STENCIL_INDEX8_EXT;
       irb->Base._BaseFormat = GL_STENCIL_INDEX;
       irb->Base.StencilBits = 8;
       irb->Base.DataType = GL_UNSIGNED_BYTE;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    case GL_DEPTH_COMPONENT16:
       irb->Base._ActualFormat = GL_DEPTH_COMPONENT16;
       irb->Base._BaseFormat = GL_DEPTH_COMPONENT;
       irb->Base.DepthBits = 16;
       irb->Base.DataType = GL_UNSIGNED_SHORT;
+      irb->texformat = &_mesa_texformat_z16;
       break;
    case GL_DEPTH_COMPONENT24:
       irb->Base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
       irb->Base._BaseFormat = GL_DEPTH_COMPONENT;
       irb->Base.DepthBits = 24;
       irb->Base.DataType = GL_UNSIGNED_INT;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    case GL_DEPTH24_STENCIL8_EXT:
       irb->Base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
@@ -365,6 +379,7 @@ intel_create_renderbuffer(GLenum intFormat)
       irb->Base.DepthBits = 24;
       irb->Base.StencilBits = 8;
       irb->Base.DataType = GL_UNSIGNED_INT_24_8_EXT;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    default:
       _mesa_problem(NULL,
@@ -449,6 +464,8 @@ static GLboolean
 intel_update_wrapper(GLcontext *ctx, struct intel_renderbuffer *irb, 
 		     struct gl_texture_image *texImage)
 {
+   irb->texformat = texImage->TexFormat;
+
    if (texImage->TexFormat == &_mesa_texformat_argb8888) {
       irb->Base._ActualFormat = GL_RGBA8;
       irb->Base._BaseFormat = GL_RGBA;
@@ -458,9 +475,21 @@ intel_update_wrapper(GLcontext *ctx, struct intel_renderbuffer *irb,
    else if (texImage->TexFormat == &_mesa_texformat_rgb565) {
       irb->Base._ActualFormat = GL_RGB5;
       irb->Base._BaseFormat = GL_RGB;
-      irb->Base.DataType = GL_UNSIGNED_SHORT;
+      irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to RGB5 texture OK\n");
    }
+   else if (texImage->TexFormat == &_mesa_texformat_argb1555) {
+      irb->Base._ActualFormat = GL_RGB5_A1;
+      irb->Base._BaseFormat = GL_RGBA;
+      irb->Base.DataType = GL_UNSIGNED_BYTE;
+      DBG("Render to ARGB1555 texture OK\n");
+   }
+   else if (texImage->TexFormat == &_mesa_texformat_argb4444) {
+      irb->Base._ActualFormat = GL_RGBA4;
+      irb->Base._BaseFormat = GL_RGBA;
+      irb->Base.DataType = GL_UNSIGNED_BYTE;
+      DBG("Render to ARGB4444 texture OK\n");
+   }
    else if (texImage->TexFormat == &_mesa_texformat_z16) {
       irb->Base._ActualFormat = GL_DEPTH_COMPONENT16;
       irb->Base._BaseFormat = GL_DEPTH_COMPONENT;
@@ -546,9 +575,10 @@ intel_render_texture(GLcontext * ctx,
 
    ASSERT(newImage);
 
-   if (newImage->Border != 0) {
-      /* Fallback on drawing to a texture with a border, which won't have a
-       * miptree.
+   intel_image = intel_texture_image(newImage);
+   if (!intel_image->mt) {
+      /* Fallback on drawing to a texture that doesn't have a miptree
+       * (has a border, width/height 0, etc.)
        */
       _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
       _mesa_render_texture(ctx, fb, att);
@@ -579,7 +609,6 @@ intel_render_texture(GLcontext * ctx,
        irb->Base.RefCount);
 
    /* point the renderbufer's region to the texture image region */
-   intel_image = intel_texture_image(newImage);
    if (irb->region != intel_image->mt->region) {
       if (irb->region)
 	 intel_region_release(&irb->region);
@@ -631,11 +660,11 @@ intel_finish_render_texture(GLcontext * ctx,
 static void
 intel_validate_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb)
 {
-   struct intel_context *intel = intel_context(ctx);
    const struct intel_renderbuffer *depthRb =
       intel_get_renderbuffer(fb, BUFFER_DEPTH);
    const struct intel_renderbuffer *stencilRb =
       intel_get_renderbuffer(fb, BUFFER_STENCIL);
+   int i;
 
    if (stencilRb && stencilRb != depthRb) {
       /* we only support combined depth/stencil buffers, not separate
@@ -644,32 +673,26 @@ intel_validate_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb)
       fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
    }
 
-   /* check that texture color buffers are a format we can render into */
-   {
-      const struct gl_texture_format *supportedFormat;
-      GLuint i;
+   for (i = 0; i < ctx->Const.MaxDrawBuffers; i++) {
+      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+      struct intel_renderbuffer *irb = intel_renderbuffer(rb);
 
-      /* The texture format we can render into seems to depend on the
-       * screen depth.  There currently seems to be a problem when
-       * rendering into a rgb565 texture when the screen is abgr8888.
-       */
+      if (rb == NULL)
+	 continue;
+
+      if (irb == NULL) {
+	 fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
+	 continue;
+      }
 
-      if (intel->ctx.Visual.rgbBits >= 24)
-         supportedFormat = &_mesa_texformat_argb8888;
-      else 
-         supportedFormat = &_mesa_texformat_rgb565;
-
-      for (i = 0; i < ctx->Const.MaxDrawBuffers; i++) {
-         const struct gl_texture_object *texObj =
-            fb->Attachment[BUFFER_COLOR0 + i].Texture;
-         if (texObj) {
-            const struct gl_texture_image *texImg =
-               texObj->Image[0][texObj->BaseLevel];
-            if (texImg && texImg->TexFormat != supportedFormat) {
-               fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
-               break;
-            }
-         }
+      switch (irb->texformat->MesaFormat) {
+      case MESA_FORMAT_ARGB8888:
+      case MESA_FORMAT_RGB565:
+      case MESA_FORMAT_ARGB1555:
+      case MESA_FORMAT_ARGB4444:
+	 break;
+      default:
+	 fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
       }
    }
 }
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.h b/src/mesa/drivers/dri/intel/intel_fbo.h
index 7226ee026f6..f0665af482e 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.h
+++ b/src/mesa/drivers/dri/intel/intel_fbo.h
@@ -61,6 +61,8 @@ struct intel_renderbuffer
    struct gl_renderbuffer Base;
    struct intel_region *region;
 
+   const struct gl_texture_format *texformat;
+
    GLuint vbl_pending;   /**< vblank sequence number of pending flip */
 
    uint8_t *span_cache;
diff --git a/src/mesa/drivers/dri/intel/intel_generatemipmap.c b/src/mesa/drivers/dri/intel/intel_generatemipmap.c
new file mode 100644
index 00000000000..1060fbd9e59
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_generatemipmap.c
@@ -0,0 +1,283 @@
+/*
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <[email protected]>
+ *
+ */
+
+#include "main/glheader.h"
+#include "main/enums.h"
+#include "main/image.h"
+#include "main/mtypes.h"
+#include "main/macros.h"
+#include "main/bufferobj.h"
+#include "main/teximage.h"
+#include "main/texenv.h"
+#include "main/texobj.h"
+#include "main/texstate.h"
+#include "main/texparam.h"
+#include "main/varray.h"
+#include "main/attrib.h"
+#include "main/enable.h"
+#include "main/buffers.h"
+#include "main/fbobject.h"
+#include "main/framebuffer.h"
+#include "main/renderbuffer.h"
+#include "main/depth.h"
+#include "main/hash.h"
+#include "main/mipmap.h"
+#include "main/blend.h"
+#include "glapi/dispatch.h"
+#include "swrast/swrast.h"
+
+#include "intel_screen.h"
+#include "intel_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_pixel.h"
+#include "intel_tex.h"
+#include "intel_mipmap_tree.h"
+
+static const char *intel_fp_tex2d =
+      "!!ARBfp1.0\n"
+      "TEX result.color, fragment.texcoord[0], texture[0], 2D;\n"
+      "END\n";
+
+static GLboolean
+intel_generate_mipmap_level(GLcontext *ctx, GLuint tex_name,
+			    int level, int width, int height)
+{
+   struct intel_context *intel = intel_context(ctx);
+   GLfloat vertices[4][2];
+   GLint status;
+
+   /* Set to source from the previous level */
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, level - 1);
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, level - 1);
+
+   /* Set to draw into the current level */
+   _mesa_FramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
+				 GL_COLOR_ATTACHMENT0_EXT,
+				 GL_TEXTURE_2D,
+				 tex_name,
+				 level);
+   /* Choose to render to the color attachment. */
+   _mesa_DrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
+
+   status = _mesa_CheckFramebufferStatusEXT (GL_FRAMEBUFFER_EXT);
+   if (status != GL_FRAMEBUFFER_COMPLETE_EXT)
+      return GL_FALSE;
+
+   intel_meta_set_passthrough_transform(intel);
+
+   /* XXX: Doing it right would involve setting up the transformation to do
+    * 0-1 mapping or something, and not changing the vertex data.
+    */
+   vertices[0][0] = 0;
+   vertices[0][1] = 0;
+   vertices[1][0] = width;
+   vertices[1][1] = 0;
+   vertices[2][0] = width;
+   vertices[2][1] = height;
+   vertices[3][0] = 0;
+   vertices[3][1] = height;
+
+   _mesa_VertexPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &vertices);
+   _mesa_Enable(GL_VERTEX_ARRAY);
+   intel_meta_set_default_texrect(intel);
+
+   CALL_DrawArrays(ctx->Exec, (GL_TRIANGLE_FAN, 0, 4));
+
+   intel_meta_restore_texcoords(intel);
+   intel_meta_restore_transform(intel);
+
+   return GL_TRUE;
+}
+
+static GLboolean
+intel_generate_mipmap_2d(GLcontext *ctx,
+			 GLenum target,
+			 struct gl_texture_object *texObj)
+{
+   struct intel_context *intel = intel_context(ctx);
+   GLint old_active_texture;
+   int level, max_levels, start_level, end_level;
+   GLuint fb_name;
+   GLboolean success = GL_FALSE;
+   struct gl_framebuffer *saved_fbo = NULL;
+
+   _mesa_PushAttrib(GL_ENABLE_BIT | GL_TEXTURE_BIT |
+		    GL_CURRENT_BIT | GL_COLOR_BUFFER_BIT |
+		    GL_DEPTH_BUFFER_BIT);
+   _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
+   old_active_texture = ctx->Texture.CurrentUnit;
+   _mesa_reference_framebuffer(&saved_fbo, ctx->DrawBuffer);
+
+   _mesa_Disable(GL_POLYGON_STIPPLE);
+   _mesa_Disable(GL_DEPTH_TEST);
+   _mesa_Disable(GL_STENCIL_TEST);
+   _mesa_ColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+   _mesa_DepthMask(GL_FALSE);
+
+   /* Bind the given texture to GL_TEXTURE_2D with linear filtering for our
+    * minification.
+    */
+   _mesa_ActiveTextureARB(GL_TEXTURE0_ARB);
+   _mesa_Enable(GL_TEXTURE_2D);
+   _mesa_BindTexture(GL_TEXTURE_2D, texObj->Name);
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER,
+		       GL_LINEAR_MIPMAP_NEAREST);
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+
+   /* Bind the new renderbuffer to the color attachment point. */
+   _mesa_GenFramebuffersEXT(1, &fb_name);
+   _mesa_BindFramebufferEXT(GL_FRAMEBUFFER_EXT, fb_name);
+
+   intel_meta_set_fragment_program(intel, &intel->meta.tex2d_fp,
+				   intel_fp_tex2d);
+   intel_meta_set_passthrough_vertex_program(intel);
+
+   max_levels = _mesa_max_texture_levels(ctx, texObj->Target);
+   start_level = texObj->BaseLevel;
+   end_level = texObj->MaxLevel;
+
+   /* Loop generating level+1 from level. */
+   for (level = start_level; level < end_level && level < max_levels - 1; level++) {
+      const struct gl_texture_image *srcImage;
+      int width, height;
+
+      srcImage = _mesa_select_tex_image(ctx, texObj, target, level);
+      if (srcImage->Border != 0)
+	 goto fail;
+
+      width = srcImage->Width / 2;
+      if (width < 1)
+	 width = 1;
+      height = srcImage->Height / 2;
+      if (height < 1)
+	 height = 1;
+
+      if (width == srcImage->Width &&
+	  height == srcImage->Height) {
+	 /* Neither _mesa_max_texture_levels nor texObj->MaxLevel are the
+	  * maximum texture level for the object, so break out when we've gone
+	  * over the edge.
+	  */
+	 break;
+      }
+
+      /* Make sure that there's space allocated for the target level.
+       * We could skip this if there's already space allocated and save some
+       * time.
+       */
+      _mesa_TexImage2D(GL_TEXTURE_2D, level + 1, srcImage->InternalFormat,
+		       width, height, 0,
+		       GL_RGBA, GL_UNSIGNED_INT, NULL);
+
+      if (!intel_generate_mipmap_level(ctx, texObj->Name, level + 1,
+				       width, height))
+	 goto fail;
+   }
+
+   success = GL_TRUE;
+
+fail:
+   intel_meta_restore_fragment_program(intel);
+   intel_meta_restore_vertex_program(intel);
+
+   _mesa_DeleteFramebuffersEXT(1, &fb_name);
+   _mesa_ActiveTextureARB(GL_TEXTURE0_ARB + old_active_texture);
+   if (saved_fbo)
+      _mesa_BindFramebufferEXT(GL_FRAMEBUFFER_EXT, saved_fbo->Name);
+   _mesa_reference_framebuffer(&saved_fbo, NULL);
+   _mesa_PopClientAttrib();
+   _mesa_PopAttrib();
+
+   return success;
+}
+
+
+/**
+ * Generate new mipmap data from BASE+1 to BASE+p (the minimally-sized mipmap
+ * level).
+ *
+ * The texture object's miptree must be mapped.
+ *
+ * It would be really nice if this was just called by Mesa whenever mipmaps
+ * needed to be regenerated, rather than us having to remember to do so in
+ * each texture image modification path.
+ *
+ * This function should also include an accelerated path.
+ */
+void
+intel_generate_mipmap(GLcontext *ctx, GLenum target,
+                      struct gl_texture_object *texObj)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_texture_object *intelObj = intel_texture_object(texObj);
+   GLuint nr_faces = (intelObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
+   int face, i;
+
+   /* HW path */
+   if (target == GL_TEXTURE_2D &&
+       ctx->Extensions.EXT_framebuffer_object &&
+       ctx->Extensions.ARB_fragment_program &&
+       ctx->Extensions.ARB_vertex_program) {
+      GLboolean success;
+
+      /* We'll be accessing this texture using GL entrypoints, which should
+       * be resilient against other access to this texture.
+       */
+      _mesa_unlock_texture(ctx, texObj);
+      success = intel_generate_mipmap_2d(ctx, target, texObj);
+      _mesa_lock_texture(ctx, texObj);
+
+      if (success)
+	 return;
+   }
+
+   /* SW path */
+   intel_tex_map_level_images(intel, intelObj, texObj->BaseLevel);
+   _mesa_generate_mipmap(ctx, target, texObj);
+   intel_tex_unmap_level_images(intel, intelObj, texObj->BaseLevel);
+
+   /* Update the level information in our private data in the new images, since
+    * it didn't get set as part of a normal TexImage path.
+    */
+   for (face = 0; face < nr_faces; face++) {
+      for (i = texObj->BaseLevel + 1; i < texObj->MaxLevel; i++) {
+         struct intel_texture_image *intelImage;
+
+	 intelImage = intel_texture_image(texObj->Image[face][i]);
+	 if (intelImage == NULL)
+	    break;
+
+	 intelImage->level = i;
+	 intelImage->face = face;
+	 /* Unreference the miptree to signal that the new Data is a bare
+	  * pointer from mesa.
+	  */
+	 intel_miptree_release(intel, &intelImage->mt);
+      }
+   }
+}
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
index 6e1e034e53d..0d34f28311e 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
@@ -57,14 +57,16 @@ intel_miptree_create_internal(struct intel_context *intel,
 			      GLuint last_level,
 			      GLuint width0,
 			      GLuint height0,
-			      GLuint depth0, GLuint cpp, GLuint compress_byte)
+			      GLuint depth0, GLuint cpp, GLuint compress_byte,
+			      uint32_t tiling)
 {
    GLboolean ok;
    struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
 
-   DBG("%s target %s format %s level %d..%d\n", __FUNCTION__,
+   DBG("%s target %s format %s level %d..%d <-- %p\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(target),
-       _mesa_lookup_enum_by_nr(internal_format), first_level, last_level);
+       _mesa_lookup_enum_by_nr(internal_format), 
+       first_level, last_level, mt);
 
    mt->target = target_to_target(target);
    mt->internal_format = internal_format;
@@ -80,15 +82,16 @@ intel_miptree_create_internal(struct intel_context *intel,
 
 #ifdef I915
    if (IS_945(intel->intelScreen->deviceID))
-      ok = i945_miptree_layout(intel, mt);
+      ok = i945_miptree_layout(intel, mt, tiling);
    else
-      ok = i915_miptree_layout(intel, mt);
+      ok = i915_miptree_layout(intel, mt, tiling);
 #else
-   ok = brw_miptree_layout(intel, mt);
+   ok = brw_miptree_layout(intel, mt, tiling);
 #endif
 
    if (!ok) {
       free(mt);
+      DBG("%s not okay - returning NULL\n", __FUNCTION__);
       return NULL;
    }
 
@@ -107,10 +110,18 @@ intel_miptree_create(struct intel_context *intel,
 		     GLboolean expect_accelerated_upload)
 {
    struct intel_mipmap_tree *mt;
+   uint32_t tiling;
+
+   if (intel->use_texture_tiling && compress_byte == 0 &&
+       intel->intelScreen->kernel_exec_fencing)
+      tiling = I915_TILING_X;
+   else
+      tiling = I915_TILING_NONE;
 
    mt = intel_miptree_create_internal(intel, target, internal_format,
 				      first_level, last_level, width0,
-				      height0, depth0, cpp, compress_byte);
+				      height0, depth0, cpp, compress_byte,
+				      tiling);
    /*
     * pitch == 0 || height == 0  indicates the null texture
     */
@@ -118,6 +129,7 @@ intel_miptree_create(struct intel_context *intel,
       return NULL;
 
    mt->region = intel_region_alloc(intel,
+				   tiling,
 				   mt->cpp,
 				   mt->pitch,
 				   mt->total_height,
@@ -147,7 +159,8 @@ intel_miptree_create_for_region(struct intel_context *intel,
    mt = intel_miptree_create_internal(intel, target, internal_format,
 				      first_level, last_level,
 				      region->width, region->height, 1,
-				      region->cpp, compress_byte);
+				      region->cpp, compress_byte,
+				      I915_TILING_NONE);
    if (!mt)
       return mt;
 #if 0
@@ -185,6 +198,7 @@ intel_miptree_create_for_region(struct intel_context *intel,
 
 int intel_miptree_pitch_align (struct intel_context *intel,
 			       struct intel_mipmap_tree *mt,
+			       uint32_t tiling,
 			       int pitch)
 {
 #ifdef I915
@@ -205,6 +219,11 @@ int intel_miptree_pitch_align (struct intel_context *intel,
 	 pitch_align = 4;
       }
 
+      if (tiling == I915_TILING_X)
+	 pitch_align = 512;
+      else if (tiling == I915_TILING_Y)
+	 pitch_align = 128;
+
       pitch = ALIGN(pitch * mt->cpp, pitch_align);
 
 #ifdef I915
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
index 4060b9df78f..3af9966827f 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
@@ -148,6 +148,7 @@ intel_miptree_create_for_region(struct intel_context *intel,
 
 int intel_miptree_pitch_align (struct intel_context *intel,
 			       struct intel_mipmap_tree *mt,
+			       uint32_t tiling,
 			       int pitch);
 
 void intel_miptree_reference(struct intel_mipmap_tree **dst,
@@ -218,10 +219,13 @@ void intel_miptree_image_copy(struct intel_context *intel,
 /* i915_mipmap_tree.c:
  */
 GLboolean i915_miptree_layout(struct intel_context *intel,
-			      struct intel_mipmap_tree *mt);
+			      struct intel_mipmap_tree *mt,
+			      uint32_t tiling);
 GLboolean i945_miptree_layout(struct intel_context *intel,
-			      struct intel_mipmap_tree *mt);
+			      struct intel_mipmap_tree *mt,
+			      uint32_t tiling);
 GLboolean brw_miptree_layout(struct intel_context *intel,
-			     struct intel_mipmap_tree *mt);
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling);
 
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_pixel.c b/src/mesa/drivers/dri/intel/intel_pixel.c
index fc0ac0b79c0..36a684b3b85 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel.c
@@ -27,9 +27,12 @@
 
 #include "main/enums.h"
 #include "main/state.h"
+#include "main/bufferobj.h"
 #include "main/context.h"
 #include "main/enable.h"
 #include "main/matrix.h"
+#include "main/texstate.h"
+#include "main/varray.h"
 #include "main/viewport.h"
 #include "swrast/swrast.h"
 #include "shader/arbprogram.h"
@@ -334,6 +337,85 @@ intel_meta_restore_fragment_program(struct intel_context *intel)
       _mesa_Disable(GL_FRAGMENT_PROGRAM_ARB);
 }
 
+static const float default_texcoords[4][2] = { { 0.0, 0.0 },
+					       { 1.0, 0.0 },
+					       { 1.0, 1.0 },
+					       { 0.0, 1.0 } };
+
+void
+intel_meta_set_default_texrect(struct intel_context *intel)
+{
+   GLcontext *ctx = &intel->ctx;
+   struct gl_client_array *old_texcoord_array;
+
+   intel->meta.saved_active_texture = ctx->Texture.CurrentUnit;
+   if (intel->meta.saved_array_vbo == NULL) {
+      _mesa_reference_buffer_object(ctx, &intel->meta.saved_array_vbo,
+				    ctx->Array.ArrayBufferObj);
+   }
+
+   old_texcoord_array = &ctx->Array.ArrayObj->TexCoord[0];
+   intel->meta.saved_texcoord_type = old_texcoord_array->Type;
+   intel->meta.saved_texcoord_size = old_texcoord_array->Size;
+   intel->meta.saved_texcoord_stride = old_texcoord_array->Stride;
+   intel->meta.saved_texcoord_enable = old_texcoord_array->Enabled;
+   intel->meta.saved_texcoord_ptr = old_texcoord_array->Ptr;
+   _mesa_reference_buffer_object(ctx, &intel->meta.saved_texcoord_vbo,
+				 old_texcoord_array->BufferObj);
+
+   _mesa_ClientActiveTextureARB(GL_TEXTURE0);
+
+   if (intel->meta.texcoord_vbo == NULL) {
+      GLuint vbo_name;
+
+      _mesa_GenBuffersARB(1, &vbo_name);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, vbo_name);
+      _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(default_texcoords),
+			  default_texcoords, GL_STATIC_DRAW_ARB);
+      _mesa_reference_buffer_object(ctx, &intel->meta.texcoord_vbo,
+				    ctx->Array.ArrayBufferObj);
+   } else {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB,
+			  intel->meta.texcoord_vbo->Name);
+   }
+   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), NULL);
+
+   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
+}
+
+void
+intel_meta_restore_texcoords(struct intel_context *intel)
+{
+   GLcontext *ctx = &intel->ctx;
+
+   /* Restore the old TexCoordPointer */
+   if (intel->meta.saved_texcoord_vbo) {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB,
+			  intel->meta.saved_texcoord_vbo->Name);
+      _mesa_reference_buffer_object(ctx, &intel->meta.saved_texcoord_vbo, NULL);
+   } else {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
+   }
+
+   _mesa_TexCoordPointer(intel->meta.saved_texcoord_size,
+			 intel->meta.saved_texcoord_type,
+			 intel->meta.saved_texcoord_stride,
+			 intel->meta.saved_texcoord_ptr);
+   if (!intel->meta.saved_texcoord_enable)
+      _mesa_Disable(GL_TEXTURE_COORD_ARRAY);
+
+   _mesa_ClientActiveTextureARB(GL_TEXTURE0 +
+				intel->meta.saved_active_texture);
+
+   if (intel->meta.saved_array_vbo) {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB,
+			  intel->meta.saved_array_vbo->Name);
+      _mesa_reference_buffer_object(ctx, &intel->meta.saved_array_vbo, NULL);
+   } else {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
+   }
+}
+
 void
 intelInitPixelFuncs(struct dd_function_table *functions)
 {
@@ -355,5 +437,7 @@ intel_free_pixel_state(struct intel_context *intel)
 
    _mesa_reference_vertprog(ctx, &intel->meta.passthrough_vp, NULL);
    _mesa_reference_fragprog(ctx, &intel->meta.bitmap_fp, NULL);
+   _mesa_reference_fragprog(ctx, &intel->meta.tex2d_fp, NULL);
+   _mesa_reference_buffer_object(ctx, &intel->meta.texcoord_vbo, NULL);
 }
 
diff --git a/src/mesa/drivers/dri/intel/intel_pixel.h b/src/mesa/drivers/dri/intel/intel_pixel.h
index cb41fa182cb..6acf0813c8c 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel.h
+++ b/src/mesa/drivers/dri/intel/intel_pixel.h
@@ -40,6 +40,9 @@ void intel_meta_set_fragment_program(struct intel_context *intel,
 				     const char *prog_string);
 void intel_meta_restore_fragment_program(struct intel_context *intel);
 void intel_free_pixel_state(struct intel_context *intel);
+void intel_meta_set_default_texrect(struct intel_context *intel);
+void intel_meta_set_default_texrect(struct intel_context *intel);
+void intel_meta_restore_texcoords(struct intel_context *intel);
 
 GLboolean intel_check_blit_fragment_ops(GLcontext * ctx,
 					GLboolean src_alpha_is_one);
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
index 1db7f5594e9..80d3239189c 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
@@ -360,7 +360,6 @@ intel_texture_bitmap(GLcontext * ctx,
       "END\n";
    GLuint texname;
    GLfloat vertices[4][4];
-   GLfloat texcoords[4][2];
    GLint old_active_texture;
    GLubyte *unpacked_bitmap;
    GLubyte *a8_bitmap;
@@ -401,6 +400,14 @@ intel_texture_bitmap(GLcontext * ctx,
       return GL_FALSE;
    }
 
+   if (!ctx->Extensions.ARB_texture_non_power_of_two &&
+       (!is_power_of_two(width) || !is_power_of_two(height))) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr,
+		 "glBitmap() fallback: NPOT texture\n");
+      return GL_FALSE;
+   }
+
    /* Check that we can load in a texture this big. */
    if (width > (1 << (ctx->Const.MaxTextureLevels - 1)) ||
        height > (1 << (ctx->Const.MaxTextureLevels - 1))) {
@@ -485,22 +492,12 @@ intel_texture_bitmap(GLcontext * ctx,
    vertices[3][2] = dst_z;
    vertices[3][3] = 1.0;
 
-   texcoords[0][0] = 0.0;
-   texcoords[0][1] = 0.0;
-   texcoords[1][0] = 1.0;
-   texcoords[1][1] = 0.0;
-   texcoords[2][0] = 1.0;
-   texcoords[2][1] = 1.0;
-   texcoords[3][0] = 0.0;
-   texcoords[3][1] = 1.0;
-
    _mesa_VertexPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), &vertices);
-   _mesa_ClientActiveTextureARB(GL_TEXTURE0);
-   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
    _mesa_Enable(GL_VERTEX_ARRAY);
-   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
+   intel_meta_set_default_texrect(intel);
    CALL_DrawArrays(ctx->Exec, (GL_TRIANGLE_FAN, 0, 4));
 
+   intel_meta_restore_texcoords(intel);
    intel_meta_restore_transform(intel);
    intel_meta_restore_fragment_program(intel);
    intel_meta_restore_vertex_program(intel);
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_draw.c b/src/mesa/drivers/dri/intel/intel_pixel_draw.c
index e8d5ac8569d..46d27f1a93a 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_draw.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_draw.c
@@ -70,7 +70,6 @@ intel_texture_drawpixels(GLcontext * ctx,
    struct intel_context *intel = intel_context(ctx);
    GLuint texname;
    GLfloat vertices[4][4];
-   GLfloat texcoords[4][2];
    GLfloat z;
    GLint old_active_texture;
    GLenum internalFormat;
@@ -97,7 +96,7 @@ intel_texture_drawpixels(GLcontext * ctx,
    /* We don't have a way to generate fragments with stencil values which
     * will set the resulting stencil value.
     */
-   if (format == GL_STENCIL_INDEX)
+   if (format == GL_STENCIL_INDEX || format == GL_DEPTH_STENCIL)
       return GL_FALSE;
 
    /* Check that we can load in a texture this big. */
@@ -120,6 +119,14 @@ intel_texture_drawpixels(GLcontext * ctx,
       return GL_FALSE;
    }
 
+   if (!ctx->Extensions.ARB_texture_non_power_of_two &&
+       (!is_power_of_two(width) || !is_power_of_two(height))) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr,
+		 "glDrawPixels() fallback: NPOT texture\n");
+      return GL_FALSE;
+   }
+
    _mesa_PushAttrib(GL_ENABLE_BIT | GL_TEXTURE_BIT |
 		    GL_CURRENT_BIT);
    _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
@@ -169,22 +176,13 @@ intel_texture_drawpixels(GLcontext * ctx,
    vertices[3][2] = z;
    vertices[3][3] = 1.0;
 
-   texcoords[0][0] = 0.0;
-   texcoords[0][1] = 0.0;
-   texcoords[1][0] = 1.0;
-   texcoords[1][1] = 0.0;
-   texcoords[2][0] = 1.0;
-   texcoords[2][1] = 1.0;
-   texcoords[3][0] = 0.0;
-   texcoords[3][1] = 1.0;
-
    _mesa_VertexPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), &vertices);
-   _mesa_ClientActiveTextureARB(GL_TEXTURE0);
-   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
    _mesa_Enable(GL_VERTEX_ARRAY);
-   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
+   intel_meta_set_default_texrect(intel);
+
    CALL_DrawArrays(ctx->Exec, (GL_TRIANGLE_FAN, 0, 4));
 
+   intel_meta_restore_texcoords(intel);
    intel_meta_restore_transform(intel);
 
    _mesa_ActiveTextureARB(GL_TEXTURE0_ARB + old_active_texture);
@@ -208,7 +206,6 @@ intel_stencil_drawpixels(GLcontext * ctx,
    struct intel_context *intel = intel_context(ctx);
    GLuint texname, rb_name, fb_name, old_fb_name;
    GLfloat vertices[4][2];
-   GLfloat texcoords[4][2];
    struct intel_renderbuffer *irb;
    struct intel_renderbuffer *depth_irb;
    struct gl_renderbuffer *rb;
@@ -273,6 +270,14 @@ intel_stencil_drawpixels(GLcontext * ctx,
       return GL_FALSE;
    }
 
+   if (!ctx->Extensions.ARB_texture_non_power_of_two &&
+       (!is_power_of_two(width) || !is_power_of_two(height))) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr,
+		 "glDrawPixels(GL_STENCIL_INDEX) fallback: NPOT texture\n");
+      return GL_FALSE;
+   }
+
    _mesa_PushAttrib(GL_ENABLE_BIT | GL_TEXTURE_BIT |
 		    GL_CURRENT_BIT | GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
@@ -343,7 +348,6 @@ intel_stencil_drawpixels(GLcontext * ctx,
    _mesa_free(stencil_pixels);
 
    intel_meta_set_passthrough_transform(intel);
-
    vertices[0][0] = x;
    vertices[0][1] = y;
    vertices[1][0] = x + width * ctx->Pixel.ZoomX;
@@ -353,22 +357,13 @@ intel_stencil_drawpixels(GLcontext * ctx,
    vertices[3][0] = x;
    vertices[3][1] = y + height * ctx->Pixel.ZoomY;
 
-   texcoords[0][0] = 0.0;
-   texcoords[0][1] = 0.0;
-   texcoords[1][0] = 1.0;
-   texcoords[1][1] = 0.0;
-   texcoords[2][0] = 1.0;
-   texcoords[2][1] = 1.0;
-   texcoords[3][0] = 0.0;
-   texcoords[3][1] = 1.0;
-
    _mesa_VertexPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &vertices);
-   _mesa_ClientActiveTextureARB(GL_TEXTURE0);
-   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
    _mesa_Enable(GL_VERTEX_ARRAY);
-   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
+   intel_meta_set_default_texrect(intel);
+
    CALL_DrawArrays(ctx->Exec, (GL_TRIANGLE_FAN, 0, 4));
 
+   intel_meta_restore_texcoords(intel);
    intel_meta_restore_transform(intel);
 
    _mesa_ActiveTextureARB(GL_TEXTURE0_ARB + old_active_texture);
diff --git a/src/mesa/drivers/dri/intel/intel_reg.h b/src/mesa/drivers/dri/intel/intel_reg.h
index 57ac8f0cc14..d19f1bae34c 100644
--- a/src/mesa/drivers/dri/intel/intel_reg.h
+++ b/src/mesa/drivers/dri/intel/intel_reg.h
@@ -189,6 +189,19 @@
 
 #define S7_DEPTH_OFFSET_CONST_MASK     ~0
 
+/* p143 */
+#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
+/* Dword 1 */
+#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
+#define BUF_3D_ID_DEPTH 	(0x7<<24)
+#define BUF_3D_USE_FENCE	(1<<23)
+#define BUF_3D_TILED_SURFACE	(1<<22)
+#define BUF_3D_TILE_WALK_X	0
+#define BUF_3D_TILE_WALK_Y	(1<<21)
+#define BUF_3D_PITCH(x)         (((x)/4)<<2)
+/* Dword 2 */
+#define BUF_3D_ADDR(x)		((x) & ~0x3)
+
 /* Primitive dispatch on 830-945 */
 #define _3DPRIMITIVE			(CMD_3D | (0x1f << 24))
 #define PRIM_INDIRECT            (1<<23)
diff --git a/src/mesa/drivers/dri/intel/intel_regions.c b/src/mesa/drivers/dri/intel/intel_regions.c
index 0aa5b8c02c9..49bcb3c1ddd 100644
--- a/src/mesa/drivers/dri/intel/intel_regions.c
+++ b/src/mesa/drivers/dri/intel/intel_regions.c
@@ -52,17 +52,74 @@
 
 #define FILE_DEBUG_FLAG DEBUG_REGION
 
+/* This should be set to the maximum backtrace size desired.
+ * Set it to 0 to disable backtrace debugging.
+ */
+#define DEBUG_BACKTRACE_SIZE 0
+
+#if DEBUG_BACKTRACE_SIZE == 0
+/* Use the standard debug output */
+#define _DBG(...) DBG(__VA_ARGS__)
+#else
+/* Use backtracing debug output */
+#define _DBG(...) {debug_backtrace(); DBG(__VA_ARGS__);}
+
+/* Backtracing debug support */
+#include <execinfo.h>
+
+static void
+debug_backtrace(void)
+{
+   void *trace[DEBUG_BACKTRACE_SIZE];
+   char **strings = NULL;
+   int traceSize;
+   register int i;
+
+   traceSize = backtrace(trace, DEBUG_BACKTRACE_SIZE);
+   strings = backtrace_symbols(trace, traceSize);
+   if (strings == NULL) {
+      DBG("no backtrace:");
+      return;
+   }
+
+   /* Spit out all the strings with a colon separator.  Ignore
+    * the first, since we don't really care about the call
+    * to debug_backtrace() itself.  Skip until the final "/" in
+    * the trace to avoid really long lines.
+    */
+   for (i = 1; i < traceSize; i++) {
+      char *p = strings[i], *slash = strings[i];
+      while (*p) {
+         if (*p++ == '/') {
+            slash = p;
+         }
+      }
+
+      DBG("%s:", slash);
+   }
+
+   /* Free up the memory, and we're done */
+   free(strings);
+}
+
+#endif
+
+
+
 /* XXX: Thread safety?
  */
 GLubyte *
 intel_region_map(struct intel_context *intel, struct intel_region *region)
 {
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s %p\n", __FUNCTION__, region);
    if (!region->map_refcount++) {
       if (region->pbo)
          intel_region_cow(intel, region);
 
-      dri_bo_map(region->buffer, GL_TRUE);
+      if (intel->intelScreen->kernel_exec_fencing)
+	 drm_intel_gem_bo_map_gtt(region->buffer);
+      else
+	 dri_bo_map(region->buffer, GL_TRUE);
       region->map = region->buffer->virtual;
    }
 
@@ -72,9 +129,12 @@ intel_region_map(struct intel_context *intel, struct intel_region *region)
 void
 intel_region_unmap(struct intel_context *intel, struct intel_region *region)
 {
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s %p\n", __FUNCTION__, region);
    if (!--region->map_refcount) {
-      dri_bo_unmap(region->buffer);
+      if (intel->intelScreen->kernel_exec_fencing)
+	 drm_intel_gem_bo_unmap_gtt(region->buffer);
+      else
+	 dri_bo_unmap(region->buffer);
       region->map = NULL;
    }
 }
@@ -87,10 +147,10 @@ intel_region_alloc_internal(struct intel_context *intel,
 {
    struct intel_region *region;
 
-   DBG("%s\n", __FUNCTION__);
-
-   if (buffer == NULL)
+   if (buffer == NULL) {
+      _DBG("%s <-- NULL\n", __FUNCTION__);
       return NULL;
+   }
 
    region = calloc(sizeof(*region), 1);
    region->cpp = cpp;
@@ -104,15 +164,18 @@ intel_region_alloc_internal(struct intel_context *intel,
    region->tiling = I915_TILING_NONE;
    region->bit_6_swizzle = I915_BIT_6_SWIZZLE_NONE;
 
+   _DBG("%s <-- %p\n", __FUNCTION__, region);
    return region;
 }
 
 struct intel_region *
 intel_region_alloc(struct intel_context *intel,
+		   uint32_t tiling,
                    GLuint cpp, GLuint width, GLuint height, GLuint pitch,
 		   GLboolean expect_accelerated_upload)
 {
    dri_bo *buffer;
+   struct intel_region *region;
 
    if (expect_accelerated_upload) {
       buffer = drm_intel_bo_alloc_for_render(intel->bufmgr, "region",
@@ -122,7 +185,16 @@ intel_region_alloc(struct intel_context *intel,
 				  pitch * cpp * height, 64);
    }
 
-   return intel_region_alloc_internal(intel, cpp, width, height, pitch, buffer);
+   region = intel_region_alloc_internal(intel, cpp, width, height,
+					pitch, buffer);
+
+   if (tiling != I915_TILING_NONE) {
+      assert(((pitch * cpp) & 511) == 0);
+      drm_intel_bo_set_tiling(buffer, &tiling, pitch * cpp);
+      drm_intel_bo_get_tiling(buffer, &region->tiling, &region->bit_6_swizzle);
+   }
+
+   return region;
 }
 
 struct intel_region *
@@ -158,7 +230,7 @@ void
 intel_region_reference(struct intel_region **dst, struct intel_region *src)
 {
    if (src)
-      DBG("%s %p %d\n", __FUNCTION__, src, src->refcount);
+      _DBG("%s %p %d\n", __FUNCTION__, src, src->refcount);
 
    assert(*dst == NULL);
    if (src) {
@@ -172,10 +244,12 @@ intel_region_release(struct intel_region **region_handle)
 {
    struct intel_region *region = *region_handle;
 
-   if (region == NULL)
+   if (region == NULL) {
+      _DBG("%s NULL\n", __FUNCTION__);
       return;
+   }
 
-   DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
+   _DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
 
    ASSERT(region->refcount > 0);
    region->refcount--;
@@ -251,7 +325,7 @@ intel_region_data(struct intel_context *intel,
 {
    GLboolean locked = GL_FALSE;
 
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s\n", __FUNCTION__);
 
    if (intel == NULL)
       return;
@@ -293,7 +367,7 @@ intel_region_copy(struct intel_context *intel,
                   GLuint src_offset,
                   GLuint srcx, GLuint srcy, GLuint width, GLuint height)
 {
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s\n", __FUNCTION__);
 
    if (intel == NULL)
       return;
@@ -326,7 +400,7 @@ intel_region_fill(struct intel_context *intel,
                   GLuint dstx, GLuint dsty,
                   GLuint width, GLuint height, GLuint color)
 {
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s\n", __FUNCTION__);
 
    if (intel == NULL)
       return;   
@@ -356,6 +430,8 @@ intel_region_attach_pbo(struct intel_context *intel,
    if (region->pbo == pbo)
       return;
 
+   _DBG("%s %p %p\n", __FUNCTION__, region, pbo);
+
    /* If there is already a pbo attached, break the cow tie now.
     * Don't call intel_region_release_pbo() as that would
     * unnecessarily allocate a new buffer we would have to immediately
@@ -385,6 +461,7 @@ void
 intel_region_release_pbo(struct intel_context *intel,
                          struct intel_region *region)
 {
+   _DBG("%s %p\n", __FUNCTION__, region);
    assert(region->buffer == region->pbo->buffer);
    region->pbo->region = NULL;
    region->pbo = NULL;
@@ -412,7 +489,7 @@ intel_region_cow(struct intel_context *intel, struct intel_region *region)
 
    assert(region->cpp * region->pitch * region->height == pbo->Base.Size);
 
-   DBG("%s (%d bytes)\n", __FUNCTION__, pbo->Base.Size);
+   _DBG("%s %p (%d bytes)\n", __FUNCTION__, region, pbo->Base.Size);
 
    /* Now blit from the texture buffer to the new buffer: 
     */
@@ -459,6 +536,10 @@ intel_recreate_static(struct intel_context *intel,
    if (region == NULL) {
       region = calloc(sizeof(*region), 1);
       region->refcount = 1;
+      _DBG("%s creating new region %p\n", __FUNCTION__, region);
+   }
+   else {
+      _DBG("%s %p\n", __FUNCTION__, region);
    }
 
    if (intel->ctx.Visual.rgbBits == 24)
diff --git a/src/mesa/drivers/dri/intel/intel_regions.h b/src/mesa/drivers/dri/intel/intel_regions.h
index 45e2bf4e77a..bd3c8e7325b 100644
--- a/src/mesa/drivers/dri/intel/intel_regions.h
+++ b/src/mesa/drivers/dri/intel/intel_regions.h
@@ -73,7 +73,8 @@ struct intel_region
  * copied by calling intel_reference_region().
  */
 struct intel_region *intel_region_alloc(struct intel_context *intel,
-                                        GLuint cpp, GLuint width,
+                                        uint32_t tiling,
+					GLuint cpp, GLuint width,
                                         GLuint height, GLuint pitch,
 					GLboolean expect_accelerated_upload);
 
diff --git a/src/mesa/drivers/dri/intel/intel_screen.c b/src/mesa/drivers/dri/intel/intel_screen.c
index 65e62947ef6..8da96ede644 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -49,6 +49,10 @@
 #include "i915_drm.h"
 #include "i830_dri.h"
 
+#define DRI_CONF_TEXTURE_TILING(def) \
+	DRI_CONF_OPT_BEGIN(texture_tiling, bool, def)		\
+		DRI_CONF_DESC(en, "Enable texture tiling")	\
+	DRI_CONF_OPT_END					\
 
 PUBLIC const char __driConfigOptions[] =
    DRI_CONF_BEGIN
@@ -64,6 +68,17 @@ PUBLIC const char __driConfigOptions[] =
 	    DRI_CONF_ENUM(1, "Enable reuse of all sizes of buffer objects")
 	 DRI_CONF_DESC_END
       DRI_CONF_OPT_END
+
+#ifdef I915
+     DRI_CONF_TEXTURE_TILING(false)
+#else
+     DRI_CONF_TEXTURE_TILING(true)
+#endif
+
+      DRI_CONF_OPT_BEGIN(early_z, bool, false)
+	 DRI_CONF_DESC(en, "Enable early Z in classic mode (unstable, 945-only).")
+      DRI_CONF_OPT_END
+
    DRI_CONF_SECTION_END
    DRI_CONF_SECTION_QUALITY
       DRI_CONF_FORCE_S3TC_ENABLE(false)
@@ -76,7 +91,7 @@ PUBLIC const char __driConfigOptions[] =
    DRI_CONF_SECTION_END
 DRI_CONF_END;
 
-const GLuint __driNConfigOptions = 8;
+const GLuint __driNConfigOptions = 10;
 
 #ifdef USE_NEW_INTERFACE
 static PFNGLXCREATECONTEXTMODES create_context_modes = NULL;
@@ -236,7 +251,7 @@ intel_get_param(__DRIscreenPrivate *psp, int param, int *value)
 
    ret = drmCommandWriteRead(psp->fd, DRM_I915_GETPARAM, &gp, sizeof(gp));
    if (ret) {
-      fprintf(stderr, "drm_i915_getparam: %d\n", ret);
+      _mesa_warning(NULL, "drm_i915_getparam: %d", ret);
       return GL_FALSE;
    }
 
diff --git a/src/mesa/drivers/dri/intel/intel_span.c b/src/mesa/drivers/dri/intel/intel_span.c
index c3a873f1abd..34b78ebc1ab 100644
--- a/src/mesa/drivers/dri/intel/intel_span.c
+++ b/src/mesa/drivers/dri/intel/intel_span.c
@@ -29,6 +29,7 @@
 #include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/colormac.h"
+#include "main/texformat.h"
 
 #include "intel_buffers.h"
 #include "intel_fbo.h"
@@ -313,6 +314,22 @@ static uint32_t y_tile_swizzle(struct intel_renderbuffer *irb,
 #define INTEL_TAG(x) x##_RGB565
 #include "intel_spantmp.h"
 
+/* a4r4g4b4 color span and pixel functions */
+#define INTEL_PIXEL_FMT GL_BGRA
+#define INTEL_PIXEL_TYPE GL_UNSIGNED_SHORT_4_4_4_4_REV
+#define INTEL_READ_VALUE(offset) pread_16(irb, offset)
+#define INTEL_WRITE_VALUE(offset, v) pwrite_16(irb, offset, v)
+#define INTEL_TAG(x) x##_ARGB4444
+#include "intel_spantmp.h"
+
+/* a1r5g5b5 color span and pixel functions */
+#define INTEL_PIXEL_FMT GL_BGRA
+#define INTEL_PIXEL_TYPE GL_UNSIGNED_SHORT_1_5_5_5_REV
+#define INTEL_READ_VALUE(offset) pread_16(irb, offset)
+#define INTEL_WRITE_VALUE(offset, v) pwrite_16(irb, offset, v)
+#define INTEL_TAG(x) x##_ARGB1555
+#include "intel_spantmp.h"
+
 /* a8r8g8b8 color span and pixel functions */
 #define INTEL_PIXEL_FMT GL_BGRA
 #define INTEL_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
@@ -561,8 +578,8 @@ intel_set_span_functions(struct intel_context *intel,
    else
       tiling = I915_TILING_NONE;
 
-   if (rb->_ActualFormat == GL_RGB5) {
-      /* 565 RGB */
+   switch (irb->texformat->MesaFormat) {
+   case MESA_FORMAT_RGB565:
       switch (tiling) {
       case I915_TILING_NONE:
       default:
@@ -575,38 +592,67 @@ intel_set_span_functions(struct intel_context *intel,
 	 intel_YTile_InitPointers_RGB565(rb);
 	 break;
       }
-   }
-   else if (rb->_ActualFormat == GL_RGB8) {
-      /* 8888 RGBx */
+      break;
+   case MESA_FORMAT_ARGB4444:
       switch (tiling) {
       case I915_TILING_NONE:
       default:
-	 intelInitPointers_xRGB8888(rb);
+	 intelInitPointers_ARGB4444(rb);
 	 break;
       case I915_TILING_X:
-	 intel_XTile_InitPointers_xRGB8888(rb);
+	 intel_XTile_InitPointers_ARGB4444(rb);
 	 break;
       case I915_TILING_Y:
-	 intel_YTile_InitPointers_xRGB8888(rb);
+	 intel_YTile_InitPointers_ARGB4444(rb);
 	 break;
       }
-   }
-   else if (rb->_ActualFormat == GL_RGBA8) {
-      /* 8888 RGBA */
+      break;
+   case MESA_FORMAT_ARGB1555:
       switch (tiling) {
       case I915_TILING_NONE:
       default:
-	 intelInitPointers_ARGB8888(rb);
+	 intelInitPointers_ARGB1555(rb);
 	 break;
       case I915_TILING_X:
-	 intel_XTile_InitPointers_ARGB8888(rb);
+	 intel_XTile_InitPointers_ARGB1555(rb);
 	 break;
       case I915_TILING_Y:
-	 intel_YTile_InitPointers_ARGB8888(rb);
+	 intel_YTile_InitPointers_ARGB1555(rb);
 	 break;
       }
-   }
-   else if (rb->_ActualFormat == GL_DEPTH_COMPONENT16) {
+      break;
+   case MESA_FORMAT_ARGB8888:
+      if (rb->AlphaBits == 0) { /* XXX: Need xRGB8888 Mesa format */
+	 /* 8888 RGBx */
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitPointers_xRGB8888(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitPointers_xRGB8888(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitPointers_xRGB8888(rb);
+	    break;
+	 }
+      } else {
+	 /* 8888 RGBA */
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitPointers_ARGB8888(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitPointers_ARGB8888(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitPointers_ARGB8888(rb);
+	    break;
+	 }
+      }
+      break;
+   case MESA_FORMAT_Z16:
       switch (tiling) {
       case I915_TILING_NONE:
       default:
@@ -619,51 +665,57 @@ intel_set_span_functions(struct intel_context *intel,
 	 intel_YTile_InitDepthPointers_z16(rb);
 	 break;
       }
-   }
-   else if (rb->_ActualFormat == GL_DEPTH_COMPONENT24) {
-      switch (tiling) {
-      case I915_TILING_NONE:
-      default:
-	 intelInitDepthPointers_z24(rb);
-	 break;
-      case I915_TILING_X:
-	 intel_XTile_InitDepthPointers_z24(rb);
-	 break;
-      case I915_TILING_Y:
-	 intel_YTile_InitDepthPointers_z24(rb);
-	 break;
-      }
-   }
-   else if (rb->_ActualFormat == GL_DEPTH24_STENCIL8_EXT) {
-      switch (tiling) {
-      case I915_TILING_NONE:
-      default:
-	 intelInitDepthPointers_z24_s8(rb);
-	 break;
-      case I915_TILING_X:
-	 intel_XTile_InitDepthPointers_z24_s8(rb);
-	 break;
-      case I915_TILING_Y:
-	 intel_YTile_InitDepthPointers_z24_s8(rb);
-	 break;
-      }
-   }
-   else if (rb->_ActualFormat == GL_STENCIL_INDEX8_EXT) {
-      switch (tiling) {
-      case I915_TILING_NONE:
-      default:
-	 intelInitStencilPointers_z24_s8(rb);
-	 break;
-      case I915_TILING_X:
-	 intel_XTile_InitStencilPointers_z24_s8(rb);
-	 break;
-      case I915_TILING_Y:
-	 intel_YTile_InitStencilPointers_z24_s8(rb);
-	 break;
+      break;
+   case MESA_FORMAT_S8_Z24:
+      /* There are a few different ways SW asks us to access the S8Z24 data:
+       * Z24 depth-only depth reads
+       * S8Z24 depth reads
+       * S8Z24 stencil reads.
+       */
+      if (rb->_ActualFormat == GL_DEPTH_COMPONENT24) {
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitDepthPointers_z24(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitDepthPointers_z24(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitDepthPointers_z24(rb);
+	    break;
+	 }
+      } else if (rb->_ActualFormat == GL_DEPTH24_STENCIL8_EXT) {
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitDepthPointers_z24_s8(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitDepthPointers_z24_s8(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitDepthPointers_z24_s8(rb);
+	    break;
+	 }
+      } else if (rb->_ActualFormat == GL_STENCIL_INDEX8_EXT) {
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitStencilPointers_z24_s8(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitStencilPointers_z24_s8(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitStencilPointers_z24_s8(rb);
+	    break;
+	 }
       }
-   }
-   else {
+      break;
+   default:
       _mesa_problem(NULL,
-                    "Unexpected _ActualFormat in intelSetSpanFunctions");
+                    "Unexpected MesaFormat in intelSetSpanFunctions");
+      break;
    }
 }
diff --git a/src/mesa/drivers/dri/intel/intel_tex.c b/src/mesa/drivers/dri/intel/intel_tex.c
index ae0994b183a..df63f29a42c 100644
--- a/src/mesa/drivers/dri/intel/intel_tex.c
+++ b/src/mesa/drivers/dri/intel/intel_tex.c
@@ -158,81 +158,11 @@ timed_memcpy(void *dest, const void *src, size_t n)
 }
 #endif /* DO_DEBUG */
 
-/**
- * Generate new mipmap data from BASE+1 to BASE+p (the minimally-sized mipmap
- * level).
- *
- * The texture object's miptree must be mapped.
- *
- * It would be really nice if this was just called by Mesa whenever mipmaps
- * needed to be regenerated, rather than us having to remember to do so in
- * each texture image modification path.
- *
- * This function should also include an accelerated path.
- */
-void
-intel_generate_mipmap(GLcontext *ctx, GLenum target,
-                      struct gl_texture_object *texObj)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_texture_object *intelObj = intel_texture_object(texObj);
-   GLuint nr_faces = (intelObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
-   int face, i;
-
-   _mesa_generate_mipmap(ctx, target, texObj);
-
-   /* Update the level information in our private data in the new images, since
-    * it didn't get set as part of a normal TexImage path.
-    */
-   for (face = 0; face < nr_faces; face++) {
-      for (i = texObj->BaseLevel + 1; i < texObj->MaxLevel; i++) {
-         struct intel_texture_image *intelImage;
-
-	 intelImage = intel_texture_image(texObj->Image[face][i]);
-	 if (intelImage == NULL)
-	    break;
-
-	 intelImage->level = i;
-	 intelImage->face = face;
-	 /* Unreference the miptree to signal that the new Data is a bare
-	  * pointer from mesa.
-	  */
-	 intel_miptree_release(intel, &intelImage->mt);
-      }
-   }
-}
-
-static void intelGenerateMipmap(GLcontext *ctx, GLenum target, struct gl_texture_object *texObj)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_texture_object *intelObj = intel_texture_object(texObj);
-
-   intel_tex_map_level_images(intel, intelObj, texObj->BaseLevel);
-   intel_generate_mipmap(ctx, target, texObj);
-   intel_tex_unmap_level_images(intel, intelObj, texObj->BaseLevel);
-}
-
 void
 intelInitTextureFuncs(struct dd_function_table *functions)
 {
    functions->ChooseTextureFormat = intelChooseTextureFormat;
-   functions->TexImage1D = intelTexImage1D;
-   functions->TexImage2D = intelTexImage2D;
-   functions->TexImage3D = intelTexImage3D;
-   functions->TexSubImage1D = intelTexSubImage1D;
-   functions->TexSubImage2D = intelTexSubImage2D;
-   functions->TexSubImage3D = intelTexSubImage3D;
-   functions->CopyTexImage1D = intelCopyTexImage1D;
-   functions->CopyTexImage2D = intelCopyTexImage2D;
-   functions->CopyTexSubImage1D = intelCopyTexSubImage1D;
-   functions->CopyTexSubImage2D = intelCopyTexSubImage2D;
-   functions->GetTexImage = intelGetTexImage;
-   functions->GenerateMipmap = intelGenerateMipmap;
-
-   /* compressed texture functions */
-   functions->CompressedTexImage2D = intelCompressedTexImage2D;
-   functions->CompressedTexSubImage2D = intelCompressedTexSubImage2D;
-   functions->GetCompressedTexImage = intelGetCompressedTexImage;
+   functions->GenerateMipmap = intel_generate_mipmap;
 
    functions->NewTextureObject = intelNewTextureObject;
    functions->NewTextureImage = intelNewTextureImage;
diff --git a/src/mesa/drivers/dri/intel/intel_tex.h b/src/mesa/drivers/dri/intel/intel_tex.h
index f5372d82fb2..471aa2a240b 100644
--- a/src/mesa/drivers/dri/intel/intel_tex.h
+++ b/src/mesa/drivers/dri/intel/intel_tex.h
@@ -35,116 +35,17 @@
 
 void intelInitTextureFuncs(struct dd_function_table *functions);
 
+void intelInitTextureImageFuncs(struct dd_function_table *functions);
+
+void intelInitTextureSubImageFuncs(struct dd_function_table *functions);
+
+void intelInitTextureCopyImageFuncs(struct dd_function_table *functions);
+
 const struct gl_texture_format *intelChooseTextureFormat(GLcontext * ctx,
                                                          GLint internalFormat,
                                                          GLenum format,
                                                          GLenum type);
 
-
-void intelTexImage3D(GLcontext * ctx,
-                     GLenum target, GLint level,
-                     GLint internalFormat,
-                     GLint width, GLint height, GLint depth,
-                     GLint border,
-                     GLenum format, GLenum type, const void *pixels,
-                     const struct gl_pixelstore_attrib *packing,
-                     struct gl_texture_object *texObj,
-                     struct gl_texture_image *texImage);
-
-void intelTexSubImage3D(GLcontext * ctx,
-                        GLenum target,
-                        GLint level,
-                        GLint xoffset, GLint yoffset, GLint zoffset,
-                        GLsizei width, GLsizei height, GLsizei depth,
-                        GLenum format, GLenum type,
-                        const GLvoid * pixels,
-                        const struct gl_pixelstore_attrib *packing,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage);
-
-void intelTexImage2D(GLcontext * ctx,
-                     GLenum target, GLint level,
-                     GLint internalFormat,
-                     GLint width, GLint height, GLint border,
-                     GLenum format, GLenum type, const void *pixels,
-                     const struct gl_pixelstore_attrib *packing,
-                     struct gl_texture_object *texObj,
-                     struct gl_texture_image *texImage);
-
-void intelTexSubImage2D(GLcontext * ctx,
-                        GLenum target,
-                        GLint level,
-                        GLint xoffset, GLint yoffset,
-                        GLsizei width, GLsizei height,
-                        GLenum format, GLenum type,
-                        const GLvoid * pixels,
-                        const struct gl_pixelstore_attrib *packing,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage);
-
-void intelTexImage1D(GLcontext * ctx,
-                     GLenum target, GLint level,
-                     GLint internalFormat,
-                     GLint width, GLint border,
-                     GLenum format, GLenum type, const void *pixels,
-                     const struct gl_pixelstore_attrib *packing,
-                     struct gl_texture_object *texObj,
-                     struct gl_texture_image *texImage);
-
-void intelTexSubImage1D(GLcontext * ctx,
-                        GLenum target,
-                        GLint level,
-                        GLint xoffset,
-                        GLsizei width,
-                        GLenum format, GLenum type,
-                        const GLvoid * pixels,
-                        const struct gl_pixelstore_attrib *packing,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage);
-
-void intelCopyTexImage1D(GLcontext * ctx, GLenum target, GLint level,
-                         GLenum internalFormat,
-                         GLint x, GLint y, GLsizei width, GLint border);
-
-void intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
-                         GLenum internalFormat,
-                         GLint x, GLint y, GLsizei width, GLsizei height,
-                         GLint border);
-
-void intelCopyTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
-                            GLint xoffset, GLint x, GLint y, GLsizei width);
-
-void intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
-                            GLint xoffset, GLint yoffset,
-                            GLint x, GLint y, GLsizei width, GLsizei height);
-
-void intelGetTexImage(GLcontext * ctx, GLenum target, GLint level,
-                      GLenum format, GLenum type, GLvoid * pixels,
-                      struct gl_texture_object *texObj,
-                      struct gl_texture_image *texImage);
-
-void intelCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-				GLint internalFormat,
-				GLint width, GLint height, GLint border,
-				GLsizei imageSize, const GLvoid *data,
-				struct gl_texture_object *texObj,
-				struct gl_texture_image *texImage );
-
-void intelCompressedTexSubImage2D(GLcontext * ctx,
-				  GLenum target,
-				  GLint level,
-				  GLint xoffset, GLint yoffset,
-				  GLsizei width, GLsizei height,
-				  GLenum format, GLsizei imageSize,
-				  const GLvoid * pixels,
-				  struct gl_texture_object *texObj,
-				  struct gl_texture_image *texImage);
-
-void intelGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
-				GLvoid *pixels,
-				struct gl_texture_object *texObj,
-				struct gl_texture_image *texImage);
-
 void intelSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 		       unsigned long long offset, GLint depth, GLuint pitch);
 void intelSetTexBuffer(__DRIcontext *pDRICtx,
diff --git a/src/mesa/drivers/dri/intel/intel_tex_copy.c b/src/mesa/drivers/dri/intel/intel_tex_copy.c
index 08437aa0e2b..260235b1eb1 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_copy.c
@@ -118,8 +118,12 @@ do_copy_texsubimage(struct intel_context *intel,
       dstx += x - orig_x;
       dsty += y - orig_y;
 
-      /* image_offset may be non-page-aligned, but that's illegal for tiling. */
-      assert(intelImage->mt->region->tiling == I915_TILING_NONE);
+      /* Can't blit to tiled buffers with non-tile-aligned offset. */
+      if (intelImage->mt->region->tiling != I915_TILING_NONE &&
+	  (image_offset & 4095) != 0) {
+	 UNLOCK_HARDWARE(intel);
+	 return GL_FALSE;
+      }
 
       if (ctx->ReadBuffer->Name == 0) {
 	 /* reading from a window, adjust x, y */
@@ -158,17 +162,14 @@ do_copy_texsubimage(struct intel_context *intel,
 
    /* GL_SGIS_generate_mipmap */
    if (intelImage->level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target, texObj);
+      intel_generate_mipmap(ctx, target, texObj);
    }
 
    return GL_TRUE;
 }
 
 
-
-
-
-void
+static void
 intelCopyTexImage1D(GLcontext * ctx, GLenum target, GLint level,
                     GLenum internalFormat,
                     GLint x, GLint y, GLsizei width, GLint border)
@@ -214,7 +215,8 @@ intelCopyTexImage1D(GLcontext * ctx, GLenum target, GLint level,
                            width, border);
 }
 
-void
+
+static void
 intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
                     GLenum internalFormat,
                     GLint x, GLint y, GLsizei width, GLsizei height,
@@ -231,6 +233,14 @@ intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
    if (border)
       goto fail;
 
+   /* Setup or redefine the texture object, mipmap tree and texture
+    * image.  Don't populate yet.
+    */
+   ctx->Driver.TexImage2D(ctx, target, level, internalFormat,
+                          width, height, border,
+                          GL_RGBA, CHAN_TYPE, NULL,
+                          &ctx->DefaultPacking, texObj, texImage);
+
    srcx = x;
    srcy = y;
    dstx = 0;
@@ -241,15 +251,6 @@ intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
 				   &width, &height))
       return;
 
-   /* Setup or redefine the texture object, mipmap tree and texture
-    * image.  Don't populate yet.  
-    */
-   ctx->Driver.TexImage2D(ctx, target, level, internalFormat,
-                          width, height, border,
-                          GL_RGBA, CHAN_TYPE, NULL,
-                          &ctx->DefaultPacking, texObj, texImage);
-
-
    if (!do_copy_texsubimage(intel_context(ctx), target,
                             intel_texture_image(texImage),
                             internalFormat, 0, 0, x, y, width, height))
@@ -263,7 +264,7 @@ intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
 }
 
 
-void
+static void
 intelCopyTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
                        GLint xoffset, GLint x, GLint y, GLsizei width)
 {
@@ -288,8 +289,7 @@ intelCopyTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
 }
 
 
-
-void
+static void
 intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
                        GLint xoffset, GLint yoffset,
                        GLint x, GLint y, GLsizei width, GLsizei height)
@@ -302,7 +302,6 @@ intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
       _mesa_select_tex_image(ctx, texObj, target, level);
    GLenum internalFormat = texImage->InternalFormat;
 
-
    /* Need to check texture is compatible with source format. 
     */
 
@@ -317,3 +316,13 @@ intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
                                  xoffset, yoffset, x, y, width, height);
    }
 }
+
+
+void
+intelInitTextureCopyImageFuncs(struct dd_function_table *functions)
+{
+   functions->CopyTexImage1D = intelCopyTexImage1D;
+   functions->CopyTexImage2D = intelCopyTexImage2D;
+   functions->CopyTexSubImage1D = intelCopyTexSubImage1D;
+   functions->CopyTexSubImage2D = intelCopyTexSubImage2D;
+}
diff --git a/src/mesa/drivers/dri/intel/intel_tex_image.c b/src/mesa/drivers/dri/intel/intel_tex_image.c
index 71561cf85cd..e9a3823078a 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_image.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_image.c
@@ -208,7 +208,7 @@ try_pbo_upload(struct intel_context *intel,
    if (!pbo ||
        intel->ctx._ImageTransferState ||
        unpack->SkipPixels || unpack->SkipRows) {
-      _mesa_printf("%s: failure 1\n", __FUNCTION__);
+      DBG("%s: failure 1\n", __FUNCTION__);
       return GL_FALSE;
    }
 
@@ -248,7 +248,6 @@ try_pbo_upload(struct intel_context *intel,
 }
 
 
-
 static GLboolean
 try_pbo_zcopy(struct intel_context *intel,
               struct intel_texture_image *intelImage,
@@ -264,7 +263,7 @@ try_pbo_zcopy(struct intel_context *intel,
    if (!pbo ||
        intel->ctx._ImageTransferState ||
        unpack->SkipPixels || unpack->SkipRows) {
-      _mesa_printf("%s: failure 1\n", __FUNCTION__);
+      DBG("%s: failure 1\n", __FUNCTION__);
       return GL_FALSE;
    }
 
@@ -283,7 +282,7 @@ try_pbo_zcopy(struct intel_context *intel,
    dst_stride = intelImage->mt->pitch;
 
    if (src_stride != dst_stride || dst_offset != 0 || src_offset != 0) {
-      _mesa_printf("%s: failure 2\n", __FUNCTION__);
+      DBG("%s: failure 2\n", __FUNCTION__);
       return GL_FALSE;
    }
 
@@ -293,10 +292,6 @@ try_pbo_zcopy(struct intel_context *intel,
 }
 
 
-
-
-
-
 static void
 intelTexImage(GLcontext * ctx,
               GLint dims,
@@ -307,7 +302,8 @@ intelTexImage(GLcontext * ctx,
               GLenum format, GLenum type, const void *pixels,
               const struct gl_pixelstore_attrib *unpack,
               struct gl_texture_object *texObj,
-              struct gl_texture_image *texImage, GLsizei imageSize, int compressed)
+              struct gl_texture_image *texImage, GLsizei imageSize,
+              GLboolean compressed)
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_texture_object *intelObj = intel_texture_object(texObj);
@@ -315,8 +311,7 @@ intelTexImage(GLcontext * ctx,
    GLint postConvWidth = width;
    GLint postConvHeight = height;
    GLint texelBytes, sizeInBytes;
-   GLuint dstRowStride, srcRowStride = texImage->RowStride;
-
+   GLuint dstRowStride = 0, srcRowStride = texImage->RowStride;
 
    DBG("%s target %s level %d %dx%dx%d border %d\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(target), level, width, height, depth, border);
@@ -464,8 +459,6 @@ intelTexImage(GLcontext * ctx,
       DBG("pbo upload failed\n");
    }
 
-
-
    /* intelCopyTexImage calls this function with pixels == NULL, with
     * the expectation that the mipmap tree will be set up but nothing
     * more will be done.  This is where those calls return:
@@ -483,7 +476,7 @@ intelTexImage(GLcontext * ctx,
    LOCK_HARDWARE(intel);
 
    if (intelImage->mt) {
-      if (pixels)
+      if (pixels != NULL)
          texImage->Data = intel_miptree_image_map(intel,
                                                   intelImage->mt,
                                                   intelImage->face,
@@ -509,8 +502,9 @@ intelTexImage(GLcontext * ctx,
    }
 
    DBG("Upload image %dx%dx%d row_len %d "
-       "pitch %d\n",
-       width, height, depth, width * texelBytes, dstRowStride);
+       "pitch %d pixels %d compressed %d\n",
+       width, height, depth, width * texelBytes, dstRowStride,
+       pixels ? 1 : 0, compressed);
 
    /* Copy data.  Would like to know when it's ok for us to eg. use
     * the blitter to copy.  Or, use the hardware to do the format
@@ -523,7 +517,7 @@ intelTexImage(GLcontext * ctx,
 	       _mesa_copy_rect(texImage->Data, dst->cpp, dst->pitch,
 			       0, 0,
 			       intelImage->mt->level[level].width,
-			       intelImage->mt->level[level].height/4,
+			       (intelImage->mt->level[level].height+3)/4,
 			       pixels,
 			       srcRowStride,
 			       0, 0);
@@ -539,25 +533,26 @@ intelTexImage(GLcontext * ctx,
 						   format, type, pixels, unpack)) {
 	   _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
        }
-
-       /* GL_SGIS_generate_mipmap */
-       if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-	  intel_generate_mipmap(ctx, target, texObj);
-       }
    }
 
    _mesa_unmap_teximage_pbo(ctx, unpack);
 
    if (intelImage->mt) {
-      if (pixels)
+      if (pixels != NULL)
          intel_miptree_image_unmap(intel, intelImage->mt);
       texImage->Data = NULL;
    }
 
    UNLOCK_HARDWARE(intel);
+
+   /* GL_SGIS_generate_mipmap */
+   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+      intel_generate_mipmap(ctx, target, texObj);
+   }
 }
 
-void
+
+static void
 intelTexImage3D(GLcontext * ctx,
                 GLenum target, GLint level,
                 GLint internalFormat,
@@ -570,11 +565,11 @@ intelTexImage3D(GLcontext * ctx,
 {
    intelTexImage(ctx, 3, target, level,
                  internalFormat, width, height, depth, border,
-                 format, type, pixels, unpack, texObj, texImage, 0, 0);
+                 format, type, pixels, unpack, texObj, texImage, 0, GL_FALSE);
 }
 
 
-void
+static void
 intelTexImage2D(GLcontext * ctx,
                 GLenum target, GLint level,
                 GLint internalFormat,
@@ -586,10 +581,11 @@ intelTexImage2D(GLcontext * ctx,
 {
    intelTexImage(ctx, 2, target, level,
                  internalFormat, width, height, 1, border,
-                 format, type, pixels, unpack, texObj, texImage, 0, 0);
+                 format, type, pixels, unpack, texObj, texImage, 0, GL_FALSE);
 }
 
-void
+
+static void
 intelTexImage1D(GLcontext * ctx,
                 GLenum target, GLint level,
                 GLint internalFormat,
@@ -601,21 +597,24 @@ intelTexImage1D(GLcontext * ctx,
 {
    intelTexImage(ctx, 1, target, level,
                  internalFormat, width, 1, 1, border,
-                 format, type, pixels, unpack, texObj, texImage, 0, 0);
+                 format, type, pixels, unpack, texObj, texImage, 0, GL_FALSE);
 }
 
-void intelCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-				GLint internalFormat,
-				GLint width, GLint height, GLint border,
-				GLsizei imageSize, const GLvoid *data,
-				struct gl_texture_object *texObj,
-				struct gl_texture_image *texImage )
+
+static void
+intelCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
+                           GLint internalFormat,
+                           GLint width, GLint height, GLint border,
+                           GLsizei imageSize, const GLvoid *data,
+                           struct gl_texture_object *texObj,
+                           struct gl_texture_image *texImage )
 {
    intelTexImage(ctx, 2, target, level,
 		 internalFormat, width, height, 1, border,
-		 0, 0, data, &ctx->Unpack, texObj, texImage, imageSize, 1);
+		 0, 0, data, &ctx->Unpack, texObj, texImage, imageSize, GL_TRUE);
 }
 
+
 /**
  * Need to map texture image into memory before copying image data,
  * then unmap it.
@@ -624,11 +623,17 @@ static void
 intel_get_tex_image(GLcontext * ctx, GLenum target, GLint level,
 		    GLenum format, GLenum type, GLvoid * pixels,
 		    struct gl_texture_object *texObj,
-		    struct gl_texture_image *texImage, int compressed)
+		    struct gl_texture_image *texImage, GLboolean compressed)
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_texture_image *intelImage = intel_texture_image(texImage);
 
+   /* If we're reading from a texture that has been rendered to, need to
+    * make sure rendering is complete.
+    * We could probably predicate this on texObj->_RenderToTexture
+    */
+   intelFlush(ctx);
+
    /* Map */
    if (intelImage->mt) {
       /* Image is stored in hardware format in a buffer managed by the
@@ -672,28 +677,29 @@ intel_get_tex_image(GLcontext * ctx, GLenum target, GLint level,
    }
 }
 
-void
+
+static void
 intelGetTexImage(GLcontext * ctx, GLenum target, GLint level,
                  GLenum format, GLenum type, GLvoid * pixels,
                  struct gl_texture_object *texObj,
                  struct gl_texture_image *texImage)
 {
    intel_get_tex_image(ctx, target, level, format, type, pixels,
-		       texObj, texImage, 0);
-
-
+		       texObj, texImage, GL_FALSE);
 }
 
-void
+
+static void
 intelGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
 			   GLvoid *pixels,
 			   struct gl_texture_object *texObj,
 			   struct gl_texture_image *texImage)
 {
    intel_get_tex_image(ctx, target, level, 0, 0, pixels,
-		       texObj, texImage, 1);
+		       texObj, texImage, GL_TRUE);
 }
 
+
 void
 intelSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 		  unsigned long long offset, GLint depth, GLuint pitch)
@@ -802,3 +808,16 @@ intelSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
     */
    intelSetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
 }
+
+
+void
+intelInitTextureImageFuncs(struct dd_function_table *functions)
+{
+   functions->TexImage1D = intelTexImage1D;
+   functions->TexImage2D = intelTexImage2D;
+   functions->TexImage3D = intelTexImage3D;
+   functions->GetTexImage = intelGetTexImage;
+
+   functions->CompressedTexImage2D = intelCompressedTexImage2D;
+   functions->GetCompressedTexImage = intelGetCompressedTexImage;
+}
diff --git a/src/mesa/drivers/dri/intel/intel_tex_layout.c b/src/mesa/drivers/dri/intel/intel_tex_layout.c
index e6f9a417790..2c1b722b7f5 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_layout.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_layout.c
@@ -52,7 +52,9 @@ GLuint intel_compressed_alignment(GLenum internalFormat)
     return alignment;
 }
 
-void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tree *mt )
+void i945_miptree_layout_2d( struct intel_context *intel,
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling )
 {
    GLint align_h = 2, align_w = 4;
    GLuint level;
@@ -92,7 +94,7 @@ void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tr
    /* Pitch must be a whole number of dwords, even though we
     * express it in texels.
     */
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->pitch);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->pitch);
    mt->total_height = 0;
 
    for ( level = mt->first_level ; level <= mt->last_level ; level++ ) {
diff --git a/src/mesa/drivers/dri/intel/intel_tex_layout.h b/src/mesa/drivers/dri/intel/intel_tex_layout.h
index dbc90e6f9b7..7bc25b6bcb1 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_layout.h
+++ b/src/mesa/drivers/dri/intel/intel_tex_layout.h
@@ -38,5 +38,7 @@ static GLuint minify( GLuint d )
    return MAX2(1, d>>1);
 }
 
-extern void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tree *mt );
+extern void i945_miptree_layout_2d(struct intel_context *intel,
+				   struct intel_mipmap_tree *mt,
+				   uint32_t tiling);
 extern GLuint intel_compressed_alignment(GLenum);
diff --git a/src/mesa/drivers/dri/intel/intel_tex_subimage.c b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
index f86de568976..1f27131dac0 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
@@ -101,11 +101,6 @@ intelTexSubimage(GLcontext * ctx,
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "intelTexSubImage");
    }
 
-   /* GL_SGIS_generate_mipmap */
-   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      intel_generate_mipmap(ctx, target, texObj);
-   }
-
    _mesa_unmap_teximage_pbo(ctx, packing);
 
    if (intelImage->mt) {
@@ -114,13 +109,15 @@ intelTexSubimage(GLcontext * ctx,
    }
 
    UNLOCK_HARDWARE(intel);
-}
-
-
 
+   /* GL_SGIS_generate_mipmap */
+   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+      intel_generate_mipmap(ctx, target, texObj);
+   }
+}
 
 
-void
+static void
 intelTexSubImage3D(GLcontext * ctx,
                    GLenum target,
                    GLint level,
@@ -132,18 +129,15 @@ intelTexSubImage3D(GLcontext * ctx,
                    struct gl_texture_object *texObj,
                    struct gl_texture_image *texImage)
 {
-
    intelTexSubimage(ctx, 3,
                     target, level,
                     xoffset, yoffset, zoffset,
                     width, height, depth,
                     format, type, pixels, packing, texObj, texImage);
-
 }
 
 
-
-void
+static void
 intelTexSubImage2D(GLcontext * ctx,
                    GLenum target,
                    GLint level,
@@ -155,17 +149,15 @@ intelTexSubImage2D(GLcontext * ctx,
                    struct gl_texture_object *texObj,
                    struct gl_texture_image *texImage)
 {
-
    intelTexSubimage(ctx, 2,
                     target, level,
                     xoffset, yoffset, 0,
                     width, height, 1,
                     format, type, pixels, packing, texObj, texImage);
-
 }
 
 
-void
+static void
 intelTexSubImage1D(GLcontext * ctx,
                    GLenum target,
                    GLint level,
@@ -182,10 +174,9 @@ intelTexSubImage1D(GLcontext * ctx,
                     xoffset, 0, 0,
                     width, 1, 1,
                     format, type, pixels, packing, texObj, texImage);
-
 }
 
-void
+static void
 intelCompressedTexSubImage2D(GLcontext * ctx,
 			     GLenum target,
 			     GLint level,
@@ -199,3 +190,14 @@ intelCompressedTexSubImage2D(GLcontext * ctx,
    fprintf(stderr, "stubbed CompressedTexSubImage2D: %dx%d@%dx%d\n",
 	   width, height, xoffset, yoffset);
 }
+
+
+
+void
+intelInitTextureSubImageFuncs(struct dd_function_table *functions)
+{
+   functions->TexSubImage1D = intelTexSubImage1D;
+   functions->TexSubImage2D = intelTexSubImage2D;
+   functions->TexSubImage3D = intelTexSubImage3D;
+   functions->CompressedTexSubImage2D = intelCompressedTexSubImage2D;
+}
diff --git a/src/mesa/drivers/dri/intel/intel_tex_validate.c b/src/mesa/drivers/dri/intel/intel_tex_validate.c
index 05a375e1f3b..b5cb7597d16 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_validate.c
@@ -241,7 +241,7 @@ intel_tex_map_level_images(struct intel_context *intel,
       struct intel_texture_image *intelImage =
 	 intel_texture_image(intelObj->base.Image[face][level]);
 
-      if (intelImage->mt) {
+      if (intelImage && intelImage->mt) {
 	 intelImage->base.Data =
 	    intel_miptree_image_map(intel,
 				    intelImage->mt,
@@ -268,7 +268,7 @@ intel_tex_unmap_level_images(struct intel_context *intel,
       struct intel_texture_image *intelImage =
 	 intel_texture_image(intelObj->base.Image[face][level]);
 
-      if (intelImage->mt) {
+      if (intelImage && intelImage->mt) {
 	 intel_miptree_image_unmap(intel, intelImage->mt);
 	 intelImage->base.Data = NULL;
       }
diff --git a/src/mesa/drivers/dri/r200/r200_cmdbuf.c b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
index 3a11a448eca..df9dd83344c 100644
--- a/src/mesa/drivers/dri/r200/r200_cmdbuf.c
+++ b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
@@ -105,6 +105,35 @@ void r200SetUpAtomList( r200ContextPtr rmesa )
    insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpi[1] );
 }
 
+void r200EmitScissor(r200ContextPtr rmesa)
+{
+    BATCH_LOCALS(&rmesa->radeon);
+    if (!rmesa->radeon.radeonScreen->kernel_mm) {
+       return;
+    }
+    if (rmesa->radeon.state.scissor.enabled) {
+        BEGIN_BATCH(8);
+        OUT_BATCH(CP_PACKET0(R200_RE_CNTL, 0));
+        OUT_BATCH(R200_SCISSOR_ENABLE | rmesa->hw.set.cmd[SET_RE_CNTL]);
+        OUT_BATCH(CP_PACKET0(R200_RE_AUX_SCISSOR_CNTL, 0));
+        OUT_BATCH(R200_SCISSOR_ENABLE_0);
+        OUT_BATCH(CP_PACKET0(R200_RE_SCISSOR_TL_0, 0));
+        OUT_BATCH((rmesa->radeon.state.scissor.rect.y1 << 16) |
+                  rmesa->radeon.state.scissor.rect.x1);
+        OUT_BATCH(CP_PACKET0(R200_RE_SCISSOR_BR_0, 0));
+        OUT_BATCH(((rmesa->radeon.state.scissor.rect.y2 - 1) << 16) |
+                  (rmesa->radeon.state.scissor.rect.x2 - 1));
+        END_BATCH();
+    } else {
+        BEGIN_BATCH(4);
+        OUT_BATCH(CP_PACKET0(R200_RE_CNTL, 0));
+        OUT_BATCH(rmesa->hw.set.cmd[SET_RE_CNTL] & ~R200_SCISSOR_ENABLE);
+        OUT_BATCH(CP_PACKET0(R200_RE_AUX_SCISSOR_CNTL, 0));
+        OUT_BATCH(0);
+        END_BATCH();
+    }
+}
+
 /* Fire a section of the retained (indexed_verts) buffer as a regular
  * primtive.  
  */
@@ -121,6 +150,7 @@ void r200EmitVbufPrim( r200ContextPtr rmesa,
    if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_PRIMS))
       fprintf(stderr, "%s cmd_used/4: %d prim %x nr %d\n", __FUNCTION__,
 	      rmesa->store.cmd_used/4, primitive, vertex_nr);
+   r200EmitScissor(rmesa);
  
    BEGIN_BATCH(3);
    OUT_BATCH_PACKET3_CLIP(R200_CP_CMD_3D_DRAW_VBUF_2, 0);
@@ -134,9 +164,11 @@ static void r200FireEB(r200ContextPtr rmesa, int vertex_count, int type)
 	BATCH_LOCALS(&rmesa->radeon);
 
 	if (vertex_count > 0) {
+        r200EmitScissor(rmesa);
 		BEGIN_BATCH(8+2);
-		OUT_BATCH_PACKET3(R200_CP_CMD_3D_DRAW_INDX_2, 0);
+		OUT_BATCH_PACKET3_CLIP(R200_CP_CMD_3D_DRAW_INDX_2, 0);
 		OUT_BATCH(R200_VF_PRIM_WALK_IND |
+			  R200_VF_COLOR_ORDER_RGBA | 
 			  ((vertex_count + 0) << 16) |
 			  type);
 		
@@ -147,12 +179,12 @@ static void r200FireEB(r200ContextPtr rmesa, int vertex_count, int type)
 					rmesa->radeon.tcl.elt_dma_bo,
 					rmesa->radeon.tcl.elt_dma_offset,
 					RADEON_GEM_DOMAIN_GTT, 0, 0);
-			OUT_BATCH(vertex_count/2);
+			OUT_BATCH((vertex_count + 1)/2);
 		} else {
 			OUT_BATCH_PACKET3(R200_CP_CMD_INDX_BUFFER, 2);
 			OUT_BATCH((0x80 << 24) | (0 << 16) | 0x810);
 			OUT_BATCH(rmesa->radeon.tcl.elt_dma_offset);
-			OUT_BATCH(vertex_count/2);
+			OUT_BATCH((vertex_count + 1)/2);
 			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
 					      rmesa->radeon.tcl.elt_dma_bo,
 					      RADEON_GEM_DOMAIN_GTT, 0, 0);
@@ -172,8 +204,6 @@ void r200FlushElts(GLcontext *ctx)
    assert( rmesa->radeon.dma.flush == r200FlushElts );
    rmesa->radeon.dma.flush = NULL;
 
-   elt_used = (elt_used + 2) & ~2;
-
    nr = elt_used / 2;
 
    radeon_bo_unmap(rmesa->radeon.tcl.elt_dma_bo);
@@ -209,6 +239,11 @@ GLushort *r200AllocEltsOpenEnded( r200ContextPtr rmesa,
    rmesa->radeon.tcl.elt_dma_offset = 0;
    rmesa->tcl.elt_used = min_nr * 2;
 
+   radeon_validate_bo(&rmesa->radeon, rmesa->radeon.tcl.elt_dma_bo,
+                      RADEON_GEM_DOMAIN_GTT, 0);
+   if (radeon_revalidate_bos(rmesa->radeon.glCtx) == GL_FALSE)
+      fprintf(stderr,"failure to revalidate BOs - badness\n");
+
    radeon_bo_map(rmesa->radeon.tcl.elt_dma_bo, 1);
    retval = rmesa->radeon.tcl.elt_dma_bo->ptr + rmesa->radeon.tcl.elt_dma_offset;
    
@@ -238,7 +273,7 @@ void r200EmitVertexAOS( r200ContextPtr rmesa,
 	      __FUNCTION__, vertex_size, offset);
 
 
-   BEGIN_BATCH(5);
+   BEGIN_BATCH(7);
    OUT_BATCH_PACKET3(R200_CP_CMD_3D_LOAD_VBPNTR, 2);
    OUT_BATCH(1);
    OUT_BATCH(vertex_size | (vertex_size << 8));
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index f80f0d8ac73..8924849d082 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -354,6 +354,10 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
 
    i = driQueryOptioni( &rmesa->radeon.optionCache, "allow_large_textures");
 
+   /* FIXME: When no memory manager is available we should set this 
+    * to some reasonable value based on texture memory pool size */
+   ctx->Const.MaxTextureLevels = 12;
+
    ctx->Const.MaxTextureMaxAnisotropy = 16.0;
 
    /* No wide AA points.
diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.c b/src/mesa/drivers/dri/r200/r200_ioctl.c
index 0262aea880e..0b3398a730e 100644
--- a/src/mesa/drivers/dri/r200/r200_ioctl.c
+++ b/src/mesa/drivers/dri/r200/r200_ioctl.c
@@ -31,7 +31,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * Authors:
  *   Keith Whitwell <[email protected]>
  */
- 
+
 #include <sched.h>
 #include <errno.h>
 
@@ -66,7 +66,7 @@ static void r200UserClear(GLcontext *ctx, GLuint mask)
 static void r200KernelClear(GLcontext *ctx, GLuint flags)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLint cx, cy, cw, ch, ret;
    GLuint i;
 
@@ -94,7 +94,7 @@ static void r200KernelClear(GLcontext *ctx, GLuint flags)
       if ( rmesa->radeon.sarea->last_clear - clear <= 25 ) {
 	 break;
       }
-      
+
       if (rmesa->radeon.do_usleeps) {
 	 UNLOCK_HARDWARE( &rmesa->radeon );
 	 DO_USLEEP( 1 );
@@ -190,7 +190,7 @@ static void r200KernelClear(GLcontext *ctx, GLuint flags)
 static void r200Clear( GLcontext *ctx, GLbitfield mask )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLuint flags = 0;
    GLuint color_mask = 0;
    GLuint orig_mask = mask;
@@ -202,7 +202,7 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
    {
       LOCK_HARDWARE( &rmesa->radeon );
       UNLOCK_HARDWARE( &rmesa->radeon );
-      if ( dPriv->numClipRects == 0 ) 
+      if ( dPriv->numClipRects == 0 )
 	 return;
    }
 
@@ -236,7 +236,7 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
       _swrast_Clear( ctx, mask );
    }
 
-   if ( !flags ) 
+   if ( !flags )
       return;
 
    if (rmesa->using_hyperz) {
@@ -267,7 +267,7 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
  * device fd.
  */
 void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size,
-			     GLfloat readfreq, GLfloat writefreq, 
+			     GLfloat readfreq, GLfloat writefreq,
 			     GLfloat priority)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -277,7 +277,7 @@ void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size,
    int ret;
 
    if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s sz %d %f/%f/%f\n", __FUNCTION__, size, readfreq, 
+      fprintf(stderr, "%s sz %d %f/%f/%f\n", __FUNCTION__, size, readfreq,
 	      writefreq, priority);
 
    if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->radeon.radeonScreen->gartTextures.map)
@@ -294,12 +294,12 @@ void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size,
    ret = drmCommandWriteRead( rmesa->radeon.radeonScreen->driScreen->fd,
 			      DRM_RADEON_ALLOC,
 			      &alloc, sizeof(alloc));
-   
+
    if (ret) {
       fprintf(stderr, "%s: DRM_RADEON_ALLOC ret %d\n", __FUNCTION__, ret);
       return NULL;
    }
-   
+
    {
       char *region_start = (char *)rmesa->radeon.radeonScreen->gartTextures.map;
       return (void *)(region_start + region_offset);
@@ -326,7 +326,7 @@ void r200FreeMemoryMESA(__DRIscreen *screen, GLvoid *pointer)
 
    region_offset = (char *)pointer - (char *)rmesa->radeon.radeonScreen->gartTextures.map;
 
-   if (region_offset < 0 || 
+   if (region_offset < 0 ||
        region_offset > rmesa->radeon.radeonScreen->gartTextures.size) {
       fprintf(stderr, "offset %d outside range 0..%d\n", region_offset,
 	      rmesa->radeon.radeonScreen->gartTextures.size);
@@ -335,12 +335,12 @@ void r200FreeMemoryMESA(__DRIscreen *screen, GLvoid *pointer)
 
    memfree.region = RADEON_MEM_REGION_GART;
    memfree.region_offset = region_offset;
-   
+
    ret = drmCommandWrite( rmesa->radeon.radeonScreen->driScreen->fd,
 			  DRM_RADEON_FREE,
 			  &memfree, sizeof(memfree));
-   
-   if (ret) 
+
+   if (ret)
       fprintf(stderr, "%s: DRM_RADEON_FREE ret %d\n", __FUNCTION__, ret);
 }
 
@@ -374,7 +374,7 @@ GLboolean r200IsGartMemory( r200ContextPtr rmesa, const GLvoid *pointer,
 
    if (R200_DEBUG & DEBUG_IOCTL)
       fprintf(stderr, "r200IsGartMemory( %p ) : %d\n", pointer, valid );
-   
+
    return valid;
 }
 
diff --git a/src/mesa/drivers/dri/r200/r200_pixel.c b/src/mesa/drivers/dri/r200/r200_pixel.c
index 354daef07f6..654f2c6ae98 100644
--- a/src/mesa/drivers/dri/r200/r200_pixel.c
+++ b/src/mesa/drivers/dri/r200/r200_pixel.c
@@ -65,8 +65,8 @@ check_color( const GLcontext *ctx, GLenum type, GLenum format,
       return GL_FALSE;
    }
 
-   if ( type == GL_UNSIGNED_INT_8_8_8_8_REV && 
-	cpp == 4 && 
+   if ( type == GL_UNSIGNED_INT_8_8_8_8_REV &&
+	cpp == 4 &&
 	format == GL_BGRA ) {
       if (R200_DEBUG & DEBUG_PIXEL)
 	 fprintf(stderr, "%s: passed 2\n", __FUNCTION__);
@@ -83,7 +83,7 @@ static GLboolean
 check_color_per_fragment_ops( const GLcontext *ctx )
 {
    int result;
-   result = (!(     ctx->Color.AlphaEnabled || 
+   result = (!(     ctx->Color.AlphaEnabled ||
 		    ctx->Depth.Test ||
 		    ctx->Fog.Enabled ||
 		    ctx->Scissor.Enabled ||
@@ -96,7 +96,7 @@ check_color_per_fragment_ops( const GLcontext *ctx )
 		    ctx->Texture._EnabledUnits
            ) &&
 	   ctx->Current.RasterPosValid);
-   
+
    return result;
 }
 
@@ -163,7 +163,7 @@ r200TryReadPixels( GLcontext *ctx,
 
    /* Only accelerate reading to GART buffers.
     */
-   if ( !r200IsGartMemory(rmesa, pixels, 
+   if ( !r200IsGartMemory(rmesa, pixels,
 			 pitch * height * rmesa->radeon.radeonScreen->cpp ) ) {
       if (R200_DEBUG & DEBUG_PIXEL)
 	 fprintf(stderr, "%s: dest not GART\n", __FUNCTION__);
@@ -224,7 +224,7 @@ r200TryReadPixels( GLcontext *ctx,
       drm_clip_rect_t *box = dPriv->pClipRects;
       int i;
 
-      r200EmitWait( rmesa, RADEON_WAIT_3D ); 
+      r200EmitWait( rmesa, RADEON_WAIT_3D );
 
       y = dPriv->h - y - height;
       x += dPriv->x;
@@ -241,7 +241,7 @@ r200TryReadPixels( GLcontext *ctx,
 	 GLint by = box[i].y1;
 	 GLint bw = box[i].x2 - bx;
 	 GLint bh = box[i].y2 - by;
-	 
+
 	 if (bx < x) bw -= x - bx, bx = x;
 	 if (by < y) bh -= y - by, by = y;
 	 if (bx + bw > x + width) bw = x + width - bx;
@@ -277,9 +277,9 @@ r200ReadPixels( GLcontext *ctx,
    if (R200_DEBUG & DEBUG_PIXEL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
-   if (!r200TryReadPixels( ctx, x, y, width, height, format, type, pack, 
+   if (!r200TryReadPixels( ctx, x, y, width, height, format, type, pack,
 			   pixels))
-      _swrast_ReadPixels( ctx, x, y, width, height, format, type, pack, 
+      _swrast_ReadPixels( ctx, x, y, width, height, format, type, pack,
 			  pixels);
 }
 
@@ -293,7 +293,7 @@ static void do_draw_pix( GLcontext *ctx,
 			 GLuint planemask)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    drm_clip_rect_t *box = dPriv->pClipRects;
    struct gl_renderbuffer *rb = ctx->ReadBuffer->_ColorDrawBuffers[0];
    driRenderbuffer *drb = (driRenderbuffer *) rb;
@@ -325,7 +325,7 @@ static void do_draw_pix( GLcontext *ctx,
       rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
 
    y -= height;			/* cope with pixel zoom */
-   
+
    if (!clip_pixelrect(ctx, ctx->DrawBuffer,
 		       &x, &y, &width, &height,
 		       &size)) {
@@ -409,7 +409,7 @@ r200TryDrawPixels( GLcontext *ctx,
       if (planemask != ~0)
 	 return GL_FALSE;	/* fix me -- should be possible */
 
-      /* Can't do conversions on GART reads/draws. 
+      /* Can't do conversions on GART reads/draws.
        */
       if ( !r200IsGartMemory( rmesa, pixels, size ) ) {
 	 if (R200_DEBUG & DEBUG_PIXEL)
@@ -484,9 +484,9 @@ r200Bitmap( GLcontext *ctx, GLint px, GLint py,
 void r200InitPixelFuncs( GLcontext *ctx )
 {
    if (!getenv("R200_NO_BLITS")) {
-      ctx->Driver.ReadPixels = r200ReadPixels;  
-      ctx->Driver.DrawPixels = r200DrawPixels; 
-      if (getenv("R200_HW_BITMAP")) 
+      ctx->Driver.ReadPixels = r200ReadPixels;
+      ctx->Driver.DrawPixels = r200DrawPixels;
+      if (getenv("R200_HW_BITMAP"))
 	 ctx->Driver.Bitmap = r200Bitmap;
    }
 }
diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
index ebf389efe26..f8ebe0df57e 100644
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@@ -79,7 +79,7 @@ static void r200AlphaFunc( GLcontext *ctx, GLenum func, GLfloat ref )
 
    switch ( func ) {
    case GL_NEVER:
-      pp_misc |= R200_ALPHA_TEST_FAIL; 
+      pp_misc |= R200_ALPHA_TEST_FAIL;
       break;
    case GL_LESS:
       pp_misc |= R200_ALPHA_TEST_LESS;
@@ -479,7 +479,7 @@ static void r200Fogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
 	 }
       }
       break;
-   case GL_FOG_COLOR: 
+   case GL_FOG_COLOR:
       R200_STATECHANGE( rmesa, ctx );
       UNCLAMPED_FLOAT_TO_RGB_CHAN( col, ctx->Fog.Color );
       i = radeonPackColor( 4, col[0], col[1], col[2], 0 );
@@ -507,7 +507,7 @@ static void r200Fogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
 
       if (out_0 != rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0]) {
 	 R200_STATECHANGE( rmesa, vtx );
-	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] = out_0;	 
+	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] = out_0;
       }
 
       break;
@@ -696,7 +696,7 @@ static void r200LineStipple( GLcontext *ctx, GLint factor, GLushort pattern )
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
    R200_STATECHANGE( rmesa, lin );
-   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] = 
+   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] =
       ((((GLuint)factor & 0xff) << 16) | ((GLuint)pattern));
 }
 
@@ -720,10 +720,10 @@ static void r200ColorMask( GLcontext *ctx,
    if (!(r && g && b && a))
       flag |= R200_PLANE_MASK_ENABLE;
 
-   if ( rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] != flag ) { 
-      R200_STATECHANGE( rmesa, ctx ); 
-      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = flag; 
-   } 
+   if ( rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] != flag ) {
+      R200_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = flag;
+   }
 
    if ( rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK] != mask ) {
       R200_STATECHANGE( rmesa, msk );
@@ -774,7 +774,7 @@ static void r200PolygonStipple( GLcontext *ctx, const GLubyte *mask )
    /* FIXME: Use window x,y offsets into stipple RAM.
     */
    stipple.mask = rmesa->state.stipple.mask;
-   drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_STIPPLE, 
+   drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_STIPPLE,
                     &stipple, sizeof(stipple) );
    UNLOCK_HARDWARE( &rmesa->radeon );
 }
@@ -785,7 +785,7 @@ static void r200PolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
    GLboolean flag = (ctx->_TriangleCaps & DD_TRI_UNFILLED) != 0;
 
    /* Can't generally do unfilled via tcl, but some good special
-    * cases work. 
+    * cases work.
     */
    TCL_FALLBACK( ctx, R200_TCL_FALLBACK_UNFILLED, flag);
    if (rmesa->radeon.TclFallback) {
@@ -827,34 +827,34 @@ static void r200UpdateSpecular( GLcontext *ctx )
 
    if (ctx->Light.Enabled &&
        ctx->Light.Model.ColorControl == GL_SEPARATE_SPECULAR_COLOR) {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
 	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT) |
-	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));	
+	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_0;
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_1;
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_LIGHTING_ENABLE;
       p |=  R200_SPECULAR_ENABLE;
-      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= 
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &=
 	 ~R200_DIFFUSE_SPECULAR_COMBINE;
    }
    else if (ctx->Light.Enabled) {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
-	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));	
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_0;
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_LIGHTING_ENABLE;
    } else if (ctx->Fog.ColorSumEnabled ) {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
 	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT) |
-	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));	
+	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));
       p |=  R200_SPECULAR_ENABLE;
    } else {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
-	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));	
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));
    }
 
    if (ctx->Fog.Enabled) {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
-	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));	
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_1;
    }
 
@@ -865,7 +865,7 @@ static void r200UpdateSpecular( GLcontext *ctx )
 
    /* Update vertex/render formats
     */
-   if (rmesa->radeon.TclFallback) { 
+   if (rmesa->radeon.TclFallback) {
       r200ChooseRenderState( ctx );
       r200ChooseVertexState( ctx );
    }
@@ -877,7 +877,7 @@ static void r200UpdateSpecular( GLcontext *ctx )
  */
 
 
-/* Update on colormaterial, material emmissive/ambient, 
+/* Update on colormaterial, material emmissive/ambient,
  * lightmodel.globalambient
  */
 static void update_global_ambient( GLcontext *ctx )
@@ -891,23 +891,23 @@ static void update_global_ambient( GLcontext *ctx )
     */
    if ((rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_1] &
        ((3 << R200_FRONT_EMISSIVE_SOURCE_SHIFT) |
-	(3 << R200_FRONT_AMBIENT_SOURCE_SHIFT))) == 0) 
+	(3 << R200_FRONT_AMBIENT_SOURCE_SHIFT))) == 0)
    {
-      COPY_3V( &fcmd[GLT_RED], 
+      COPY_3V( &fcmd[GLT_RED],
 	       ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_EMISSION]);
       ACC_SCALE_3V( &fcmd[GLT_RED],
 		   ctx->Light.Model.Ambient,
 		   ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_AMBIENT]);
-   } 
+   }
    else
    {
       COPY_3V( &fcmd[GLT_RED], ctx->Light.Model.Ambient );
    }
-   
+
    R200_DB_STATECHANGE(rmesa, &rmesa->hw.glt);
 }
 
-/* Update on change to 
+/* Update on change to
  *    - light[p].colors
  *    - light[p].enabled
  */
@@ -921,10 +921,10 @@ static void update_light_colors( GLcontext *ctx, GLuint p )
       r200ContextPtr rmesa = R200_CONTEXT(ctx);
       float *fcmd = (float *)R200_DB_STATE( lit[p] );
 
-      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );	 
+      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );
       COPY_4V( &fcmd[LIT_DIFFUSE_RED], l->Diffuse );
       COPY_4V( &fcmd[LIT_SPECULAR_RED], l->Specular );
-      
+
       R200_DB_STATECHANGE( rmesa, &rmesa->hw.lit[p] );
    }
 }
@@ -944,7 +944,7 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 
    if (ctx->Light.ColorMaterialEnabled) {
       GLuint mask = ctx->Light.ColorMaterialBitmask;
-   
+
       if (mask & MAT_BIT_FRONT_EMISSION) {
 	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
 			     R200_FRONT_EMISSIVE_SOURCE_SHIFT);
@@ -960,7 +960,7 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
       else
          light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_0 <<
 			     R200_FRONT_AMBIENT_SOURCE_SHIFT);
-	 
+
       if (mask & MAT_BIT_FRONT_DIFFUSE) {
 	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
 			     R200_FRONT_DIFFUSE_SOURCE_SHIFT);
@@ -968,7 +968,7 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
       else
          light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_0 <<
 			     R200_FRONT_DIFFUSE_SOURCE_SHIFT);
-   
+
       if (mask & MAT_BIT_FRONT_SPECULAR) {
 	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
 			     R200_FRONT_SPECULAR_SOURCE_SHIFT);
@@ -977,7 +977,7 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
          light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_0 <<
 			     R200_FRONT_SPECULAR_SOURCE_SHIFT);
       }
-   
+
       if (mask & MAT_BIT_BACK_EMISSION) {
 	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
 			     R200_BACK_EMISSIVE_SOURCE_SHIFT);
@@ -1027,8 +1027,8 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
       R200_STATECHANGE( rmesa, tcl );
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_1] = light_model_ctl1;
    }
-   
-   
+
+
 }
 
 void r200UpdateMaterial( GLcontext *ctx )
@@ -1038,7 +1038,7 @@ void r200UpdateMaterial( GLcontext *ctx )
    GLfloat *fcmd = (GLfloat *)R200_DB_STATE( mtl[0] );
    GLfloat *fcmd2 = (GLfloat *)R200_DB_STATE( mtl[1] );
    GLuint mask = ~0;
-   
+
    /* Might be possible and faster to update everything unconditionally? */
    if (ctx->Light.ColorMaterialEnabled)
       mask &= ~ctx->Light.ColorMaterialBitmask;
@@ -1124,7 +1124,7 @@ void r200UpdateMaterial( GLcontext *ctx )
  *
  * which are calculated in light.c and are correct for the current
  * lighting space (model or eye), hence dependencies on _NEW_MODELVIEW
- * and _MESA_NEW_NEED_EYE_COORDS.  
+ * and _MESA_NEW_NEED_EYE_COORDS.
  */
 static void update_light( GLcontext *ctx )
 {
@@ -1141,8 +1141,8 @@ static void update_light( GLcontext *ctx )
 	 tmp &= ~R200_LIGHT_IN_MODELSPACE;
       else
 	 tmp |= R200_LIGHT_IN_MODELSPACE;
-      
-      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0]) 
+
+      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0])
       {
 	 R200_STATECHANGE( rmesa, tcl );
 	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] = tmp;
@@ -1166,10 +1166,10 @@ static void update_light( GLcontext *ctx )
 	 if (ctx->Light.Light[p].Enabled) {
 	    struct gl_light *l = &ctx->Light.Light[p];
 	    GLfloat *fcmd = (GLfloat *)R200_DB_STATE( lit[p] );
-	    
+
 	    if (l->EyePosition[3] == 0.0) {
-	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm ); 
-	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm ); 
+	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm );
+	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm );
 	       fcmd[LIT_POSITION_W] = 0;
 	       fcmd[LIT_DIRECTION_W] = 0;
 	    } else {
@@ -1193,21 +1193,21 @@ static void r200Lightfv( GLcontext *ctx, GLenum light,
    GLint p = light - GL_LIGHT0;
    struct gl_light *l = &ctx->Light.Light[p];
    GLfloat *fcmd = (GLfloat *)rmesa->hw.lit[p].cmd;
-   
+
 
    switch (pname) {
-   case GL_AMBIENT:		
+   case GL_AMBIENT:
    case GL_DIFFUSE:
    case GL_SPECULAR:
       update_light_colors( ctx, p );
       break;
 
-   case GL_SPOT_DIRECTION: 
-      /* picked up in update_light */	
+   case GL_SPOT_DIRECTION:
+      /* picked up in update_light */
       break;
 
    case GL_POSITION: {
-      /* positions picked up in update_light, but can do flag here */	
+      /* positions picked up in update_light, but can do flag here */
       GLuint flag = (p&1)? R200_LIGHT_1_IS_LOCAL : R200_LIGHT_0_IS_LOCAL;
       GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
 
@@ -1323,7 +1323,7 @@ static void r200LightModelfv( GLcontext *ctx, GLenum pname,
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
    switch (pname) {
-      case GL_LIGHT_MODEL_AMBIENT: 
+      case GL_LIGHT_MODEL_AMBIENT:
 	 update_global_ambient( ctx );
 	 break;
 
@@ -1582,7 +1582,7 @@ static void r200ClearStencil( GLcontext *ctx, GLint s )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
-   rmesa->radeon.state.stencil.clear = 
+   rmesa->radeon.state.stencil.clear =
       ((GLuint) (ctx->Stencil.Clear & 0xff) |
        (0xff << R200_STENCIL_MASK_SHIFT) |
        ((ctx->Stencil.WriteMask[0] & 0xff) << R200_STENCIL_WRITEMASK_SHIFT));
@@ -1607,7 +1607,7 @@ static void r200ClearStencil( GLcontext *ctx, GLint s )
 void r200UpdateWindow( GLcontext *ctx )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
    GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1663,7 +1663,7 @@ static void r200DepthRange( GLcontext *ctx, GLclampd nearval,
 void r200UpdateViewportOffset( GLcontext *ctx )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLfloat xoffset = (GLfloat)dPriv->x;
    GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1693,8 +1693,8 @@ void r200UpdateViewportOffset( GLcontext *ctx )
                 R200_STIPPLE_Y_OFFSET_MASK);
 
          /* add magic offsets, then invert */
-         stx = 31 - ((rmesa->radeon.dri.drawable->x - 1) & R200_STIPPLE_COORD_MASK);
-         sty = 31 - ((rmesa->radeon.dri.drawable->y + rmesa->radeon.dri.drawable->h - 1)
+         stx = 31 - ((dPriv->x - 1) & R200_STIPPLE_COORD_MASK);
+         sty = 31 - ((dPriv->y + dPriv->h - 1)
                      & R200_STIPPLE_COORD_MASK);
 
          m |= ((stx << R200_STIPPLE_X_OFFSET_SHIFT) |
@@ -1808,7 +1808,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_CLIP_PLANE2:
    case GL_CLIP_PLANE3:
    case GL_CLIP_PLANE4:
-   case GL_CLIP_PLANE5: 
+   case GL_CLIP_PLANE5:
       p = cap-GL_CLIP_PLANE0;
       R200_STATECHANGE( rmesa, tcl );
       if (state) {
@@ -1860,7 +1860,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
 	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~R200_TCL_FOG_MASK;
       }
       r200UpdateSpecular( ctx ); /* for PK_SPEC */
-      if (rmesa->radeon.TclFallback) 
+      if (rmesa->radeon.TclFallback)
 	 r200ChooseVertexState( ctx );
       _mesa_allow_light_in_model( ctx, !state );
       break;
@@ -1875,13 +1875,13 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_LIGHT7:
       R200_STATECHANGE(rmesa, tcl);
       p = cap - GL_LIGHT0;
-      if (p&1) 
+      if (p&1)
 	 flag = (R200_LIGHT_1_ENABLE |
-		 R200_LIGHT_1_ENABLE_AMBIENT | 
+		 R200_LIGHT_1_ENABLE_AMBIENT |
 		 R200_LIGHT_1_ENABLE_SPECULAR);
       else
 	 flag = (R200_LIGHT_0_ENABLE |
-		 R200_LIGHT_0_ENABLE_AMBIENT | 
+		 R200_LIGHT_0_ENABLE_AMBIENT |
 		 R200_LIGHT_0_ENABLE_SPECULAR);
 
       if (state)
@@ -1889,7 +1889,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
       else
 	 rmesa->hw.tcl.cmd[p/2 + TCL_PER_LIGHT_CTL_0] &= ~flag;
 
-      /* 
+      /*
        */
       update_light_colors( ctx, p );
       break;
@@ -2043,7 +2043,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_TEXTURE_GEN_T:
       /* Picked up in r200UpdateTextureState.
        */
-      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE; 
+      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE;
       break;
 
    case GL_COLOR_SUM_EXT:
@@ -2160,7 +2160,7 @@ void r200LightingSpaceChange( GLcontext *ctx )
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLboolean tmp;
 
-   if (R200_DEBUG & DEBUG_STATE) 
+   if (R200_DEBUG & DEBUG_STATE)
       fprintf(stderr, "%s %d BEFORE %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
 	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0]);
 
@@ -2176,7 +2176,7 @@ void r200LightingSpaceChange( GLcontext *ctx )
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= ~R200_RESCALE_NORMALS;
    }
 
-   if (R200_DEBUG & DEBUG_STATE) 
+   if (R200_DEBUG & DEBUG_STATE)
       fprintf(stderr, "%s %d AFTER %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
 	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0]);
 }
@@ -2219,7 +2219,7 @@ static void update_texturematrix( GLcontext *ctx )
    GLuint compsel = rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL];
    int unit;
 
-   if (R200_DEBUG & DEBUG_STATE) 
+   if (R200_DEBUG & DEBUG_STATE)
       fprintf(stderr, "%s before COMPSEL: %x\n", __FUNCTION__,
 	      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL]);
 
@@ -2227,7 +2227,7 @@ static void update_texturematrix( GLcontext *ctx )
    rmesa->TexMatCompSel = 0;
 
    for (unit = 0 ; unit < ctx->Const.MaxTextureUnits; unit++) {
-      if (!ctx->Texture.Unit[unit]._ReallyEnabled) 
+      if (!ctx->Texture.Unit[unit]._ReallyEnabled)
 	 continue;
 
       if (ctx->TextureMatrixStack[unit].Top->type != MATRIX_IDENTITY) {
@@ -2237,21 +2237,21 @@ static void update_texturematrix( GLcontext *ctx )
 	 rmesa->TexMatCompSel |= R200_OUTPUT_TEX_0 << unit;
 
 	 if (rmesa->TexGenEnabled & (R200_TEXMAT_0_ENABLE << unit)) {
-	    /* Need to preconcatenate any active texgen 
+	    /* Need to preconcatenate any active texgen
 	     * obj/eyeplane matrices:
 	     */
 	    _math_matrix_mul_matrix( &rmesa->tmpmat,
-				     ctx->TextureMatrixStack[unit].Top, 
+				     ctx->TextureMatrixStack[unit].Top,
 				     &rmesa->TexGenMatrix[unit] );
 	    upload_matrix( rmesa, rmesa->tmpmat.m, R200_MTX_TEX0+unit );
-	 } 
+	 }
 	 else {
-	    upload_matrix( rmesa, ctx->TextureMatrixStack[unit].Top->m, 
+	    upload_matrix( rmesa, ctx->TextureMatrixStack[unit].Top->m,
 			   R200_MTX_TEX0+unit );
 	 }
       }
       else if (rmesa->TexGenEnabled & (R200_TEXMAT_0_ENABLE << unit)) {
-	 upload_matrix( rmesa, rmesa->TexGenMatrix[unit].m, 
+	 upload_matrix( rmesa, rmesa->TexGenMatrix[unit].m,
 			R200_MTX_TEX0+unit );
       }
    }
@@ -2277,7 +2277,7 @@ static GLboolean r200ValidateBuffers(GLcontext *ctx)
    int i;
 
    radeon_validate_reset_bos(&rmesa->radeon);
-   
+
    rrb = radeon_get_colorbuffer(&rmesa->radeon);
    /* color buffer */
    if (rrb && rrb->bo) {
@@ -2295,7 +2295,7 @@ static GLboolean r200ValidateBuffers(GLcontext *ctx)
 
    for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
       radeonTexObj *t;
-      
+
       if (!ctx->Texture.Unit[i]._ReallyEnabled)
 	 continue;
 
@@ -2319,15 +2319,15 @@ GLboolean r200ValidateState( GLcontext *ctx )
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLuint new_state = rmesa->radeon.NewGLState;
 
-   if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+   if (new_state & _NEW_BUFFERS) {
       _mesa_update_framebuffer(ctx);
       /* this updates the DrawBuffer's Width/Height if it's a FBO */
       _mesa_update_draw_buffer_bounds(ctx);
-      
+
       R200_STATECHANGE(rmesa, ctx);
    }
 
-   if (new_state & (_NEW_TEXTURE | _NEW_PROGRAM)) {
+   if (new_state & (_NEW_TEXTURE | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS)) {
       r200UpdateTextureState( ctx );
       new_state |= rmesa->radeon.NewGLState; /* may add TEXTURE_MATRIX */
       r200UpdateLocalViewer( ctx );
@@ -2341,7 +2341,7 @@ GLboolean r200ValidateState( GLcontext *ctx )
 
    /* Need an event driven matrix update?
     */
-   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION)) 
+   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION))
       upload_matrix( rmesa, ctx->_ModelProjectMatrix.m, R200_MTX_MVP );
 
    /* Need these for lighting (shouldn't upload otherwise)
@@ -2365,11 +2365,12 @@ GLboolean r200ValidateState( GLcontext *ctx )
    /* emit all active clip planes if projection matrix changes.
     */
    if (new_state & (_NEW_PROJECTION)) {
-      if (ctx->Transform.ClipPlanesEnabled) 
+      if (ctx->Transform.ClipPlanesEnabled)
 	 r200UpdateClipPlanes( ctx );
    }
 
    if (new_state & (_NEW_PROGRAM|
+                    _NEW_PROGRAM_CONSTANTS |
    /* need to test for pretty much anything due to possible parameter bindings */
 	_NEW_MODELVIEW|_NEW_PROJECTION|_NEW_TRANSFORM|
 	_NEW_LIGHT|_NEW_TEXTURE|_NEW_TEXTURE_MATRIX|
@@ -2436,7 +2437,7 @@ static void r200WrapRunPipeline( GLcontext *ctx )
    }
 
    /* Run the pipeline.
-    */ 
+    */
    _tnl_run_pipeline( ctx );
 
    if (has_material) {
diff --git a/src/mesa/drivers/dri/r200/r200_state_init.c b/src/mesa/drivers/dri/r200/r200_state_init.c
index 75262e46bd8..83920093378 100644
--- a/src/mesa/drivers/dri/r200/r200_state_init.c
+++ b/src/mesa/drivers/dri/r200/r200_state_init.c
@@ -295,12 +295,16 @@ VP_CHECK( tcl_vpp_size, ctx->VertexProgram.Current->Base.NumNativeParameters > 9
     h.i = hdr;								\
     _start = h.veclinear.addr_lo | (h.veclinear.addr_hi << 8);		\
     _sz = h.veclinear.count * 4;					\
+    if (r200->radeon.radeonScreen->kernel_mm && _sz) { \
+    BEGIN_BATCH_NO_AUTOSTATE(dwords); \
     OUT_BATCH(CP_PACKET0(RADEON_SE_TCL_STATE_FLUSH, 0));		\
     OUT_BATCH(0);							\
     OUT_BATCH(CP_PACKET0(R200_SE_TCL_VECTOR_INDX_REG, 0));		\
     OUT_BATCH(_start | (1 << RADEON_VEC_INDX_OCTWORD_STRIDE_SHIFT));	\
     OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_VECTOR_DATA_REG, _sz - 1));	\
     OUT_BATCH_TABLE((data), _sz);					\
+    END_BATCH(); \
+    } \
   } while(0)
 
 #define OUT_SCL(hdr, data) do {					\
@@ -367,9 +371,7 @@ static void veclinear_emit(GLcontext *ctx, struct radeon_state_atom *atom)
    uint32_t dwords = atom->cmd_size;
 
    dwords += 4;
-   BEGIN_BATCH_NO_AUTOSTATE(dwords);
    OUT_VECLINEAR(atom->cmd[0], atom->cmd+1);
-   END_BATCH();
 }
 
 static void scl_emit(GLcontext *ctx, struct radeon_state_atom *atom)
@@ -505,6 +507,8 @@ static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
      dwords += 6;
    if (rrb)
      dwords += 6;
+   if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM)
+     dwords += 4;
 
    /* output the first 7 bytes of context */
    BEGIN_BATCH_NO_AUTOSTATE(dwords);
@@ -555,17 +559,12 @@ static void tex_emit(GLcontext *ctx, struct radeon_state_atom *atom)
    if (t && t->mt && !t->image_override)
      dwords += 2;
    BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   /* is this ok even with drm older than 1.18? */
    OUT_BATCH_TABLE(atom->cmd, 10);
 
    if (t && t->mt && !t->image_override) {
-     if ((ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT)) {
-   	lvl = &t->mt->levels[0];
-	OUT_BATCH_RELOC(lvl->faces[5].offset, t->mt->bo, lvl->faces[5].offset,
-			RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
-     } else {
-        OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
-		     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
-     }
+     OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+		  RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
    } else if (!t) {
      /* workaround for old CS mechanism */
      OUT_BATCH(r200->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP]);
@@ -593,27 +592,20 @@ static void tex_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
 		hastexture = 0;
    }
 
-   dwords += 2;
    if (hastexture)
      dwords += 2;
    else
      dwords -= 2;
    BEGIN_BATCH_NO_AUTOSTATE(dwords);
 
-   OUT_BATCH(CP_PACKET0(R200_PP_TXFILTER_0 + (24 * i), 7));
+   OUT_BATCH(CP_PACKET0(R200_PP_TXFILTER_0 + (32 * i), 7));
    OUT_BATCH_TABLE((atom->cmd + 1), 8);
 
    if (hastexture) {
      OUT_BATCH(CP_PACKET0(R200_PP_TXOFFSET_0 + (24 * i), 0));
      if (t->mt && !t->image_override) {
-        if ((ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT)) {
-            lvl = &t->mt->levels[0];
-	    OUT_BATCH_RELOC(lvl->faces[5].offset, t->mt->bo, lvl->faces[5].offset,
-			RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
-        } else {
-           OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
-		     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
-        }
+        OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+		  RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
       } else {
 	if (t->bo)
             OUT_BATCH_RELOC(t->tile_bits, t->bo, 0,
@@ -628,21 +620,21 @@ static void cube_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 {
    r200ContextPtr r200 = R200_CONTEXT(ctx);
    BATCH_LOCALS(&r200->radeon);
-   uint32_t dwords = atom->cmd_size;
-   int i = atom->idx;
+   uint32_t dwords = 2;
+   int i = atom->idx, j;
    radeonTexObj *t = r200->state.texture.unit[i].texobj;
-   GLuint size;
+   radeon_mipmap_level *lvl;
 
-   BEGIN_BATCH_NO_AUTOSTATE(dwords + (2 * 5));
-   OUT_BATCH_TABLE(atom->cmd, 3);
+   BEGIN_BATCH_NO_AUTOSTATE(dwords + (4 * 5));
+   OUT_BATCH_TABLE(atom->cmd, 2);
 
    if (t && !t->image_override) {
-     size = t->mt->totalsize / 6;
-     OUT_BATCH_RELOC(0, t->mt->bo, size, RADEON_GEM_DOMAIN_VRAM, 0, 0);
-     OUT_BATCH_RELOC(0, t->mt->bo, size * 2, RADEON_GEM_DOMAIN_VRAM, 0, 0);
-     OUT_BATCH_RELOC(0, t->mt->bo, size * 3, RADEON_GEM_DOMAIN_VRAM, 0, 0);
-     OUT_BATCH_RELOC(0, t->mt->bo, size * 4, RADEON_GEM_DOMAIN_VRAM, 0, 0);
-     OUT_BATCH_RELOC(0, t->mt->bo, size * 5, RADEON_GEM_DOMAIN_VRAM, 0, 0);
+     lvl = &t->mt->levels[0];
+     for (j = 1; j <= 5; j++) {
+       OUT_BATCH(CP_PACKET0(R200_PP_CUBIC_OFFSET_F1_0 + (24*i) + (4 * (j-1)), 0));
+       OUT_BATCH_RELOC(lvl->faces[j].offset, t->mt->bo, lvl->faces[j].offset,
+			RADEON_GEM_DOMAIN_VRAM, 0, 0);
+     }
    }
    END_BATCH();
 }
diff --git a/src/mesa/drivers/dri/r200/r200_swtcl.c b/src/mesa/drivers/dri/r200/r200_swtcl.c
index 712da980775..83e70b586d7 100644
--- a/src/mesa/drivers/dri/r200/r200_swtcl.c
+++ b/src/mesa/drivers/dri/r200/r200_swtcl.c
@@ -56,7 +56,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 
 /***********************************************************************
- *                         Initialization 
+ *                         Initialization
  ***********************************************************************/
 
 #define EMIT_ATTR( ATTR, STYLE, F0 )					\
@@ -118,7 +118,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
    }
 
    rmesa->swtcl.coloroffset = offset;
-#if MESA_LITTLE_ENDIAN 
+#if MESA_LITTLE_ENDIAN
    EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA, (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) );
 #else
    EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR, (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) );
@@ -129,7 +129,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
    if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 ) ||
        RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
 
-#if MESA_LITTLE_ENDIAN 
+#if MESA_LITTLE_ENDIAN
       if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
 	 rmesa->swtcl.specoffset = offset;
 	 EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_RGB, (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT) );
@@ -192,7 +192,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
 
       rmesa->radeon.swtcl.vertex_size =
 	  _tnl_install_attrs( ctx,
-			      rmesa->radeon.swtcl.vertex_attrs, 
+			      rmesa->radeon.swtcl.vertex_attrs,
 			      rmesa->radeon.swtcl.vertex_attr_count,
 			      NULL, 0 );
       rmesa->radeon.swtcl.vertex_size /= 4;
@@ -278,7 +278,7 @@ void r200_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
 		      rmesa->radeon.dma.current,
 		      current_offset);
 
-		      
+
    r200EmitVbufPrim( rmesa,
 		     rmesa->radeon.swtcl.hw_primitive,
 		     rmesa->radeon.swtcl.numverts);
@@ -338,7 +338,7 @@ static void r200ResetLineStipple( GLcontext *ctx );
    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
    const char *r200verts = (char *)rmesa->radeon.swtcl.verts;
 #define VERT(x) (radeonVertex *)(r200verts + ((x) * vertsize * sizeof(int)))
-#define VERTEX radeonVertex 
+#define VERTEX radeonVertex
 #define DO_DEBUG_VERTS (1 && (R200_DEBUG & DEBUG_VERTS))
 
 #undef TAG
@@ -539,7 +539,7 @@ void r200ChooseRenderState( GLcontext *ctx )
    GLuint index = 0;
    GLuint flags = ctx->_TriangleCaps;
 
-   if (!rmesa->radeon.TclFallback || rmesa->radeon.Fallback) 
+   if (!rmesa->radeon.TclFallback || rmesa->radeon.Fallback)
       return;
 
    if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R200_TWOSIDE_BIT;
@@ -597,7 +597,7 @@ static void r200RenderPrimitive( GLcontext *ctx, GLenum prim )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    rmesa->radeon.swtcl.render_primitive = prim;
-   if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED)) 
+   if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED))
       r200RasterPrimitive( ctx, reduced_hw_prim(ctx, prim) );
 }
 
@@ -695,7 +695,7 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 
 /**
  * Cope with depth operations by drawing individual pixels as points.
- * 
+ *
  * \todo
  * The way the vertex state is set in this routine is hokey.  It seems to
  * work, but it's very hackish.  This whole routine is pretty hackish.  If
@@ -710,14 +710,14 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 		  const GLubyte *bitmap )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   const GLfloat *rc = ctx->Current.RasterColor; 
+   const GLfloat *rc = ctx->Current.RasterColor;
    GLint row, col;
    radeonVertex vert;
    GLuint orig_vte;
    GLuint h;
 
 
-   /* Turn off tcl.  
+   /* Turn off tcl.
     */
    TCL_FALLBACK( ctx, R200_TCL_FALLBACK_BITMAP, 1 );
 
@@ -768,7 +768,7 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 					   R200_VPORT_Z_SCALE_ENA |
 					   R200_VPORT_X_OFFSET_ENA |
 					   R200_VPORT_Y_OFFSET_ENA |
-					   R200_VPORT_Z_OFFSET_ENA); 
+					   R200_VPORT_Z_OFFSET_ENA);
 
    /* Turn off other stuff:  Stipple?, texture?, blending?, etc.
     */
@@ -813,14 +813,14 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
     */
    LOCK_HARDWARE( &rmesa->radeon );
    UNLOCK_HARDWARE( &rmesa->radeon );
-   h = rmesa->radeon.dri.drawable->h + rmesa->radeon.dri.drawable->y;
-   px += rmesa->radeon.dri.drawable->x;
+   h = radeon_get_drawable(&rmesa->radeon)->h + radeon_get_drawable(&rmesa->radeon)->y;
+   px += radeon_get_drawable(&rmesa->radeon)->x;
 
    /* Clipping handled by existing mechansims in r200_ioctl.c?
     */
    for (row=0; row<height; row++) {
-      const GLubyte *src = (const GLubyte *) 
-	 _mesa_image_address2d(unpack, bitmap, width, height, 
+      const GLubyte *src = (const GLubyte *)
+	 _mesa_image_address2d(unpack, bitmap, width, height,
                                GL_COLOR_INDEX, GL_BITMAP, row, 0 );
 
       if (unpack->LsbFirst) {
@@ -899,9 +899,9 @@ void r200InitSwtcl( GLcontext *ctx )
    tnl->Driver.Render.Interp = _tnl_interp;
 
    /* FIXME: what are these numbers? */
-   _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
+   _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12,
 		       36 * sizeof(GLfloat) );
-   
+
    rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
    rmesa->radeon.swtcl.RenderIndex = ~0;
    rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
diff --git a/src/mesa/drivers/dri/r200/r200_texstate.c b/src/mesa/drivers/dri/r200/r200_texstate.c
index 9797f77ec4f..ed1995e147c 100644
--- a/src/mesa/drivers/dri/r200/r200_texstate.c
+++ b/src/mesa/drivers/dri/r200/r200_texstate.c
@@ -833,7 +833,6 @@ void r200SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
 		radeon_miptree_unreference(rImage->mt);
 		rImage->mt = NULL;
 	}
-	fprintf(stderr,"settexbuf %d %dx%d@%d\n", rb->pitch, rb->width, rb->height, rb->cpp);
 	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
 				   rb->width, rb->height, 1, 0, rb->cpp);
 	texImage->RowStride = rb->pitch / rb->cpp;
@@ -851,7 +850,10 @@ void r200SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
 	pitch_val = rb->pitch;
 	switch (rb->cpp) {
 	case 4:
-		t->pp_txformat = tx_table_le[MESA_FORMAT_ARGB8888].format;
+		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
+			t->pp_txformat = tx_table_le[MESA_FORMAT_RGB888].format;
+		else
+			t->pp_txformat = tx_table_le[MESA_FORMAT_ARGB8888].format;
 		t->pp_txfilter |= tx_table_le[MESA_FORMAT_ARGB8888].filter;
 		break;
 	case 3:
@@ -1422,8 +1424,8 @@ void set_re_cntl_d3d( GLcontext *ctx, int unit, GLboolean use_d3d )
  */
 static void setup_hardware_state(r200ContextPtr rmesa, radeonTexObj *t)
 {
-   const struct gl_texture_image *firstImage =
-      t->base.Image[0][t->mt->firstLevel];
+   int firstlevel = t->mt ? t->mt->firstLevel : 0;
+   const struct gl_texture_image *firstImage = t->base.Image[0][firstlevel];
    GLint log2Width, log2Height, log2Depth, texelBytes;
    
    if ( t->bo ) {
diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c
index aadd1443ad9..620f29b5c6e 100644
--- a/src/mesa/drivers/dri/r200/r200_vertprog.c
+++ b/src/mesa/drivers/dri/r200/r200_vertprog.c
@@ -290,7 +290,7 @@ static unsigned long t_src(struct r200_vertex_program *vp, struct prog_src_regis
 			t_swizzle(GET_SWZ(src->Swizzle, 2)),
 			t_swizzle(GET_SWZ(src->Swizzle, 3)),
 			t_src_class(src->File),
-			src->NegateBase) | (src->RelAddr << 4);
+			src->Negate) | (src->RelAddr << 4);
 }
 
 static unsigned long t_src_scalar(struct r200_vertex_program *vp, struct prog_src_register *src)
@@ -302,7 +302,7 @@ static unsigned long t_src_scalar(struct r200_vertex_program *vp, struct prog_sr
 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
 			t_src_class(src->File),
-			src->NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src->RelAddr << 4);
+			src->Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src->RelAddr << 4);
 }
 
 static unsigned long t_opcode(enum prog_opcode opcode)
@@ -700,7 +700,7 @@ static GLboolean r200_translate_vertex_program(GLcontext *ctx, struct r200_verte
 		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
 		   SWIZZLE_ZERO,
 		   t_src_class(src[0].File),
-		   src[0].NegateBase) | (src[0].RelAddr << 4);
+		   src[0].Negate) | (src[0].RelAddr << 4);
 	    o_inst->src1 = UNUSED_SRC_0;
 	    o_inst->src2 = UNUSED_SRC_0;
 	 }
@@ -712,12 +712,12 @@ static GLboolean r200_translate_vertex_program(GLcontext *ctx, struct r200_verte
 		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
 		   SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO,
 		   t_src_class(src[0].File),
-		   src[0].NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
+		   src[0].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
 	    o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
 		   SWIZZLE_ZERO, SWIZZLE_ZERO,
 		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)), SWIZZLE_ZERO,
 		   t_src_class(src[1].File),
-		   src[1].NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
+		   src[1].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
 	    o_inst->src2 = UNUSED_SRC_1;
 	    o_inst++;
 
@@ -766,11 +766,11 @@ if ((o_inst - vp->instr) == 31) {
 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
 			SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X,
 			t_src_class(src[1].File),
-			src[1].NegateBase) | (src[1].RelAddr << 4);
+			src[1].Negate) | (src[1].RelAddr << 4);
 o_inst->src2 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
 			SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y,
 			t_src_class(src[1].File),
-			src[1].NegateBase) | (src[1].RelAddr << 4);
+			src[1].Negate) | (src[1].RelAddr << 4);
 }
 else {
 	 o_inst->src1 = t_src(vp, &src[1]);
@@ -792,7 +792,7 @@ else {
 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
 		SWIZZLE_ZERO,
 		t_src_class(src[0].File),
-		src[0].NegateBase) | (src[0].RelAddr << 4);
+		src[0].Negate) | (src[0].RelAddr << 4);
 
 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
@@ -800,7 +800,7 @@ else {
 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
 		SWIZZLE_ZERO,
 		t_src_class(src[1].File),
-		src[1].NegateBase) | (src[1].RelAddr << 4);
+		src[1].Negate) | (src[1].RelAddr << 4);
 
 	 o_inst->src2 = UNUSED_SRC_1;
 	 goto next;
@@ -815,7 +815,7 @@ else {
 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
 		VSF_IN_COMPONENT_ONE,
 		t_src_class(src[0].File),
-		src[0].NegateBase) | (src[0].RelAddr << 4);
+		src[0].Negate) | (src[0].RelAddr << 4);
 	 o_inst->src1 = t_src(vp, &src[1]);
 	 o_inst->src2 = UNUSED_SRC_1;
 	 goto next;
@@ -831,7 +831,7 @@ else {
 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
 		t_src_class(src[1].File),
-		(!src[1].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
+		(!src[1].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
 	 o_inst->src2 = UNUSED_SRC_1;
 	 goto next;
 
@@ -846,7 +846,7 @@ else {
 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)),
 		t_src_class(src[0].File),
-		(!src[0].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
+		(!src[0].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
 	 o_inst->src2 = UNUSED_SRC_1;
 	 goto next;
 
@@ -874,7 +874,7 @@ else {
 		VSF_IN_COMPONENT_W,
 		VSF_IN_CLASS_TMP,
 		/* Not 100% sure about this */
-		(!src[0].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE/*VSF_FLAG_ALL*/);
+		(!src[0].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE/*VSF_FLAG_ALL*/);
 
 	 o_inst->src2 = UNUSED_SRC_0;
 	 u_temp_i--;
@@ -899,7 +899,7 @@ else {
 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
 		t_src_class(src[0].File),
-		src[0].NegateBase) | (src[0].RelAddr << 4);
+		src[0].Negate) | (src[0].RelAddr << 4);
 
 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
@@ -907,7 +907,7 @@ else {
 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
 		t_src_class(src[1].File),
-		src[1].NegateBase) | (src[1].RelAddr << 4);
+		src[1].Negate) | (src[1].RelAddr << 4);
 
 	 o_inst->src2 = UNUSED_SRC_1;
 	 o_inst++;
@@ -922,7 +922,7 @@ else {
 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
 		t_src_class(src[1].File),
-		(!src[1].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
+		(!src[1].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
 
 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
@@ -930,7 +930,7 @@ else {
 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
 		t_src_class(src[0].File),
-		src[0].NegateBase) | (src[0].RelAddr << 4);
+		src[0].Negate) | (src[0].RelAddr << 4);
 
 	 o_inst->src2 = MAKE_VSF_SOURCE(u_temp_i+1,
 		VSF_IN_COMPONENT_X,
diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile
index 0dff9a12731..bdb09624be4 100644
--- a/src/mesa/drivers/dri/r300/Makefile
+++ b/src/mesa/drivers/dri/r300/Makefile
@@ -37,6 +37,7 @@ RADEON_COMMON_SOURCES = \
 DRIVER_SOURCES = \
 		 radeon_screen.c \
 		 r300_context.c \
+		 r300_draw.c \
 		 r300_ioctl.c \
 		 r300_cmdbuf.c \
 		 r300_state.c \
@@ -48,6 +49,7 @@ DRIVER_SOURCES = \
 		 radeon_program_pair.c \
 		 radeon_nqssadce.c \
 		 r300_vertprog.c \
+		 r300_fragprog_common.c \
 		 r300_fragprog.c \
 		 r300_fragprog_swizzle.c \
 		 r300_fragprog_emit.c \
diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.c b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
index c575c9ac496..0261a5b1d8a 100644
--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.c
+++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
@@ -55,9 +55,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_state.h"
 #include "radeon_reg.h"
 
-#define R300_VAP_PVS_UPLOAD_ADDRESS 0x2200
-#   define RADEON_ONE_REG_WR        (1 << 15)
-
 /** # of dwords reserved for additional instructions that may need to be written
  * during flushing.
  */
@@ -71,7 +68,6 @@ static unsigned packet0_count(r300ContextPtr r300, uint32_t *pkt)
         drm_r300_cmd_header_t *t = (drm_r300_cmd_header_t*)pkt;
         return t->packet0.count;
     }
-    return 0;
 }
 
 #define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
@@ -83,7 +79,7 @@ void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom)
 	BATCH_LOCALS(&r300->radeon);
 	drm_r300_cmd_header_t cmd;
 	uint32_t addr, ndw, i;
-	
+
 	if (!r300->radeon.radeonScreen->kernel_mm) {
 		uint32_t dwords;
 		dwords = (*atom->check) (ctx, atom);
@@ -92,7 +88,7 @@ void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom)
 		END_BATCH();
 		return;
 	}
-	
+
 	cmd.u = atom->cmd[0];
 	addr = (cmd.vpu.adrhi << 8) | cmd.vpu.adrlo;
 	ndw = cmd.vpu.count * 4;
@@ -111,7 +107,7 @@ void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom)
 		} else {
 			BEGIN_BATCH_NO_AUTOSTATE(5 + ndw);
 		}
-		OUT_BATCH_REGVAL(R300_VAP_PVS_UPLOAD_ADDRESS, addr);
+		OUT_BATCH_REGVAL(R300_VAP_PVS_VECTOR_INDX_REG, addr);
 		OUT_BATCH(CP_PACKET0(R300_VAP_PVS_UPLOAD_DATA, ndw-1) | RADEON_ONE_REG_WR);
 		for (i = 0; i < ndw; i++) {
 			OUT_BATCH(atom->cmd[i+1]);
@@ -175,7 +171,7 @@ static void emit_tex_offsets(GLcontext *ctx, struct radeon_state_atom * atom)
 
 		for(i = 0; i < numtmus; ++i) {
 		    radeonTexObj *t = r300->hw.textures[i];
-		
+
 		    if (!t)
 			notexture = 1;
 		}
@@ -183,30 +179,75 @@ static void emit_tex_offsets(GLcontext *ctx, struct radeon_state_atom * atom)
 		if (r300->radeon.radeonScreen->kernel_mm && notexture) {
 			return;
 		}
-		BEGIN_BATCH_NO_AUTOSTATE(4 * numtmus);
 		for(i = 0; i < numtmus; ++i) {
 		    radeonTexObj *t = r300->hw.textures[i];
-		    OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
 		    if (t && !t->image_override) {
+                BEGIN_BATCH_NO_AUTOSTATE(4);
+                OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
 			    OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
 					    RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+                END_BATCH();
 		    } else if (!t) {
-			    OUT_BATCH(r300->radeon.radeonScreen->texOffset[0]);
+                /* Texture unit hasn't a texture bound nothings to do */
 		    } else { /* override cases */
 			    if (t->bo) {
+                    BEGIN_BATCH_NO_AUTOSTATE(4);
+                    OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
 				    OUT_BATCH_RELOC(t->tile_bits, t->bo, 0,
 						    RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+                    END_BATCH();
 			    } else if (!r300->radeon.radeonScreen->kernel_mm) {
+                    BEGIN_BATCH_NO_AUTOSTATE(2);
+                    OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
 				    OUT_BATCH(t->override_offset);
-			    }
-			    else
-			    	OUT_BATCH(r300->radeon.radeonScreen->texOffset[0]);
+                    END_BATCH();
+			    } else {
+                    /* Texture unit hasn't a texture bound nothings to do */
+                }
 		    }
 		}
-		END_BATCH();
 	}
 }
 
+void r300_emit_scissor(GLcontext *ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+    unsigned x1, y1, x2, y2;
+	struct radeon_renderbuffer *rrb;
+
+    if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        return;
+    }
+	rrb = radeon_get_colorbuffer(&r300->radeon);
+	if (!rrb || !rrb->bo) {
+		fprintf(stderr, "no rrb\n");
+		return;
+	}
+    if (r300->radeon.state.scissor.enabled) {
+        x1 = r300->radeon.state.scissor.rect.x1;
+        y1 = r300->radeon.state.scissor.rect.y1;
+        x2 = r300->radeon.state.scissor.rect.x2 - 1;
+        y2 = r300->radeon.state.scissor.rect.y2 - 1;
+    } else {
+        x1 = 0;
+        y1 = 0;
+        x2 = rrb->width - 1;
+        y2 = rrb->height - 1;
+    }
+    if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
+        x1 += R300_SCISSORS_OFFSET;
+        y1 += R300_SCISSORS_OFFSET;
+        x2 += R300_SCISSORS_OFFSET;
+        y2 += R300_SCISSORS_OFFSET;
+    }
+    BEGIN_BATCH_NO_AUTOSTATE(3);
+    OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+    OUT_BATCH((x1 << R300_SCISSORS_X_SHIFT)|(y1 << R300_SCISSORS_Y_SHIFT));
+    OUT_BATCH((x2 << R300_SCISSORS_X_SHIFT)|(y2 << R300_SCISSORS_Y_SHIFT));
+    END_BATCH();
+}
+
 static void emit_cb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
@@ -215,6 +256,7 @@ static void emit_cb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
 	uint32_t cbpitch;
 	uint32_t offset = r300->radeon.state.color.draw_offset;
 	uint32_t dw = 6;
+    int i;
 
 	rrb = radeon_get_colorbuffer(&r300->radeon);
 	if (!rrb || !rrb->bo) {
@@ -247,16 +289,39 @@ static void emit_cb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
             BEGIN_BATCH_NO_AUTOSTATE(3);
             OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
             OUT_BATCH(0);
-            OUT_BATCH((rrb->width << R300_SCISSORS_X_SHIFT) |
-                    (rrb->height << R300_SCISSORS_Y_SHIFT));
+            OUT_BATCH(((rrb->width - 1) << R300_SCISSORS_X_SHIFT) |
+                    ((rrb->height - 1) << R300_SCISSORS_Y_SHIFT));
+            END_BATCH();
+            BEGIN_BATCH_NO_AUTOSTATE(16);
+            for (i = 0; i < 4; i++) {
+                OUT_BATCH_REGSEQ(R300_SC_CLIPRECT_TL_0 + (i * 8), 2);
+                OUT_BATCH((0 << R300_CLIPRECT_X_SHIFT) | (0 << R300_CLIPRECT_Y_SHIFT));
+                OUT_BATCH(((rrb->width - 1) << R300_CLIPRECT_X_SHIFT) | ((rrb->height - 1) << R300_CLIPRECT_Y_SHIFT));
+            }
+            OUT_BATCH_REGSEQ(R300_SC_CLIP_RULE, 1);
+            OUT_BATCH(0xAAAA);
+            OUT_BATCH_REGSEQ(R300_SC_SCREENDOOR, 1);
+            OUT_BATCH(0xffffff);
             END_BATCH();
         } else {
             BEGIN_BATCH_NO_AUTOSTATE(3);
             OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
             OUT_BATCH((R300_SCISSORS_OFFSET << R300_SCISSORS_X_SHIFT) |
                     (R300_SCISSORS_OFFSET << R300_SCISSORS_Y_SHIFT));
-            OUT_BATCH(((rrb->width + R300_SCISSORS_OFFSET) << R300_SCISSORS_X_SHIFT) |
-                    ((rrb->height + R300_SCISSORS_OFFSET) << R300_SCISSORS_Y_SHIFT));
+            OUT_BATCH(((rrb->width + R300_SCISSORS_OFFSET - 1) << R300_SCISSORS_X_SHIFT) |
+                    ((rrb->height + R300_SCISSORS_OFFSET - 1) << R300_SCISSORS_Y_SHIFT));
+            END_BATCH();
+            BEGIN_BATCH_NO_AUTOSTATE(16);
+            for (i = 0; i < 4; i++) {
+                OUT_BATCH_REGSEQ(R300_SC_CLIPRECT_TL_0 + (i * 8), 2);
+                OUT_BATCH((R300_SCISSORS_OFFSET << R300_CLIPRECT_X_SHIFT) | (R300_SCISSORS_OFFSET << R300_CLIPRECT_Y_SHIFT));
+                OUT_BATCH(((R300_SCISSORS_OFFSET + rrb->width - 1) << R300_CLIPRECT_X_SHIFT) |
+                          ((R300_SCISSORS_OFFSET + rrb->height - 1) << R300_CLIPRECT_Y_SHIFT));
+            }
+            OUT_BATCH_REGSEQ(R300_SC_CLIP_RULE, 1);
+            OUT_BATCH(0xAAAA);
+            OUT_BATCH_REGSEQ(R300_SC_SCREENDOOR, 1);
+            OUT_BATCH(0xffffff);
             END_BATCH();
         }
     }
@@ -280,7 +345,7 @@ static void emit_zb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
 	if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE){
 		zbpitch |= R300_DEPTHMICROTILE_TILED;
 	}
-	
+
 	BEGIN_BATCH_NO_AUTOSTATE(6);
 	OUT_BATCH_REGSEQ(R300_ZB_DEPTHOFFSET, 1);
 	OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
@@ -288,6 +353,46 @@ static void emit_zb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
 	END_BATCH();
 }
 
+static void emit_gb_misc(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+    if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        BEGIN_BATCH_NO_AUTOSTATE(4);
+        OUT_BATCH(atom->cmd[0]);
+        OUT_BATCH(atom->cmd[1]);
+        OUT_BATCH(atom->cmd[2]);
+        OUT_BATCH(atom->cmd[3]);
+        END_BATCH();
+    }
+}
+
+static void emit_threshold_misc(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+    if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+        BEGIN_BATCH_NO_AUTOSTATE(3);
+        OUT_BATCH(atom->cmd[0]);
+        OUT_BATCH(atom->cmd[1]);
+        OUT_BATCH(atom->cmd[2]);
+        END_BATCH();
+    }
+}
+
+static void emit_shade_misc(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+
+    if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        BEGIN_BATCH_NO_AUTOSTATE(2);
+        OUT_BATCH(atom->cmd[0]);
+        OUT_BATCH(atom->cmd[1]);
+        END_BATCH();
+    }
+}
+
 static void emit_zstencil_format(GLcontext *ctx, struct radeon_state_atom * atom)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
@@ -372,12 +477,10 @@ int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom)
 void r300InitCmdBuf(r300ContextPtr r300)
 {
 	int mtu;
-	int has_tcl = 1;
+	int has_tcl;
 	int is_r500 = 0;
-	int i;
 
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
-		has_tcl = 0;
+	has_tcl = r300->options.hw_tcl_enabled;
 
 	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
 		is_r500 = 1;
@@ -444,7 +547,10 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	ALLOC_STATE(gb_enable, always, 2, 0);
 	r300->hw.gb_enable.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GB_ENABLE, 1);
 	ALLOC_STATE(gb_misc, always, R300_GB_MISC_CMDSIZE, 0);
-	r300->hw.gb_misc.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GB_MSPOS0, 5);
+	r300->hw.gb_misc.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GB_MSPOS0, 3);
+	r300->hw.gb_misc.emit = emit_gb_misc;
+	ALLOC_STATE(gb_misc2, always, R300_GB_MISC2_CMDSIZE, 0);
+    r300->hw.gb_misc2.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, 0x401C, 2);
 	ALLOC_STATE(txe, always, R300_TXE_CMDSIZE, 0);
 	r300->hw.txe.cmd[R300_TXE_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_ENABLE, 1);
 	ALLOC_STATE(ga_point_s0, always, 5, 0);
@@ -459,8 +565,11 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	r300->hw.lcntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_LINE_CNTL, 1);
 	ALLOC_STATE(ga_line_stipple, always, 4, 0);
 	r300->hw.ga_line_stipple.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_LINE_STIPPLE_VALUE, 3);
-	ALLOC_STATE(shade, always, 5, 0);
-	r300->hw.shade.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_ENHANCE, 4);
+	ALLOC_STATE(shade, always, 2, 0);
+	r300->hw.shade.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_ENHANCE, 1);
+	r300->hw.shade.emit = emit_shade_misc;
+	ALLOC_STATE(shade2, always, 4, 0);
+	r300->hw.shade2.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, 0x4278, 3);
 	ALLOC_STATE(polygon_mode, always, 4, 0);
 	r300->hw.polygon_mode.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POLY_MODE, 3);
 	ALLOC_STATE(fogp, always, 3, 0);
@@ -479,19 +588,12 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	ALLOC_STATE(rc, always, R300_RC_CMDSIZE, 0);
 	r300->hw.rc.cmd[R300_RC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_COUNT, 2);
 	if (is_r500) {
-		ALLOC_STATE(ri, always, R500_RI_CMDSIZE, 0);
+		ALLOC_STATE(ri, variable, R500_RI_CMDSIZE, 0);
 		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_IP_0, 16);
-		for (i = 0; i < 8; i++) {
-			r300->hw.ri.cmd[R300_RI_CMD_0 + i +1] =
-			  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
-                          (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
-                          (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
-                          (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT);
-		}
 		ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
 		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_INST_0, 1);
 	} else {
-		ALLOC_STATE(ri, always, R300_RI_CMDSIZE, 0);
+		ALLOC_STATE(ri, variable, R300_RI_CMDSIZE, 0);
 		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_IP_0, 8);
 		ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
 		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_INST_0, 1);
@@ -567,8 +669,9 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	r300->hw.rb3d_dither_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_DITHER_CTL, 9);
 	ALLOC_STATE(rb3d_aaresolve_ctl, always, 2, 0);
 	r300->hw.rb3d_aaresolve_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_AARESOLVE_CTL, 1);
-	ALLOC_STATE(rb3d_discard_src_pixel_lte_threshold, always, 3, 0);
-	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 2);
+    ALLOC_STATE(rb3d_discard_src_pixel_lte_threshold, always, 3, 0);
+    r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 2);
+	r300->hw.rb3d_discard_src_pixel_lte_threshold.emit = emit_threshold_misc;
 	ALLOC_STATE(zs, always, R300_ZS_CMDSIZE, 0);
 	r300->hw.zs.cmd[R300_ZS_CMD_0] =
 	    cmdpacket0(r300->radeon.radeonScreen, R300_ZB_CNTL, 3);
@@ -582,8 +685,8 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	r300->hw.zb.emit = emit_zb_offset;
 	ALLOC_STATE(zb_depthclearvalue, always, 2, 0);
 	r300->hw.zb_depthclearvalue.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_DEPTHCLEARVALUE, 1);
-	ALLOC_STATE(unk4F30, always, 3, 0);
-	r300->hw.unk4F30.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, 0x4F30, 2);
+	ALLOC_STATE(zb_zmask, always, 3, 0);
+	r300->hw.zb_zmask.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_ZMASK_OFFSET, 2);
 	ALLOC_STATE(zb_hiz_offset, always, 2, 0);
 	r300->hw.zb_hiz_offset.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_HIZ_OFFSET, 1);
 	ALLOC_STATE(zb_hiz_pitch, always, 2, 0);
diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.h b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
index 3786813de36..53bcc0eeb49 100644
--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.h
+++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
@@ -39,6 +39,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_context.h"
 
 extern void r300InitCmdBuf(r300ContextPtr r300);
+void r300_emit_scissor(GLcontext *ctx);
 
 void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom);
 int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom);
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index 5f279d6629e..394521a051f 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -64,6 +64,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_ioctl.h"
 #include "r300_tex.h"
 #include "r300_emit.h"
+#include "r300_render.h"
 #include "r300_swtcl.h"
 #include "radeon_bocs_wrapper.h"
 
@@ -72,10 +73,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "utils.h"
 #include "xmlpool.h"		/* for symbolic values of enum-type options */
 
-/* hw_tcl_on derives from future_hw_tcl_on when its safe to change it. */
-int future_hw_tcl_on = 1;
-int hw_tcl_on = 1;
-
 #define need_GL_VERSION_2_0
 #define need_GL_ARB_point_parameters
 #define need_GL_ARB_vertex_program
@@ -127,6 +124,7 @@ const struct dri_extension card_extensions[] = {
   {"GL_EXT_texture_lod_bias",		NULL},
   {"GL_EXT_texture_mirror_clamp",	NULL},
   {"GL_EXT_texture_rectangle",		NULL},
+  {"GL_EXT_vertex_array_bgra",		NULL},
   {"GL_ATI_separate_stencil",		GL_ATI_separate_stencil_functions},
   {"GL_ATI_texture_env_combine3",	NULL},
   {"GL_ATI_texture_mirror_once",	NULL},
@@ -154,16 +152,8 @@ const struct dri_extension gl_20_extension[] = {
   {"GL_VERSION_2_0",			GL_VERSION_2_0_functions },
 };
 
-
-extern struct tnl_pipeline_stage _r300_render_stage;
-extern const struct tnl_pipeline_stage _r300_tcl_stage;
-
 static const struct tnl_pipeline_stage *r300_pipeline[] = {
 
-	/* Try and go straight to t&l
-	 */
-	&_r300_tcl_stage,
-
 	/* Catch any t&l fallbacks
 	 */
 	&_tnl_vertex_transform_stage,
@@ -172,6 +162,7 @@ static const struct tnl_pipeline_stage *r300_pipeline[] = {
 	&_tnl_fog_coordinate_stage,
 	&_tnl_texgen_stage,
 	&_tnl_texture_transform_stage,
+	&_tnl_point_attenuation_stage,
 	&_tnl_vertex_program_stage,
 
 	/* Try again to go to tcl?
@@ -191,17 +182,6 @@ static const struct tnl_pipeline_stage *r300_pipeline[] = {
 	0,
 };
 
-static void r300RunPipeline(GLcontext * ctx)
-{
-    _mesa_lock_context_textures(ctx);
-
-    if (ctx->NewState)
-        _mesa_update_state_locked(ctx);
-    
-    _tnl_run_pipeline(ctx);
-    _mesa_unlock_context_textures(ctx);
-}
-
 static void r300_get_lock(radeonContextPtr rmesa)
 {
 	drm_radeon_sarea_t *sarea = rmesa->sarea;
@@ -211,7 +191,7 @@ static void r300_get_lock(radeonContextPtr rmesa)
 		if (!rmesa->radeonScreen->kernel_mm)
 			radeon_bo_legacy_texture_age(rmesa->radeonScreen->bom);
 	}
-}		  
+}
 
 static void r300_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
 {
@@ -246,9 +226,9 @@ static void r300_vtbl_pre_emit_atoms(radeonContextPtr radeon)
 {
 	r300ContextPtr r300 = (r300ContextPtr)radeon;
 	BATCH_LOCALS(radeon);
-	
+
 	r300->vap_flush_needed = GL_TRUE;
-	
+
 	cp_wait(radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
 	BEGIN_BATCH_NO_AUTOSTATE(2);
 	OUT_BATCH_REGVAL(R300_TX_INVALTAGS, R300_TX_FLUSH);
@@ -275,6 +255,112 @@ static void r300_init_vtbl(radeonContextPtr radeon)
 	radeon->vtbl.fallback = r300_fallback;
 }
 
+static void r300InitConstValues(GLcontext *ctx, radeonScreenPtr screen)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	ctx->Const.MaxTextureImageUnits =
+	    driQueryOptioni(&r300->radeon.optionCache, "texture_image_units");
+	ctx->Const.MaxTextureCoordUnits =
+	    driQueryOptioni(&r300->radeon.optionCache, "texture_coord_units");
+	ctx->Const.MaxTextureUnits = MIN2(ctx->Const.MaxTextureImageUnits,
+		 ctx->Const.MaxTextureCoordUnits);
+
+	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+	ctx->Const.MaxTextureLodBias = 16.0;
+
+	if (screen->chip_family >= CHIP_FAMILY_RV515)
+		ctx->Const.MaxTextureLevels = 13;
+	else
+		ctx->Const.MaxTextureLevels = 12;
+
+	ctx->Const.MinPointSize = 1.0;
+	ctx->Const.MinPointSizeAA = 1.0;
+	ctx->Const.MaxPointSize = R300_POINTSIZE_MAX;
+	ctx->Const.MaxPointSizeAA = R300_POINTSIZE_MAX;
+
+	ctx->Const.MinLineWidth = 1.0;
+	ctx->Const.MinLineWidthAA = 1.0;
+	ctx->Const.MaxLineWidth = R300_LINESIZE_MAX;
+	ctx->Const.MaxLineWidthAA = R300_LINESIZE_MAX;
+
+	ctx->Const.MaxDrawBuffers = 1;
+
+	/* currently bogus data */
+	if (r300->options.hw_tcl_enabled) {
+		ctx->Const.VertexProgram.MaxInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
+		ctx->Const.VertexProgram.MaxNativeInstructions =
+		  VSF_MAX_FRAGMENT_LENGTH / 4;
+		ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
+		ctx->Const.VertexProgram.MaxTemps = 32;
+		ctx->Const.VertexProgram.MaxNativeTemps =
+		  /*VSF_MAX_FRAGMENT_TEMPS */ 32;
+		ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
+		ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+	}
+
+	if (screen->chip_family >= CHIP_FAMILY_RV515) {
+		ctx->Const.FragmentProgram.MaxNativeTemps = R500_PFS_NUM_TEMP_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
+		ctx->Const.FragmentProgram.MaxNativeParameters = R500_PFS_NUM_CONST_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAluInstructions = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexInstructions = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeInstructions = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexIndirections = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
+	} else {
+		ctx->Const.FragmentProgram.MaxNativeTemps = R300_PFS_NUM_TEMP_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
+		ctx->Const.FragmentProgram.MaxNativeParameters = R300_PFS_NUM_CONST_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAluInstructions = R300_PFS_MAX_ALU_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexInstructions = R300_PFS_MAX_TEX_INST;
+		ctx->Const.FragmentProgram.MaxNativeInstructions = R300_PFS_MAX_ALU_INST + R300_PFS_MAX_TEX_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexIndirections = R300_PFS_MAX_TEX_INDIRECT;
+		ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
+	}
+}
+
+static void r300ParseOptions(r300ContextPtr r300, radeonScreenPtr screen)
+{
+	struct r300_options options = { 0 };
+
+	driParseConfigFiles(&r300->radeon.optionCache, &screen->optionCache,
+			    screen->driScreen->myNum, "r300");
+
+	r300->radeon.initialMaxAnisotropy = driQueryOptionf(&r300->radeon.optionCache, "def_max_anisotropy");
+
+	options.stencil_two_side_disabled = driQueryOptionb(&r300->radeon.optionCache, "disable_stencil_two_side");
+	options.s3tc_force_enabled = driQueryOptionb(&r300->radeon.optionCache, "force_s3tc_enable");
+	options.s3tc_force_disabled = driQueryOptionb(&r300->radeon.optionCache, "disable_s3tc");
+
+	if (!(screen->chip_flags & RADEON_CHIPSET_TCL) || driQueryOptioni(&r300->radeon.optionCache, "tcl_mode") == DRI_CONF_TCL_SW)
+		options.hw_tcl_enabled = 0;
+	else
+		options.hw_tcl_enabled = 1;
+
+	options.conformance_mode = !driQueryOptionb(&r300->radeon.optionCache, "disable_lowimpact_fallback");
+
+	r300->options = options;
+}
+
+static void r300InitGLExtensions(GLcontext *ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	driInitExtensions(ctx, card_extensions, GL_TRUE);
+	if (r300->radeon.radeonScreen->kernel_mm)
+		driInitExtensions(ctx, mm_extensions, GL_FALSE);
+
+	if (r300->options.stencil_two_side_disabled)
+		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
+
+	if (ctx->Mesa_DXTn && !r300->options.s3tc_force_enabled) {
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+		_mesa_enable_extension(ctx, "GL_S3_s3tc");
+	} else if (r300->options.s3tc_force_disabled) {
+		_mesa_disable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+	}
+}
 
 /* Create the device specific rendering context.
  */
@@ -287,33 +373,19 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	struct dd_function_table functions;
 	r300ContextPtr r300;
 	GLcontext *ctx;
-	int tcl_mode;
 
 	assert(glVisual);
 	assert(driContextPriv);
 	assert(screen);
 
-	/* Allocate the R300 context */
 	r300 = (r300ContextPtr) CALLOC(sizeof(*r300));
 	if (!r300)
 		return GL_FALSE;
 
-	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
-		hw_tcl_on = future_hw_tcl_on = 0;
+	r300ParseOptions(r300, screen);
 
 	r300_init_vtbl(&r300->radeon);
-	/* Parse configuration files.
-	 * Do this here so that initialMaxAnisotropy is set before we create
-	 * the default textures.
-	 */
-	driParseConfigFiles(&r300->radeon.optionCache, &screen->optionCache,
-			    screen->driScreen->myNum, "r300");
-	r300->radeon.initialMaxAnisotropy = driQueryOptionf(&r300->radeon.optionCache,
-						     "def_max_anisotropy");
 
-	/* Init default driver functions then plug in our R300-specific functions
-	 * (the texture functions are especially important)
-	 */
 	_mesa_init_driver_functions(&functions);
 	r300InitIoctlFuncs(&functions);
 	r300InitStateFuncs(&functions);
@@ -327,46 +399,15 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 		return GL_FALSE;
 	}
 
-	/* Init r300 context data */
-	/* Set the maximum texture size small enough that we can guarentee that
-	 * all texture units can bind a maximal texture and have them both in
-	 * texturable memory at once.
-	 */
-
 	ctx = r300->radeon.glCtx;
 
-	ctx->Const.MaxTextureImageUnits =
-	    driQueryOptioni(&r300->radeon.optionCache, "texture_image_units");
-	ctx->Const.MaxTextureCoordUnits =
-	    driQueryOptioni(&r300->radeon.optionCache, "texture_coord_units");
-	ctx->Const.MaxTextureUnits =
-	    MIN2(ctx->Const.MaxTextureImageUnits,
-		 ctx->Const.MaxTextureCoordUnits);
-	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
-	ctx->Const.MaxTextureLodBias = 16.0;
-
-	if (screen->chip_family >= CHIP_FAMILY_RV515)
-	    ctx->Const.MaxTextureLevels = 13;
-	else
-	    ctx->Const.MaxTextureLevels = 12;
-
-	ctx->Const.MinPointSize = 1.0;
-	ctx->Const.MinPointSizeAA = 1.0;
-	ctx->Const.MaxPointSize = R300_POINTSIZE_MAX;
-	ctx->Const.MaxPointSizeAA = R300_POINTSIZE_MAX;
+	r300->fallback = 0;
+	if (r300->options.hw_tcl_enabled)
+		ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
 
-	ctx->Const.MinLineWidth = 1.0;
-	ctx->Const.MinLineWidthAA = 1.0;
-	ctx->Const.MaxLineWidth = R300_LINESIZE_MAX;
-	ctx->Const.MaxLineWidthAA = R300_LINESIZE_MAX;
-
-	/* Needs further modifications */
-#if 0
-	ctx->Const.MaxArrayLockSize =
-	    ( /*512 */ RADEON_BUFFER_SIZE * 16 * 1024) / (4 * 4);
-#endif
+	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
 
-	ctx->Const.MaxDrawBuffers = 1;
+	r300InitConstValues(ctx, screen);
 
 	/* Initialize the software rasterizer and helper modules.
 	 */
@@ -375,16 +416,12 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	_tnl_CreateContext(ctx);
 	_swsetup_CreateContext(ctx);
 	_swsetup_Wakeup(ctx);
-	_ae_create_context(ctx);
 
 	/* Install the customized pipeline:
 	 */
 	_tnl_destroy_pipeline(ctx);
 	_tnl_install_pipeline(ctx, r300_pipeline);
-
-	/* Try and keep materials and vertices separate:
-	 */
-/* 	_tnl_isolate_materials(ctx, GL_TRUE); */
+	TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
 
 	/* Configure swrast and TNL to match hardware characteristics:
 	 */
@@ -393,80 +430,25 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	_tnl_allow_pixel_fog(ctx, GL_FALSE);
 	_tnl_allow_vertex_fog(ctx, GL_TRUE);
 
-	/* currently bogus data */
-	if (screen->chip_flags & RADEON_CHIPSET_TCL) {
-	        ctx->Const.VertexProgram.MaxInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
-		ctx->Const.VertexProgram.MaxNativeInstructions =
-		  VSF_MAX_FRAGMENT_LENGTH / 4;
-		ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
-		ctx->Const.VertexProgram.MaxTemps = 32;
-		ctx->Const.VertexProgram.MaxNativeTemps =
-		  /*VSF_MAX_FRAGMENT_TEMPS */ 32;
-		ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
-		ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
-	}
-
-	ctx->Const.FragmentProgram.MaxNativeTemps = PFS_NUM_TEMP_REGS;
-	ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
-	ctx->Const.FragmentProgram.MaxNativeParameters = PFS_NUM_CONST_REGS;
-	ctx->Const.FragmentProgram.MaxNativeAluInstructions = PFS_MAX_ALU_INST;
-	ctx->Const.FragmentProgram.MaxNativeTexInstructions = PFS_MAX_TEX_INST;
-	ctx->Const.FragmentProgram.MaxNativeInstructions =
-	    PFS_MAX_ALU_INST + PFS_MAX_TEX_INST;
-	ctx->Const.FragmentProgram.MaxNativeTexIndirections =
-	    PFS_MAX_TEX_INDIRECT;
-	ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;	/* and these are?? */
-	ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
-	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
-
-	driInitExtensions(ctx, card_extensions, GL_TRUE);
-	if (r300->radeon.radeonScreen->kernel_mm)
-	  driInitExtensions(ctx, mm_extensions, GL_FALSE);
-
-	if (driQueryOptionb
-	    (&r300->radeon.optionCache, "disable_stencil_two_side"))
-		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
-
-	if (r300->radeon.glCtx->Mesa_DXTn
-	    && !driQueryOptionb(&r300->radeon.optionCache, "disable_s3tc")) {
-		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
-		_mesa_enable_extension(ctx, "GL_S3_s3tc");
-	} else
-	    if (driQueryOptionb(&r300->radeon.optionCache, "force_s3tc_enable"))
-	{
-		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+	if (r300->options.hw_tcl_enabled) {
+		r300InitDraw(ctx);
+	} else {
+		r300InitSwtcl(ctx);
 	}
 
-	r300->disable_lowimpact_fallback =
-	    driQueryOptionb(&r300->radeon.optionCache,
-			    "disable_lowimpact_fallback");
 	radeon_fbo_init(&r300->radeon);
-   	radeonInitSpanFuncs( ctx );
+	radeonInitSpanFuncs( ctx );
 	r300InitCmdBuf(r300);
 	r300InitState(r300);
-	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
-	        r300InitSwtcl(ctx);
-
-	TNL_CONTEXT(ctx)->Driver.RunPipeline = r300RunPipeline;
+	r300InitShaderFunctions(r300);
 
-	tcl_mode = driQueryOptioni(&r300->radeon.optionCache, "tcl_mode");
-	if (driQueryOptionb(&r300->radeon.optionCache, "no_rast")) {
-		fprintf(stderr, "disabling 3D acceleration\n");
-#if R200_MERGED
-		FALLBACK(&r300->radeon, RADEON_FALLBACK_DISABLE, 1);
-#endif
-	}
-	if (tcl_mode == DRI_CONF_TCL_SW ||
-	    !(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
-		if (r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
-			r300->radeon.radeonScreen->chip_flags &=
-			    ~RADEON_CHIPSET_TCL;
-			fprintf(stderr, "Disabling HW TCL support\n");
-		}
-		TCL_FALLBACK(r300->radeon.glCtx,
-			     RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
+	if (screen->chip_family == CHIP_FAMILY_RS600 ||	screen->chip_family == CHIP_FAMILY_RS690 ||
+		screen->chip_family == CHIP_FAMILY_RS740) {
+		r300->radeon.texture_row_align = 64;
 	}
 
+	r300InitGLExtensions(ctx);
+
 	return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index 602f86ba669..026c33c67cb 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -37,24 +37,19 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #ifndef __R300_CONTEXT_H__
 #define __R300_CONTEXT_H__
 
-#include "tnl/t_vertex.h"
 #include "drm.h"
 #include "radeon_drm.h"
 #include "dri_util.h"
-#include "texmem.h"
 #include "radeon_common.h"
 
-#include "main/macros.h"
 #include "main/mtypes.h"
-#include "main/colormac.h"
+#include "shader/prog_instruction.h"
 
 struct r300_context;
 typedef struct r300_context r300ContextRec;
 typedef struct r300_context *r300ContextPtr;
 
 
-#include "main/mm.h"
-
 /* From http://gcc. gnu.org/onlinedocs/gcc-3.2.3/gcc/Variadic-Macros.html .
    I suppose we could inline this and use macro to fetch out __LINE__ and stuff in case we run into trouble
    with other compilers ... GLUE!
@@ -72,8 +67,6 @@ typedef struct r300_context *r300ContextPtr;
 	}
 
 #include "r300_vertprog.h"
-#include "r500_fragprog.h"
-
 
 
 /* The blit width for texture uploads
@@ -81,9 +74,6 @@ typedef struct r300_context *r300ContextPtr;
 #define R300_BLIT_WIDTH_BYTES 1024
 #define R300_MAX_TEXTURE_UNITS 8
 
-struct r300_texture_state {
-	int tc_count;		/* number of incoming texture coordinates from VAP */
-};
 
 
 #define R300_VPT_CMD_0		0
@@ -126,9 +116,11 @@ struct r300_texture_state {
 #define R300_GB_MISC_MSPOS_0		1
 #define R300_GB_MISC_MSPOS_1		2
 #define R300_GB_MISC_TILE_CONFIG	3
-#define R300_GB_MISC_SELECT		4
-#define R300_GB_MISC_AA_CONFIG		5
-#define R300_GB_MISC_CMDSIZE		6
+#define R300_GB_MISC_CMDSIZE		4
+#define R300_GB_MISC2_CMD_0		    0
+#define R300_GB_MISC2_SELECT		1
+#define R300_GB_MISC2_AA_CONFIG		2
+#define R300_GB_MISC2_CMDSIZE		3
 
 #define R300_TXE_CMD_0		0
 #define R300_TXE_ENABLE		1
@@ -303,7 +295,7 @@ struct r300_texture_state {
 struct r300_hw_state {
 	struct radeon_state_atom vpt;	/* viewport (1D98) */
 	struct radeon_state_atom vap_cntl;
-        struct radeon_state_atom vap_index_offset; /* 0x208c r5xx only */
+	struct radeon_state_atom vap_index_offset; /* 0x208c r5xx only */
 	struct radeon_state_atom vof;	/* VAP output format register 0x2090 */
 	struct radeon_state_atom vte;	/* (20B0) */
 	struct radeon_state_atom vap_vf_max_vtx_indx;	/* Maximum Vertex Indx Clamp (2134) */
@@ -317,6 +309,7 @@ struct r300_hw_state {
 	struct radeon_state_atom pvs;	/* pvs_cntl (22D0) */
 	struct radeon_state_atom gb_enable;	/* (4008) */
 	struct radeon_state_atom gb_misc;	/* Multisampling position shifts ? (4010) */
+	struct radeon_state_atom gb_misc2;	/* Multisampling position shifts ? (4010) */
 	struct radeon_state_atom ga_point_s0;	/* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) (4200) */
 	struct radeon_state_atom ga_triangle_stipple;	/* (4214) */
 	struct radeon_state_atom ps;	/* pointsize (421C) */
@@ -324,6 +317,7 @@ struct r300_hw_state {
 	struct radeon_state_atom lcntl;	/* line control */
 	struct radeon_state_atom ga_line_stipple;	/* (4260) */
 	struct radeon_state_atom shade;
+	struct radeon_state_atom shade2;
 	struct radeon_state_atom polygon_mode;
 	struct radeon_state_atom fogp;	/* fog parameters (4294) */
 	struct radeon_state_atom ga_soft_reset;	/* (429C) */
@@ -361,7 +355,7 @@ struct r300_hw_state {
 	struct radeon_state_atom zstencil_format;
 	struct radeon_state_atom zb;	/* z buffer (4F20) */
 	struct radeon_state_atom zb_depthclearvalue;	/* (4F28) */
-	struct radeon_state_atom unk4F30;	/* (4F30) */
+	struct radeon_state_atom zb_zmask;	/* (4F30) */
 	struct radeon_state_atom zb_hiz_offset;	/* (4F44) */
 	struct radeon_state_atom zb_hiz_pitch;	/* (4F54) */
 
@@ -405,73 +399,55 @@ struct r300_hw_state {
 #define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
 #define STATE_R300_TEXRECT_FACTOR (STATE_INTERNAL_DRIVER+1)
 
-struct r300_vertex_shader_fragment {
-	int length;
-	union {
-		GLuint d[VSF_MAX_FRAGMENT_LENGTH];
-		float f[VSF_MAX_FRAGMENT_LENGTH];
-		GLuint i[VSF_MAX_FRAGMENT_LENGTH];
-	} body;
-};
-
-struct r300_vertex_shader_state {
-	struct r300_vertex_shader_fragment program;
-};
-
-extern int hw_tcl_on;
-
 #define COLOR_IS_RGBA
 #define TAG(x) r300##x
 #include "tnl_dd/t_dd_vertex.h"
 #undef TAG
 
-//#define CURRENT_VERTEX_SHADER(ctx) (ctx->VertexProgram._Current)
-#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->selected_vp)
-
-/* Should but doesnt work */
-//#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->curr_vp)
-
-/* r300_vertex_shader_state and r300_vertex_program should probably be merged together someday.
- * Keeping them them seperate for now should ensure fixed pipeline keeps functioning properly.
- */
-
-struct r300_vertex_program_key {
-	GLuint InputsRead;
-	GLuint OutputsWritten;
-	GLuint OutputsAdded;
-};
-
 struct r300_vertex_program {
 	struct r300_vertex_program *next;
-	struct r300_vertex_program_key key;
-	int translated;
 
-	struct r300_vertex_shader_fragment program;
+	struct r300_vertex_program_key {
+		GLuint InputsRead;
+		GLuint OutputsWritten;
+		GLuint OutputsAdded;
+	} key;
+	
+	struct r300_vertex_shader_hw_code {
+		int length;
+		union {
+			GLuint d[VSF_MAX_FRAGMENT_LENGTH];
+			float f[VSF_MAX_FRAGMENT_LENGTH];
+		} body;
+	} hw_code;
+
+	GLboolean translated;
+	GLboolean error;
 
 	int pos_end;
 	int num_temporaries;	/* Number of temp vars used by program */
 	int wpos_idx;
 	int inputs[VERT_ATTRIB_MAX];
 	int outputs[VERT_RESULT_MAX];
-	int native;
-	int ref_count;
-	int use_ref_count;
 };
 
 struct r300_vertex_program_cont {
 	struct gl_vertex_program mesa_program;	/* Must be first */
-	struct r300_vertex_shader_fragment params;
 	struct r300_vertex_program *progs;
 };
 
-#define PFS_MAX_ALU_INST	64
-#define PFS_MAX_TEX_INST	64
-#define PFS_MAX_TEX_INDIRECT 4
-#define PFS_NUM_TEMP_REGS	32
-#define PFS_NUM_CONST_REGS	16
+#define R300_PFS_MAX_ALU_INST	64
+#define R300_PFS_MAX_TEX_INST	32
+#define R300_PFS_MAX_TEX_INDIRECT 4
+#define R300_PFS_NUM_TEMP_REGS	32
+#define R300_PFS_NUM_CONST_REGS	32
 
-struct r300_pfs_compile_state;
+#define R500_PFS_MAX_INST 512
+#define R500_PFS_NUM_TEMP_REGS 128
+#define R500_PFS_NUM_CONST_REGS 256
 
+struct r300_pfs_compile_state;
+struct r500_pfs_compile_state;
 
 /**
  * Stores state that influences the compilation of a fragment program.
@@ -514,7 +490,7 @@ struct r300_fragment_program_node {
 struct r300_fragment_program_code {
 	struct {
 		int length; /**< total # of texture instructions used */
-		GLuint inst[PFS_MAX_TEX_INST];
+		GLuint inst[R300_PFS_MAX_TEX_INST];
 	} tex;
 
 	struct {
@@ -524,7 +500,7 @@ struct r300_fragment_program_code {
 			GLuint inst1;
 			GLuint inst2;
 			GLuint inst3;
-		} inst[PFS_MAX_ALU_INST];
+		} inst[R300_PFS_MAX_ALU_INST];
 	} alu;
 
 	struct r300_fragment_program_node node[4];
@@ -535,53 +511,12 @@ struct r300_fragment_program_code {
 	 * Remember which program register a given hardware constant
 	 * belongs to.
 	 */
-	struct prog_src_register constant[PFS_NUM_CONST_REGS];
+	struct prog_src_register constant[R300_PFS_NUM_CONST_REGS];
 	int const_nr;
 
 	int max_temp_idx;
 };
 
-/**
- * Store everything about a fragment program that is needed
- * to render with that program.
- */
-struct r300_fragment_program {
-	struct gl_fragment_program mesa_program;
-
-	GLboolean translated;
-	GLboolean error;
-
-	struct r300_fragment_program_external_state state;
-	struct r300_fragment_program_code code;
-
-	GLboolean WritesDepth;
-	GLuint optimization;
-};
-
-struct r500_pfs_compile_state;
-
-struct r500_fragment_program_external_state {
-	struct {
-		/**
-		 * If the sampler is used as a shadow sampler,
-		 * this field is:
-		 *  0 - GL_LUMINANCE
-		 *  1 - GL_INTENSITY
-		 *  2 - GL_ALPHA
-		 * depending on the depth texture mode.
-		 */
-		GLuint depth_texture_mode : 2;
-
-		/**
-		 * If the sampler is used as a shadow sampler,
-		 * this field is (texture_compare_func - GL_NEVER).
-		 * [e.g. if compare function is GL_LEQUAL, this field is 3]
-		 *
-		 * Otherwise, this field is 0.
-		 */
-		GLuint texture_compare_func : 3;
-	} unit[16];
-};
 
 struct r500_fragment_program_code {
 	struct {
@@ -591,7 +526,7 @@ struct r500_fragment_program_code {
 		GLuint inst3;
 		GLuint inst4;
 		GLuint inst5;
-	} inst[512];
+	} inst[R500_PFS_MAX_INST];
 
 	int inst_offset;
 	int inst_end;
@@ -600,47 +535,41 @@ struct r500_fragment_program_code {
 	 * Remember which program register a given hardware constant
 	 * belongs to.
 	 */
-	struct prog_src_register constant[PFS_NUM_CONST_REGS];
+	struct prog_src_register constant[R500_PFS_NUM_CONST_REGS];
 	int const_nr;
 
 	int max_temp_idx;
 };
 
-struct r500_fragment_program {
-	struct gl_fragment_program mesa_program;
+/**
+* Store everything about a fragment program that is needed
+* to render with that program.
+*/
+struct r300_fragment_program {
+	struct gl_fragment_program Base;
 
-	GLcontext *ctx;
 	GLboolean translated;
 	GLboolean error;
 
-	struct r500_fragment_program_external_state state;
-	struct r500_fragment_program_code code;
+	struct r300_fragment_program_external_state state;
+	union rX00_fragment_program_code {
+		struct r300_fragment_program_code r300;
+		struct r500_fragment_program_code r500;
+	} code;
 
 	GLboolean writes_depth;
-
 	GLuint optimization;
 };
 
-#define R300_MAX_AOS_ARRAYS		16
-
-#define REG_COORDS	0
-#define REG_COLOR0	1
-#define REG_TEX0	2
-
-struct r300_state {
-	struct r300_texture_state texture;
-	int sw_tcl_inputs[VERT_ATTRIB_MAX];
-	struct r300_vertex_shader_state vertex_shader;
-
-
-	DECLARE_RENDERINPUTS(render_inputs_bitset);	/* actual render inputs that R300 was configured for.
-							   They are the same as tnl->render_inputs for fixed pipeline */
-
+struct r300_fragment_program_compiler {
+	r300ContextPtr r300;
+	struct r300_fragment_program *fp;
+	union rX00_fragment_program_code *code;
+	struct gl_program *program;
 };
 
-#define R300_FALLBACK_NONE 0
-#define R300_FALLBACK_TCL 1
-#define R300_FALLBACK_RAST 2
+#define R300_MAX_AOS_ARRAYS		16
+
 
 /* r300_swtcl.c
  */
@@ -654,18 +583,44 @@ struct r300_swtcl_info {
     * Offset of the 3UB specular color data within a hardware (swtcl) vertex.
     */
    GLuint specoffset;
+};
 
-   struct vertex_attribute{
-       GLuint attr;
-       GLubyte format;
-       GLubyte dst_loc;
-       GLuint swizzle;
-       GLubyte write_mask;
-   } vert_attrs[VERT_ATTRIB_MAX];
+struct r300_vtable {
+	void (* SetupRSUnit)(GLcontext *ctx);
+	void (* SetupFragmentShaderTextures)(GLcontext *ctx, int *tmu_mappings);
+	GLboolean (* BuildFragmentProgramHwCode)(struct r300_fragment_program_compiler *compiler);
+	void (* FragmentProgramDump)(union rX00_fragment_program_code *code);
+	void (* SetupPixelShader)(GLcontext *ctx);
+};
 
-   GLubyte vertex_attr_count;
+struct r300_vertex_buffer {
+	struct vertex_attribute {
+		/* generic */
+		GLubyte element;
+		GLvoid *data;
+		GLboolean free_needed;
+		GLuint stride;
+		GLuint dwords;
+		GLubyte size; /* number of components */
+
+		/* hw specific */
+		uint32_t data_type:4;
+		uint32_t dst_loc:5;
+		uint32_t _signed:1;
+		uint32_t normalize:1;
+		uint32_t swizzle:12;
+		uint32_t write_mask:4;
+	} attribs[VERT_ATTRIB_MAX];
+
+	GLubyte num_attribs;
 };
 
+struct r300_index_buffer {
+	GLvoid *ptr;
+	GLboolean is_32bit;
+	GLboolean free_needed;
+	GLuint count;
+};
 
 /**
  * \brief R300 context structure.
@@ -673,10 +628,10 @@ struct r300_swtcl_info {
 struct r300_context {
 	struct radeon_context radeon;	/* parent class, must be first */
 
+	struct r300_vtable vtbl;
+
 	struct r300_hw_state hw;
 
-	struct r300_state state;
-	struct gl_vertex_program *curr_vp;
 	struct r300_vertex_program *selected_vp;
 
 	/* Vertex buffers
@@ -684,10 +639,22 @@ struct r300_context {
 	GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
 	GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
 
-	GLboolean disable_lowimpact_fallback;
-
+	struct r300_options {
+		uint32_t conformance_mode:1;
+		uint32_t hw_tcl_enabled:1;
+		uint32_t s3tc_force_enabled:1;
+		uint32_t s3tc_force_disabled:1;
+		uint32_t stencil_two_side_disabled:1;
+	} options;
+	
 	struct r300_swtcl_info swtcl;
+	struct r300_vertex_buffer vbuf;
+	struct r300_index_buffer ind_buf;
 	GLboolean vap_flush_needed;
+
+	uint32_t fallback;
+
+	DECLARE_RENDERINPUTS(render_inputs_bitset);
 };
 
 #define R300_CONTEXT(ctx)		((r300ContextPtr)(ctx->DriverCtx))
@@ -703,10 +670,9 @@ extern int r300VertexProgUpdateParams(GLcontext * ctx,
 				      struct r300_vertex_program_cont *vp,
 				      float *dst);
 
-#define RADEON_D_CAPTURE 0
-#define RADEON_D_PLAYBACK 1
-#define RADEON_D_PLAYBACK_RAW 2
-#define RADEON_D_T 3
+extern void r300InitShaderFunctions(r300ContextPtr r300);
+
+extern void r300InitDraw(GLcontext *ctx);
 
 #define r300PackFloat32 radeonPackFloat32
 #define r300PackFloat24 radeonPackFloat24
diff --git a/src/mesa/drivers/dri/r300/r300_draw.c b/src/mesa/drivers/dri/r300/r300_draw.c
new file mode 100644
index 00000000000..cc5650fb7c8
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_draw.c
@@ -0,0 +1,475 @@
+/**************************************************************************
+ *
+ * Copyright 2009 Maciej Cencora
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHOR(S) AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdlib.h>
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/state.h"
+#include "main/api_validate.h"
+#include "main/enums.h"
+
+#include "r300_reg.h"
+#include "r300_context.h"
+#include "r300_emit.h"
+#include "r300_render.h"
+#include "r300_state.h"
+#include "r300_tex.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_vp_build.h"
+#include "vbo/vbo_context.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+
+static void r300FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer *mesa_ind_buf, struct gl_buffer_object **bo, GLuint *nr_bo)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_index_buffer *ind_buf = &r300->ind_buf;
+	GLvoid *src_ptr;
+
+	if (!mesa_ind_buf) {
+		ind_buf->ptr = NULL;
+		return;
+	}
+
+	ind_buf->count = mesa_ind_buf->count;
+	if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer) {
+		bo[*nr_bo] = mesa_ind_buf->obj;
+		(*nr_bo)++;
+		ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
+		assert(mesa_ind_buf->obj->Pointer != NULL);
+	}
+	src_ptr = ADD_POINTERS(mesa_ind_buf->obj->Pointer, mesa_ind_buf->ptr);
+
+	if (mesa_ind_buf->type == GL_UNSIGNED_BYTE) {
+		GLubyte *in = (GLubyte *)src_ptr;
+		GLuint *out = _mesa_malloc(sizeof(GLushort) * ((mesa_ind_buf->count + 1) & ~1));
+		int i;
+
+		ind_buf->ptr = out;
+
+		for (i = 0; i + 1 < mesa_ind_buf->count; i += 2) {
+			*out++ = in[i] | in[i + 1] << 16;
+		}
+
+		if (i < mesa_ind_buf->count) {
+			*out++ = in[i];
+		}
+
+		ind_buf->free_needed = GL_TRUE;
+		ind_buf->is_32bit = GL_FALSE;
+	} else if (mesa_ind_buf->type == GL_UNSIGNED_SHORT) {
+#if MESA_BIG_ENDIAN
+		GLushort *in = (GLushort *)src_ptr;
+		GLuint *out = _mesa_malloc(sizeof(GLushort) *
+					   ((mesa_ind_buf->count + 1) & ~1));
+		int i;
+
+		ind_buf->ptr = out;
+
+		for (i = 0; i + 1 < mesa_ind_buf->count; i += 2) {
+			*out++ = in[i] | in[i + 1] << 16;
+		}
+
+		if (i < mesa_ind_buf->count) {
+			*out++ = in[i];
+		}
+
+		ind_buf->free_needed = GL_TRUE;
+#else
+		ind_buf->ptr = src_ptr;
+		ind_buf->free_needed = GL_FALSE;
+#endif
+		ind_buf->is_32bit = GL_FALSE;
+	} else {
+		ind_buf->ptr = src_ptr;
+		ind_buf->free_needed = GL_FALSE;
+		ind_buf->is_32bit = GL_TRUE;
+	}
+}
+
+static int getTypeSize(GLenum type)
+{
+	switch (type) {
+		case GL_DOUBLE:
+			return sizeof(GLdouble);
+		case GL_FLOAT:
+			return sizeof(GLfloat);
+		case GL_INT:
+			return sizeof(GLint);
+		case GL_UNSIGNED_INT:
+			return sizeof(GLuint);
+		case GL_SHORT:
+			return sizeof(GLshort);
+		case GL_UNSIGNED_SHORT:
+			return sizeof(GLushort);
+		case GL_BYTE:
+			return sizeof(GLbyte);
+		case GL_UNSIGNED_BYTE:
+			return sizeof(GLubyte);
+		default:
+			assert(0);
+			return 0;
+	}
+}
+
+#define CONVERT( TYPE, MACRO ) do {		\
+	GLuint i, j, sz;				\
+	sz = input->Size;				\
+	if (input->Normalized) {			\
+		for (i = 0; i < count; i++) {		\
+			const TYPE *in = (TYPE *)src_ptr;		\
+			for (j = 0; j < sz; j++) {		\
+				*dst_ptr++ = MACRO(*in);		\
+				in++;				\
+			}					\
+			src_ptr += stride;			\
+		}						\
+	} else {					\
+		for (i = 0; i < count; i++) {		\
+			const TYPE *in = (TYPE *)src_ptr;		\
+			for (j = 0; j < sz; j++) {		\
+				*dst_ptr++ = (GLfloat)(*in);		\
+				in++;				\
+			}					\
+			src_ptr += stride;			\
+		}						\
+	}						\
+} while (0)
+
+static void r300TranslateAttrib(GLcontext *ctx, GLuint attr, int count, const struct gl_client_array *input, struct gl_buffer_object **bo, GLuint *nr_bo)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_buffer *vbuf = &r300->vbuf;
+	struct vertex_attribute r300_attr;
+	const void *src_ptr;
+	GLenum type;
+	GLuint stride;
+
+	if (input->BufferObj->Name) {
+		if (!input->BufferObj->Pointer) {
+			bo[*nr_bo] = input->BufferObj;
+			(*nr_bo)++;
+			ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
+			assert(input->BufferObj->Pointer != NULL);
+		}
+
+		src_ptr = ADD_POINTERS(input->BufferObj->Pointer, input->Ptr);
+	} else
+		src_ptr = input->Ptr;
+
+	stride = (input->StrideB == 0) ? getTypeSize(input->Type) * input->Size : input->StrideB;
+
+	if (input->Type == GL_DOUBLE || input->Type == GL_UNSIGNED_INT || input->Type == GL_INT ||
+#if MESA_BIG_ENDIAN
+	    getTypeSize(input->Type) != 4 ||
+#endif
+	    stride < 4) {
+		if (RADEON_DEBUG & DEBUG_FALLBACKS) {
+			fprintf(stderr, "%s: Converting vertex attributes, attribute data format %x,", __FUNCTION__, input->Type);
+			fprintf(stderr, "stride %d, components %d\n", stride, input->Size);
+		}
+
+		GLfloat *dst_ptr, *tmp;
+		tmp = dst_ptr = _mesa_malloc(sizeof(GLfloat) * input->Size * count);
+
+		switch (input->Type) {
+			case GL_DOUBLE:
+				CONVERT(GLdouble, (GLfloat));
+				break;
+			case GL_UNSIGNED_INT:
+				CONVERT(GLuint, UINT_TO_FLOAT);
+				break;
+			case GL_INT:
+				CONVERT(GLint, INT_TO_FLOAT);
+				break;
+			case GL_UNSIGNED_SHORT:
+				CONVERT(GLushort, USHORT_TO_FLOAT);
+				break;
+			case GL_SHORT:
+				CONVERT(GLshort, SHORT_TO_FLOAT);
+				break;
+			case GL_UNSIGNED_BYTE:
+				assert(input->Format != GL_BGRA);
+				CONVERT(GLubyte, UBYTE_TO_FLOAT);
+				break;
+			case GL_BYTE:
+				CONVERT(GLbyte, BYTE_TO_FLOAT);
+				break;
+			default:
+				assert(0);
+				break;
+		}
+
+		type = GL_FLOAT;
+		r300_attr.free_needed = GL_TRUE;
+		r300_attr.data = tmp;
+		r300_attr.stride = sizeof(GLfloat) * input->Size;
+		r300_attr.dwords = input->Size;
+	} else {
+		type = input->Type;
+		r300_attr.free_needed = GL_FALSE;
+		r300_attr.data = (GLvoid *)src_ptr;
+		r300_attr.stride = stride;
+		r300_attr.dwords = (getTypeSize(type) * input->Size  + 3)/ 4;
+	}
+
+	r300_attr.size = input->Size;
+	r300_attr.element = attr;
+	r300_attr.dst_loc = vbuf->num_attribs;
+
+	switch (type) {
+		case GL_FLOAT:
+			switch (input->Size) {
+				case 1: r300_attr.data_type = R300_DATA_TYPE_FLOAT_1; break;
+				case 2: r300_attr.data_type = R300_DATA_TYPE_FLOAT_2; break;
+				case 3: r300_attr.data_type = R300_DATA_TYPE_FLOAT_3; break;
+				case 4: r300_attr.data_type = R300_DATA_TYPE_FLOAT_4; break;
+			}
+			r300_attr._signed = 0;
+			r300_attr.normalize = 0;
+			break;
+		case GL_SHORT:
+			r300_attr._signed = 1;
+			r300_attr.normalize = input->Normalized;
+			switch (input->Size) {
+				case 1:
+				case 2:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_2;
+					break;
+				case 3:
+				case 4:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_4;
+					break;
+			}
+			break;
+		case GL_BYTE:
+			r300_attr._signed = 1;
+			r300_attr.normalize = input->Normalized;
+			r300_attr.data_type = R300_DATA_TYPE_BYTE;
+			break;
+		case GL_UNSIGNED_SHORT:
+			r300_attr._signed = 0;
+			r300_attr.normalize = input->Normalized;
+			switch (input->Size) {
+				case 1:
+				case 2:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_2;
+					break;
+				case 3:
+				case 4:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_4;
+					break;
+			}
+			break;
+		case GL_UNSIGNED_BYTE:
+			r300_attr._signed = 0;
+			r300_attr.normalize = input->Normalized;
+			if (input->Format == GL_BGRA)
+				r300_attr.data_type = R300_DATA_TYPE_D3DCOLOR;
+			else
+				r300_attr.data_type = R300_DATA_TYPE_BYTE;
+			break;
+
+		default:
+		case GL_DOUBLE:
+		case GL_INT:
+		case GL_UNSIGNED_INT:
+			assert(0);
+			break;
+	}
+
+	switch (input->Size) {
+		case 4:
+			r300_attr.swizzle = SWIZZLE_XYZW;
+			break;
+		case 3:
+			r300_attr.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
+			break;
+		case 2:
+			r300_attr.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ONE);
+			break;
+		case 1:
+			r300_attr.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE);
+			break;
+	}
+
+	r300_attr.write_mask = MASK_XYZW;
+
+	vbuf->attribs[vbuf->num_attribs] = r300_attr;
+	++vbuf->num_attribs;
+}
+
+static void r300SetVertexFormat(GLcontext *ctx, const struct gl_client_array *arrays[], int count, struct gl_buffer_object **bo, GLuint *nr_bo)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_buffer *vbuf = &r300->vbuf;
+
+	{
+		int i, tmp;
+
+		tmp = r300->selected_vp->key.InputsRead;
+		i = 0;
+		vbuf->num_attribs = 0;
+		while (tmp) {
+			/* find first enabled bit */
+			while (!(tmp & 1)) {
+				tmp >>= 1;
+				++i;
+			}
+
+			r300TranslateAttrib(ctx, i, count, arrays[i], bo, nr_bo);
+
+			tmp >>= 1;
+			++i;
+		}
+	}
+
+	r300SwitchFallback(ctx, R300_FALLBACK_AOS_LIMIT, vbuf->num_attribs > R300_MAX_AOS_ARRAYS);
+	if (r300->fallback)
+		return;
+
+	{
+		int i;
+
+		for (i = 0; i < vbuf->num_attribs; i++) {
+			rcommon_emit_vector(ctx, &r300->radeon.tcl.aos[i],
+						vbuf->attribs[i].data, vbuf->attribs[i].dwords,
+						vbuf->attribs[i].stride, count);
+		}
+
+		r300->radeon.tcl.aos_count = vbuf->num_attribs;
+	}
+}
+
+static void r300FreeData(GLcontext *ctx, struct gl_buffer_object **bo, GLuint nr_bo)
+{
+	{
+		struct r300_vertex_buffer *vbuf = &R300_CONTEXT(ctx)->vbuf;
+		int i;
+
+		for (i = 0; i < vbuf->num_attribs; i++) {
+			if (vbuf->attribs[i].free_needed)
+				_mesa_free(vbuf->attribs[i].data);
+		}
+	}
+
+	{
+		struct r300_index_buffer *ind_buf = &R300_CONTEXT(ctx)->ind_buf;
+		if (ind_buf->free_needed)
+			_mesa_free(ind_buf->ptr);
+	}
+
+	{
+		int i;
+
+		for (i = 0; i < nr_bo; ++i) {
+			ctx->Driver.UnmapBuffer(ctx, 0, bo[i]);
+		}
+	}
+}
+
+static GLboolean r300TryDrawPrims(GLcontext *ctx,
+					 const struct gl_client_array *arrays[],
+					 const struct _mesa_prim *prim,
+					 GLuint nr_prims,
+					 const struct _mesa_index_buffer *ib,
+					 GLuint min_index,
+					 GLuint max_index )
+{
+	struct r300_context *r300 = R300_CONTEXT(ctx);
+	struct gl_buffer_object *bo[VERT_ATTRIB_MAX+1];
+	GLuint i, nr_bo = 0;
+
+	if (ctx->NewState)
+		_mesa_update_state( ctx );
+
+	if (r300->options.hw_tcl_enabled)
+		_tnl_UpdateFixedFunctionProgram(ctx);
+
+	r300UpdateShaders(r300);
+
+	r300SwitchFallback(ctx, R300_FALLBACK_INVALID_BUFFERS, !r300ValidateBuffers(ctx));
+
+	r300FixupIndexBuffer(ctx, ib, bo, &nr_bo);
+
+	r300SetVertexFormat(ctx, arrays, max_index + 1, bo, &nr_bo);
+
+	if (r300->fallback)
+		return GL_FALSE;
+
+	r300SetupVAP(ctx, r300->selected_vp->key.InputsRead, r300->selected_vp->key.OutputsWritten);
+
+	r300UpdateShaderStates(r300);
+
+	r300EmitCacheFlush(r300);
+	radeonEmitState(&r300->radeon);
+
+	for (i = 0; i < nr_prims; ++i) {
+		r300RunRenderPrimitive(ctx, prim[i].start, prim[i].start + prim[i].count, prim[i].mode);
+	}
+
+	r300EmitCacheFlush(r300);
+
+	radeonReleaseArrays(ctx, ~0);
+
+	r300FreeData(ctx, bo, nr_bo);
+
+	return GL_TRUE;
+}
+
+/* TODO: rebase if number of indices in any of primitives is > 8192 for 32bit indices or 16384 for 16bit indices */
+
+static void r300DrawPrims(GLcontext *ctx,
+			 const struct gl_client_array *arrays[],
+			 const struct _mesa_prim *prim,
+			 GLuint nr_prims,
+			 const struct _mesa_index_buffer *ib,
+			 GLuint min_index,
+			 GLuint max_index)
+{
+	GLboolean retval;
+
+	if (min_index) {
+		vbo_rebase_prims( ctx, arrays, prim, nr_prims, ib, min_index, max_index, r300DrawPrims );
+		return;
+	}
+
+	/* Make an attempt at drawing */
+	retval = r300TryDrawPrims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+
+	/* If failed run tnl pipeline - it should take care of fallbacks */
+	if (!retval)
+		_tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+}
+
+void r300InitDraw(GLcontext *ctx)
+{
+	struct vbo_context *vbo = vbo_context(ctx);
+
+	vbo->draw_prims = r300DrawPrims;
+}
diff --git a/src/mesa/drivers/dri/r300/r300_emit.c b/src/mesa/drivers/dri/r300/r300_emit.c
index 4fd6ba9b91a..c3817721dc4 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.c
+++ b/src/mesa/drivers/dri/r300/r300_emit.c
@@ -31,6 +31,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * \file
  *
  * \author Keith Whitwell <[email protected]>
+ * \author Maciej Cencora <[email protected]>
  */
 
 #include "main/glheader.h"
@@ -49,74 +50,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_state.h"
 #include "r300_emit.h"
 #include "r300_ioctl.h"
-
-
-#if SWIZZLE_X != R300_INPUT_ROUTE_SELECT_X || \
-    SWIZZLE_Y != R300_INPUT_ROUTE_SELECT_Y || \
-    SWIZZLE_Z != R300_INPUT_ROUTE_SELECT_Z || \
-    SWIZZLE_W != R300_INPUT_ROUTE_SELECT_W || \
-    SWIZZLE_ZERO != R300_INPUT_ROUTE_SELECT_ZERO || \
-    SWIZZLE_ONE != R300_INPUT_ROUTE_SELECT_ONE
-#error Cannot change these!
-#endif
-
-#define DEBUG_ALL DEBUG_VERTS
-
-#define DW_SIZE(x) ((inputs[tab[(x)]] << R300_DST_VEC_LOC_SHIFT) |	\
-		    (attribptr[tab[(x)]]->size - 1) << R300_DATA_TYPE_0_SHIFT)
-
-GLuint r300VAPInputRoute0(uint32_t * dst, GLvector4f ** attribptr,
-				 int *inputs, GLint * tab, GLuint nr)
-{
-	GLuint i, dw;
-
-	/* type, inputs, stop bit, size */
-	for (i = 0; i < nr; i += 2) {
-		/* make sure input is valid, would lockup the gpu */
-		assert(inputs[tab[i]] != -1);
-		dw = (R300_SIGNED | DW_SIZE(i));
-		if (i + 1 == nr) {
-			dw |= R300_LAST_VEC << R300_DATA_TYPE_0_SHIFT;
-		} else {
-			assert(inputs[tab[i + 1]] != -1);
-			dw |= (R300_SIGNED |
-			       DW_SIZE(i + 1)) << R300_DATA_TYPE_1_SHIFT;
-			if (i + 2 == nr) {
-				dw |= R300_LAST_VEC << R300_DATA_TYPE_1_SHIFT;
-			}
-		}
-		dst[i >> 1] = dw;
-	}
-
-	return (nr + 1) >> 1;
-}
-
-static GLuint r300VAPInputRoute1Swizzle(int swizzle[4])
-{
-	return (swizzle[0] << R300_SWIZZLE_SELECT_X_SHIFT) |
-	    (swizzle[1] << R300_SWIZZLE_SELECT_Y_SHIFT) |
-	    (swizzle[2] << R300_SWIZZLE_SELECT_Z_SHIFT) |
-	    (swizzle[3] << R300_SWIZZLE_SELECT_W_SHIFT);
-}
-
-GLuint r300VAPInputRoute1(uint32_t * dst, int swizzle[][4], GLuint nr)
-{
-	GLuint i, dw;
-
-	for (i = 0; i < nr; i += 2) {
-		dw = (r300VAPInputRoute1Swizzle(swizzle[i]) |
-		      ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y |
-			R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE0_SHIFT;
-		if (i + 1 < nr) {
-			dw |= (r300VAPInputRoute1Swizzle(swizzle[i + 1]) |
-			       ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y |
-				 R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE1_SHIFT;
-		}
-		dst[i >> 1] = dw;
-	}
-
-	return (nr + 1) >> 1;
-}
+#include "r300_render.h"
+#include "r300_swtcl.h"
 
 GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead)
 {
@@ -127,7 +62,6 @@ GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead)
 
 GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	GLuint i, vic_1 = 0;
 
 	if (InputsRead & (1 << VERT_ATTRIB_POS))
@@ -139,213 +73,99 @@ GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead)
 	if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
 		vic_1 |= R300_INPUT_CNTL_COLOR;
 
-	rmesa->state.texture.tc_count = 0;
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
 		if (InputsRead & (1 << (VERT_ATTRIB_TEX0 + i))) {
-			rmesa->state.texture.tc_count++;
 			vic_1 |= R300_INPUT_CNTL_TC0 << i;
 		}
 
 	return vic_1;
 }
 
-GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint OutputsWritten)
+GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint vp_writes, GLuint fp_reads)
 {
 	GLuint ret = 0;
 
-	if (OutputsWritten & (1 << VERT_RESULT_HPOS))
+	if (vp_writes & (1 << VERT_RESULT_HPOS))
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_COL0))
+	if (vp_writes & (1 << VERT_RESULT_COL0) && fp_reads & FRAG_BIT_COL0)
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_COL1))
+	if (vp_writes & (1 << VERT_RESULT_COL1) && fp_reads & FRAG_BIT_COL1)
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_BFC0)
-	    || OutputsWritten & (1 << VERT_RESULT_BFC1))
-		ret |=
-		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT |
-		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT |
-		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT;
+	/* Two sided lighting works only if all 4 colors are written */
+	if (vp_writes & (1 << VERT_RESULT_BFC0) || vp_writes & (1 << VERT_RESULT_BFC1))
+		ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT | R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT |
+			   R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT | R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_PSIZ))
+	if (vp_writes & (1 << VERT_RESULT_PSIZ))
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
 
 	return ret;
 }
 
-GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint OutputsWritten)
+GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint vp_writes, GLuint fp_reads)
 {
 	GLuint i, ret = 0, first_free_texcoord = 0;
 
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-		if (OutputsWritten & (1 << (VERT_RESULT_TEX0 + i))) {
-			ret |= (4 << (3 * i));
+		if (vp_writes & (1 << (VERT_RESULT_TEX0 + i)) && fp_reads & FRAG_BIT_TEX(i)) {
+			ret |= (4 << (3 * first_free_texcoord));
 			++first_free_texcoord;
 		}
 	}
 
-	if (OutputsWritten & (1 << VERT_RESULT_FOGC)) {
-		if (first_free_texcoord > 8) {
-			fprintf(stderr, "\tout of free texcoords to write fog coord\n");
-			_mesa_exit(-1);
-		}
-		ret |= 1 << (3 * first_free_texcoord);
+	if (fp_reads & FRAG_BIT_WPOS) {
+		ret |= (4 << (3 * first_free_texcoord));
+		++first_free_texcoord;
+	}
+
+	if (vp_writes & (1 << VERT_RESULT_FOGC) && fp_reads & FRAG_BIT_FOGC) {
+		ret |= 4 << (3 * first_free_texcoord);
+	}
+
+	if (first_free_texcoord > 8) {
+		fprintf(stderr, "\tout of free texcoords\n");
+		_mesa_exit(-1);
 	}
 
 	return ret;
 }
 
-/* Emit vertex data to GART memory
- * Route inputs to the vertex processor
- * This function should never return R300_FALLBACK_TCL when using software tcl.
- */
-int r300EmitArrays(GLcontext * ctx)
+GLboolean r300EmitArrays(GLcontext * ctx)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *vb = &tnl->vb;
-	GLuint nr;
-	GLuint count = vb->Count;
-	GLuint i;
-	GLuint InputsRead = 0, OutputsWritten = 0;
-	int *inputs = NULL;
-	int vir_inputs[VERT_ATTRIB_MAX];
-	GLint tab[VERT_ATTRIB_MAX];
-	int swizzle[VERT_ATTRIB_MAX][4];
-	struct r300_vertex_program *prog =
-	    (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
-
-	if (hw_tcl_on) {
-		inputs = prog->inputs;
-		InputsRead = prog->key.InputsRead;
-		OutputsWritten = prog->key.OutputsWritten;
-	} else {
-		inputs = rmesa->state.sw_tcl_inputs;
-
-		DECLARE_RENDERINPUTS(render_inputs_bitset);
-		RENDERINPUTS_COPY(render_inputs_bitset, tnl->render_inputs_bitset);
-
-		vb->AttribPtr[VERT_ATTRIB_POS] = vb->ClipPtr;
-
-		assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_POS));
-		assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_NORMAL) == 0);
-
-		if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_POS)) {
-			InputsRead |= 1 << VERT_ATTRIB_POS;
-			OutputsWritten |= 1 << VERT_RESULT_HPOS;
-		}
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_buffer *vbuf = &r300->vbuf;
+	GLuint InputsRead, OutputsWritten;
 
-		if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR0)) {
-			InputsRead |= 1 << VERT_ATTRIB_COLOR0;
-			OutputsWritten |= 1 << VERT_RESULT_COL0;
-		}
+	r300ChooseSwtclVertexFormat(ctx, &InputsRead, &OutputsWritten);
 
-		if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR1)) {
-			InputsRead |= 1 << VERT_ATTRIB_COLOR1;
-			OutputsWritten |= 1 << VERT_RESULT_COL1;
-		}
+	r300SwitchFallback(ctx, R300_FALLBACK_AOS_LIMIT, vbuf->num_attribs > R300_MAX_AOS_ARRAYS);
+	if (r300->fallback & R300_RASTER_FALLBACK_MASK)
+		return GL_FALSE;
 
-		for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-			if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_TEX(i))) {
-				InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
-				OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
-			}
-		}
+	{
+		struct vertex_buffer *mesa_vb = &TNL_CONTEXT(ctx)->vb;
+		GLuint attr, i;
 
-		for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-			if (InputsRead & (1 << i)) {
-				inputs[i] = nr++;
-			} else {
-				inputs[i] = -1;
-			}
+		for (i = 0; i < vbuf->num_attribs; i++) {
+			attr = vbuf->attribs[i].element;
+			rcommon_emit_vector(ctx, &r300->radeon.tcl.aos[i], mesa_vb->AttribPtr[attr]->data,
+					mesa_vb->AttribPtr[attr]->size, mesa_vb->AttribPtr[attr]->stride, mesa_vb->Count);
 		}
 
-		/* Fixed, apply to vir0 only */
-		memcpy(vir_inputs, inputs, VERT_ATTRIB_MAX * sizeof(int));
-		inputs = vir_inputs;
-		if (InputsRead & VERT_ATTRIB_POS)
-			inputs[VERT_ATTRIB_POS] = 0;
-		if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
-			inputs[VERT_ATTRIB_COLOR0] = 2;
-		if (InputsRead & (1 << VERT_ATTRIB_COLOR1))
-			inputs[VERT_ATTRIB_COLOR1] = 3;
-		for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++)
-			if (InputsRead & (1 << i))
-				inputs[i] = 6 + (i - VERT_ATTRIB_TEX0);
-
-		RENDERINPUTS_COPY(rmesa->state.render_inputs_bitset, render_inputs_bitset);
-	}
+		r300->radeon.tcl.aos_count = vbuf->num_attribs;
 
-	assert(InputsRead);
-	assert(OutputsWritten);
-
-	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-		if (InputsRead & (1 << i)) {
-			tab[nr++] = i;
-		}
+		/* Fill index buffer info */
+		r300->ind_buf.ptr = mesa_vb->Elts;
+		r300->ind_buf.is_32bit = GL_TRUE;
+		r300->ind_buf.free_needed = GL_FALSE;
 	}
 
-	if (nr > R300_MAX_AOS_ARRAYS) {
-		return R300_FALLBACK_TCL;
-	}
-
-	for (i = 0; i < nr; i++) {
-		int ci;
-
-		swizzle[i][0] = SWIZZLE_ZERO;
-		swizzle[i][1] = SWIZZLE_ZERO;
-		swizzle[i][2] = SWIZZLE_ZERO;
-		swizzle[i][3] = SWIZZLE_ONE;
+	r300SetupVAP(ctx, InputsRead, OutputsWritten);
 
-		for (ci = 0; ci < vb->AttribPtr[tab[i]]->size; ci++) {
-			swizzle[i][ci] = ci;
-		}
-		rcommon_emit_vector(ctx, &rmesa->radeon.tcl.aos[i],
-				    vb->AttribPtr[tab[i]]->data,
-				    vb->AttribPtr[tab[i]]->size,
-				    vb->AttribPtr[tab[i]]->stride, count);
-	}
-
-	/* Setup INPUT_ROUTE. */
-	if (rmesa->radeon.radeonScreen->kernel_mm) {
-		R300_STATECHANGE(rmesa, vir[0]);
-		rmesa->hw.vir[0].cmd[0] &= 0xC000FFFF;
-		rmesa->hw.vir[1].cmd[0] &= 0xC000FFFF;
-		rmesa->hw.vir[0].cmd[0] |=
-			(r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
-					    vb->AttribPtr, inputs, tab, nr) & 0x3FFF) << 16;
-		R300_STATECHANGE(rmesa, vir[1]);
-		rmesa->hw.vir[1].cmd[0] |=
-			(r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
-					    nr) & 0x3FFF) << 16;
-	} else {
-		R300_STATECHANGE(rmesa, vir[0]);
-		((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count =
-			r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
-					   vb->AttribPtr, inputs, tab, nr);
-		R300_STATECHANGE(rmesa, vir[1]);
-		((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
-			r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
-					   nr);
-	}
-	
-	/* Setup INPUT_CNTL. */
-	R300_STATECHANGE(rmesa, vic);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
-
-	/* Setup OUTPUT_VTX_FMT. */
-	R300_STATECHANGE(rmesa, vof);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] =
-	    r300VAPOutputCntl0(ctx, OutputsWritten);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] =
-	    r300VAPOutputCntl1(ctx, OutputsWritten);
-
-	rmesa->radeon.tcl.aos_count = nr;
-
-	return R300_FALLBACK_NONE;
+	return GL_TRUE;
 }
 
 void r300EmitCacheFlush(r300ContextPtr rmesa)
diff --git a/src/mesa/drivers/dri/r300/r300_emit.h b/src/mesa/drivers/dri/r300/r300_emit.h
index 80c22d5e9ab..2fb8b82d3a1 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.h
+++ b/src/mesa/drivers/dri/r300/r300_emit.h
@@ -216,19 +216,16 @@ void static INLINE cp_wait(radeonContextPtr radeon, unsigned char flags)
 	}
 }
 
-extern int r300EmitArrays(GLcontext * ctx);
+extern GLboolean r300EmitArrays(GLcontext * ctx);
 
 extern int r300PrimitiveType(r300ContextPtr rmesa, int prim);
 extern int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim);
 
 extern void r300EmitCacheFlush(r300ContextPtr rmesa);
 
-extern GLuint r300VAPInputRoute0(uint32_t * dst, GLvector4f ** attribptr,
-				 int *inputs, GLint * tab, GLuint nr);
-extern GLuint r300VAPInputRoute1(uint32_t * dst, int swizzle[][4], GLuint nr);
 extern GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead);
 extern GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead);
-extern GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint OutputsWritten);
-extern GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint OutputsWritten);
+extern GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint vp_writes, GLuint fp_reads);
+extern GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint vp_writes, GLuint fp_reads);
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 32182bb6674..55c1cfe6317 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -25,32 +25,12 @@
  *
  */
 
-/**
- * \file
- *
- * Fragment program compiler. Perform transformations on the intermediate
- * representation until the program is in a form where we can translate
- * it more or less directly into machine-readable form.
- *
- * \author Ben Skeggs <[email protected]>
- * \author Jerome Glisse <[email protected]>
- */
+#include "r300_fragprog.h"
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/prog_instruction.h"
 #include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
 
 #include "r300_context.h"
-#include "r300_fragprog.h"
 #include "r300_fragprog_swizzle.h"
-#include "r300_state.h"
-
-#include "radeon_nqssadce.h"
-#include "radeon_program_alu.h"
-
 
 static void reset_srcreg(struct prog_src_register* reg)
 {
@@ -81,7 +61,7 @@ static struct prog_src_register shadow_ambient(struct gl_program *program, int t
  * \todo If/when r5xx uses the radeon_program architecture, this can probably
  * be reused.
  */
-static GLboolean transform_TEX(
+GLboolean r300_transform_TEX(
 	struct radeon_transform_context *t,
 	struct prog_instruction* orig_inst, void* data)
 {
@@ -160,6 +140,8 @@ static GLboolean transform_TEX(
 			inst.DstReg.Index = tempreg;
 			inst.DstReg.WriteMask = WRITEMASK_XYZW;
 			destredirect = GL_TRUE;
+		} else if (inst.SaturateMode) {
+			destredirect = GL_TRUE;
 		}
 	}
 
@@ -175,7 +157,7 @@ static GLboolean transform_TEX(
 		inst.SrcReg[0].File = PROGRAM_TEMPORARY;
 		inst.SrcReg[0].Index = tmpreg;
 	}
-	
+
 	tgt = radeonAppendInstructions(t->Program, 1);
 	_mesa_copy_instructions(tgt, &inst, 1);
 
@@ -214,9 +196,9 @@ static GLboolean transform_TEX(
 		 *   r  < tex  <=>      -tex+r < 0
 		 *   r >= tex  <=> not (-tex+r < 0 */
 		if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
-			tgt[1].SrcReg[2].NegateBase = tgt[0].SrcReg[2].NegateBase ^ NEGATE_XYZW;
+			tgt[1].SrcReg[2].Negate = tgt[0].SrcReg[2].Negate ^ NEGATE_XYZW;
 		else
-			tgt[1].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW;
+			tgt[1].SrcReg[0].Negate = tgt[0].SrcReg[0].Negate ^ NEGATE_XYZW;
 
 		tgt[2].Opcode = OPCODE_CMP;
 		tgt[2].DstReg = orig_inst->DstReg;
@@ -239,6 +221,7 @@ static GLboolean transform_TEX(
 
 		tgt->Opcode = OPCODE_MOV;
 		tgt->DstReg = orig_inst->DstReg;
+		tgt->SaturateMode = inst.SaturateMode;
 		tgt->SrcReg[0].File = PROGRAM_TEMPORARY;
 		tgt->SrcReg[0].Index = inst.DstReg.Index;
 	}
@@ -246,241 +229,10 @@ static GLboolean transform_TEX(
 	return GL_TRUE;
 }
 
-
-static void update_params(r300ContextPtr r300, struct r300_fragment_program *fp)
-{
-	struct gl_fragment_program *mp = &fp->mesa_program;
-
-	/* Ask Mesa nicely to fill in ParameterValues for us */
-	if (mp->Base.Parameters)
-		_mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
-}
-
-
-/**
- * Transform the program to support fragment.position.
- *
- * Introduce a small fragment at the start of the program that will be
- * the only code that directly reads the FRAG_ATTRIB_WPOS input.
- * All other code pieces that reference that input will be rewritten
- * to read from a newly allocated temporary.
- *
- * \todo if/when r5xx supports the radeon_program architecture, this is a
- * likely candidate for code sharing.
- */
-static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler)
-{
-	GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
-
-	if (!(InputsRead & FRAG_BIT_WPOS))
-		return;
-
-	static gl_state_index tokens[STATE_LENGTH] = {
-		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
-	};
-	struct prog_instruction *fpi;
-	GLuint window_index;
-	int i = 0;
-	GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY);
-
-	_mesa_insert_instructions(compiler->program, 0, 3);
-	fpi = compiler->program->Instructions;
-
-	/* perspective divide */
-	fpi[i].Opcode = OPCODE_RCP;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_W;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	fpi[i].Opcode = OPCODE_MUL;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-
-	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[1].Index = tempregi;
-	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	/* viewport transformation */
-	window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
-
-	fpi[i].Opcode = OPCODE_MAD;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[0].Index = tempregi;
-	fpi[i].SrcReg[0].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[1].Index = window_index;
-	fpi[i].SrcReg[1].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[2].Index = window_index;
-	fpi[i].SrcReg[2].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-	i++;
-
-	for (; i < compiler->program->NumInstructions; ++i) {
-		int reg;
-		for (reg = 0; reg < 3; reg++) {
-			if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
-			    fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
-				fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
-				fpi[i].SrcReg[reg].Index = tempregi;
-			}
-		}
-	}
-}
-
-
-static void nqssadce_init(struct nqssadce_state* s)
-{
-	s->Outputs[FRAG_RESULT_COLOR].Sourced = WRITEMASK_XYZW;
-	s->Outputs[FRAG_RESULT_DEPTH].Sourced = WRITEMASK_W;
-}
-
-
-static GLuint build_dtm(GLuint depthmode)
-{
-	switch(depthmode) {
-	default:
-	case GL_LUMINANCE: return 0;
-	case GL_INTENSITY: return 1;
-	case GL_ALPHA: return 2;
-	}
-}
-
-static GLuint build_func(GLuint comparefunc)
-{
-	return comparefunc - GL_NEVER;
-}
-
-
-/**
- * Collect all external state that is relevant for compiling the given
- * fragment program.
- */
-static void build_state(
-	r300ContextPtr r300,
-	struct r300_fragment_program *fp,
-	struct r300_fragment_program_external_state *state)
-{
-	int unit;
-
-	_mesa_bzero(state, sizeof(*state));
-
-	for(unit = 0; unit < 16; ++unit) {
-		if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
-			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
-
-			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
-			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
-		}
-	}
-}
-
-
-void r300TranslateFragmentShader(r300ContextPtr r300,
-				 struct r300_fragment_program *fp)
-{
-	struct r300_fragment_program_external_state state;
-
-	build_state(r300, fp, &state);
-	if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
-		/* TODO: cache compiled programs */
-		fp->translated = GL_FALSE;
-		_mesa_memcpy(&fp->state, &state, sizeof(state));
-	}
-
-	if (!fp->translated) {
-		struct r300_fragment_program_compiler compiler;
-
-		compiler.r300 = r300;
-		compiler.fp = fp;
-		compiler.code = &fp->code;
-		compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Fragment Program: Initial program:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		insert_WPOS_trailer(&compiler);
-
-		struct radeon_program_transformation transformations[] = {
-			{ &transform_TEX, &compiler },
-			{ &radeonTransformALU, 0 },
-			{ &radeonTransformTrigSimple, 0 }
-		};
-		radeonLocalTransform(
-			r300->radeon.glCtx,
-			compiler.program,
-			3, transformations);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Fragment Program: After native rewrite:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		struct radeon_nqssadce_descr nqssadce = {
-			.Init = &nqssadce_init,
-			.IsNativeSwizzle = &r300FPIsNativeSwizzle,
-			.BuildSwizzle = &r300FPBuildSwizzle,
-			.RewriteDepthOut = GL_TRUE
-		};
-		radeonNqssaDce(r300->radeon.glCtx, compiler.program, &nqssadce);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: after NqSSA-DCE:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		if (!r300FragmentProgramEmit(&compiler))
-			fp->error = GL_TRUE;
-
-		/* Subtle: Rescue any parameters that have been added during transformations */
-		_mesa_free_parameter_list(fp->mesa_program.Base.Parameters);
-		fp->mesa_program.Base.Parameters = compiler.program->Parameters;
-		compiler.program->Parameters = 0;
-
-		_mesa_reference_program(r300->radeon.glCtx, &compiler.program, NULL);
-
-		if (!fp->error)
-			fp->translated = GL_TRUE;
-		if (fp->error || (RADEON_DEBUG & DEBUG_PIXEL))
-			r300FragmentProgramDump(fp, &fp->code);
-		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
-	}
-
-	update_params(r300, fp);
-}
-
 /* just some random things... */
-void r300FragmentProgramDump(
-	struct r300_fragment_program *fp,
-	struct r300_fragment_program_code *code)
+void r300FragmentProgramDump(union rX00_fragment_program_code *c)
 {
+	struct r300_fragment_program_code *code = &c->r300;
 	int n, i, j;
 	static int pc = 0;
 
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.h b/src/mesa/drivers/dri/r300/r300_fragprog.h
index 94fb554fb37..5ce6f33cee7 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.h
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.h
@@ -33,9 +33,6 @@
 #ifndef __R300_FRAGPROG_H_
 #define __R300_FRAGPROG_H_
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
 #include "shader/program.h"
 #include "shader/prog_instruction.h"
 
@@ -105,28 +102,10 @@
 
 #endif
 
-struct r300_fragment_program;
+extern GLboolean r300BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler);
 
-extern void r300TranslateFragmentShader(r300ContextPtr r300,
-					struct r300_fragment_program *fp);
+extern void r300FragmentProgramDump(union rX00_fragment_program_code *c);
 
-
-/**
- * Used internally by the r300 fragment program code to store compile-time
- * only data.
- */
-struct r300_fragment_program_compiler {
-	r300ContextPtr r300;
-	struct r300_fragment_program *fp;
-	struct r300_fragment_program_code *code;
-	struct gl_program *program;
-};
-
-extern GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler);
-
-
-extern void r300FragmentProgramDump(
-	struct r300_fragment_program *fp,
-	struct r300_fragment_program_code *code);
+extern GLboolean r300_transform_TEX(struct radeon_transform_context *t, struct prog_instruction* orig_inst, void* data);
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_common.c b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
new file mode 100644
index 00000000000..abc8757ba1f
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
@@ -0,0 +1,291 @@
+/*
+ * Copyright (C) 2009 Maciej Cencora <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * Fragment program compiler. Perform transformations on the intermediate
+ * representation until the program is in a form where we can translate
+ * it more or less directly into machine-readable form.
+ *
+ * \author Ben Skeggs <[email protected]>
+ * \author Jerome Glisse <[email protected]>
+ */
+
+#include "r300_fragprog_common.h"
+
+#include "shader/program.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+
+#include "r300_state.h"
+#include "r300_fragprog.h"
+#include "r300_fragprog_swizzle.h"
+#include "r500_fragprog.h"
+
+#include "radeon_program.h"
+#include "radeon_program_alu.h"
+
+static void update_params(GLcontext *ctx, struct gl_fragment_program *fp)
+{
+	/* Ask Mesa nicely to fill in ParameterValues for us */
+	if (fp->Base.Parameters)
+		_mesa_load_state_parameters(ctx, fp->Base.Parameters);
+}
+
+static void nqssadce_init(struct nqssadce_state* s)
+{
+	s->Outputs[FRAG_RESULT_COLOR].Sourced = WRITEMASK_XYZW;
+	s->Outputs[FRAG_RESULT_DEPTH].Sourced = WRITEMASK_W;
+}
+
+/**
+ * Transform the program to support fragment.position.
+ *
+ * Introduce a small fragment at the start of the program that will be
+ * the only code that directly reads the FRAG_ATTRIB_WPOS input.
+ * All other code pieces that reference that input will be rewritten
+ * to read from a newly allocated temporary.
+ *
+ */
+static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler)
+{
+	GLuint InputsRead = compiler->fp->Base.Base.InputsRead;
+
+	if (!(InputsRead & FRAG_BIT_WPOS))
+		return;
+
+	static gl_state_index tokens[STATE_LENGTH] = {
+		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
+	};
+	struct prog_instruction *fpi;
+	GLuint window_index;
+	int i = 0;
+	GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY);
+
+	_mesa_insert_instructions(compiler->program, 0, 3);
+	fpi = compiler->program->Instructions;
+
+	/* perspective divide */
+	fpi[i].Opcode = OPCODE_RCP;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_W;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+	i++;
+
+	fpi[i].Opcode = OPCODE_MUL;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
+	fpi[i].SrcReg[1].Index = tempregi;
+	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
+	i++;
+
+	/* viewport transformation */
+	window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
+
+	fpi[i].Opcode = OPCODE_MAD;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+	fpi[i].SrcReg[0].Index = tempregi;
+	fpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
+	fpi[i].SrcReg[1].Index = window_index;
+	fpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
+	fpi[i].SrcReg[2].Index = window_index;
+	fpi[i].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+	i++;
+
+	for (; i < compiler->program->NumInstructions; ++i) {
+		int reg;
+		for (reg = 0; reg < 3; reg++) {
+			if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
+			    fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
+				fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
+				fpi[i].SrcReg[reg].Index = tempregi;
+			}
+		}
+	}
+}
+
+static GLuint build_dtm(GLuint depthmode)
+{
+	switch(depthmode) {
+	default:
+	case GL_LUMINANCE: return 0;
+	case GL_INTENSITY: return 1;
+	case GL_ALPHA: return 2;
+	}
+}
+
+static GLuint build_func(GLuint comparefunc)
+{
+	return comparefunc - GL_NEVER;
+}
+
+/**
+ * Collect all external state that is relevant for compiling the given
+ * fragment program.
+ */
+static void build_state(
+	r300ContextPtr r300,
+	struct r300_fragment_program *fp,
+	struct r300_fragment_program_external_state *state)
+{
+	int unit;
+
+	_mesa_bzero(state, sizeof(*state));
+
+	for(unit = 0; unit < 16; ++unit) {
+		if (fp->Base.Base.ShadowSamplers & (1 << unit)) {
+			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
+
+			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
+			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
+		}
+	}
+}
+
+void r300TranslateFragmentShader(GLcontext *ctx, struct gl_fragment_program *fp)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_fragment_program *r300_fp = (struct r300_fragment_program *)fp;
+	struct r300_fragment_program_external_state state;
+
+	build_state(r300, r300_fp, &state);
+	if (_mesa_memcmp(&r300_fp->state, &state, sizeof(state))) {
+		/* TODO: cache compiled programs */
+		r300_fp->translated = GL_FALSE;
+		_mesa_memcpy(&r300_fp->state, &state, sizeof(state));
+	}
+
+	if (!r300_fp->translated) {
+		struct r300_fragment_program_compiler compiler;
+
+		compiler.r300 = r300;
+		compiler.fp = r300_fp;
+		compiler.code = &r300_fp->code;
+		compiler.program = _mesa_clone_program(ctx, &fp->Base);
+
+		if (RADEON_DEBUG & DEBUG_PIXEL) {
+			fflush(stdout);
+			_mesa_printf("Fragment Program: Initial program:\n");
+			_mesa_print_program(compiler.program);
+			fflush(stdout);
+		}
+
+		insert_WPOS_trailer(&compiler);
+
+		if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+			struct radeon_program_transformation transformations[] = {
+				{ &r500_transform_TEX, &compiler },
+				{ &radeonTransformALU, 0 },
+				{ &radeonTransformDeriv, 0 },
+				{ &radeonTransformTrigScale, 0 }
+			};
+			radeonLocalTransform(ctx, compiler.program, 4, transformations);
+		} else {
+			struct radeon_program_transformation transformations[] = {
+				{ &r300_transform_TEX, &compiler },
+				{ &radeonTransformALU, 0 },
+				{ &radeonTransformTrigSimple, 0 }
+			};
+			radeonLocalTransform(ctx, compiler.program, 3, transformations);
+		}
+
+		if (RADEON_DEBUG & DEBUG_PIXEL) {
+			_mesa_printf("Fragment Program: After native rewrite:\n");
+			_mesa_print_program(compiler.program);
+			fflush(stdout);
+		}
+
+		if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+			struct radeon_nqssadce_descr nqssadce = {
+				.Init = &nqssadce_init,
+				.IsNativeSwizzle = &r500FPIsNativeSwizzle,
+				.BuildSwizzle = &r500FPBuildSwizzle,
+				.RewriteDepthOut = GL_TRUE
+			};
+			radeonNqssaDce(ctx, compiler.program, &nqssadce);
+		} else {
+			struct radeon_nqssadce_descr nqssadce = {
+				.Init = &nqssadce_init,
+				.IsNativeSwizzle = &r300FPIsNativeSwizzle,
+				.BuildSwizzle = &r300FPBuildSwizzle,
+				.RewriteDepthOut = GL_TRUE
+			};
+			radeonNqssaDce(ctx, compiler.program, &nqssadce);
+		}
+
+		if (RADEON_DEBUG & DEBUG_PIXEL) {
+			_mesa_printf("Compiler: after NqSSA-DCE:\n");
+			_mesa_print_program(compiler.program);
+			fflush(stdout);
+		}
+
+		if (!r300->vtbl.BuildFragmentProgramHwCode(&compiler))
+			r300_fp->error = GL_TRUE;
+
+		/* Subtle: Rescue any parameters that have been added during transformations */
+		_mesa_free_parameter_list(fp->Base.Parameters);
+		fp->Base.Parameters = compiler.program->Parameters;
+		compiler.program->Parameters = 0;
+
+		_mesa_reference_program(ctx, &compiler.program, NULL);
+
+		r300_fp->translated = GL_TRUE;
+
+		r300UpdateStateParameters(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+
+		if (r300_fp->error || (RADEON_DEBUG & DEBUG_PIXEL))
+			r300->vtbl.FragmentProgramDump(&r300_fp->code);
+	}
+
+	update_params(ctx, fp);
+}
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_common.h b/src/mesa/drivers/dri/r300/r300_fragprog_common.h
new file mode 100644
index 00000000000..85ea86fecb1
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_common.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2009 Maciej Cencora <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __R300_FRAGPROG_COMMON_H_
+#define __R300_FRAGPROG_COMMON_H_
+
+#include "main/mtypes.h"
+
+extern void r300TranslateFragmentShader(GLcontext *ctx, struct gl_fragment_program *fp);
+
+#endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
index 9f0b7e35349..b75656e7ee1 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
@@ -47,7 +47,7 @@
 
 #define PROG_CODE \
 	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)data; \
-	struct r300_fragment_program_code *code = c->code
+	struct r300_fragment_program_code *code = &c->code->r300
 
 #define error(fmt, args...) do {			\
 		fprintf(stderr, "%s::%s(): " fmt "\n",	\
@@ -66,7 +66,7 @@ static GLboolean emit_const(void* data, GLuint file, GLuint index, GLuint *hwind
 	}
 
 	if (*hwindex >= code->const_nr) {
-		if (*hwindex >= PFS_NUM_CONST_REGS) {
+		if (*hwindex >= R300_PFS_NUM_CONST_REGS) {
 			error("Out of hw constants!\n");
 			return GL_FALSE;
 		}
@@ -138,7 +138,7 @@ static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
 {
 	PROG_CODE;
 
-	if (code->alu.length >= PFS_MAX_ALU_INST) {
+	if (code->alu.length >= R300_PFS_MAX_ALU_INST) {
 		error("Too many ALU instructions");
 		return GL_FALSE;
 	}
@@ -201,7 +201,7 @@ static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
 	if (inst->Alpha.DepthWriteMask) {
 		code->alu.inst[ip].inst3 |= R300_ALU_DSTA_DEPTH;
 		code->node[code->cur_node].flags |= R300_W_OUT;
-		c->fp->WritesDepth = GL_TRUE;
+		c->fp->writes_depth = GL_TRUE;
 	}
 
 	return GL_TRUE;
@@ -213,7 +213,7 @@ static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
  */
 static GLboolean finish_node(struct r300_fragment_program_compiler *c)
 {
-	struct r300_fragment_program_code *code = c->code;
+	struct r300_fragment_program_code *code = &c->code->r300;
 	struct r300_fragment_program_node *node = &code->node[code->cur_node];
 
 	if (node->alu_end < 0) {
@@ -275,7 +275,7 @@ static GLboolean emit_tex(void* data, struct prog_instruction* inst)
 {
 	PROG_CODE;
 
-	if (code->tex.length >= PFS_MAX_TEX_INST) {
+	if (code->tex.length >= R300_PFS_MAX_TEX_INST) {
 		error("Too many TEX instructions");
 		return GL_FALSE;
 	}
@@ -318,16 +318,16 @@ static const struct radeon_pair_handler pair_handler = {
 	.EmitPaired = &emit_alu,
 	.EmitTex = &emit_tex,
 	.BeginTexBlock = &begin_tex,
-	.MaxHwTemps = PFS_NUM_TEMP_REGS
+	.MaxHwTemps = R300_PFS_NUM_TEMP_REGS
 };
 
 /**
  * Final compilation step: Turn the intermediate radeon_program into
  * machine-readable instructions.
  */
-GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler)
+GLboolean r300BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler)
 {
-	struct r300_fragment_program_code *code = compiler->code;
+	struct r300_fragment_program_code *code = &compiler->code->r300;
 
 	_mesa_bzero(code, sizeof(struct r300_fragment_program_code));
 	code->node[0].alu_end = -1;
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.c b/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.c
index a86d2bd4712..fc9d855bce6 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.c
@@ -92,7 +92,7 @@ static const struct swizzle_data* lookup_native_swizzle(GLuint swizzle)
 GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
 {
 	if (reg.Abs)
-		reg.NegateBase = 0;
+		reg.Negate = NEGATE_NONE;
 
 	if (opcode == OPCODE_KIL ||
 	    opcode == OPCODE_TEX ||
@@ -100,7 +100,7 @@ GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
 	    opcode == OPCODE_TXP) {
 		int j;
 
-		if (reg.Abs || reg.NegateBase != (15*reg.NegateAbs))
+		if (reg.Abs || reg.Negate)
 			return GL_FALSE;
 
 		for(j = 0; j < 4; ++j) {
@@ -121,7 +121,7 @@ GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
 		if (GET_SWZ(reg.Swizzle, j) != SWIZZLE_NIL)
 			relevant |= 1 << j;
 
-	if ((reg.NegateBase & relevant) && (reg.NegateBase & relevant) != relevant)
+	if ((reg.Negate & relevant) && ((reg.Negate & relevant) != relevant))
 		return GL_FALSE;
 
 	if (!lookup_native_swizzle(reg.Swizzle))
@@ -137,13 +137,12 @@ GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
 void r300FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src)
 {
 	if (src.Abs)
-		src.NegateBase = 0;
+		src.Negate = NEGATE_NONE;
 
 	while(dst.WriteMask) {
 		const struct swizzle_data *best_swizzle = 0;
 		GLuint best_matchcount = 0;
 		GLuint best_matchmask = 0;
-		GLboolean rgbnegate;
 		int i, comp;
 
 		for(i = 0; i < num_native_swizzles; ++i) {
@@ -157,6 +156,11 @@ void r300FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst,
 				if (swz == SWIZZLE_NIL)
 					continue;
 				if (swz == GET_SWZ(sd->hash, comp)) {
+					/* check if the negate bit of current component
+					 * is the same for already matched components */
+					if (matchmask && (!!(src.Negate & matchmask) != !!(src.Negate & (1 << comp))))
+						continue;
+
 					matchcount++;
 					matchmask |= 1 << comp;
 				}
@@ -170,13 +174,6 @@ void r300FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst,
 			}
 		}
 
-		if ((src.NegateBase & best_matchmask) != 0) {
-			best_matchmask &= src.NegateBase;
-			rgbnegate = !src.NegateAbs;
-		} else {
-			rgbnegate = src.NegateAbs;
-		}
-
 		struct prog_instruction *inst;
 
 		_mesa_insert_instructions(s->Program, s->IP, 1);
@@ -185,6 +182,7 @@ void r300FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst,
 		inst->DstReg = dst;
 		inst->DstReg.WriteMask &= (best_matchmask | WRITEMASK_W);
 		inst->SrcReg[0] = src;
+		inst->SrcReg[0].Negate = (best_matchmask & src.Negate) ? NEGATE_XYZW : NEGATE_NONE;
 		/* Note: We rely on NqSSA/DCE to set unused swizzle components to NIL */
 
 		dst.WriteMask &= ~inst->DstReg.WriteMask;
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.c b/src/mesa/drivers/dri/r300/r300_ioctl.c
index a7f5121da74..104079b4dbe 100644
--- a/src/mesa/drivers/dri/r300/r300_ioctl.c
+++ b/src/mesa/drivers/dri/r300/r300_ioctl.c
@@ -79,7 +79,7 @@ static void r300ClearBuffer(r300ContextPtr r300, int flags,
 {
 	BATCH_LOCALS(&r300->radeon);
 	GLcontext *ctx = r300->radeon.glCtx;
-	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
 	GLuint cbpitch = 0;
 	r300ContextPtr rmesa = r300;
 
@@ -200,7 +200,7 @@ static void r300ClearBuffer(r300ContextPtr r300, int flags,
 		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
 		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
 	}
-	
+
 	r300EmitCacheFlush(rmesa);
 	cp_wait(&r300->radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
 
@@ -213,14 +213,13 @@ static void r300EmitClearState(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	BATCH_LOCALS(&r300->radeon);
-	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
 	int i;
-	int has_tcl = 1;
+	int has_tcl;
 	int is_r500 = 0;
 	GLuint vap_cntl;
 
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
-		has_tcl = 0;
+	has_tcl = r300->options.hw_tcl_enabled;
 
 	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
 		is_r500 = 1;
@@ -448,7 +447,7 @@ static void r300EmitClearState(GLcontext * ctx)
 			R500_ALU_RGBA_G_SWIZ_0 |
 			R500_ALU_RGBA_B_SWIZ_0 |
 			R500_ALU_RGBA_A_SWIZ_0;
-		
+
 		r500fp.cmd[7] = 0;
 		emit_r500fp(ctx, &r500fp);
 	}
@@ -542,9 +541,9 @@ static void r300EmitClearState(GLcontext * ctx)
 }
 
 static void r300KernelClear(GLcontext *ctx, GLuint flags)
-{	  	
+{
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
 	struct radeon_framebuffer *rfb = dPriv->driverPrivate;
 	struct radeon_renderbuffer *rrb;
 	struct radeon_renderbuffer *rrbd;
@@ -566,7 +565,7 @@ static void r300KernelClear(GLcontext *ctx, GLuint flags)
 		r300ClearBuffer(r300, CLEARBUFFER_COLOR, rrb, NULL);
 		bits = 0;
 	}
-		
+
 	if (flags & BUFFER_BIT_FRONT_LEFT) {
 		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_FRONT_LEFT);
 		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb, rrbd);
@@ -591,7 +590,7 @@ static void r300KernelClear(GLcontext *ctx, GLuint flags)
 static void r300Clear(GLcontext * ctx, GLbitfield mask)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
 	const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
 	GLbitfield swrast_mask = 0, tri_mask = 0;
 	int i;
diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
index ed552d09bbc..c22616b95f6 100644
--- a/src/mesa/drivers/dri/r300/r300_reg.h
+++ b/src/mesa/drivers/dri/r300/r300_reg.h
@@ -2432,6 +2432,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* Z Buffer Clear Value */
 #define R300_ZB_DEPTHCLEARVALUE                  0x4f28
 
+#define R300_ZB_ZMASK_OFFSET                     0x4f30
+#define R300_ZB_ZMASK_PITCH                      0x4f34
+#define R300_ZB_ZMASK_WRINDEX                    0x4f38
+#define R300_ZB_ZMASK_DWORD                      0x4f3c
+#define R300_ZB_ZMASK_RDINDEX                    0x4f40
+
 /* Hierarchical Z Memory Offset */
 #define R300_ZB_HIZ_OFFSET                       0x4f44
 
@@ -3172,6 +3178,9 @@ enum {
 #   define R300_W_SRC_RAS				(1 << 2)
 
 
+/* Packet0 field ordering to write all values to the same reg */
+#define RADEON_ONE_REG_WR        (1 << 15)
+
 /* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR.
  * Two parameter dwords:
  * 0. VAP_VTX_FMT: The first parameter is not written to hardware
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index 924305dd128..1356305a212 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -50,6 +50,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
  * no bugs...
  */
 
+#include "r300_render.h"
+
 #include "main/glheader.h"
 #include "main/state.h"
 #include "main/imports.h"
@@ -72,8 +74,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_reg.h"
 #include "r300_tex.h"
 #include "r300_emit.h"
-#include "r300_fragprog.h"
-extern int future_hw_tcl_on;
+#include "r300_fragprog_common.h"
+#include "r300_swtcl.h"
 
 /**
  * \brief Convert a OpenGL primitive type into a R300 primitive type.
@@ -170,16 +172,19 @@ int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim)
 	return num_verts - verts_off;
 }
 
-static void r300EmitElts(GLcontext * ctx, void *elts, unsigned long n_elts)
+static void r300EmitElts(GLcontext * ctx, unsigned long n_elts)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	void *out;
+	GLuint size;
+
+	size = ((rmesa->ind_buf.is_32bit ? 4 : 2) * n_elts + 3) & ~3;
 
 	radeonAllocDmaRegion(&rmesa->radeon, &rmesa->radeon.tcl.elt_dma_bo,
-			     &rmesa->radeon.tcl.elt_dma_offset, n_elts * 4, 4);
+			     &rmesa->radeon.tcl.elt_dma_offset, size, 4);
 	radeon_bo_map(rmesa->radeon.tcl.elt_dma_bo, 1);
 	out = rmesa->radeon.tcl.elt_dma_bo->ptr + rmesa->radeon.tcl.elt_dma_offset;
-	memcpy(out, elts, n_elts * 4);
+	memcpy(out, rmesa->ind_buf.ptr, size);
 	radeon_bo_unmap(rmesa->radeon.tcl.elt_dma_bo);
 }
 
@@ -187,14 +192,23 @@ static void r300FireEB(r300ContextPtr rmesa, int vertex_count, int type)
 {
 	BATCH_LOCALS(&rmesa->radeon);
 
+    r300_emit_scissor(rmesa->radeon.glCtx);
 	if (vertex_count > 0) {
+		int size;
+
 		BEGIN_BATCH(10);
 		OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_INDX_2, 0);
-		OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
-			  ((vertex_count + 0) << 16) |
-			  type |
+		if (rmesa->ind_buf.is_32bit) {
+			size = vertex_count;
+			OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
+			  ((vertex_count + 0) << 16) | type |
 			  R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
-		
+		} else {
+			size = (vertex_count + 1) >> 1;
+			OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
+			   ((vertex_count + 0) << 16) | type);
+		}
+
 		if (!rmesa->radeon.radeonScreen->kernel_mm) {
 			OUT_BATCH_PACKET3(R300_PACKET3_INDX_BUFFER, 2);
 			OUT_BATCH(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
@@ -203,13 +217,13 @@ static void r300FireEB(r300ContextPtr rmesa, int vertex_count, int type)
 					rmesa->radeon.tcl.elt_dma_bo,
 					rmesa->radeon.tcl.elt_dma_offset,
 					RADEON_GEM_DOMAIN_GTT, 0, 0);
-			OUT_BATCH(vertex_count);
+			OUT_BATCH(size);
 		} else {
 			OUT_BATCH_PACKET3(R300_PACKET3_INDX_BUFFER, 2);
 			OUT_BATCH(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
 	    			 (R300_VAP_PORT_IDX0 >> 2));
 			OUT_BATCH(rmesa->radeon.tcl.elt_dma_offset);
-			OUT_BATCH(vertex_count);
+			OUT_BATCH(size);
 			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
 					      rmesa->radeon.tcl.elt_dma_bo,
 					      RADEON_GEM_DOMAIN_GTT, 0, 0);
@@ -224,12 +238,12 @@ static void r300EmitAOS(r300ContextPtr rmesa, GLuint nr, GLuint offset)
 	uint32_t voffset;
 	int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
 	int i;
-	
+
 	if (RADEON_DEBUG & DEBUG_VERTS)
 		fprintf(stderr, "%s: nr=%d, ofs=0x%08x\n", __FUNCTION__, nr,
 			offset);
 
-    
+
 	if (!rmesa->radeon.radeonScreen->kernel_mm) {
 		BEGIN_BATCH(sz+2+(nr * 2));
 		OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1);
@@ -240,7 +254,7 @@ static void r300EmitAOS(r300ContextPtr rmesa, GLuint nr, GLuint offset)
 				  (rmesa->radeon.tcl.aos[i].stride << 8) |
 				  (rmesa->radeon.tcl.aos[i + 1].components << 16) |
 				  (rmesa->radeon.tcl.aos[i + 1].stride << 24));
-			
+
 			voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
 				offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
 			OUT_BATCH_RELOC(voffset,
@@ -256,7 +270,7 @@ static void r300EmitAOS(r300ContextPtr rmesa, GLuint nr, GLuint offset)
 					RADEON_GEM_DOMAIN_GTT,
 					0, 0);
 		}
-		
+
 		if (nr & 1) {
 			OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
 				  (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
@@ -280,7 +294,7 @@ static void r300EmitAOS(r300ContextPtr rmesa, GLuint nr, GLuint offset)
 				  (rmesa->radeon.tcl.aos[i].stride << 8) |
 				  (rmesa->radeon.tcl.aos[i + 1].components << 16) |
 				  (rmesa->radeon.tcl.aos[i + 1].stride << 24));
-			
+
 			voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
 				offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
 			OUT_BATCH(voffset);
@@ -288,7 +302,7 @@ static void r300EmitAOS(r300ContextPtr rmesa, GLuint nr, GLuint offset)
 				offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
 			OUT_BATCH(voffset);
 		}
-		
+
 		if (nr & 1) {
 			OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
 			  (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
@@ -327,18 +341,18 @@ static void r300FireAOS(r300ContextPtr rmesa, int vertex_count, int type)
 {
 	BATCH_LOCALS(&rmesa->radeon);
 
+    r300_emit_scissor(rmesa->radeon.glCtx);
 	BEGIN_BATCH(3);
 	OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
 	OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16) | type);
 	END_BATCH();
 }
 
-static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
-				   int start, int end, int prim)
+void r300RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim)
 {
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&rmesa->radeon);
 	int type, num_verts;
-	TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *vb = &tnl->vb;
 
 	type = r300PrimitiveType(rmesa, prim);
 	num_verts = r300NumVerts(rmesa, end - start, prim);
@@ -350,9 +364,9 @@ static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
 	 * This is supposed to ensure that we can get all rendering
 	 * commands into a single command buffer.
 	 */
-	rcommonEnsureCmdBufSpace(&rmesa->radeon, 64, __FUNCTION__);
+	rcommonEnsureCmdBufSpace(&rmesa->radeon, 128, __FUNCTION__);
 
-	if (vb->Elts) {
+	if (rmesa->ind_buf.ptr) {
 		if (num_verts > 65535) {
 			/* not implemented yet */
 			WARN_ONCE("Too many elts\n");
@@ -369,8 +383,14 @@ static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
 		 * allocating the index array might actually evict the vertex
 		 * arrays. *sigh*
 		 */
-		r300EmitElts(ctx, vb->Elts, num_verts);
+		r300EmitElts(ctx, num_verts);
 		r300EmitAOS(rmesa, rmesa->radeon.tcl.aos_count, start);
+		if (rmesa->radeon.radeonScreen->kernel_mm) {
+			BEGIN_BATCH_NO_AUTOSTATE(2);
+			OUT_BATCH_REGSEQ(R300_VAP_VF_MAX_VTX_INDX, 1);
+			OUT_BATCH(rmesa->radeon.tcl.aos[0].count);
+			END_BATCH();
+		}
 		r300FireEB(rmesa, num_verts, type);
 	} else {
 		r300EmitAOS(rmesa, rmesa->radeon.tcl.aos_count, start);
@@ -379,8 +399,7 @@ static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
 	COMMIT_BATCH();
 }
 
-static GLboolean r300RunRender(GLcontext * ctx,
-			       struct tnl_pipeline_stage *stage)
+static void r300RunRender(GLcontext * ctx, struct tnl_pipeline_stage *stage)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	int i;
@@ -391,8 +410,7 @@ static GLboolean r300RunRender(GLcontext * ctx,
 		fprintf(stderr, "%s\n", __FUNCTION__);
 
 	r300UpdateShaders(rmesa);
-	if (r300EmitArrays(ctx))
-		return GL_TRUE;
+	r300EmitArrays(ctx);
 
 	r300UpdateShaderStates(rmesa);
 
@@ -403,77 +421,104 @@ static GLboolean r300RunRender(GLcontext * ctx,
 		GLuint prim = _tnl_translate_prim(&vb->Primitive[i]);
 		GLuint start = vb->Primitive[i].start;
 		GLuint end = vb->Primitive[i].start + vb->Primitive[i].count;
-		r300RunRenderPrimitive(rmesa, ctx, start, end, prim);
+		r300RunRenderPrimitive(ctx, start, end, prim);
 	}
 
 	r300EmitCacheFlush(rmesa);
 
 	radeonReleaseArrays(ctx, ~0);
+}
 
-	return GL_FALSE;
+
+static const char *getFallbackString(uint32_t bit)
+{
+	switch (bit) {
+		case R300_FALLBACK_VERTEX_PROGRAM :
+			return "vertex program";
+		case R300_FALLBACK_LINE_SMOOTH:
+			return "smooth lines";
+		case R300_FALLBACK_POINT_SMOOTH:
+			return "smooth points";
+		case R300_FALLBACK_POLYGON_SMOOTH:
+			return "smooth polygons";
+		case R300_FALLBACK_LINE_STIPPLE:
+			return "line stipple";
+		case R300_FALLBACK_POLYGON_STIPPLE:
+			return "polygon stipple";
+		case R300_FALLBACK_STENCIL_TWOSIDE:
+			return "two-sided stencil";
+		case R300_FALLBACK_RENDER_MODE:
+			return "render mode != GL_RENDER";
+		case R300_FALLBACK_FRAGMENT_PROGRAM:
+			return "fragment program";
+		case R300_FALLBACK_AOS_LIMIT:
+			return "aos limit";
+		case R300_FALLBACK_INVALID_BUFFERS:
+			return "invalid buffers";
+		default:
+			return "unknown";
+	}
 }
 
-#define FALLBACK_IF(expr)						\
-	do {								\
-		if (expr) {						\
-			if (1 || RADEON_DEBUG & DEBUG_FALLBACKS)	\
-				WARN_ONCE("Software fallback:%s\n",	\
-					  #expr);			\
-			return R300_FALLBACK_RAST;			\
-		}							\
-	} while(0)
-
-static int r300Fallback(GLcontext * ctx)
+void r300SwitchFallback(GLcontext *ctx, uint32_t bit, GLboolean mode)
 {
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	const unsigned back = ctx->Stencil._BackFace;
+	TNLcontext *tnl = TNL_CONTEXT(ctx);
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	uint32_t old_fallback = rmesa->fallback;
+	static uint32_t fallback_warn = 0;
 	
-	FALLBACK_IF(r300->radeon.Fallback);
-	/* Do we need to use new-style shaders?
-	 * Also is there a better way to do this? */
-	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-		struct r500_fragment_program *fp = (struct r500_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-		if (fp) {
-			if (!fp->translated) {
-				r500TranslateFragmentShader(r300, fp);
-				FALLBACK_IF(!fp->translated);
+	if (mode) {
+		if ((fallback_warn & bit) == 0) {
+			_mesa_fprintf(stderr, "WARNING! Falling back to software for %s\n", getFallbackString(bit));
+			fallback_warn |= bit;
+		}
+		rmesa->fallback |= bit;
+
+		/* update only if we change from no tcl fallbacks to some tcl fallbacks */
+		if (rmesa->options.hw_tcl_enabled) {
+			if (((old_fallback & R300_TCL_FALLBACK_MASK) == 0) &&
+				((bit & R300_TCL_FALLBACK_MASK) > 0)) {
+				R300_STATECHANGE(rmesa, vap_cntl_status);
+				rmesa->hw.vap_cntl_status.cmd[1] |= R300_VAP_TCL_BYPASS;
 			}
 		}
+
+		/* update only if we change from no raster fallbacks to some raster fallbacks */
+		if (((old_fallback & R300_RASTER_FALLBACK_MASK) == 0) &&
+			((bit & R300_RASTER_FALLBACK_MASK) > 0)) {
+			
+			radeon_firevertices(&rmesa->radeon);
+			rmesa->radeon.swtcl.RenderIndex = ~0;
+			_swsetup_Wakeup( ctx );
+		}
 	} else {
-		struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-		if (fp) {
-			if (!fp->translated) {
-				r300TranslateFragmentShader(r300, fp);
-				FALLBACK_IF(!fp->translated);
+		rmesa->fallback &= ~bit;
+
+		/* update only if we have disabled all tcl fallbacks */
+		if (rmesa->options.hw_tcl_enabled) {
+			if ((old_fallback & R300_RASTER_FALLBACK_MASK) == bit) {
+				R300_STATECHANGE(rmesa, vap_cntl_status);
+				rmesa->hw.vap_cntl_status.cmd[1] &= ~R300_VAP_TCL_BYPASS;
 			}
 		}
-	}
 
-	FALLBACK_IF(ctx->RenderMode != GL_RENDER);
-
-	/* If GL_EXT_stencil_two_side is disabled, this fallback check can
-	 * be removed.
-	 */
-	FALLBACK_IF(ctx->Stencil.Ref[0] != ctx->Stencil.Ref[back]
-		    || ctx->Stencil.ValueMask[0] !=
-		    ctx->Stencil.ValueMask[back]
-		    || ctx->Stencil.WriteMask[0] !=
-		    ctx->Stencil.WriteMask[back]);
-
-	if (ctx->Extensions.NV_point_sprite || ctx->Extensions.ARB_point_sprite)
-		FALLBACK_IF(ctx->Point.PointSprite);
-
-	if (!r300->disable_lowimpact_fallback) {
-		FALLBACK_IF(ctx->Polygon.StippleFlag);
-		FALLBACK_IF(ctx->Multisample._Enabled);
-		FALLBACK_IF(ctx->Line.StippleFlag);
-		FALLBACK_IF(ctx->Line.SmoothFlag);
-		FALLBACK_IF(ctx->Point.SmoothFlag);
+		/* update only if we have disabled all raster fallbacks */
+		if ((old_fallback & R300_RASTER_FALLBACK_MASK) == bit) {
+			_swrast_flush( ctx );
+			
+			tnl->Driver.Render.Start = r300RenderStart;
+			tnl->Driver.Render.Finish = r300RenderFinish;
+			tnl->Driver.Render.PrimitiveNotify = r300RenderPrimitive;
+			tnl->Driver.Render.ResetLineStipple = r300ResetLineStipple;
+			tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+			tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+			tnl->Driver.Render.Interp = _tnl_interp;
+			
+			_tnl_invalidate_vertex_state( ctx, ~0 );
+			_tnl_invalidate_vertices( ctx, ~0 );
+		}
 	}
-
-	return R300_FALLBACK_NONE;
+	
 }
 
 static GLboolean r300RunNonTCLRender(GLcontext * ctx,
@@ -484,49 +529,15 @@ static GLboolean r300RunNonTCLRender(GLcontext * ctx,
 	if (RADEON_DEBUG & DEBUG_PRIMS)
 		fprintf(stderr, "%s\n", __FUNCTION__);
 
-	if (r300Fallback(ctx) >= R300_FALLBACK_RAST)
+	if (rmesa->fallback & R300_RASTER_FALLBACK_MASK)
 		return GL_TRUE;
 
-	if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
- 	        return GL_TRUE;
-
-	if (!r300ValidateBuffers(ctx))
-	    return GL_TRUE;
-	
-	return r300RunRender(ctx, stage);
-}
-
-static GLboolean r300RunTCLRender(GLcontext * ctx,
-				  struct tnl_pipeline_stage *stage)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct r300_vertex_program *vp;
-
-	hw_tcl_on = future_hw_tcl_on;
-
-	if (RADEON_DEBUG & DEBUG_PRIMS)
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (hw_tcl_on == GL_FALSE)
-		return GL_TRUE;
-
-	if (r300Fallback(ctx) >= R300_FALLBACK_TCL) {
-		hw_tcl_on = GL_FALSE;
+	if (rmesa->options.hw_tcl_enabled == GL_FALSE)
 		return GL_TRUE;
-	}
 
-	if (!r300ValidateBuffers(ctx))
-	    return GL_TRUE;
-	
-	r300UpdateShaders(rmesa);
-
-	vp = (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
-	if (vp->native == GL_FALSE) {
-		hw_tcl_on = GL_FALSE;
-		return GL_TRUE;
-	}
+	r300RunRender(ctx, stage);
 
-	return r300RunRender(ctx, stage);
+	return GL_FALSE;
 }
 
 const struct tnl_pipeline_stage _r300_render_stage = {
@@ -537,12 +548,3 @@ const struct tnl_pipeline_stage _r300_render_stage = {
 	NULL,
 	r300RunNonTCLRender
 };
-
-const struct tnl_pipeline_stage _r300_tcl_stage = {
-	"r300 Hardware Transform, Clipping and Lighting",
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	r300RunTCLRender
-};
diff --git a/src/mesa/drivers/dri/r300/r300_render.h b/src/mesa/drivers/dri/r300/r300_render.h
new file mode 100644
index 00000000000..ec785474a67
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_render.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2009 Maciej Cencora <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __R300_RENDER_H__
+#define __R300_RENDER_H__
+
+#include "main/mtypes.h"
+
+#define R300_FALLBACK_VERTEX_PROGRAM    (1 << 0)
+#define R300_TCL_FALLBACK_MASK           0x0000ffff
+
+#define R300_FALLBACK_LINE_SMOOTH       (1 << 16)
+#define R300_FALLBACK_POINT_SMOOTH      (1 << 17)
+#define R300_FALLBACK_POLYGON_SMOOTH    (1 << 18)
+#define R300_FALLBACK_LINE_STIPPLE      (1 << 19)
+#define R300_FALLBACK_POLYGON_STIPPLE   (1 << 20)
+#define R300_FALLBACK_STENCIL_TWOSIDE   (1 << 21)
+#define R300_FALLBACK_RENDER_MODE       (1 << 22)
+#define R300_FALLBACK_FRAGMENT_PROGRAM  (1 << 23)
+#define R300_FALLBACK_AOS_LIMIT         (1 << 30)
+#define R300_FALLBACK_INVALID_BUFFERS   (1 << 31)
+#define R300_RASTER_FALLBACK_MASK        0xffff0000
+
+#define MASK_XYZW (R300_WRITE_ENA_X | R300_WRITE_ENA_Y | R300_WRITE_ENA_Z | R300_WRITE_ENA_W)
+#define MASK_X R300_WRITE_ENA_X
+#define MASK_Y R300_WRITE_ENA_Y
+#define MASK_Z R300_WRITE_ENA_Z
+#define MASK_W R300_WRITE_ENA_W
+
+#if SWIZZLE_X != R300_INPUT_ROUTE_SELECT_X || \
+    SWIZZLE_Y != R300_INPUT_ROUTE_SELECT_Y || \
+    SWIZZLE_Z != R300_INPUT_ROUTE_SELECT_Z || \
+    SWIZZLE_W != R300_INPUT_ROUTE_SELECT_W || \
+    SWIZZLE_ZERO != R300_INPUT_ROUTE_SELECT_ZERO || \
+    SWIZZLE_ONE != R300_INPUT_ROUTE_SELECT_ONE
+#error Cannot change these!
+#endif
+
+extern const struct tnl_pipeline_stage _r300_render_stage;
+
+extern void r300SwitchFallback(GLcontext *ctx, uint32_t bit, GLboolean mode);
+
+extern void r300RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim);
+
+#endif
diff --git a/src/mesa/drivers/dri/r300/r300_shader.c b/src/mesa/drivers/dri/r300/r300_shader.c
index f30fd986e0f..0133b837966 100644
--- a/src/mesa/drivers/dri/r300/r300_shader.c
+++ b/src/mesa/drivers/dri/r300/r300_shader.c
@@ -1,18 +1,42 @@
+/*
+ * Copyright 2009 Maciej Cencora <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
 
 #include "main/glheader.h"
 
 #include "shader/program.h"
 #include "tnl/tnl.h"
 #include "r300_context.h"
-#include "r300_fragprog.h"
+#include "r300_fragprog_common.h"
 
 static struct gl_program *r300NewProgram(GLcontext * ctx, GLenum target,
 					 GLuint id)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	struct r300_vertex_program_cont *vp;
-	struct r300_fragment_program *r300_fp;
-	struct r500_fragment_program *r500_fp;
+	struct r300_fragment_program *fp;
 
 	switch (target) {
 	case GL_VERTEX_STATE_PROGRAM_NV:
@@ -20,28 +44,12 @@ static struct gl_program *r300NewProgram(GLcontext * ctx, GLenum target,
 		vp = CALLOC_STRUCT(r300_vertex_program_cont);
 		return _mesa_init_vertex_program(ctx, &vp->mesa_program,
 						 target, id);
-	case GL_FRAGMENT_PROGRAM_ARB:
-		if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-			r500_fp = CALLOC_STRUCT(r500_fragment_program);
-			r500_fp->ctx = ctx;
-			return _mesa_init_fragment_program(ctx, &r500_fp->mesa_program,
-							   target, id);
-		} else {
-			r300_fp = CALLOC_STRUCT(r300_fragment_program);
-			return _mesa_init_fragment_program(ctx, &r300_fp->mesa_program,
-							   target, id);
-		}
 
 	case GL_FRAGMENT_PROGRAM_NV:
-		if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-			r500_fp = CALLOC_STRUCT(r500_fragment_program);
-			return _mesa_init_fragment_program(ctx, &r500_fp->mesa_program,
-							   target, id);
-		} else {
-			r300_fp = CALLOC_STRUCT(r300_fragment_program);
-			return _mesa_init_fragment_program(ctx, &r300_fp->mesa_program,
-							   target, id);
-		}
+	case GL_FRAGMENT_PROGRAM_ARB:
+		fp = CALLOC_STRUCT(r300_fragment_program);
+		return _mesa_init_fragment_program(ctx, &fp->Base, target, id);
+
 	default:
 		_mesa_problem(ctx, "Bad target in r300NewProgram");
 	}
@@ -57,20 +65,15 @@ static void r300DeleteProgram(GLcontext * ctx, struct gl_program *prog)
 static void
 r300ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	struct r300_vertex_program_cont *vp = (void *)prog;
 	struct r300_fragment_program *r300_fp = (struct r300_fragment_program *)prog;
-	struct r500_fragment_program *r500_fp = (struct r500_fragment_program *)prog;
 
 	switch (target) {
 	case GL_VERTEX_PROGRAM_ARB:
 		vp->progs = NULL;
 		break;
 	case GL_FRAGMENT_PROGRAM_ARB:
-		if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-			r500_fp->translated = GL_FALSE;
-		else
-			r300_fp->translated = GL_FALSE;
+		r300_fp->translated = GL_FALSE;
 		break;
 	}
 
@@ -81,7 +84,14 @@ r300ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
 static GLboolean
 r300IsProgramNative(GLcontext * ctx, GLenum target, struct gl_program *prog)
 {
-	return GL_TRUE;
+	if (target == GL_FRAGMENT_PROGRAM_ARB) {
+		struct r300_fragment_program *fp = (struct r300_fragment_program *)prog;
+		if (!fp->translated)
+			r300TranslateFragmentShader(ctx, &fp->Base);
+
+		return !fp->error;
+	} else
+		return GL_TRUE;
 }
 
 void r300InitShaderFuncs(struct dd_function_table *functions)
diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
index 46c3df70991..c0eda977db8 100644
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@@ -53,20 +53,22 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "shader/prog_statevars.h"
 #include "vbo/vbo.h"
 #include "tnl/tnl.h"
+#include "tnl/t_vp_build.h"
 
 #include "r300_context.h"
 #include "r300_ioctl.h"
 #include "r300_state.h"
 #include "r300_reg.h"
 #include "r300_emit.h"
-#include "r300_fragprog.h"
 #include "r300_tex.h"
+#include "r300_fragprog_common.h"
+#include "r300_fragprog.h"
+#include "r500_fragprog.h"
+#include "r300_render.h"
+#include "r300_vertprog.h"
 
 #include "drirenderbuffer.h"
 
-extern int future_hw_tcl_on;
-extern void _tnl_UpdateFixedFunctionProgram(GLcontext * ctx);
-
 static void r300BlendColor(GLcontext * ctx, const GLfloat cf[4])
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
@@ -365,7 +367,7 @@ static void r300ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
 	GLint *ip;
 
 	/* no VAP UCP on non-TCL chipsets */
-	if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+	if (!rmesa->options.hw_tcl_enabled)
 			return;
 
 	p = (GLint) plane - (GLint) GL_CLIP_PLANE0;
@@ -384,7 +386,7 @@ static void r300SetClipPlaneState(GLcontext * ctx, GLenum cap, GLboolean state)
 	GLuint p;
 
 	/* no VAP UCP on non-TCL chipsets */
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+	if (!r300->options.hw_tcl_enabled)
 		return;
 
 	p = cap - GL_CLIP_PLANE0;
@@ -450,28 +452,22 @@ static void r300SetPolygonOffsetState(GLcontext * ctx, GLboolean state)
 
 static GLboolean current_fragment_program_writes_depth(GLcontext* ctx)
 {
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_fragment_program *fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
 
-	if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
-		struct r300_fragment_program *fp = (struct r300_fragment_program *)
-			(char *)ctx->FragmentProgram._Current;
-		return (fp && fp->WritesDepth);
-	} else {
-		struct r500_fragment_program* fp =
-			(struct r500_fragment_program*)(char*)
-			ctx->FragmentProgram._Current;
-		return (fp && fp->writes_depth);
-	}
+	return (fp && fp->writes_depth);
 }
 
 static void r300SetEarlyZState(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	GLuint topZ = R300_ZTOP_ENABLE;
+	GLuint w_fmt, fgdepthsrc;
 
 	if (ctx->Color.AlphaEnabled && ctx->Color.AlphaFunc != GL_ALWAYS)
 		topZ = R300_ZTOP_DISABLE;
-	if (current_fragment_program_writes_depth(ctx))
+	else if (current_fragment_program_writes_depth(ctx))
+		topZ = R300_ZTOP_DISABLE;
+	else if (ctx->FragmentProgram._Current && ctx->FragmentProgram._Current->UsesKill)
 		topZ = R300_ZTOP_DISABLE;
 
 	if (topZ != r300->hw.zstencil_format.cmd[2]) {
@@ -482,6 +478,26 @@ static void r300SetEarlyZState(GLcontext * ctx)
 		R300_STATECHANGE(r300, zstencil_format);
 		r300->hw.zstencil_format.cmd[2] = topZ;
 	}
+
+	/* w_fmt value is set to get best performance
+	* see p.130 R5xx 3D acceleration guide v1.3 */
+	if (current_fragment_program_writes_depth(ctx)) {
+		fgdepthsrc = R300_FG_DEPTH_SRC_SHADER;
+		w_fmt = R300_W_FMT_W24 | R300_W_SRC_US;
+	} else {
+		fgdepthsrc = R300_FG_DEPTH_SRC_SCAN;
+		w_fmt = R300_W_FMT_W0 | R300_W_SRC_US;
+	}
+
+	if (w_fmt != r300->hw.us_out_fmt.cmd[5]) {
+		R300_STATECHANGE(r300, us_out_fmt);
+		r300->hw.us_out_fmt.cmd[5] = w_fmt;
+	}
+
+	if (fgdepthsrc != r300->hw.fg_depth_src.cmd[1]) {
+		R300_STATECHANGE(r300, fg_depth_src);
+		r300->hw.fg_depth_src.cmd[1] = fgdepthsrc;
+	}
 }
 
 static void r300SetAlphaState(GLcontext * ctx)
@@ -532,8 +548,6 @@ static void r300SetAlphaState(GLcontext * ctx)
 	R300_STATECHANGE(r300, at);
 	r300->hw.at.cmd[R300_AT_ALPHA_TEST] = pp_misc;
 	r300->hw.at.cmd[R300_AT_UNKNOWN] = 0;
-
-	r300SetEarlyZState(ctx);
 }
 
 static void r300AlphaFunc(GLcontext * ctx, GLenum func, GLfloat ref)
@@ -581,14 +595,28 @@ static void r300SetDepthState(GLcontext * ctx)
 		r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
 		    translate_func(ctx->Depth.Func) << R300_Z_FUNC_SHIFT;
 	}
+}
 
-	r300SetEarlyZState(ctx);
+static void r300CatchStencilFallback(GLcontext *ctx)
+{
+	const unsigned back = ctx->Stencil._BackFace;
+
+	if (ctx->Stencil._Enabled && (ctx->Stencil.Ref[0] != ctx->Stencil.Ref[back]
+		|| ctx->Stencil.ValueMask[0] != ctx->Stencil.ValueMask[back]
+		|| ctx->Stencil.WriteMask[0] != ctx->Stencil.WriteMask[back])) {
+		r300SwitchFallback(ctx, R300_FALLBACK_STENCIL_TWOSIDE, GL_TRUE);
+	} else {
+		r300SwitchFallback(ctx, R300_FALLBACK_STENCIL_TWOSIDE, GL_FALSE);
+	}
 }
 
 static void r300SetStencilState(GLcontext * ctx, GLboolean state)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	GLboolean hw_stencil = GL_FALSE;
+
+	r300CatchStencilFallback(ctx);
+
 	if (ctx->DrawBuffer) {
 		struct radeon_renderbuffer *rrbStencil
 			= radeon_get_renderbuffer(ctx->DrawBuffer, BUFFER_STENCIL);
@@ -604,10 +632,6 @@ static void r300SetStencilState(GLcontext * ctx, GLboolean state)
 			r300->hw.zs.cmd[R300_ZS_CNTL_0] &=
 			    ~R300_STENCIL_ENABLE;
 		}
-	} else {
-#if R200_MERGED
-		FALLBACK(&r300->radeon, RADEON_FALLBACK_STENCIL, state);
-#endif
 	}
 }
 
@@ -740,7 +764,12 @@ static void r300ColorMask(GLcontext * ctx,
 static void r300PointSize(GLcontext * ctx, GLfloat size)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-        /* same size limits for AA, non-AA points */
+
+	/* We need to clamp to user defined range here, because
+	 * the HW clamping happens only for per vertex point size. */
+	size = CLAMP(size, ctx->Point.MinSize, ctx->Point.MaxSize);
+
+	/* same size limits for AA, non-AA points */
 	size = CLAMP(size, ctx->Const.MinPointSize, ctx->Const.MaxPointSize);
 
 	R300_STATECHANGE(r300, ps);
@@ -833,29 +862,33 @@ static void r300ShadeModel(GLcontext * ctx, GLenum mode)
 
 	R300_STATECHANGE(rmesa, shade);
 	rmesa->hw.shade.cmd[1] = 0x00000002;
+	R300_STATECHANGE(rmesa, shade2);
 	switch (mode) {
 	case GL_FLAT:
-		rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_FLAT;
+		rmesa->hw.shade2.cmd[1] = R300_RE_SHADE_MODEL_FLAT;
 		break;
 	case GL_SMOOTH:
-		rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_SMOOTH;
+		rmesa->hw.shade2.cmd[1] = R300_RE_SHADE_MODEL_SMOOTH;
 		break;
 	default:
 		return;
 	}
-	rmesa->hw.shade.cmd[3] = 0x00000000;
-	rmesa->hw.shade.cmd[4] = 0x00000000;
+	rmesa->hw.shade2.cmd[2] = 0x00000000;
+	rmesa->hw.shade2.cmd[3] = 0x00000000;
 }
 
 static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
 				    GLenum func, GLint ref, GLuint mask)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	GLuint refmask =
-	    ((ctx->Stencil.Ref[0] & 0xff) << R300_STENCILREF_SHIFT)
-	     | ((ctx->Stencil.ValueMask[0] & 0xff) << R300_STENCILMASK_SHIFT);
-	const unsigned back = ctx->Stencil._BackFace;
+	GLuint refmask;
 	GLuint flag;
+	const unsigned back = ctx->Stencil._BackFace;
+
+	r300CatchStencilFallback(ctx);
+
+	refmask = ((ctx->Stencil.Ref[0] & 0xff) << R300_STENCILREF_SHIFT)
+	     | ((ctx->Stencil.ValueMask[0] & 0xff) << R300_STENCILMASK_SHIFT);
 
 	R300_STATECHANGE(rmesa, zs);
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_STENCIL_FRONT_BACK;
@@ -883,6 +916,8 @@ static void r300StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 
+	r300CatchStencilFallback(ctx);
+
 	R300_STATECHANGE(rmesa, zs);
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] &=
 	    ~(R300_STENCILREF_MASK <<
@@ -899,6 +934,8 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	const unsigned back = ctx->Stencil._BackFace;
 
+	r300CatchStencilFallback(ctx);
+
 	R300_STATECHANGE(rmesa, zs);
 	/* It is easier to mask what's left.. */
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &=
@@ -927,16 +964,10 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
  * Window position and viewport transformation
  */
 
-/*
- * To correctly position primitives:
- */
-#define SUBPIXEL_X 0.125
-#define SUBPIXEL_Y 0.125
-
 static void r300UpdateWindow(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
 	GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
 	GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -953,9 +984,9 @@ static void r300UpdateWindow(GLcontext * ctx)
 	}
 
 	GLfloat sx = v[MAT_SX];
-	GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
+	GLfloat tx = v[MAT_TX] + xoffset;
 	GLfloat sy = v[MAT_SY] * y_scale;
-	GLfloat ty = (v[MAT_TY] * y_scale) + y_bias + SUBPIXEL_Y;
+	GLfloat ty = (v[MAT_TY] * y_scale) + y_bias;
 	GLfloat sz = v[MAT_SZ] * depthScale;
 	GLfloat tz = v[MAT_TZ] * depthScale;
 
@@ -989,13 +1020,13 @@ static void r300DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
 void r300UpdateViewportOffset(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = ((radeonContextPtr) rmesa)->dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
 	GLfloat xoffset = (GLfloat) dPriv->x;
 	GLfloat yoffset = (GLfloat) dPriv->y + dPriv->h;
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
 
-	GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
-	GLfloat ty = (-v[MAT_TY]) + yoffset + SUBPIXEL_Y;
+	GLfloat tx = v[MAT_TX] + xoffset;
+	GLfloat ty = (-v[MAT_TY]) + yoffset;
 
 	if (rmesa->hw.vpt.cmd[R300_VPT_XOFFSET] != r300PackFloat32(tx) ||
 	    rmesa->hw.vpt.cmd[R300_VPT_YOFFSET] != r300PackFloat32(ty)) {
@@ -1021,12 +1052,14 @@ r300FetchStateParameter(GLcontext * ctx,
 	switch (state[0]) {
 	case STATE_INTERNAL:
 		switch (state[1]) {
-		case STATE_R300_WINDOW_DIMENSION:
-			value[0] = r300->radeon.dri.drawable->w * 0.5f;	/* width*0.5 */
-			value[1] = r300->radeon.dri.drawable->h * 0.5f;	/* height*0.5 */
-			value[2] = 0.5F;	/* for moving range [-1 1] -> [0 1] */
-			value[3] = 1.0F;	/* not used */
-			break;
+		case STATE_R300_WINDOW_DIMENSION: {
+				__DRIdrawablePrivate * drawable = radeon_get_drawable(&r300->radeon);
+				value[0] = drawable->w * 0.5f;	/* width*0.5 */
+				value[1] = drawable->h * 0.5f;	/* height*0.5 */
+				value[2] = 0.5F;	/* for moving range [-1 1] -> [0 1] */
+				value[3] = 1.0F;	/* not used */
+				break;
+			}
 
 		case STATE_R300_TEXRECT_FACTOR:{
 				struct gl_texture_object *t =
@@ -1066,14 +1099,14 @@ void r300UpdateStateParameters(GLcontext * ctx, GLuint new_state)
 	struct gl_program_parameter_list *paramList;
 	GLuint i;
 
-	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM)))
+	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS)))
 		return;
 
 	fp = (struct r300_fragment_program *)ctx->FragmentProgram._Current;
 	if (!fp)
 		return;
 
-	paramList = fp->mesa_program.Base.Parameters;
+	paramList = fp->Base.Base.Parameters;
 
 	if (!paramList)
 		return;
@@ -1192,9 +1225,8 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	int i;
-	struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-	struct r300_fragment_program_code *code = &fp->code;
+	struct r300_fragment_program *fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
+	struct r300_fragment_program_code *code = &fp->code.r300;
 
 	R300_STATECHANGE(r300, fpt);
 
@@ -1235,9 +1267,8 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 {
 	int i;
-	struct r500_fragment_program *fp = (struct r500_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-	struct r500_fragment_program_code *code = &fp->code;
+	struct r300_fragment_program *fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
+	struct r500_fragment_program_code *code = &fp->code.r500;
 
 	/* find all the texture instructions and relocate the texture units */
 	for (i = 0; i < code->inst_end + 1; i++) {
@@ -1388,11 +1419,8 @@ static void r300SetupTextures(GLcontext * ctx)
 	r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
 	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_BORDER_COLOR_0, last_hw_tmu + 1);
 
-	if (!fp)		/* should only happenen once, just after context is created */
-		return;
-
 	if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
-		if (fp->mesa_program.UsesKill && last_hw_tmu < 0) {
+		if (fp->Base.UsesKill && last_hw_tmu < 0) {
 			// The KILL operation requires the first texture unit
 			// to be enabled.
 			r300->hw.txe.cmd[R300_TXE_ENABLE] |= 1;
@@ -1400,9 +1428,8 @@ static void r300SetupTextures(GLcontext * ctx)
 			r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
 				cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, 1);
 		}
-		r300SetupFragmentShaderTextures(ctx, tmu_mappings);
-	} else
-		r500SetupFragmentShaderTextures(ctx, tmu_mappings);
+	}
+	r300->vtbl.SetupFragmentShaderTextures(ctx, tmu_mappings);
 
 	if (RADEON_DEBUG & DEBUG_STATE)
 		fprintf(stderr, "TX_ENABLE: %08x  last_hw_tmu=%d\n",
@@ -1421,26 +1448,21 @@ union r300_outputs_written {
 static void r300SetupRSUnit(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-        TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *VB = &tnl->vb;
 	union r300_outputs_written OutputsWritten;
 	GLuint InputsRead;
 	int fp_reg, high_rr;
 	int col_ip, tex_ip;
 	int rs_tex_count = 0;
-	int i, count, col_fmt;
+	int i, col_fmt, hw_tcl_on;
+
+	hw_tcl_on = r300->options.hw_tcl_enabled;
 
 	if (hw_tcl_on)
-		OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+		OutputsWritten.vp_outputs = r300->selected_vp->key.OutputsWritten;
 	else
-		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->state.render_inputs_bitset);
+		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->render_inputs_bitset);
 
-	if (ctx->FragmentProgram._Current)
-		InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
-	else {
-		fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
-		return;		/* This should only ever happen once.. */
-	}
+	InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
 
 	R300_STATECHANGE(r300, ri);
 	R300_STATECHANGE(r300, rc);
@@ -1459,15 +1481,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL0) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
-			count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
-			if (count == 4)
-			    col_fmt = R300_RS_COL_FMT_RGBA;
-			else if (count == 3)
-			    col_fmt = R300_RS_COL_FMT_RGB1;
-			else
-			    col_fmt = R300_RS_COL_FMT_0001;
-
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(col_fmt);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
 			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL0;
 			++col_ip;
@@ -1479,15 +1493,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL1) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
-			count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
-			if (count == 4)
-			    col_fmt = R300_RS_COL_FMT_RGBA;
-			else if (count == 3)
-			    col_fmt = R300_RS_COL_FMT_RGB1;
-			else
-			    col_fmt = R300_RS_COL_FMT_0001;
-
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(col_fmt);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
 			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL1;
 			++col_ip;
@@ -1497,6 +1503,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
 		}
 	}
 
+	/* We always route 4 texcoord components */
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
 		if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
 		    continue;
@@ -1506,26 +1513,10 @@ static void r300SetupRSUnit(GLcontext * ctx)
 		    continue;
 		}
 
-		int swiz;
-
-		/* with TCL we always seem to route 4 components */
-		if (hw_tcl_on)
-			count = 4;
-		else
-			count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
-
-		switch(count) {
-		case 4: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3); break;
-		case 3: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
-		default:
-		case 1:
-		case 2: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(R300_RS_SEL_K0) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
-		};
-
-		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= swiz | R300_RS_TEX_PTR(rs_tex_count);
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) | R300_RS_TEX_PTR(rs_tex_count);
 		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
 		InputsRead &= ~(FRAG_BIT_TEX0 << i);
-		rs_tex_count += count;
+		rs_tex_count += 4;
 		++tex_ip;
 		++fp_reg;
 	}
@@ -1545,7 +1536,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
 			r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= R300_RS_SEL_Q(R300_RS_SEL_K1) | R300_RS_TEX_PTR(rs_tex_count);
 			r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_FOGC;
-			rs_tex_count += 1;
+			rs_tex_count += 4;
 			++tex_ip;
 			++fp_reg;
 		} else {
@@ -1555,7 +1546,8 @@ static void r300SetupRSUnit(GLcontext * ctx)
 
 	/* Setup default color if no color or tex was set */
 	if (rs_tex_count == 0 && col_ip == 0) {
-		r300->hw.rr.cmd[R300_RR_INST_0] = R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(0) | R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
+		r300->hw.rr.cmd[R300_RR_INST_0] = R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_ADDR(0);
+		r300->hw.ri.cmd[R300_RI_INTERP_0] = R300_RS_COL_PTR(0) | R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
 		++col_ip;
 	}
 
@@ -1564,6 +1556,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
 	r300->hw.rc.cmd[2] |= high_rr - 1;
 
 	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_INST_0, high_rr);
+	r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_IP_0, high_rr);
 
 	if (InputsRead)
 		WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
@@ -1572,26 +1565,21 @@ static void r300SetupRSUnit(GLcontext * ctx)
 static void r500SetupRSUnit(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-        TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *VB = &tnl->vb;
 	union r300_outputs_written OutputsWritten;
 	GLuint InputsRead;
 	int fp_reg, high_rr;
 	int col_ip, tex_ip;
 	int rs_tex_count = 0;
-	int i, count, col_fmt;
+	int i, col_fmt, hw_tcl_on;
+
+	hw_tcl_on = r300->options.hw_tcl_enabled;
 
 	if (hw_tcl_on)
-		OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+		OutputsWritten.vp_outputs = r300->selected_vp->key.OutputsWritten;
 	else
-		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->state.render_inputs_bitset);
+		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->render_inputs_bitset);
 
-	if (ctx->FragmentProgram._Current)
-		InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
-	else {
-		fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
-		return;		/* This should only ever happen once.. */
-	}
+	InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
 
 	R300_STATECHANGE(r300, ri);
 	R300_STATECHANGE(r300, rc);
@@ -1610,15 +1598,7 @@ static void r500SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL0) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
-			count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
-			if (count == 4)
-			    col_fmt = R300_RS_COL_FMT_RGBA;
-			else if (count == 3)
-			    col_fmt = R300_RS_COL_FMT_RGB1;
-			else
-			    col_fmt = R300_RS_COL_FMT_0001;
-
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(col_fmt);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
 			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL0;
 			++col_ip;
@@ -1630,15 +1610,7 @@ static void r500SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL1) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
-			count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
-			if (count == 4)
-			    col_fmt = R300_RS_COL_FMT_RGBA;
-			else if (count == 3)
-			    col_fmt = R300_RS_COL_FMT_RGB1;
-			else
-			    col_fmt = R300_RS_COL_FMT_0001;
-
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(col_fmt);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
 			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL1;
 			++col_ip;
@@ -1648,7 +1620,7 @@ static void r500SetupRSUnit(GLcontext * ctx)
 		}
 	}
 
-
+	/* We always route 4 texcoord components */
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
 		if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
 		    continue;
@@ -1658,55 +1630,37 @@ static void r500SetupRSUnit(GLcontext * ctx)
 		    continue;
 		}
 
-		int swiz = 0;
-
-		/* with TCL we always seem to route 4 components */
-		if (hw_tcl_on)
-		  count = 4;
-		else
-		  count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
-
-		if (count == 4) {
-			swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= (rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= (rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		} else if (count == 3) {
-			swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= (rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		} else if (count == 2) {
-			swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		} else if (count == 1) {
-			swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		} else {
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		}
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
+			((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
+			((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
+			((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
 
-		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= swiz;
 		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
 		InputsRead &= ~(FRAG_BIT_TEX0 << i);
-		rs_tex_count += count;
+		rs_tex_count += 4;
+		++tex_ip;
+		++fp_reg;
+	}
+
+	if (InputsRead & FRAG_BIT_WPOS) {
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
+			((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
+			((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
+			((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
+
+		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
+		InputsRead &= ~FRAG_BIT_WPOS;
+		rs_tex_count += 4;
 		++tex_ip;
 		++fp_reg;
 	}
 
 	if (InputsRead & FRAG_BIT_FOGC) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_FOGC, _TNL_ATTRIB_FOG)) {
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
-				((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
-				((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
-				((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= (rs_tex_count << R500_RS_IP_TEX_PTR_S_SHIFT) |
+				(R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
+				(R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
+				(R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT);
 
 			r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_FOGC;
@@ -1718,87 +1672,27 @@ static void r500SetupRSUnit(GLcontext * ctx)
 		}
 	}
 
-	if (InputsRead & FRAG_BIT_WPOS) {
-		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
-				((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
-				((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
-				((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
-
-		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
-		InputsRead &= ~FRAG_BIT_WPOS;
-		rs_tex_count += 4;
-		++tex_ip;
-		++fp_reg;
-	}
-
 	/* Setup default color if no color or tex was set */
 	if (rs_tex_count == 0 && col_ip == 0) {
-		r300->hw.rr.cmd[R300_RR_INST_0] |= R500_RS_INST_COL_ID(0) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(0) | R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
+		r300->hw.rr.cmd[R300_RR_INST_0] = R500_RS_INST_COL_ID(0) | R500_RS_INST_COL_ADDR(0);
+		r300->hw.ri.cmd[R300_RI_INTERP_0] = R500_RS_COL_PTR(0) | R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
 		++col_ip;
 	}
 
 	high_rr = (col_ip > tex_ip) ? col_ip : tex_ip;
-	r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT)  | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
-	r300->hw.rc.cmd[2] |= 0xC0 | (high_rr - 1);
+	r300->hw.rc.cmd[1] = (rs_tex_count << R300_IT_COUNT_SHIFT) | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
+	r300->hw.rc.cmd[2] = 0xC0 | (high_rr - 1);
 
 	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_INST_0, high_rr);
+	r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_IP_0, high_rr);
 
 	if (InputsRead)
 		WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
 }
 
-
-
-
-#define bump_vpu_count(ptr, new_count)   do{\
-	drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr));\
-	int _nc=(new_count)/4; \
-	assert(_nc < 256); \
-	if(_nc>_p->vpu.count)_p->vpu.count=_nc;\
-	}while(0)
-
-static INLINE void r300SetupVertexProgramFragment(r300ContextPtr r300, int dest, struct r300_vertex_shader_fragment *vsf)
-{
-	int i;
-
-	if (vsf->length == 0)
-		return;
-
-	if (vsf->length & 0x3) {
-		fprintf(stderr, "VERTEX_SHADER_FRAGMENT must have length divisible by 4\n");
-		_mesa_exit(-1);
-	}
-
-	switch ((dest >> 8) & 0xf) {
-	case 0:
-		R300_STATECHANGE(r300, vpi);
-		for (i = 0; i < vsf->length; i++)
-			r300->hw.vpi.cmd[R300_VPI_INSTR_0 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
-		bump_vpu_count(r300->hw.vpi.cmd, vsf->length + 4 * (dest & 0xff));
-		break;
-
-	case 2:
-		R300_STATECHANGE(r300, vpp);
-		for (i = 0; i < vsf->length; i++)
-			r300->hw.vpp.cmd[R300_VPP_PARAM_0 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
-		bump_vpu_count(r300->hw.vpp.cmd, vsf->length + 4 * (dest & 0xff));
-		break;
-	case 4:
-		R300_STATECHANGE(r300, vps);
-		for (i = 0; i < vsf->length; i++)
-			r300->hw.vps.cmd[1 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
-		bump_vpu_count(r300->hw.vps.cmd, vsf->length + 4 * (dest & 0xff));
-		break;
-	default:
-		fprintf(stderr, "%s:%s don't know how to handle dest %04x\n", __FILE__, __FUNCTION__, dest);
-		_mesa_exit(-1);
-	}
-}
-
 #define MIN3(a, b, c)	((a) < (b) ? MIN2(a, c) : MIN2(b, c))
 
-
-static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
+void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
 			GLuint output_count, GLuint temp_count)
 {
     int vtx_mem_size;
@@ -1822,7 +1716,7 @@ static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
     pvs_num_cntrls = MIN2(6, vtx_mem_size/temp_count);
 
     R300_STATECHANGE(rmesa, vap_cntl);
-    if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+    if (rmesa->options.hw_tcl_enabled) {
 	rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] =
 	    (pvs_num_slots << R300_PVS_NUM_SLOTS_SHIFT) |
 	    (pvs_num_cntrls << R300_PVS_NUM_CNTLRS_SHIFT) |
@@ -1852,115 +1746,6 @@ static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
 
 }
 
-static void r300SetupDefaultVertexProgram(r300ContextPtr rmesa)
-{
-	struct r300_vertex_shader_state *prog = &(rmesa->state.vertex_shader);
-	GLuint o_reg = 0;
-	GLuint i_reg = 0;
-	int i;
-	int inst_count = 0;
-	int param_count = 0;
-	int program_end = 0;
-
-	for (i = VERT_ATTRIB_POS; i < VERT_ATTRIB_MAX; i++) {
-		if (rmesa->state.sw_tcl_inputs[i] != -1) {
-			prog->program.body.i[program_end + 0] = PVS_OP_DST_OPERAND(VE_MULTIPLY, GL_FALSE, GL_FALSE, o_reg++, VSF_FLAG_ALL, PVS_DST_REG_OUT);
-			prog->program.body.i[program_end + 1] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
-			prog->program.body.i[program_end + 2] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
-			prog->program.body.i[program_end + 3] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
-			program_end += 4;
-			i_reg++;
-		}
-	}
-
-	prog->program.length = program_end;
-
-	r300SetupVertexProgramFragment(rmesa, R300_PVS_CODE_START,
-				       &(prog->program));
-	inst_count = (prog->program.length / 4) - 1;
-
-	r300VapCntl(rmesa, i_reg, o_reg, 0);
-
-	R300_STATECHANGE(rmesa, pvs);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
-	    (0 << R300_PVS_FIRST_INST_SHIFT) |
-	    (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
-	    (inst_count << R300_PVS_LAST_INST_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
-	    (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
-	    (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
-	    (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
-}
-
-static int bit_count (int x)
-{
-    x = ((x & 0xaaaaaaaaU) >> 1) + (x & 0x55555555U);
-    x = ((x & 0xccccccccU) >> 2) + (x & 0x33333333U);
-    x = (x >> 16) + (x & 0xffff);
-    x = ((x & 0xf0f0) >> 4) + (x & 0x0f0f);
-    return (x >> 8) + (x & 0x00ff);
-}
-
-static void r300SetupRealVertexProgram(r300ContextPtr rmesa)
-{
-	GLcontext *ctx = rmesa->radeon.glCtx;
-	struct r300_vertex_program *prog = (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
-	int inst_count = 0;
-	int param_count = 0;
-
-	/* FIXME: r300SetupVertexProgramFragment */
-	R300_STATECHANGE(rmesa, vpp);
-	param_count =
-	    r300VertexProgUpdateParams(ctx,
-				       (struct r300_vertex_program_cont *)
-				       ctx->VertexProgram._Current,
-				       (float *)&rmesa->hw.vpp.
-				       cmd[R300_VPP_PARAM_0]);
-	bump_vpu_count(rmesa->hw.vpp.cmd, param_count);
-	param_count /= 4;
-
-	r300SetupVertexProgramFragment(rmesa, R300_PVS_CODE_START, &(prog->program));
-	inst_count = (prog->program.length / 4) - 1;
-
-	r300VapCntl(rmesa, bit_count(prog->key.InputsRead),
-		    bit_count(prog->key.OutputsWritten), prog->num_temporaries);
-
-	R300_STATECHANGE(rmesa, pvs);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
-	  (0 << R300_PVS_FIRST_INST_SHIFT) |
-	  (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
-	  (inst_count << R300_PVS_LAST_INST_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
-	  (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
-	  (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
-	  (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
-}
-
-
-static void r300SetupVertexProgram(r300ContextPtr rmesa)
-{
-	GLcontext *ctx = rmesa->radeon.glCtx;
-
-	/* Reset state, in case we don't use something */
-	((drm_r300_cmd_header_t *) rmesa->hw.vpp.cmd)->vpu.count = 0;
-	((drm_r300_cmd_header_t *) rmesa->hw.vpi.cmd)->vpu.count = 0;
-	((drm_r300_cmd_header_t *) rmesa->hw.vps.cmd)->vpu.count = 0;
-
-	/* Not sure why this doesnt work...
-	   0x400 area might have something to do with pixel shaders as it appears right after pfs programming.
-	   0x406 is set to { 0.0, 0.0, 1.0, 0.0 } most of the time but should change with smooth points and in other rare cases. */
-	//setup_vertex_shader_fragment(rmesa, 0x406, &unk4);
-	if (hw_tcl_on && ((struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx))->translated) {
-		r300SetupRealVertexProgram(rmesa);
-	} else {
-		/* FIXME: This needs to be replaced by vertex shader generation code. */
-		r300SetupDefaultVertexProgram(rmesa);
-	}
-
-}
-
 /**
  * Enable/Disable states.
  *
@@ -1975,14 +1760,6 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
 			state ? "GL_TRUE" : "GL_FALSE");
 
 	switch (cap) {
-	case GL_TEXTURE_1D:
-	case GL_TEXTURE_2D:
-	case GL_TEXTURE_3D:
-		/* empty */
-		break;
-	case GL_FOG:
-		/* empty */
-		break;
 	case GL_ALPHA_TEST:
 		r300SetAlphaState(ctx);
 		break;
@@ -2000,14 +1777,31 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
 	case GL_CLIP_PLANE5:
 		r300SetClipPlaneState(ctx, cap, state);
 		break;
+	case GL_CULL_FACE:
+		r300UpdateCulling(ctx);
+		break;
 	case GL_DEPTH_TEST:
 		r300SetDepthState(ctx);
 		break;
-	case GL_STENCIL_TEST:
-		r300SetStencilState(ctx, state);
+	case GL_LINE_SMOOTH:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_LINE_SMOOTH, ctx->Line.SmoothFlag);
 		break;
-	case GL_CULL_FACE:
-		r300UpdateCulling(ctx);
+	case GL_LINE_STIPPLE:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_LINE_STIPPLE, ctx->Line.StippleFlag);
+		break;
+	case GL_POINT_SMOOTH:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_POINT_SMOOTH, ctx->Point.SmoothFlag);
+		break;
+	case GL_POLYGON_SMOOTH:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_POLYGON_SMOOTH, ctx->Polygon.SmoothFlag);
+		break;
+	case GL_POLYGON_STIPPLE:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_POLYGON_STIPPLE, ctx->Polygon.StippleFlag);
 		break;
 	case GL_POLYGON_OFFSET_POINT:
 	case GL_POLYGON_OFFSET_LINE:
@@ -2019,6 +1813,9 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
 		rmesa->radeon.state.scissor.enabled = state;
 		radeonUpdateScissor( ctx );
 		break;
+	case GL_STENCIL_TEST:
+		r300SetStencilState(ctx, state);
+		break;
 	default:
 		break;
 	}
@@ -2030,10 +1827,9 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
 static void r300ResetHwState(r300ContextPtr r300)
 {
 	GLcontext *ctx = r300->radeon.glCtx;
-	int has_tcl = 1;
+	int has_tcl;
 
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
-		has_tcl = 0;
+	has_tcl = r300->options.hw_tcl_enabled;
 
 	if (RADEON_DEBUG & DEBUG_STATE)
 		fprintf(stderr, "%s\n", __FUNCTION__);
@@ -2138,8 +1934,8 @@ static void r300ResetHwState(r300ContextPtr r300)
 	}
 
 	/* XXX: Enable anti-aliasing? */
-	r300->hw.gb_misc.cmd[R300_GB_MISC_AA_CONFIG] = GB_AA_CONFIG_AA_DISABLE;
-	r300->hw.gb_misc.cmd[R300_GB_MISC_SELECT] = 0;
+	r300->hw.gb_misc2.cmd[R300_GB_MISC2_AA_CONFIG] = GB_AA_CONFIG_AA_DISABLE;
+	r300->hw.gb_misc2.cmd[R300_GB_MISC2_SELECT] = 0;
 
 	r300->hw.ga_point_s0.cmd[1] = r300PackFloat32(0.0);
 	r300->hw.ga_point_s0.cmd[2] = r300PackFloat32(0.0);
@@ -2210,8 +2006,8 @@ static void r300ResetHwState(r300ContextPtr r300)
 
 	r300->hw.rb3d_aaresolve_ctl.cmd[1] = 0;
 
-	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[1] = 0x00000000;
-	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[2] = 0xffffffff;
+    r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[1] = 0x00000000;
+    r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[2] = 0xffffffff;
 
 	r300->hw.zb_depthclearvalue.cmd[1] = 0;
 
@@ -2220,8 +2016,8 @@ static void r300ResetHwState(r300ContextPtr r300)
 	r300->hw.zstencil_format.cmd[4] = 0x00000000;
 	r300SetEarlyZState(ctx);
 
-	r300->hw.unk4F30.cmd[1] = 0;
-	r300->hw.unk4F30.cmd[2] = 0;
+	r300->hw.zb_zmask.cmd[1] = 0;
+	r300->hw.zb_zmask.cmd[2] = 0;
 
 	r300->hw.zb_hiz_offset.cmd[1] = 0;
 
@@ -2241,14 +2037,20 @@ static void r300ResetHwState(r300ContextPtr r300)
 void r300UpdateShaders(r300ContextPtr rmesa)
 {
 	GLcontext *ctx;
-	struct r300_vertex_program *vp;
+	struct r300_fragment_program *fp;
 	int i;
 
 	ctx = rmesa->radeon.glCtx;
+	fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
 
-	if (rmesa->radeon.NewGLState && hw_tcl_on) {
-		rmesa->radeon.NewGLState = 0;
+	/* should only happenen once, just after context is created */
+	/* TODO: shouldn't we fallback to sw here? */
+	if (!fp) {
+		_mesa_fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
+		return;
+	}
 
+	if (rmesa->radeon.NewGLState && rmesa->options.hw_tcl_enabled) {
 		for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
 			rmesa->temp_attrib[i] =
 			    TNL_CONTEXT(ctx)->vb.AttribPtr[i];
@@ -2264,20 +2066,16 @@ void r300UpdateShaders(r300ContextPtr rmesa)
 		}
 
 		r300SelectVertexShader(rmesa);
-		vp = (struct r300_vertex_program *)
-		    CURRENT_VERTEX_SHADER(ctx);
-		/*if (vp->translated == GL_FALSE)
-		   r300TranslateVertexShader(vp); */
-		if (vp->translated == GL_FALSE) {
-			fprintf(stderr, "Failing back to sw-tcl\n");
-			hw_tcl_on = future_hw_tcl_on = 0;
-			r300ResetHwState(rmesa);
-
-			r300UpdateStateParameters(ctx, _NEW_PROGRAM);
-			return;
-		}
+		r300SwitchFallback(ctx, R300_FALLBACK_VERTEX_PROGRAM, rmesa->selected_vp->error);
 	}
-	r300UpdateStateParameters(ctx, _NEW_PROGRAM);
+
+	if (!fp->translated || rmesa->radeon.NewGLState)
+		r300TranslateFragmentShader(ctx, ctx->FragmentProgram._Current);
+
+	r300SwitchFallback(ctx, R300_FALLBACK_FRAGMENT_PROGRAM, fp->error);
+
+	r300UpdateStateParameters(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+	rmesa->radeon.NewGLState = 0;
 }
 
 static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx,
@@ -2301,26 +2099,14 @@ static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx,
 }
 
 
-static void r300SetupPixelShader(r300ContextPtr rmesa)
+static void r300SetupPixelShader(GLcontext *ctx)
 {
-	GLcontext *ctx = rmesa->radeon.glCtx;
-	struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct r300_fragment_program *fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
 	struct r300_fragment_program_code *code;
 	int i, k;
 
-	if (!fp)		/* should only happenen once, just after context is created */
-		return;
-
-	r300TranslateFragmentShader(rmesa, fp);
-	if (!fp->translated) {
-		fprintf(stderr, "%s: No valid fragment shader, exiting\n",
-			__FUNCTION__);
-		return;
-	}
-	code = &fp->code;
-
-	r300SetupTextures(ctx);
+	code = &fp->code.r300;
 
 	R300_STATECHANGE(rmesa, fpi[0]);
 	R300_STATECHANGE(rmesa, fpi[1]);
@@ -2363,7 +2149,7 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
 	rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_PFS_PARAM_0_X, code->const_nr * 4);
 	for (i = 0; i < code->const_nr; i++) {
 		const GLfloat *constant = get_fragmentprogram_constant(ctx,
-			&fp->mesa_program.Base, code->constant[i]);
+			&fp->Base.Base, code->constant[i]);
 		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(constant[0]);
 		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(constant[1]);
 		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(constant[2]);
@@ -2385,29 +2171,17 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
 	if(_nc>_p->r500fp.count)_p->r500fp.count=_nc;\
 } while(0)
 
-static void r500SetupPixelShader(r300ContextPtr rmesa)
+static void r500SetupPixelShader(GLcontext *ctx)
 {
-	GLcontext *ctx = rmesa->radeon.glCtx;
-	struct r500_fragment_program *fp = (struct r500_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct r300_fragment_program *fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
 	int i;
 	struct r500_fragment_program_code *code;
 
-	if (!fp)		/* should only happenen once, just after context is created */
-		return;
-
 	((drm_r300_cmd_header_t *) rmesa->hw.r500fp.cmd)->r500fp.count = 0;
 	((drm_r300_cmd_header_t *) rmesa->hw.r500fp_const.cmd)->r500fp.count = 0;
 
-	r500TranslateFragmentShader(rmesa, fp);
-	if (!fp->translated) {
-		fprintf(stderr, "%s: No valid fragment shader, exiting\n",
-			__FUNCTION__);
-		return;
-	}
-	code = &fp->code;
-
-	r300SetupTextures(ctx);
+	code = &fp->code.r500;
 
 	R300_STATECHANGE(rmesa, fp);
 	rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = code->max_temp_idx;
@@ -2437,57 +2211,96 @@ static void r500SetupPixelShader(r300ContextPtr rmesa)
 	R300_STATECHANGE(rmesa, r500fp_const);
 	for (i = 0; i < code->const_nr; i++) {
 		const GLfloat *constant = get_fragmentprogram_constant(ctx,
-			&fp->mesa_program.Base, code->constant[i]);
+			&fp->Base.Base, code->constant[i]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(constant[0]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(constant[1]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(constant[2]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(constant[3]);
 	}
 	bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, code->const_nr * 4);
+}
+
+void r300SetupVAP(GLcontext *ctx, GLuint InputsRead, GLuint OutputsWritten)
+{
+	r300ContextPtr rmesa = R300_CONTEXT( ctx );
+	struct vertex_attribute *attrs = rmesa->vbuf.attribs;
+	int i, j, reg_count;
+	uint32_t *vir0 = &rmesa->hw.vir[0].cmd[1];
+	uint32_t *vir1 = &rmesa->hw.vir[1].cmd[1];
+
+	for (i = 0; i < R300_VIR_CMDSIZE-1; ++i)
+		vir0[i] = vir1[i] = 0;
+
+	for (i = 0, j = 0; i < rmesa->vbuf.num_attribs; ++i) {
+		int tmp;
+
+		tmp = attrs[i].data_type | (attrs[i].dst_loc << R300_DST_VEC_LOC_SHIFT);
+		if (attrs[i]._signed)
+			tmp |= R300_SIGNED;
+		if (attrs[i].normalize)
+			tmp |= R300_NORMALIZE;
+
+		if (i % 2 == 0) {
+			vir0[j] = tmp << R300_DATA_TYPE_0_SHIFT;
+			vir1[j] = attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT);
+		} else {
+			vir0[j] |= tmp << R300_DATA_TYPE_1_SHIFT;
+			vir1[j] |= (attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE1_SHIFT;
+			++j;
+		}
+	}
 
+	reg_count = (rmesa->vbuf.num_attribs + 1) >> 1;
+	if (rmesa->vbuf.num_attribs % 2 != 0) {
+		vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_0_SHIFT;
+	} else {
+		vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_1_SHIFT;
+	}
+
+	R300_STATECHANGE(rmesa, vir[0]);
+	R300_STATECHANGE(rmesa, vir[1]);
+	R300_STATECHANGE(rmesa, vof);
+	R300_STATECHANGE(rmesa, vic);
+
+	if (rmesa->radeon.radeonScreen->kernel_mm) {
+		rmesa->hw.vir[0].cmd[0] &= 0xC000FFFF;
+		rmesa->hw.vir[1].cmd[0] &= 0xC000FFFF;
+		rmesa->hw.vir[0].cmd[0] |= (reg_count & 0x3FFF) << 16;
+		rmesa->hw.vir[1].cmd[0] |= (reg_count & 0x3FFF) << 16;
+	} else {
+		((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count = reg_count;
+		((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count = reg_count;
+	}
+
+	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
+	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
+	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] = r300VAPOutputCntl0(ctx, OutputsWritten, ctx->FragmentProgram._Current->Base.InputsRead);
+	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] = r300VAPOutputCntl1(ctx, OutputsWritten, ctx->FragmentProgram._Current->Base.InputsRead);
 }
 
 void r300UpdateShaderStates(r300ContextPtr rmesa)
 {
 	GLcontext *ctx;
 	ctx = rmesa->radeon.glCtx;
+	struct r300_fragment_program *r300_fp;
 
-	r300SetEarlyZState(ctx);
+	r300_fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
 
-	/* w_fmt value is set to get best performance
-	 * see p.130 R5xx 3D acceleration guide v1.3 */
-	GLuint w_fmt, fgdepthsrc;
-	if (current_fragment_program_writes_depth(ctx)) {
-		fgdepthsrc = R300_FG_DEPTH_SRC_SHADER;
-		w_fmt = R300_W_FMT_W24 | R300_W_SRC_US;
-	} else {
-		fgdepthsrc = R300_FG_DEPTH_SRC_SCAN;
-		w_fmt = R300_W_FMT_W0 | R300_W_SRC_US;
-	}
+	/* should only happenen once, just after context is created */
+	if (!r300_fp)
+		return;
 
-	if (w_fmt != rmesa->hw.us_out_fmt.cmd[5]) {
-		R300_STATECHANGE(rmesa, us_out_fmt);
-		rmesa->hw.us_out_fmt.cmd[5] = w_fmt;
-	}
+	r300SetEarlyZState(ctx);
 
-	if (fgdepthsrc != rmesa->hw.fg_depth_src.cmd[1]) {
-		R300_STATECHANGE(rmesa, fg_depth_src);
-		rmesa->hw.fg_depth_src.cmd[1] = fgdepthsrc;
-	}
+	r300SetupTextures(ctx);
 
-	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-		r500SetupPixelShader(rmesa);
-	else
-		r300SetupPixelShader(rmesa);
+	rmesa->vtbl.SetupPixelShader(ctx);
 
-	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-		r500SetupRSUnit(ctx);
-	else
-		r300SetupRSUnit(ctx);
+	rmesa->vtbl.SetupRSUnit(ctx);
 
-	if ((rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+	if (rmesa->options.hw_tcl_enabled) {
 		r300SetupVertexProgram(rmesa);
-
+	}
 }
 
 /**
@@ -2501,9 +2314,8 @@ static void r300InvalidateState(GLcontext * ctx, GLuint new_state)
 	_swsetup_InvalidateState(ctx, new_state);
 	_vbo_InvalidateState(ctx, new_state);
 	_tnl_InvalidateState(ctx, new_state);
-	_ae_invalidate_state(ctx, new_state);
 
-	if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+	if (new_state & _NEW_BUFFERS) {
 		_mesa_update_framebuffer(ctx);
 		/* this updates the DrawBuffer's Width/Height if it's a FBO */
 		_mesa_update_draw_buffer_bounds(ctx);
@@ -2523,34 +2335,12 @@ static void r300InvalidateState(GLcontext * ctx, GLuint new_state)
  */
 void r300InitState(r300ContextPtr r300)
 {
-	memset(&(r300->state.texture), 0, sizeof(r300->state.texture));
-
 	r300ResetHwState(r300);
 }
 
 static void r300RenderMode(GLcontext * ctx, GLenum mode)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	(void)rmesa;
-	(void)mode;
-}
-
-void r300UpdateClipPlanes( GLcontext *ctx )
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	GLuint p;
-
-	for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
-		if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
-			GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
-
-			R300_STATECHANGE( rmesa, vpucp[p] );
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_X] = ip[0];
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_Y] = ip[1];
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_Z] = ip[2];
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_W] = ip[3];
-		}
-	}
+	r300SwitchFallback(ctx, R300_FALLBACK_RENDER_MODE, ctx->RenderMode != GL_RENDER);
 }
 
 /**
@@ -2598,3 +2388,20 @@ void r300InitStateFuncs(struct dd_function_table *functions)
 	functions->DrawBuffer		= radeonDrawBuffer;
 	functions->ReadBuffer		= radeonReadBuffer;
 }
+
+void r300InitShaderFunctions(r300ContextPtr r300)
+{
+	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+		r300->vtbl.SetupRSUnit = r500SetupRSUnit;
+		r300->vtbl.SetupPixelShader = r500SetupPixelShader;
+		r300->vtbl.SetupFragmentShaderTextures = r500SetupFragmentShaderTextures;
+		r300->vtbl.BuildFragmentProgramHwCode = r500BuildFragmentProgramHwCode;
+		r300->vtbl.FragmentProgramDump = r500FragmentProgramDump;
+	} else {
+		r300->vtbl.SetupRSUnit = r300SetupRSUnit;
+		r300->vtbl.SetupPixelShader = r300SetupPixelShader;
+		r300->vtbl.SetupFragmentShaderTextures = r300SetupFragmentShaderTextures;
+		r300->vtbl.BuildFragmentProgramHwCode = r300BuildFragmentProgramHwCode;
+		r300->vtbl.FragmentProgramDump = r300FragmentProgramDump;
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/r300_state.h b/src/mesa/drivers/dri/r300/r300_state.h
index 247a20ee516..23282894202 100644
--- a/src/mesa/drivers/dri/r300/r300_state.h
+++ b/src/mesa/drivers/dri/r300/r300_state.h
@@ -50,16 +50,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 		r300->radeon.hw.is_dirty = GL_TRUE;		\
 	} while(0)
 
-// r300_state.c
-extern int future_hw_tcl_on;
-void _tnl_UpdateFixedFunctionProgram (GLcontext * ctx);
 void r300UpdateViewportOffset (GLcontext * ctx);
 void r300UpdateDrawBuffer (GLcontext * ctx);
 void r300UpdateStateParameters (GLcontext * ctx, GLuint new_state);
 void r300UpdateShaders (r300ContextPtr rmesa);
 void r300UpdateShaderStates (r300ContextPtr rmesa);
 void r300InitState (r300ContextPtr r300);
-void r300UpdateClipPlanes (GLcontext * ctx);
 void r300InitStateFuncs (struct dd_function_table *functions);
+void r300VapCntl(r300ContextPtr rmesa, GLuint input_count, GLuint output_count, GLuint temp_count);
+void r300SetupVAP(GLcontext *ctx, GLuint InputsRead, GLuint OutputsWritten);
 
 #endif				/* __R300_STATE_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_swtcl.c b/src/mesa/drivers/dri/r300/r300_swtcl.c
index 03c1521de7e..ce4179208eb 100644
--- a/src/mesa/drivers/dri/r300/r300_swtcl.c
+++ b/src/mesa/drivers/dri/r300/r300_swtcl.c
@@ -34,9 +34,11 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/tnl.h"
 #include "tnl/t_pipeline.h"
 
+#include "r300_state.h"
 #include "r300_swtcl.h"
 #include "r300_emit.h"
 #include "r300_tex.h"
+#include "r300_render.h"
 
 #define EMIT_ATTR( ATTR, STYLE )					\
 do {									\
@@ -53,199 +55,130 @@ do {									\
    rmesa->radeon.swtcl.vertex_attr_count++;					\
 } while (0)
 
-#define ADD_ATTR(_attr, _format, _dst_loc, _swizzle, _write_mask) \
+#define ADD_ATTR(_attr, _format, _dst_loc, _swizzle, _write_mask, _normalize) \
 do { \
-	attrs[num_attrs].attr = (_attr); \
-	attrs[num_attrs].format = (_format); \
+	attrs[num_attrs].element = (_attr); \
+	attrs[num_attrs].data_type = (_format); \
 	attrs[num_attrs].dst_loc = (_dst_loc); \
 	attrs[num_attrs].swizzle = (_swizzle); \
 	attrs[num_attrs].write_mask = (_write_mask); \
+	attrs[num_attrs]._signed = 0; \
+	attrs[num_attrs].normalize = (_normalize); \
 	++num_attrs; \
 } while (0)
 
-static void r300SwtclVAPSetup(GLcontext *ctx, GLuint InputsRead, GLuint OutputsWritten, GLuint vap_out_fmt_1)
+void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *_InputsRead,  GLuint *_OutputsWritten)
 {
 	r300ContextPtr rmesa = R300_CONTEXT( ctx );
 	TNLcontext *tnl = TNL_CONTEXT(ctx);
 	struct vertex_buffer *VB = &tnl->vb;
-	struct vertex_attribute *attrs = rmesa->swtcl.vert_attrs;
-	int vte = 0;
-	int i, j, reg_count;
-	uint32_t *vir0 = &rmesa->hw.vir[0].cmd[1];
-	uint32_t *vir1 = &rmesa->hw.vir[1].cmd[1];
-
-	for (i = 0; i < R300_VIR_CMDSIZE-1; ++i)
-		vir0[i] = vir1[i] = 0;
-
-	for (i = 0, j = 0; i < rmesa->radeon.swtcl.vertex_attr_count; ++i) {
-		int tmp, data_format;
-		switch (attrs[i].format) {
-			case EMIT_1F:
-				data_format = R300_DATA_TYPE_FLOAT_1;
-				break;
-			case EMIT_2F:
-				data_format = R300_DATA_TYPE_FLOAT_2;
-				break;
-			case EMIT_3F:
-				data_format = R300_DATA_TYPE_FLOAT_3;
-				break;
-			case EMIT_4F:
-				data_format = R300_DATA_TYPE_FLOAT_4;
-				break;
-			case EMIT_4UB_4F_RGBA:
-			case EMIT_4UB_4F_ABGR:
-				data_format = R300_DATA_TYPE_BYTE | R300_NORMALIZE;
-				break;
-			default:
-				fprintf(stderr, "%s: Invalid data format type", __FUNCTION__);
-				_mesa_exit(-1);
-				break;
-		}
-
-		tmp = data_format | (attrs[i].dst_loc << R300_DST_VEC_LOC_SHIFT);
-		if (i % 2 == 0) {
-			vir0[j] = tmp << R300_DATA_TYPE_0_SHIFT;
-			vir1[j] = attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT);
-		} else {
-			vir0[j] |= tmp << R300_DATA_TYPE_1_SHIFT;
-			vir1[j] |= (attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE1_SHIFT;
-			++j;
-		}
-	}
-
-	reg_count = (rmesa->radeon.swtcl.vertex_attr_count + 1) >> 1;
-	if (rmesa->radeon.swtcl.vertex_attr_count % 2 != 0) {
-		vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_0_SHIFT;
-	} else {
-		vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_1_SHIFT;
-	}
-
-	R300_STATECHANGE(rmesa, vir[0]);
-	R300_STATECHANGE(rmesa, vir[1]);
-	R300_STATECHANGE(rmesa, vof);
-	R300_STATECHANGE(rmesa, vte);
-	R300_STATECHANGE(rmesa, vic);
-
-	if (rmesa->radeon.radeonScreen->kernel_mm) {
-		rmesa->hw.vir[0].cmd[0] &= 0xC000FFFF;
-		rmesa->hw.vir[1].cmd[0] &= 0xC000FFFF;
-		rmesa->hw.vir[0].cmd[0] |= (reg_count & 0x3FFF) << 16;
-		rmesa->hw.vir[1].cmd[0] |= (reg_count & 0x3FFF) << 16;
-	} else {
-		((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count = reg_count;
-		((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count = reg_count;
-	}
-
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] = r300VAPOutputCntl0(ctx, OutputsWritten);
-	/**
-	  * Can't use r300VAPOutputCntl1 function because it assumes
-	  * that all texture coords have 4 components and that's the case
-	  * for HW TCL path, but not for SW TCL.
-	  */
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] = vap_out_fmt_1;
-
-	vte = rmesa->hw.vte.cmd[1];
-	vte &= ~(R300_VTX_XY_FMT | R300_VTX_Z_FMT | R300_VTX_W0_FMT);
-	/* Important:
-	 */
-	if ( VB->NdcPtr != NULL ) {
-		VB->AttribPtr[VERT_ATTRIB_POS] = VB->NdcPtr;
-		vte |= R300_VTX_XY_FMT | R300_VTX_Z_FMT;
-	}
-	else {
-		VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
-		vte |= R300_VTX_W0_FMT;
-	}
-
-	assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
-
-	rmesa->hw.vte.cmd[1] = vte;
-	rmesa->hw.vte.cmd[2] = rmesa->radeon.swtcl.vertex_size;
-}
-
-
-static void r300SetVertexFormat( GLcontext *ctx )
-{
-	r300ContextPtr rmesa = R300_CONTEXT( ctx );
-	TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *VB = &tnl->vb;
-	int first_free_tex = 0, vap_out_fmt_1 = 0;
+	int first_free_tex = 0;
 	GLuint InputsRead = 0;
 	GLuint OutputsWritten = 0;
 	int num_attrs = 0;
-	struct vertex_attribute *attrs = rmesa->swtcl.vert_attrs;
+	GLuint fp_reads = ctx->FragmentProgram._Current->Base.InputsRead;
+	struct vertex_attribute *attrs = rmesa->vbuf.attribs;
 
 	rmesa->swtcl.coloroffset = rmesa->swtcl.specoffset = 0;
 	rmesa->radeon.swtcl.vertex_attr_count = 0;
 
-	if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_POS)) {
-		InputsRead |= 1 << VERT_ATTRIB_POS;
-		OutputsWritten |= 1 << VERT_RESULT_HPOS;
-		EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F );
-		ADD_ATTR(VERT_ATTRIB_POS, EMIT_4F, SWTCL_OVM_POS, SWIZZLE_XYZW, MASK_XYZW);
-		rmesa->swtcl.coloroffset = 4;
-	}
+	/* We always want non Ndc coords format */
+	VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
+
+	/* Always write position vector */
+	InputsRead |= 1 << VERT_ATTRIB_POS;
+	OutputsWritten |= 1 << VERT_RESULT_HPOS;
+	EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F );
+	ADD_ATTR(VERT_ATTRIB_POS, R300_DATA_TYPE_FLOAT_4, SWTCL_OVM_POS, SWIZZLE_XYZW, MASK_XYZW, 0);
+	rmesa->swtcl.coloroffset = 4;
 
-	if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_COLOR0)) {
+	if (fp_reads & FRAG_BIT_COL0) {
 		InputsRead |= 1 << VERT_ATTRIB_COLOR0;
 		OutputsWritten |= 1 << VERT_RESULT_COL0;
 #if MESA_LITTLE_ENDIAN
 		EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA );
-		ADD_ATTR(VERT_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW);
+		ADD_ATTR(VERT_ATTRIB_COLOR0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW, 1);
 #else
 		EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR );
-		ADD_ATTR(VERT_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW);
+		ADD_ATTR(VERT_ATTRIB_COLOR0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW, 1);
 #endif
 	}
 
-	if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_COLOR1 )) {
+	if (fp_reads & FRAG_BIT_COL1) {
 		GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
 		InputsRead |= 1 << VERT_ATTRIB_COLOR1;
 		OutputsWritten |= 1 << VERT_RESULT_COL1;
 #if MESA_LITTLE_ENDIAN
 		EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_4UB_4F_RGBA );
-		ADD_ATTR(VERT_ATTRIB_COLOR1, EMIT_4UB_4F_RGBA, SWTCL_OVM_COLOR1, swiz, MASK_XYZW);
+		ADD_ATTR(VERT_ATTRIB_COLOR1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR1, swiz, MASK_XYZW, 1);
 #else
 		EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_4UB_4F_ABGR );
-		ADD_ATTR(VERT_ATTRIB_COLOR1, EMIT_4UB_4F_ABGR, SWTCL_OVM_COLOR1, swiz, MASK_XYZW);
+		ADD_ATTR(VERT_ATTRIB_COLOR1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR1, swiz, MASK_XYZW, 1);
 #endif
 		rmesa->swtcl.specoffset = rmesa->swtcl.coloroffset + 1;
 	}
 
+	if (ctx->Light.Enabled && ctx->Light.Model.TwoSide) {
+		VB->AttribPtr[VERT_ATTRIB_GENERIC0] = VB->ColorPtr[1];
+		OutputsWritten |= 1 << VERT_RESULT_BFC0;
+#if MESA_LITTLE_ENDIAN
+		EMIT_ATTR( _TNL_ATTRIB_GENERIC0, EMIT_4UB_4F_RGBA );
+		ADD_ATTR(VERT_ATTRIB_GENERIC0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR2, SWIZZLE_XYZW, MASK_XYZW, 1);
+#else
+		EMIT_ATTR( _TNL_ATTRIB_GENERIC0, EMIT_4UB_4F_ABGR );
+		ADD_ATTR(VERT_ATTRIB_GENERIC0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR2, SWIZZLE_XYZW, MASK_XYZW, 1);
+#endif
+		if (fp_reads & FRAG_BIT_COL1) {
+			VB->AttribPtr[VERT_ATTRIB_GENERIC1] = VB->SecondaryColorPtr[1];
+			GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
+			OutputsWritten |= 1 << VERT_RESULT_BFC1;
+#if MESA_LITTLE_ENDIAN
+			EMIT_ATTR( _TNL_ATTRIB_GENERIC1, EMIT_4UB_4F_RGBA );
+			ADD_ATTR(VERT_ATTRIB_GENERIC1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR3, swiz, MASK_XYZW, 1);
+#else
+			EMIT_ATTR( _TNL_ATTRIB_GENERIC1, EMIT_4UB_4F_ABGR );
+			ADD_ATTR(VERT_ATTRIB_GENERIC1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR3, swiz, MASK_XYZW, 1);
+#endif
+		}
+	}
+
 	if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_POINTSIZE )) {
 		GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO);
 		InputsRead |= 1 << VERT_ATTRIB_POINT_SIZE;
 		OutputsWritten |= 1 << VERT_RESULT_PSIZ;
 		EMIT_ATTR( _TNL_ATTRIB_POINTSIZE, EMIT_1F );
-		ADD_ATTR(VERT_ATTRIB_POINT_SIZE, EMIT_1F, SWTCL_OVM_POINT_SIZE, swiz, MASK_X);
+		ADD_ATTR(VERT_ATTRIB_POINT_SIZE, R300_DATA_TYPE_FLOAT_1, SWTCL_OVM_POINT_SIZE, swiz, MASK_X, 0);
 	}
 
-	if (RENDERINPUTS_TEST_RANGE(tnl->render_inputs_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
-		int i, size;
-		GLuint swiz, mask, format;
+	/**
+	 *  Sending only one texcoord component may lead to lock up,
+	 *  so for all textures always output 4 texcoord components to RS.
+	 */
+	{
+		int i;
+		GLuint swiz, format, hw_format;
 		for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-			if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_TEX(i) )) {
+			if (fp_reads & FRAG_BIT_TEX(i)) {
 				switch (VB->TexCoordPtr[i]->size) {
 					case 1:
+						format = EMIT_1F;
+						hw_format = R300_DATA_TYPE_FLOAT_1;
+						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE);
+						break;
 					case 2:
 						format = EMIT_2F;
-						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ZERO);
-						mask = MASK_X | MASK_Y;
-						size = 2;
+						hw_format = R300_DATA_TYPE_FLOAT_2;
+						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ONE);
 						break;
 					case 3:
 						format = EMIT_3F;
-						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-						mask = MASK_X | MASK_Y | MASK_Z;
-						size = 3;
+						hw_format = R300_DATA_TYPE_FLOAT_3;
+						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
 						break;
 					case 4:
 						format = EMIT_4F;
+						hw_format = R300_DATA_TYPE_FLOAT_4;
 						swiz = SWIZZLE_XYZW;
-						mask = MASK_XYZW;
-						size = 4;
 						break;
 					default:
 						continue;
@@ -253,15 +186,14 @@ static void r300SetVertexFormat( GLcontext *ctx )
 				InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
 				OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
 				EMIT_ATTR(_TNL_ATTRIB_TEX(i), format);
-				ADD_ATTR(VERT_ATTRIB_TEX0 + i, format, SWTCL_OVM_TEX(i), swiz, mask);
-				vap_out_fmt_1 |= size << (i * 3);
+				ADD_ATTR(VERT_ATTRIB_TEX0 + i, hw_format, SWTCL_OVM_TEX(first_free_tex), swiz, MASK_XYZW, 0);
 				++first_free_tex;
 			}
 		}
 	}
 
 	/* RS can't put fragment position on the pixel stack, so stuff it in texcoord if needed */
-	if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_POS) && (ctx->FragmentProgram._Current->Base.InputsRead & FRAG_BIT_WPOS)) {
+	if (fp_reads & FRAG_BIT_WPOS) {
 		if (first_free_tex >= ctx->Const.MaxTextureUnits) {
 			fprintf(stderr, "\tout of free texcoords to write w pos\n");
 			_mesa_exit(-1);
@@ -270,12 +202,11 @@ static void r300SetVertexFormat( GLcontext *ctx )
 		InputsRead |= 1 << (VERT_ATTRIB_TEX0 + first_free_tex);
 		OutputsWritten |= 1 << (VERT_RESULT_TEX0 + first_free_tex);
 		EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F );
-		ADD_ATTR(VERT_ATTRIB_POS, EMIT_4F, SWTCL_OVM_TEX(first_free_tex), SWIZZLE_XYZW, MASK_XYZW);
-		vap_out_fmt_1 |= 4 << (first_free_tex * 3);
+		ADD_ATTR(VERT_ATTRIB_POS, R300_DATA_TYPE_FLOAT_4, SWTCL_OVM_TEX(first_free_tex), SWIZZLE_XYZW, MASK_XYZW, 0);
 		++first_free_tex;
 	}
 
-	if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_FOG)) {
+	if (fp_reads & FRAG_BIT_FOGC) {
 		if (first_free_tex >= ctx->Const.MaxTextureUnits) {
 			fprintf(stderr, "\tout of free texcoords to write fog coordinate\n");
 			_mesa_exit(-1);
@@ -285,12 +216,24 @@ static void r300SetVertexFormat( GLcontext *ctx )
 		OutputsWritten |= 1 << VERT_RESULT_FOGC;
 		GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO);
 		EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1F );
-		ADD_ATTR(VERT_ATTRIB_FOG, EMIT_1F, SWTCL_OVM_TEX(first_free_tex), swiz, MASK_X);
-		vap_out_fmt_1 |=  1 << (first_free_tex * 3);
+		ADD_ATTR(VERT_ATTRIB_FOG, R300_DATA_TYPE_FLOAT_1, SWTCL_OVM_TEX(first_free_tex), swiz, MASK_XYZW, 0);
 	}
 
 	R300_NEWPRIM(rmesa);
-	r300SwtclVAPSetup(ctx, InputsRead, OutputsWritten, vap_out_fmt_1);
+	rmesa->vbuf.num_attribs = num_attrs;
+	*_InputsRead = InputsRead;
+	*_OutputsWritten = OutputsWritten;
+
+	RENDERINPUTS_COPY(rmesa->render_inputs_bitset, tnl->render_inputs_bitset);
+}
+
+static void r300PrepareVertices(GLcontext *ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	GLuint InputsRead, OutputsWritten;
+
+	r300ChooseSwtclVertexFormat(ctx, &InputsRead, &OutputsWritten);
+	r300SetupVAP(ctx, InputsRead, OutputsWritten);
 
 	rmesa->radeon.swtcl.vertex_size =
 		_tnl_install_attrs( ctx,
@@ -299,8 +242,6 @@ static void r300SetVertexFormat( GLcontext *ctx )
 				    NULL, 0 );
 
 	rmesa->radeon.swtcl.vertex_size /= 4;
-
-	RENDERINPUTS_COPY(rmesa->state.render_inputs_bitset, tnl->render_inputs_bitset);
 }
 
 
@@ -318,7 +259,6 @@ static GLuint reduced_prim[] = {
 };
 
 static void r300RasterPrimitive( GLcontext *ctx, GLuint prim );
-static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
 
 /***********************************************************************
  *                    Emit primitives as inline vertices               *
@@ -366,9 +306,8 @@ static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
  *              Build render functions from dd templates               *
  ***********************************************************************/
 
-#define R300_TWOSIDE_BIT	0x01
-#define R300_UNFILLED_BIT	0x02
-#define R300_MAX_TRIFUNC	0x04
+#define R300_UNFILLED_BIT	0x01
+#define R300_MAX_TRIFUNC	0x02
 
 static struct {
    tnl_points_func	        points;
@@ -379,9 +318,9 @@ static struct {
 
 #define DO_FALLBACK  0
 #define DO_UNFILLED (IND & R300_UNFILLED_BIT)
-#define DO_TWOSIDE  (IND & R300_TWOSIDE_BIT)
+#define DO_TWOSIDE   0
 #define DO_FLAT      0
-#define DO_OFFSET     0
+#define DO_OFFSET    0
 #define DO_TRI       1
 #define DO_QUAD      1
 #define DO_LINE      1
@@ -469,26 +408,15 @@ do { \
 #define TAG(x) x
 #include "tnl_dd/t_dd_tritmp.h"
 
-#define IND (R300_TWOSIDE_BIT)
-#define TAG(x) x##_twoside
-#include "tnl_dd/t_dd_tritmp.h"
-
 #define IND (R300_UNFILLED_BIT)
 #define TAG(x) x##_unfilled
 #include "tnl_dd/t_dd_tritmp.h"
 
-#define IND (R300_TWOSIDE_BIT|R300_UNFILLED_BIT)
-#define TAG(x) x##_twoside_unfilled
-#include "tnl_dd/t_dd_tritmp.h"
-
-
 
 static void init_rast_tab( void )
 {
    init();
-   init_twoside();
    init_unfilled();
-   init_twoside_unfilled();
 }
 
 /**********************************************************************/
@@ -540,7 +468,6 @@ static void r300ChooseRenderState( GLcontext *ctx )
 	GLuint index = 0;
 	GLuint flags = ctx->_TriangleCaps;
 
-	if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R300_TWOSIDE_BIT;
 	if (flags & DD_TRI_UNFILLED)      index |= R300_UNFILLED_BIT;
 
 	if (index != rmesa->radeon.swtcl.RenderIndex) {
@@ -565,12 +492,12 @@ static void r300ChooseRenderState( GLcontext *ctx )
 }
 
 
-static void r300RenderStart(GLcontext *ctx)
+void r300RenderStart(GLcontext *ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT( ctx );
 
 	r300ChooseRenderState(ctx);
-	r300SetVertexFormat(ctx);
+	r300PrepareVertices(ctx);
 
 	r300ValidateBuffers(ctx);
 
@@ -585,7 +512,7 @@ static void r300RenderStart(GLcontext *ctx)
 	}
 }
 
-static void r300RenderFinish(GLcontext *ctx)
+void r300RenderFinish(GLcontext *ctx)
 {
 }
 
@@ -599,7 +526,7 @@ static void r300RasterPrimitive( GLcontext *ctx, GLuint hwprim )
 	}
 }
 
-static void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
+void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
 {
 
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
@@ -611,7 +538,7 @@ static void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
 	r300RasterPrimitive( ctx, reduced_prim[prim] );
 }
 
-static void r300ResetLineStipple(GLcontext *ctx)
+void r300ResetLineStipple(GLcontext *ctx)
 {
 }
 
@@ -647,7 +574,6 @@ void r300InitSwtcl(GLcontext *ctx)
 	_tnl_invalidate_vertices( ctx, ~0 );
 
 	_tnl_need_projected_coords( ctx, GL_FALSE );
-	r300ChooseRenderState(ctx);
 }
 
 void r300DestroySwtcl(GLcontext *ctx)
@@ -692,6 +618,7 @@ void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
 			   rmesa->radeon.hw.max_state_size + (12*sizeof(int)),
 			   __FUNCTION__);
 	radeonEmitState(&rmesa->radeon);
+    r300_emit_scissor(ctx);
 	r300EmitVertexAOS(rmesa,
 			rmesa->radeon.swtcl.vertex_size,
 			rmesa->radeon.dma.current,
diff --git a/src/mesa/drivers/dri/r300/r300_swtcl.h b/src/mesa/drivers/dri/r300/r300_swtcl.h
index ebc99c9e8a7..c271d265468 100644
--- a/src/mesa/drivers/dri/r300/r300_swtcl.h
+++ b/src/mesa/drivers/dri/r300/r300_swtcl.h
@@ -39,24 +39,27 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast/swrast.h"
 #include "r300_context.h"
 
-#define MASK_XYZW (R300_WRITE_ENA_X | R300_WRITE_ENA_Y | R300_WRITE_ENA_Z | R300_WRITE_ENA_W)
-#define MASK_X R300_WRITE_ENA_X
-#define MASK_Y R300_WRITE_ENA_Y
-#define MASK_Z R300_WRITE_ENA_Z
-#define MASK_W R300_WRITE_ENA_W
-
 /*
  * Here are definitions of OVM locations of vertex attributes for non TCL hw
  */
 #define SWTCL_OVM_POS 0
 #define SWTCL_OVM_COLOR0 2
 #define SWTCL_OVM_COLOR1 3
+#define SWTCL_OVM_COLOR2 4
+#define SWTCL_OVM_COLOR3 5
 #define SWTCL_OVM_TEX(n) ((n) + 6)
 #define SWTCL_OVM_POINT_SIZE 15
 
+extern void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *InputsRead,  GLuint *OutputsWritten);
 
 extern void r300InitSwtcl( GLcontext *ctx );
 extern void r300DestroySwtcl( GLcontext *ctx );
 
+extern void r300RenderStart(GLcontext *ctx);
+extern void r300RenderFinish(GLcontext *ctx);
+extern void r300RenderPrimitive(GLcontext *ctx, GLenum prim);
+extern void r300ResetLineStipple(GLcontext *ctx);
+
 extern void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset);
+
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_texstate.c b/src/mesa/drivers/dri/r300/r300_texstate.c
index cf4cad73d29..6d6a90aa88a 100644
--- a/src/mesa/drivers/dri/r300/r300_texstate.c
+++ b/src/mesa/drivers/dri/r300/r300_texstate.c
@@ -117,6 +117,7 @@ static const struct tx_table {
 	_ASSIGN(INTENSITY_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, X, FL_I16)),
 	_ASSIGN(Z16, R300_EASY_TX_FORMAT(X, X, X, X, X16)),
 	_ASSIGN(Z24_S8, R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8)),
+	_ASSIGN(S8_Z24, R300_EASY_TX_FORMAT(Y, Y, Y, Y, X24_Y8)),
 	_ASSIGN(Z32, R300_EASY_TX_FORMAT(X, X, X, X, X32)),
 	/* *INDENT-ON* */
 };
@@ -431,7 +432,6 @@ void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
 		radeon_miptree_unreference(rImage->mt);
 		rImage->mt = NULL;
 	}
-	fprintf(stderr,"settexbuf %dx%d@%d %d targ %x format %x\n", rb->width, rb->height, rb->cpp, rb->pitch, target, format);
 	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
 				   rb->width, rb->height, 1, 0, rb->cpp);
 	texImage->RowStride = rb->pitch / rb->cpp;
@@ -449,7 +449,10 @@ void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
 	pitch_val = rb->pitch;
 	switch (rb->cpp) {
 	case 4:
-		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
+		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
+			t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+		else
+			t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
 		t->pp_txfilter |= tx_table[2].filter;
 		pitch_val /= 4;
 		break;
diff --git a/src/mesa/drivers/dri/r300/r300_vertprog.c b/src/mesa/drivers/dri/r300/r300_vertprog.c
index 50806575ced..c41a8fdd621 100644
--- a/src/mesa/drivers/dri/r300/r300_vertprog.c
+++ b/src/mesa/drivers/dri/r300/r300_vertprog.c
@@ -34,10 +34,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "shader/program.h"
 #include "shader/prog_instruction.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
 #include "shader/prog_statevars.h"
 #include "tnl/tnl.h"
 
 #include "r300_context.h"
+#include "r300_state.h"
 
 /* TODO: Get rid of t_src_class call */
 #define CMP_SRCS(a, b) ((a.RelAddr != b.RelAddr) || (a.Index != b.Index && \
@@ -64,7 +66,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 		int u_temp_used = (VSF_MAX_FRAGMENT_TEMPS - 1) - u_temp_i; \
 		if((vp->num_temporaries + u_temp_used) > VSF_MAX_FRAGMENT_TEMPS) { \
 			WARN_ONCE("Ran out of temps, num temps %d, us %d\n", vp->num_temporaries, u_temp_used); \
-			vp->native = GL_FALSE; \
+			vp->error = GL_TRUE; \
 		} \
 		u_temp_i=VSF_MAX_FRAGMENT_TEMPS-1; \
 	} while (0)
@@ -214,21 +216,8 @@ static void vp_dump_inputs(struct r300_vertex_program *vp, char *caller)
 static unsigned long t_src_index(struct r300_vertex_program *vp,
 				 struct prog_src_register *src)
 {
-	int i;
-	int max_reg = -1;
-
 	if (src->File == PROGRAM_INPUT) {
-		if (vp->inputs[src->Index] != -1)
-			return vp->inputs[src->Index];
-
-		for (i = 0; i < VERT_ATTRIB_MAX; i++)
-			if (vp->inputs[i] > max_reg)
-				max_reg = vp->inputs[i];
-
-		vp->inputs[src->Index] = max_reg + 1;
-
-		//vp_dump_inputs(vp, __FUNCTION__);
-
+		assert(vp->inputs[src->Index] != -1);
 		return vp->inputs[src->Index];
 	} else {
 		if (src->Index < 0) {
@@ -245,7 +234,7 @@ static unsigned long t_src_index(struct r300_vertex_program *vp,
 static unsigned long t_src(struct r300_vertex_program *vp,
 			   struct prog_src_register *src)
 {
-	/* src->NegateBase uses the NEGATE_ flags from program_instruction.h,
+	/* src->Negate uses the NEGATE_ flags from program_instruction.h,
 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
 	 */
 	return PVS_SRC_OPERAND(t_src_index(vp, src),
@@ -254,13 +243,13 @@ static unsigned long t_src(struct r300_vertex_program *vp,
 			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
 			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
 			       t_src_class(src->File),
-			       src->NegateBase) | (src->RelAddr << 4);
+			       src->Negate) | (src->RelAddr << 4);
 }
 
 static unsigned long t_src_scalar(struct r300_vertex_program *vp,
 				  struct prog_src_register *src)
 {
-	/* src->NegateBase uses the NEGATE_ flags from program_instruction.h,
+	/* src->Negate uses the NEGATE_ flags from program_instruction.h,
 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
 	 */
 	return PVS_SRC_OPERAND(t_src_index(vp, src),
@@ -269,8 +258,7 @@ static unsigned long t_src_scalar(struct r300_vertex_program *vp,
 			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
 			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
 			       t_src_class(src->File),
-			       src->
-			       NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+			       src->Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
 	    (src->RelAddr << 4);
 }
 
@@ -307,7 +295,7 @@ static GLuint *r300TranslateOpcodeABS(struct r300_vertex_program *vp,
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 3)),
 				  t_src_class(src[0].File),
 				  (!src[0].
-				   NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+				   Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
 	    (src[0].RelAddr << 4);
 	inst[3] = 0;
 
@@ -369,8 +357,7 @@ static GLuint *r300TranslateOpcodeDP3(struct r300_vertex_program *vp,
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
 				  SWIZZLE_ZERO,
 				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_XYZ : VSF_FLAG_NONE) |
+				  src[0].Negate ? VSF_FLAG_XYZ : VSF_FLAG_NONE) |
 	    (src[0].RelAddr << 4);
 	inst[2] =
 	    PVS_SRC_OPERAND(t_src_index(vp, &src[1]),
@@ -378,8 +365,7 @@ static GLuint *r300TranslateOpcodeDP3(struct r300_vertex_program *vp,
 			    t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
 			    t_swizzle(GET_SWZ(src[1].Swizzle, 2)), SWIZZLE_ZERO,
 			    t_src_class(src[1].File),
-			    src[1].
-			    NegateBase ? VSF_FLAG_XYZ : VSF_FLAG_NONE) |
+			    src[1].Negate ? VSF_FLAG_XYZ : VSF_FLAG_NONE) |
 	    (src[1].RelAddr << 4);
 	inst[3] = __CONST(1, SWIZZLE_ZERO);
 
@@ -422,8 +408,7 @@ static GLuint *r300TranslateOpcodeDPH(struct r300_vertex_program *vp,
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
 				  PVS_SRC_SELECT_FORCE_1,
 				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_XYZ : VSF_FLAG_NONE) |
+				  src[0].Negate ? VSF_FLAG_XYZ : VSF_FLAG_NONE) |
 	    (src[0].RelAddr << 4);
 	inst[2] = t_src(vp, &src[1]);
 	inst[3] = __CONST(1, SWIZZLE_ZERO);
@@ -519,7 +504,7 @@ static GLuint *r300TranslateOpcodeFLR(struct r300_vertex_program *vp,
 				  PVS_SRC_SELECT_W, PVS_SRC_REG_TEMPORARY,
 				  /* Not 100% sure about this */
 				  (!src[0].
-				   NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE
+				   Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE
 				  /*VSF_FLAG_ALL */ );
 	inst[3] = __CONST(0, SWIZZLE_ZERO);
 	(*u_temp_i)--;
@@ -564,8 +549,7 @@ static GLuint *r300TranslateOpcodeLG2(struct r300_vertex_program *vp,
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
 				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+				  src[0].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
 	    (src[0].RelAddr << 4);
 	inst[2] = __CONST(0, SWIZZLE_ZERO);
 	inst[3] = __CONST(0, SWIZZLE_ZERO);
@@ -592,24 +576,21 @@ static GLuint *r300TranslateOpcodeLIT(struct r300_vertex_program *vp,
 				  PVS_SRC_SELECT_FORCE_0,	// Z
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// Y
 				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+				  src[0].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
 	    (src[0].RelAddr << 4);
 	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// Y
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// W
 				  PVS_SRC_SELECT_FORCE_0,	// Z
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// X
 				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+				  src[0].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
 	    (src[0].RelAddr << 4);
 	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// Y
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// X
 				  PVS_SRC_SELECT_FORCE_0,	// Z
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// W
 				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+				  src[0].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
 	    (src[0].RelAddr << 4);
 
 	return inst;
@@ -837,7 +818,7 @@ static GLuint *r300TranslateOpcodeSUB(struct r300_vertex_program *vp,
 				  t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
 				  t_src_class(src[1].File),
 				  (!src[1].
-				   NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+				   Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
 	    (src[1].RelAddr << 4);
 	inst[3] = 0;
 #else
@@ -857,7 +838,7 @@ static GLuint *r300TranslateOpcodeSUB(struct r300_vertex_program *vp,
 				  t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
 				  t_src_class(src[1].File),
 				  (!src[1].
-				   NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+				   Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
 	    (src[1].RelAddr << 4);
 #endif
 
@@ -905,16 +886,14 @@ static GLuint *r300TranslateOpcodeXPD(struct r300_vertex_program *vp,
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// X
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// W
 				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+				  src[0].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
 	    (src[0].RelAddr << 4);
 	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[1]), t_swizzle(GET_SWZ(src[1].Swizzle, 2)),	// Z
 				  t_swizzle(GET_SWZ(src[1].Swizzle, 0)),	// X
 				  t_swizzle(GET_SWZ(src[1].Swizzle, 1)),	// Y
 				  t_swizzle(GET_SWZ(src[1].Swizzle, 3)),	// W
 				  t_src_class(src[1].File),
-				  src[1].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+				  src[1].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
 	    (src[1].RelAddr << 4);
 	inst[3] = __CONST(1, SWIZZLE_ZERO);
 	inst += 4;
@@ -931,15 +910,14 @@ static GLuint *r300TranslateOpcodeXPD(struct r300_vertex_program *vp,
 				  t_swizzle(GET_SWZ(src[1].Swizzle, 3)),	// W
 				  t_src_class(src[1].File),
 				  (!src[1].
-				   NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+				   Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
 	    (src[1].RelAddr << 4);
 	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 2)),	// Z
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// X
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// Y
 				  t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// W
 				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+				  src[0].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
 	    (src[0].RelAddr << 4);
 	inst[3] =
 	    PVS_SRC_OPERAND(*u_temp_i, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y,
@@ -954,11 +932,17 @@ static GLuint *r300TranslateOpcodeXPD(struct r300_vertex_program *vp,
 static void t_inputs_outputs(struct r300_vertex_program *vp)
 {
 	int i;
-	int cur_reg = 0;
+	int cur_reg;
 
-	for (i = 0; i < VERT_ATTRIB_MAX; i++)
-		vp->inputs[i] = -1;
+	cur_reg = -1;
+	for (i = 0; i < VERT_ATTRIB_MAX; i++) {
+		if (vp->key.InputsRead & (1 << i))
+			vp->inputs[i] = ++cur_reg;
+		else
+			vp->inputs[i] = -1;
+	}
 
+	cur_reg = 0;
 	for (i = 0; i < VERT_RESULT_MAX; i++)
 		vp->outputs[i] = -1;
 
@@ -972,26 +956,36 @@ static void t_inputs_outputs(struct r300_vertex_program *vp)
 		vp->outputs[VERT_RESULT_PSIZ] = cur_reg++;
 	}
 
+	/* If we're writing back facing colors we need to send
+	 * four colors to make front/back face colors selection work.
+	 * If the vertex program doesn't write all 4 colors, lets
+	 * pretend it does by skipping output index reg so the colors
+	 * get written into appropriate output vectors.
+	 */
 	if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL0)) {
 		vp->outputs[VERT_RESULT_COL0] = cur_reg++;
+	} else if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC0) ||
+		vp->key.OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		cur_reg++;
 	}
 
 	if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL1)) {
-		vp->outputs[VERT_RESULT_COL1] =
-		    vp->outputs[VERT_RESULT_COL0] + 1;
-		cur_reg = vp->outputs[VERT_RESULT_COL1] + 1;
+		vp->outputs[VERT_RESULT_COL1] = cur_reg++;
+	} else if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC0) ||
+		vp->key.OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		cur_reg++;
 	}
 
 	if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC0)) {
-		vp->outputs[VERT_RESULT_BFC0] =
-		    vp->outputs[VERT_RESULT_COL0] + 2;
-		cur_reg = vp->outputs[VERT_RESULT_BFC0] + 2;
+		vp->outputs[VERT_RESULT_BFC0] = cur_reg++;
+	} else if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		cur_reg++;
 	}
 
 	if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC1)) {
-		vp->outputs[VERT_RESULT_BFC1] =
-		    vp->outputs[VERT_RESULT_COL0] + 3;
-		cur_reg = vp->outputs[VERT_RESULT_BFC1] + 1;
+		vp->outputs[VERT_RESULT_BFC1] = cur_reg++;
+	} else if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC0)) {
+		cur_reg++;
 	}
 
 	for (i = VERT_RESULT_TEX0; i <= VERT_RESULT_TEX7; i++) {
@@ -1018,14 +1012,13 @@ static void r300TranslateVertexShader(struct r300_vertex_program *vp,
 	struct prog_src_register src[3];
 
 	vp->pos_end = 0;	/* Not supported yet */
-	vp->program.length = 0;
-	/*vp->num_temporaries=mesa_vp->Base.NumTemporaries; */
+	vp->hw_code.length = 0;
 	vp->translated = GL_TRUE;
-	vp->native = GL_TRUE;
+	vp->error = GL_FALSE;
 
 	t_inputs_outputs(vp);
 
-	for (inst = vp->program.body.i; vpi->Opcode != OPCODE_END;
+	for (inst = vp->hw_code.body.d; vpi->Opcode != OPCODE_END;
 	     vpi++, inst += 4) {
 
 		FREE_TEMPS();
@@ -1187,38 +1180,15 @@ static void r300TranslateVertexShader(struct r300_vertex_program *vp,
 						      &u_temp_i);
 			break;
 		default:
-			assert(0);
+			vp->error = GL_TRUE;
 			break;
 		}
 	}
 
-	/* Some outputs may be artificially added, to match the inputs
-	   of the fragment program. Blank the outputs here. */
-	for (i = 0; i < VERT_RESULT_MAX; i++) {
-		if (vp->key.OutputsAdded & (1 << i)) {
-			inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
-						     GL_FALSE,
-						     GL_FALSE,
-						     vp->outputs[i],
-						     VSF_FLAG_ALL,
-						     PVS_DST_REG_OUT);
-			inst[1] = __CONST(0, SWIZZLE_ZERO);
-			inst[2] = __CONST(0, SWIZZLE_ZERO);
-			inst[3] = __CONST(0, SWIZZLE_ZERO);
-			inst += 4;
-		}
+	vp->hw_code.length = (inst - vp->hw_code.body.d);
+	if (vp->hw_code.length >= VSF_MAX_FRAGMENT_LENGTH) {
+		vp->error = GL_TRUE;
 	}
-
-	vp->program.length = (inst - vp->program.body.i);
-	if (vp->program.length >= VSF_MAX_FRAGMENT_LENGTH) {
-		vp->program.length = 0;
-		vp->native = GL_FALSE;
-	}
-#if 0
-	fprintf(stderr, "hw program:\n");
-	for (i = 0; i < vp->program.length; i++)
-		fprintf(stderr, "%08x\n", vp->program.body.d[i]);
-#endif
 }
 
 /* DP4 version seems to trigger some hw peculiarity */
@@ -1397,6 +1367,49 @@ static struct r300_vertex_program *build_program(struct r300_vertex_program_key
 		pos_as_texcoord(vp, &mesa_vp->Base);
 	}
 
+	if (RADEON_DEBUG & DEBUG_VERTS) {
+		fprintf(stderr, "Vertex program after native rewrite:\n");
+		_mesa_print_program(&mesa_vp->Base);
+		fflush(stdout);
+	}
+
+	/* Some outputs may be artificially added, to match the inputs of the fragment program.
+	 * Issue 16 of vertex program spec says that all vertex attributes that are unwritten by
+	 * vertex program are undefined, so just use MOV [vertex_result], CONST[0]
+	 */
+	{
+		int i, count = 0;
+		for (i = 0; i < VERT_RESULT_MAX; ++i) {
+			if (vp->key.OutputsAdded & (1 << i)) {
+				++count;
+			}
+		}
+
+		if (count > 0) {
+			struct prog_instruction *inst;
+
+			_mesa_insert_instructions(&mesa_vp->Base, mesa_vp->Base.NumInstructions - 1, count);
+			inst = &mesa_vp->Base.Instructions[mesa_vp->Base.NumInstructions - 1 - count];
+
+			for (i = 0; i < VERT_RESULT_MAX; ++i) {
+				if (vp->key.OutputsAdded & (1 << i)) {
+					inst->Opcode = OPCODE_MOV;
+
+					inst->DstReg.File = PROGRAM_OUTPUT;
+					inst->DstReg.Index = i;
+					inst->DstReg.WriteMask = WRITEMASK_XYZW;
+					inst->DstReg.CondMask = COND_TR;
+
+					inst->SrcReg[0].File = PROGRAM_CONSTANT;
+					inst->SrcReg[0].Index = 0;
+					inst->SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+					++inst;
+				}
+			}
+		}
+	}
+
 	assert(mesa_vp->Base.NumInstructions);
 	vp->num_temporaries = mesa_vp->Base.NumTemporaries;
 	r300TranslateVertexShader(vp, mesa_vp->Base.Instructions);
@@ -1443,7 +1456,12 @@ void r300SelectVertexShader(r300ContextPtr r300)
 		wpos_idx = i;
 	}
 
-	add_outputs(&wanted_key, VERT_RESULT_HPOS);
+	if (vpc->mesa_program.IsPositionInvariant) {
+		wanted_key.InputsRead |= (1 << VERT_ATTRIB_POS);
+		wanted_key.OutputsWritten |= (1 << VERT_RESULT_HPOS);
+	} else {
+		add_outputs(&wanted_key, VERT_RESULT_HPOS);
+	}
 
 	if (InputsRead & FRAG_BIT_COL0) {
 		add_outputs(&wanted_key, VERT_RESULT_COL0);
@@ -1453,27 +1471,103 @@ void r300SelectVertexShader(r300ContextPtr r300)
 		add_outputs(&wanted_key, VERT_RESULT_COL1);
 	}
 
+	if (InputsRead & FRAG_BIT_FOGC) {
+		add_outputs(&wanted_key, VERT_RESULT_FOGC);
+	}
+
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
 		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
 			add_outputs(&wanted_key, VERT_RESULT_TEX0 + i);
 		}
 	}
 
-	if (vpc->mesa_program.IsPositionInvariant) {
-		/* we wan't position don't we ? */
-		wanted_key.InputsRead |= (1 << VERT_ATTRIB_POS);
-	}
-
 	for (vp = vpc->progs; vp; vp = vp->next)
 		if (_mesa_memcmp(&vp->key, &wanted_key, sizeof(wanted_key))
 		    == 0) {
 			r300->selected_vp = vp;
 			return;
 		}
-	//_mesa_print_program(&vpc->mesa_program.Base);
+
+	if (RADEON_DEBUG & DEBUG_VERTS) {
+		fprintf(stderr, "Initial vertex program:\n");
+		_mesa_print_program(&vpc->mesa_program.Base);
+		fflush(stdout);
+	}
 
 	vp = build_program(&wanted_key, &vpc->mesa_program, wpos_idx);
 	vp->next = vpc->progs;
 	vpc->progs = vp;
 	r300->selected_vp = vp;
 }
+
+#define bump_vpu_count(ptr, new_count)   do { \
+		drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr)); \
+		int _nc=(new_count)/4; \
+		assert(_nc < 256); \
+		if(_nc>_p->vpu.count)_p->vpu.count=_nc; \
+	} while(0)
+
+static void r300EmitVertexProgram(r300ContextPtr r300, int dest, struct r300_vertex_shader_hw_code *code)
+{
+	int i;
+
+	assert((code->length > 0) && (code->length % 4 == 0));
+
+	switch ((dest >> 8) & 0xf) {
+		case 0:
+			R300_STATECHANGE(r300, vpi);
+			for (i = 0; i < code->length; i++)
+				r300->hw.vpi.cmd[R300_VPI_INSTR_0 + i + 4 * (dest & 0xff)] = (code->body.d[i]);
+			bump_vpu_count(r300->hw.vpi.cmd, code->length + 4 * (dest & 0xff));
+			break;
+		case 2:
+			R300_STATECHANGE(r300, vpp);
+			for (i = 0; i < code->length; i++)
+				r300->hw.vpp.cmd[R300_VPP_PARAM_0 + i + 4 * (dest & 0xff)] = (code->body.d[i]);
+			bump_vpu_count(r300->hw.vpp.cmd, code->length + 4 * (dest & 0xff));
+			break;
+		case 4:
+			R300_STATECHANGE(r300, vps);
+			for (i = 0; i < code->length; i++)
+				r300->hw.vps.cmd[1 + i + 4 * (dest & 0xff)] = (code->body.d[i]);
+			bump_vpu_count(r300->hw.vps.cmd, code->length + 4 * (dest & 0xff));
+			break;
+		default:
+			fprintf(stderr, "%s:%s don't know how to handle dest %04x\n", __FILE__, __FUNCTION__, dest);
+			_mesa_exit(-1);
+	}
+}
+
+void r300SetupVertexProgram(r300ContextPtr rmesa)
+{
+	GLcontext *ctx = rmesa->radeon.glCtx;
+	struct r300_vertex_program *prog = rmesa->selected_vp;
+	int inst_count = 0;
+	int param_count = 0;
+	
+	/* Reset state, in case we don't use something */
+	((drm_r300_cmd_header_t *) rmesa->hw.vpp.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vpi.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vps.cmd)->vpu.count = 0;
+	
+	R300_STATECHANGE(rmesa, vpp);
+	param_count = r300VertexProgUpdateParams(ctx,
+								(struct r300_vertex_program_cont *)
+								ctx->VertexProgram._Current,
+								(float *)&rmesa->hw.vpp.cmd[R300_VPP_PARAM_0]);
+	bump_vpu_count(rmesa->hw.vpp.cmd, param_count);
+	param_count /= 4;
+
+	r300EmitVertexProgram(rmesa, R300_PVS_CODE_START, &(prog->hw_code));
+	inst_count = (prog->hw_code.length / 4) - 1;
+
+	r300VapCntl(rmesa, _mesa_bitcount(prog->key.InputsRead),
+				 _mesa_bitcount(prog->key.OutputsWritten), prog->num_temporaries);
+
+	R300_STATECHANGE(rmesa, pvs);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] = (0 << R300_PVS_FIRST_INST_SHIFT) | (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
+				(inst_count << R300_PVS_LAST_INST_SHIFT);
+
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] = (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) | (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] = (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
+}
diff --git a/src/mesa/drivers/dri/r300/r300_vertprog.h b/src/mesa/drivers/dri/r300/r300_vertprog.h
index 2f35f02bc84..b552e3fb1bd 100644
--- a/src/mesa/drivers/dri/r300/r300_vertprog.h
+++ b/src/mesa/drivers/dri/r300/r300_vertprog.h
@@ -32,4 +32,6 @@
 
 #endif
 
+void r300SetupVertexProgram(r300ContextPtr rmesa);
+
 #endif
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c
index 07a2a7b17ce..4d58cf21622 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r500_fragprog.c
@@ -27,10 +27,6 @@
 
 #include "r500_fragprog.h"
 
-#include "radeon_nqssadce.h"
-#include "radeon_program_alu.h"
-
-
 static void reset_srcreg(struct prog_src_register* reg)
 {
 	_mesa_bzero(reg, sizeof(*reg));
@@ -58,12 +54,12 @@ static struct prog_src_register shadow_ambient(struct gl_program *program, int t
  *  - introduce a temporary register when write masks are needed
  *
  */
-static GLboolean transform_TEX(
+GLboolean r500_transform_TEX(
 	struct radeon_transform_context *t,
 	struct prog_instruction* orig_inst, void* data)
 {
-	struct r500_fragment_program_compiler *compiler =
-		(struct r500_fragment_program_compiler*)data;
+	struct r300_fragment_program_compiler *compiler =
+		(struct r300_fragment_program_compiler*)data;
 	struct prog_instruction inst = *orig_inst;
 	struct prog_instruction* tgt;
 	GLboolean destredirect = GL_FALSE;
@@ -156,9 +152,9 @@ static GLboolean transform_TEX(
 		 *   r  < tex  <=>      -tex+r < 0
 		 *   r >= tex  <=> not (-tex+r < 0 */
 		if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
-			tgt[1].SrcReg[2].NegateBase = tgt[0].SrcReg[2].NegateBase ^ NEGATE_XYZW;
+			tgt[1].SrcReg[2].Negate = tgt[0].SrcReg[2].Negate ^ NEGATE_XYZW;
 		else
-			tgt[1].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW;
+			tgt[1].SrcReg[0].Negate = tgt[0].SrcReg[0].Negate ^ NEGATE_XYZW;
 
 		tgt[2].Opcode = OPCODE_CMP;
 		tgt[2].DstReg = orig_inst->DstReg;
@@ -188,121 +184,7 @@ static GLboolean transform_TEX(
 	return GL_TRUE;
 }
 
-
-static void update_params(r300ContextPtr r300, struct r500_fragment_program *fp)
-{
-	struct gl_fragment_program *mp = &fp->mesa_program;
-
-	/* Ask Mesa nicely to fill in ParameterValues for us */
-	if (mp->Base.Parameters)
-		_mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
-}
-
-
-/**
- * Transform the program to support fragment.position.
- *
- * Introduce a small fragment at the start of the program that will be
- * the only code that directly reads the FRAG_ATTRIB_WPOS input.
- * All other code pieces that reference that input will be rewritten
- * to read from a newly allocated temporary.
- *
- * \todo if/when r5xx supports the radeon_program architecture, this is a
- * likely candidate for code sharing.
- */
-static void insert_WPOS_trailer(struct r500_fragment_program_compiler *compiler)
-{
-	GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
-
-	if (!(InputsRead & FRAG_BIT_WPOS))
-		return;
-
-	static gl_state_index tokens[STATE_LENGTH] = {
-		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
-	};
-	struct prog_instruction *fpi;
-	GLuint window_index;
-	int i = 0;
-	GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY);
-
-	_mesa_insert_instructions(compiler->program, 0, 3);
-	fpi = compiler->program->Instructions;
-
-	/* perspective divide */
-	fpi[i].Opcode = OPCODE_RCP;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_W;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	fpi[i].Opcode = OPCODE_MUL;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-
-	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[1].Index = tempregi;
-	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	/* viewport transformation */
-	window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
-
-	fpi[i].Opcode = OPCODE_MAD;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[0].Index = tempregi;
-	fpi[i].SrcReg[0].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[1].Index = window_index;
-	fpi[i].SrcReg[1].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[2].Index = window_index;
-	fpi[i].SrcReg[2].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-	i++;
-
-	for (; i < compiler->program->NumInstructions; ++i) {
-		int reg;
-		for (reg = 0; reg < 3; reg++) {
-			if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
-			    fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
-				fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
-				fpi[i].SrcReg[reg].Index = tempregi;
-			}
-		}
-	}
-}
-
-
-static void nqssadce_init(struct nqssadce_state* s)
-{
-	s->Outputs[FRAG_RESULT_COLOR].Sourced = WRITEMASK_XYZW;
-	s->Outputs[FRAG_RESULT_DEPTH].Sourced = WRITEMASK_W;
-}
-
-static GLboolean is_native_swizzle(GLuint opcode, struct prog_src_register reg)
+GLboolean r500FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
 {
 	GLuint relevant;
 	int i;
@@ -314,33 +196,30 @@ static GLboolean is_native_swizzle(GLuint opcode, struct prog_src_register reg)
 		if (reg.Abs)
 			return GL_FALSE;
 
-		if (reg.NegateAbs)
-			reg.NegateBase ^= 15;
+		if (opcode == OPCODE_KIL && (reg.Swizzle != SWIZZLE_NOOP || reg.Negate != NEGATE_NONE))
+			return GL_FALSE;
 
-		if (opcode == OPCODE_KIL) {
-			if (reg.Swizzle != SWIZZLE_NOOP)
-				return GL_FALSE;
-		} else {
-			for(i = 0; i < 4; ++i) {
-				GLuint swz = GET_SWZ(reg.Swizzle, i);
-				if (swz == SWIZZLE_NIL) {
-					reg.NegateBase &= ~(1 << i);
-					continue;
-				}
-				if (swz >= 4)
-					return GL_FALSE;
+		if (reg.Negate)
+			reg.Negate ^= NEGATE_XYZW;
+
+		for(i = 0; i < 4; ++i) {
+			GLuint swz = GET_SWZ(reg.Swizzle, i);
+			if (swz == SWIZZLE_NIL) {
+				reg.Negate &= ~(1 << i);
+				continue;
 			}
+			if (swz >= 4)
+				return GL_FALSE;
 		}
 
-		if (reg.NegateBase)
+		if (reg.Negate)
 			return GL_FALSE;
 
 		return GL_TRUE;
 	} else if (opcode == OPCODE_DDX || opcode == OPCODE_DDY) {
 		/* DDX/MDH and DDY/MDV explicitly ignore incoming swizzles;
 		 * if it doesn't fit perfectly into a .xyzw case... */
-		if (reg.Swizzle == SWIZZLE_NOOP && !reg.Abs
-				&& !reg.NegateBase && !reg.NegateAbs)
+		if (reg.Swizzle == SWIZZLE_NOOP && !reg.Abs && !reg.Negate)
 			return GL_TRUE;
 
 		return GL_FALSE;
@@ -355,7 +234,7 @@ static GLboolean is_native_swizzle(GLuint opcode, struct prog_src_register reg)
 			if (swz != SWIZZLE_NIL && swz != SWIZZLE_ZERO)
 				relevant |= 1 << i;
 		}
-		if ((reg.NegateBase & relevant) && ((reg.NegateBase & relevant) != relevant))
+		if ((reg.Negate & relevant) && ((reg.Negate & relevant) != relevant))
 			return GL_FALSE;
 
 		return GL_TRUE;
@@ -368,8 +247,7 @@ static GLboolean is_native_swizzle(GLuint opcode, struct prog_src_register reg)
  * The only thing we *cannot* do in an ALU instruction is per-component
  * negation. Therefore, we split the MOV into two instructions when necessary.
  */
-static void nqssadce_build_swizzle(struct nqssadce_state *s,
-	struct prog_dst_register dst, struct prog_src_register src)
+void r500FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src)
 {
 	struct prog_instruction *inst;
 	GLuint negatebase[2] = { 0, 0 };
@@ -379,7 +257,7 @@ static void nqssadce_build_swizzle(struct nqssadce_state *s,
 		GLuint swz = GET_SWZ(src.Swizzle, i);
 		if (swz == SWIZZLE_NIL)
 			continue;
-		negatebase[GET_BIT(src.NegateBase, i)] |= 1 << i;
+		negatebase[GET_BIT(src.Negate, i)] |= 1 << i;
 	}
 
 	_mesa_insert_instructions(s->Program, s->IP, (negatebase[0] ? 1 : 0) + (negatebase[1] ? 1 : 0));
@@ -393,129 +271,12 @@ static void nqssadce_build_swizzle(struct nqssadce_state *s,
 		inst->DstReg = dst;
 		inst->DstReg.WriteMask = negatebase[i];
 		inst->SrcReg[0] = src;
+		inst->SrcReg[0].Negate = (i == 0) ? NEGATE_NONE : NEGATE_XYZW;
 		inst++;
 		s->IP++;
 	}
 }
 
-static GLuint build_dtm(GLuint depthmode)
-{
-	switch(depthmode) {
-	default:
-	case GL_LUMINANCE: return 0;
-	case GL_INTENSITY: return 1;
-	case GL_ALPHA: return 2;
-	}
-}
-
-static GLuint build_func(GLuint comparefunc)
-{
-	return comparefunc - GL_NEVER;
-}
-
-
-/**
- * Collect all external state that is relevant for compiling the given
- * fragment program.
- */
-static void build_state(
-	r300ContextPtr r300,
-	struct r500_fragment_program *fp,
-	struct r500_fragment_program_external_state *state)
-{
-	int unit;
-
-	_mesa_bzero(state, sizeof(*state));
-
-	for(unit = 0; unit < 16; ++unit) {
-		if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
-			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
-
-			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
-			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
-		}
-	}
-}
-
-static void dump_program(struct r500_fragment_program_code *code);
-
-void r500TranslateFragmentShader(r300ContextPtr r300,
-				 struct r500_fragment_program *fp)
-{
-	struct r500_fragment_program_external_state state;
-
-	build_state(r300, fp, &state);
-	if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
-		/* TODO: cache compiled programs */
-		fp->translated = GL_FALSE;
-		_mesa_memcpy(&fp->state, &state, sizeof(state));
-	}
-
-	if (!fp->translated) {
-		struct r500_fragment_program_compiler compiler;
-
-		compiler.r300 = r300;
-		compiler.fp = fp;
-		compiler.code = &fp->code;
-		compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: Initial program:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		insert_WPOS_trailer(&compiler);
-
-		struct radeon_program_transformation transformations[] = {
-			{ &transform_TEX, &compiler },
-			{ &radeonTransformALU, 0 },
-			{ &radeonTransformDeriv, 0 },
-			{ &radeonTransformTrigScale, 0 }
-		};
-		radeonLocalTransform(r300->radeon.glCtx, compiler.program,
-			4, transformations);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: after native rewrite:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		struct radeon_nqssadce_descr nqssadce = {
-			.Init = &nqssadce_init,
-			.IsNativeSwizzle = &is_native_swizzle,
-			.BuildSwizzle = &nqssadce_build_swizzle,
-			.RewriteDepthOut = GL_TRUE
-		};
-		radeonNqssaDce(r300->radeon.glCtx, compiler.program, &nqssadce);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: after NqSSA-DCE:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		fp->translated = r500FragmentProgramEmit(&compiler);
-
-		/* Subtle: Rescue any parameters that have been added during transformations */
-		_mesa_free_parameter_list(fp->mesa_program.Base.Parameters);
-		fp->mesa_program.Base.Parameters = compiler.program->Parameters;
-		compiler.program->Parameters = 0;
-
-		_mesa_reference_program(r300->radeon.glCtx, &compiler.program, 0);
-
-		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			if (fp->translated) {
-				_mesa_printf("Machine-readable code:\n");
-				dump_program(&fp->code);
-			}
-		}
-
-	}
-
-	update_params(r300, fp);
-
-}
 
 static char *toswiz(int swiz_val) {
   switch(swiz_val) {
@@ -614,9 +375,9 @@ static char *to_texop(int val)
   return NULL;
 }
 
-static void dump_program(struct r500_fragment_program_code *code)
+void r500FragmentProgramDump(union rX00_fragment_program_code *c)
 {
-
+  struct r500_fragment_program_code *code = &c->r500;
   fprintf(stderr, "R500 Fragment Program:\n--------\n");
 
   int n;
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.h b/src/mesa/drivers/dri/r300/r500_fragprog.h
index 1e45538f807..1179bf66073 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog.h
+++ b/src/mesa/drivers/dri/r300/r500_fragprog.h
@@ -33,30 +33,20 @@
 #ifndef __R500_FRAGPROG_H_
 #define __R500_FRAGPROG_H_
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
 #include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/program.h"
 #include "shader/prog_instruction.h"
 
 #include "r300_context.h"
-#include "r300_state.h"
-#include "radeon_program.h"
+#include "radeon_nqssadce.h"
 
-struct r500_fragment_program;
+extern GLboolean r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler);
 
-extern void r500TranslateFragmentShader(r300ContextPtr r300,
-					struct r500_fragment_program *fp);
+extern void r500FragmentProgramDump(union rX00_fragment_program_code *c);
 
-struct r500_fragment_program_compiler {
-	r300ContextPtr r300;
-	struct r500_fragment_program *fp;
-	struct r500_fragment_program_code *code;
-	struct gl_program *program;
-};
+extern GLboolean r500FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg);
 
-extern GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler);
+extern void r500FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src);
+
+extern GLboolean r500_transform_TEX(struct radeon_transform_context *t, struct prog_instruction* orig_inst, void* data);
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
index 4631235f0d3..30f4514897e 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
@@ -49,8 +49,8 @@
 
 
 #define PROG_CODE \
-	struct r500_fragment_program_compiler *c = (struct r500_fragment_program_compiler*)data; \
-	struct r500_fragment_program_code *code = c->code
+	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)data; \
+	struct r500_fragment_program_code *code = &c->code->r500
 
 #define error(fmt, args...) do {			\
 		fprintf(stderr, "%s::%s(): " fmt "\n",	\
@@ -72,7 +72,7 @@ static GLboolean emit_const(void *data, GLuint file, GLuint idx, GLuint *hwindex
 	}
 
 	if (*hwindex >= code->const_nr) {
-		if (*hwindex >= PFS_NUM_CONST_REGS) {
+		if (*hwindex >= R500_PFS_NUM_CONST_REGS) {
 			error("Out of hw constants!\n");
 			return GL_FALSE;
 		}
@@ -299,9 +299,9 @@ static const struct radeon_pair_handler pair_handler = {
 	.MaxHwTemps = 128
 };
 
-GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler)
+GLboolean r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler)
 {
-	struct r500_fragment_program_code *code = compiler->code;
+	struct r500_fragment_program_code *code = &compiler->code->r500;
 
 	_mesa_bzero(code, sizeof(*code));
 	code->max_temp_idx = 1;
diff --git a/src/mesa/drivers/dri/r300/radeon_nqssadce.c b/src/mesa/drivers/dri/r300/radeon_nqssadce.c
index a083c3d2436..4a2e1cba402 100644
--- a/src/mesa/drivers/dri/r300/radeon_nqssadce.c
+++ b/src/mesa/drivers/dri/r300/radeon_nqssadce.c
@@ -61,12 +61,12 @@ static struct prog_src_register lmul_swizzle(GLuint swizzle, struct prog_src_reg
 	struct prog_src_register tmp = srcreg;
 	int i;
 	tmp.Swizzle = 0;
-	tmp.NegateBase = 0;
+	tmp.Negate = NEGATE_NONE;
 	for(i = 0; i < 4; ++i) {
 		GLuint swz = GET_SWZ(swizzle, i);
 		if (swz < 4) {
 			tmp.Swizzle |= GET_SWZ(srcreg.Swizzle, swz) << (i*3);
-			tmp.NegateBase |= GET_BIT(srcreg.NegateBase, swz) << i;
+			tmp.Negate |= GET_BIT(srcreg.Negate, swz) << i;
 		} else {
 			tmp.Swizzle |= swz << (i*3);
 		}
@@ -103,9 +103,8 @@ static struct prog_instruction* track_used_srcreg(struct nqssadce_state* s,
 		inst->SrcReg[src].File = PROGRAM_TEMPORARY;
 		inst->SrcReg[src].Index = dstreg.Index;
 		inst->SrcReg[src].Swizzle = 0;
-		inst->SrcReg[src].NegateBase = 0;
+		inst->SrcReg[src].Negate = NEGATE_NONE;
 		inst->SrcReg[src].Abs = 0;
-		inst->SrcReg[src].NegateAbs = 0;
 		for(i = 0; i < 4; ++i) {
 			if (GET_BIT(sourced, i))
 				inst->SrcReg[src].Swizzle |= i << (3*i);
diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.c b/src/mesa/drivers/dri/r300/radeon_program_alu.c
index 1ef71e74dcf..8283723bad7 100644
--- a/src/mesa/drivers/dri/r300/radeon_program_alu.c
+++ b/src/mesa/drivers/dri/r300/radeon_program_alu.c
@@ -81,18 +81,6 @@ static struct prog_instruction *emit3(struct gl_program* p,
 	return fpi;
 }
 
-static void set_swizzle(struct prog_src_register *SrcReg, int coordinate, int swz)
-{
-	SrcReg->Swizzle &= ~(7 << (3*coordinate));
-	SrcReg->Swizzle |= swz << (3*coordinate);
-}
-
-static void set_negate_base(struct prog_src_register *SrcReg, int coordinate, int negate)
-{
-	SrcReg->NegateBase &= ~(1 << coordinate);
-	SrcReg->NegateBase |= (negate << coordinate);
-}
-
 static struct prog_dst_register dstreg(int file, int index)
 {
 	struct prog_dst_register dst;
@@ -156,15 +144,14 @@ static struct prog_src_register absolute(struct prog_src_register reg)
 {
 	struct prog_src_register newreg = reg;
 	newreg.Abs = 1;
-	newreg.NegateBase = 0;
-	newreg.NegateAbs = 0;
+	newreg.Negate = NEGATE_NONE;
 	return newreg;
 }
 
 static struct prog_src_register negate(struct prog_src_register reg)
 {
 	struct prog_src_register newreg = reg;
-	newreg.NegateAbs = !newreg.NegateAbs;
+	newreg.Negate = newreg.Negate ^ NEGATE_XYZW;
 	return newreg;
 }
 
@@ -189,8 +176,7 @@ static void transform_ABS(struct radeon_transform_context* t,
 {
 	struct prog_src_register src = inst->SrcReg[0];
 	src.Abs = 1;
-	src.NegateBase = 0;
-	src.NegateAbs = 0;
+	src.Negate = NEGATE_NONE;
 	emit1(t->Program, OPCODE_MOV, inst->SaturateMode, inst->DstReg, src);
 }
 
@@ -198,18 +184,9 @@ static void transform_DPH(struct radeon_transform_context* t,
 	struct prog_instruction* inst)
 {
 	struct prog_src_register src0 = inst->SrcReg[0];
-	if (src0.NegateAbs) {
-		if (src0.Abs) {
-			int tempreg = radeonFindFreeTemporary(t);
-			emit1(t->Program, OPCODE_MOV, 0, dstreg(PROGRAM_TEMPORARY, tempreg), src0);
-			src0 = srcreg(src0.File, src0.Index);
-		} else {
-			src0.NegateAbs = 0;
-			src0.NegateBase ^= NEGATE_XYZW;
-		}
-	}
-	set_swizzle(&src0, 3, SWIZZLE_ONE);
-	set_negate_base(&src0, 3, 0);
+	src0.Negate &= ~NEGATE_W;
+	src0.Swizzle &= ~(7 << (3 * 3));
+	src0.Swizzle |= SWIZZLE_ONE << (3 * 3);
 	emit2(t->Program, OPCODE_DP4, inst->SaturateMode, inst->DstReg, src0, inst->SrcReg[1]);
 }
 
@@ -649,7 +626,7 @@ GLboolean radeonTransformDeriv(struct radeon_transform_context* t,
 
 	B.Swizzle = MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE,
 						SWIZZLE_ONE, SWIZZLE_ONE);
-	B.NegateBase = NEGATE_XYZW;
+	B.Negate = NEGATE_XYZW;
 
 	emit2(t->Program, inst->Opcode, inst->SaturateMode, inst->DstReg,
 		inst->SrcReg[0], B);
diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.c b/src/mesa/drivers/dri/r300/radeon_program_pair.c
index 49aa90dd94a..906d36e5226 100644
--- a/src/mesa/drivers/dri/r300/radeon_program_pair.c
+++ b/src/mesa/drivers/dri/r300/radeon_program_pair.c
@@ -255,8 +255,7 @@ static void final_rewrite(struct pair_state *s, struct prog_instruction *inst)
 		inst->SrcReg[2] = inst->SrcReg[1];
 		inst->SrcReg[1].File = PROGRAM_BUILTIN;
 		inst->SrcReg[1].Swizzle = SWIZZLE_1111;
-		inst->SrcReg[1].NegateBase = 0;
-		inst->SrcReg[1].NegateAbs = 0;
+		inst->SrcReg[1].Negate = NEGATE_NONE;
 		inst->Opcode = OPCODE_MAD;
 		break;
 	case OPCODE_CMP:
@@ -610,6 +609,7 @@ static void emit_all_tex(struct pair_state *s)
 		if (s->Debug) {
 			_mesa_printf("   ");
 			_mesa_print_instruction(inst);
+			fflush(stdout);
 		}
 		s->Error = s->Error || !s->Handler->EmitTex(s->UserData, inst);
 	}
@@ -722,7 +722,6 @@ static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_
 		if (pairinst->NeedRGB && !pairinst->IsTranscendent) {
 			GLboolean srcrgb = GL_FALSE;
 			GLboolean srcalpha = GL_FALSE;
-			GLuint negatebase = 0;
 			int j;
 			for(j = 0; j < 3; ++j) {
 				GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j);
@@ -730,8 +729,6 @@ static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_
 					srcrgb = GL_TRUE;
 				else if (swz < 4)
 					srcalpha = GL_TRUE;
-				if (swz != SWIZZLE_NIL && GET_BIT(inst->SrcReg[i].NegateBase, j))
-					negatebase = 1;
 			}
 			source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha);
 			if (source < 0)
@@ -739,12 +736,11 @@ static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_
 			pair->RGB.Arg[i].Source = source;
 			pair->RGB.Arg[i].Swizzle = inst->SrcReg[i].Swizzle & 0x1ff;
 			pair->RGB.Arg[i].Abs = inst->SrcReg[i].Abs;
-			pair->RGB.Arg[i].Negate = (negatebase & ~pair->RGB.Arg[i].Abs) ^ inst->SrcReg[i].NegateAbs;
+			pair->RGB.Arg[i].Negate = !!(inst->SrcReg[i].Negate & (NEGATE_X | NEGATE_Y | NEGATE_Z));
 		}
 		if (pairinst->NeedAlpha) {
 			GLboolean srcrgb = GL_FALSE;
 			GLboolean srcalpha = GL_FALSE;
-			GLuint negatebase = GET_BIT(inst->SrcReg[i].NegateBase, pairinst->IsTranscendent ? 0 : 3);
 			GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, pairinst->IsTranscendent ? 0 : 3);
 			if (swz < 3)
 				srcrgb = GL_TRUE;
@@ -756,7 +752,7 @@ static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_
 			pair->Alpha.Arg[i].Source = source;
 			pair->Alpha.Arg[i].Swizzle = swz;
 			pair->Alpha.Arg[i].Abs = inst->SrcReg[i].Abs;
-			pair->Alpha.Arg[i].Negate = (negatebase & ~pair->RGB.Arg[i].Abs) ^ inst->SrcReg[i].NegateAbs;
+			pair->Alpha.Arg[i].Negate = !!(inst->SrcReg[i].Negate & NEGATE_W);
 		}
 	}
 
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c
index 9f1d1a3a448..4666518824f 100644
--- a/src/mesa/drivers/dri/r600/r700_assembler.c
+++ b/src/mesa/drivers/dri/r600/r700_assembler.c
@@ -1057,10 +1057,10 @@ GLboolean assemble_src(r700_AssemblerBase *pAsm,
     pAsm->S[fld].src.swizzlez = (pILInst->SrcReg[src].Swizzle >> 6) & 0x7;
     pAsm->S[fld].src.swizzlew = (pILInst->SrcReg[src].Swizzle >> 9) & 0x7;
 
-    pAsm->S[fld].src.negx = pILInst->SrcReg[src].NegateBase & 0x1;
-    pAsm->S[fld].src.negy = (pILInst->SrcReg[src].NegateBase >> 1) & 0x1;
-    pAsm->S[fld].src.negz = (pILInst->SrcReg[src].NegateBase >> 2) & 0x1;
-    pAsm->S[fld].src.negw = (pILInst->SrcReg[src].NegateBase >> 3) & 0x1;
+    pAsm->S[fld].src.negx = pILInst->SrcReg[src].Negate & 0x1;
+    pAsm->S[fld].src.negy = (pILInst->SrcReg[src].Negate >> 1) & 0x1;
+    pAsm->S[fld].src.negz = (pILInst->SrcReg[src].Negate >> 2) & 0x1;
+    pAsm->S[fld].src.negw = (pILInst->SrcReg[src].Negate >> 3) & 0x1;
      
     return GL_TRUE;
 }
@@ -1197,10 +1197,10 @@ GLboolean tex_src(r700_AssemblerBase *pAsm)
     pAsm->S[0].src.swizzlez = (pILInst->SrcReg[0].Swizzle >> 6) & 0x7;
     pAsm->S[0].src.swizzlew = (pILInst->SrcReg[0].Swizzle >> 9) & 0x7;
 
-    pAsm->S[0].src.negx = pILInst->SrcReg[0].NegateBase & 0x1;
-    pAsm->S[0].src.negy = (pILInst->SrcReg[0].NegateBase >> 1) & 0x1;
-    pAsm->S[0].src.negz = (pILInst->SrcReg[0].NegateBase >> 2) & 0x1;
-    pAsm->S[0].src.negw = (pILInst->SrcReg[0].NegateBase >> 3) & 0x1;
+    pAsm->S[0].src.negx = pILInst->SrcReg[0].Negate & 0x1;
+    pAsm->S[0].src.negy = (pILInst->SrcReg[0].Negate >> 1) & 0x1;
+    pAsm->S[0].src.negz = (pILInst->SrcReg[0].Negate >> 2) & 0x1;
+    pAsm->S[0].src.negw = (pILInst->SrcReg[0].Negate >> 3) & 0x1;
      
     return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/r600/r700_clear.c b/src/mesa/drivers/dri/r600/r700_clear.c
index b62aa66eb29..f142aafd03e 100644
--- a/src/mesa/drivers/dri/r600/r700_clear.c
+++ b/src/mesa/drivers/dri/r600/r700_clear.c
@@ -57,7 +57,7 @@ static void r700UserClear(GLcontext *ctx, GLuint mask)
 void r700Clear(GLcontext * ctx, GLbitfield mask)
 {
     context_t *context = R700_CONTEXT(ctx);
-    __DRIdrawablePrivate *dPriv = context->radeon.dri.drawable;
+    __DRIdrawablePrivate *dPriv = radeon_get_drawable(&context->radeon);
     const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
     GLbitfield swrast_mask = 0, tri_mask = 0;
     int i;
diff --git a/src/mesa/drivers/dri/r600/r700_state.c b/src/mesa/drivers/dri/r600/r700_state.c
index 404e946f7ca..329a4aa179a 100644
--- a/src/mesa/drivers/dri/r600/r700_state.c
+++ b/src/mesa/drivers/dri/r600/r700_state.c
@@ -527,7 +527,7 @@ static void r700UpdateWindow(GLcontext * ctx, int id) //--------------------
 
 	context_t *context = R700_CONTEXT(ctx);
 	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
-	__DRIdrawablePrivate *dPriv = context->radeon.dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&context->radeon);
 	GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
 	GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
index 670fa5b6047..0b0a2aa2c46 100644
--- a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
@@ -48,6 +48,7 @@
 #include "radeon_drm.h"
 #include "radeon_common.h"
 #include "radeon_bocs_wrapper.h"
+#include "radeon_macros.h"
 
 /* no seriously texmem.c is this screwed up */
 struct bo_legacy_texture_object {
@@ -167,6 +168,7 @@ static int legacy_free_handle(struct bo_manager_legacy *bom, uint32_t handle)
 static void legacy_get_current_age(struct bo_manager_legacy *boml)
 {
     drm_radeon_getparam_t gp;
+    unsigned char *RADEONMMIO = NULL;
     int r;
 
     if (IS_R300_CLASS(boml->screen)) {
@@ -178,8 +180,11 @@ static void legacy_get_current_age(struct bo_manager_legacy *boml)
        	 fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__, r);
          exit(1);
        }
-    } else
-	boml->current_age = boml->screen->scratch[3];
+    } else {
+        RADEONMMIO = boml->screen->mmio.map;
+        boml->current_age = boml->screen->scratch[3];
+        boml->current_age = INREG(RADEON_GUI_SCRATCH_REG3);
+    }
 }
 
 static int legacy_is_pending(struct radeon_bo *bo)
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index 691615938f5..60d6bbb5af9 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -62,6 +62,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/bufferobj.h"
 #include "main/buffers.h"
 #include "main/depth.h"
+#include "main/polygon.h"
 #include "main/shaders.h"
 #include "main/texstate.h"
 #include "main/varray.h"
@@ -152,7 +153,7 @@ void radeon_get_cliprects(radeonContextPtr radeon,
 			  unsigned int *num_cliprects,
 			  int *x_off, int *y_off)
 {
-	__DRIdrawablePrivate *dPriv = radeon->dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(radeon);
 	struct radeon_framebuffer *rfb = dPriv->driverPrivate;
 
 	if (radeon->constant_cliprect) {
@@ -184,15 +185,15 @@ void radeon_get_cliprects(radeonContextPtr radeon,
  */
 void radeonSetCliprects(radeonContextPtr radeon)
 {
-	__DRIdrawablePrivate *const drawable = radeon->dri.drawable;
-	__DRIdrawablePrivate *const readable = radeon->dri.readable;
+	__DRIdrawablePrivate *const drawable = radeon_get_drawable(radeon);
+	__DRIdrawablePrivate *const readable = radeon_get_readable(radeon);
 	struct radeon_framebuffer *const draw_rfb = drawable->driverPrivate;
 	struct radeon_framebuffer *const read_rfb = readable->driverPrivate;
 	int x_off, y_off;
 
 	radeon_get_cliprects(radeon, &radeon->pClipRects,
 			     &radeon->numClipRects, &x_off, &y_off);
-	
+
 	if ((draw_rfb->base.Width != drawable->w) ||
 	    (draw_rfb->base.Height != drawable->h)) {
 		_mesa_resize_framebuffer(radeon->glCtx, &draw_rfb->base,
@@ -220,9 +221,9 @@ void radeonUpdateScissor( GLcontext *ctx )
 {
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
 
-	if ( rmesa->dri.drawable ) {
-		__DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
-      
+	if ( radeon_get_drawable(rmesa) ) {
+		__DRIdrawablePrivate *dPriv = radeon_get_drawable(rmesa);
+
 		int x = ctx->Scissor.X;
 		int y = dPriv->h - ctx->Scissor.Y - ctx->Scissor.Height;
 		int w = ctx->Scissor.X + ctx->Scissor.Width - 1;
@@ -424,11 +425,11 @@ void radeonCopyBuffer( __DRIdrawablePrivate *dPriv,
 	radeonContextPtr rmesa;
 	struct radeon_framebuffer *rfb;
 	GLint nbox, i, ret;
-   
+
 	assert(dPriv);
 	assert(dPriv->driContextPriv);
 	assert(dPriv->driContextPriv->driverPrivate);
-   
+
 	rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
 
 	LOCK_HARDWARE(rmesa);
@@ -505,7 +506,7 @@ static int radeonScheduleSwap(__DRIdrawablePrivate *dPriv, GLboolean *missed_tar
 
 	UNLOCK_HARDWARE(rmesa);
 	driWaitForVBlank(dPriv, missed_target);
-	
+
 	return 0;
 }
 
@@ -539,7 +540,7 @@ static GLboolean radeonPageFlip( __DRIdrawablePrivate *dPriv )
 	radeon->sarea->nbox = 1;
 
 	ret = drmCommandNone( radeon->dri.fd, DRM_RADEON_FLIP );
-	
+
 	UNLOCK_HARDWARE(radeon);
 
 	if ( ret ) {
@@ -637,7 +638,7 @@ void radeon_draw_buffer(GLcontext *ctx, struct gl_framebuffer *fb)
 	struct radeon_renderbuffer *rrbDepth = NULL, *rrbStencil = NULL,
 		*rrbColor = NULL;
 	uint32_t offset = 0;
-       
+
 
 	if (!fb) {
 		/* this can happen during the initial context initialization */
@@ -649,7 +650,7 @@ void radeon_draw_buffer(GLcontext *ctx, struct gl_framebuffer *fb)
 		radeon->vtbl.fallback(ctx, RADEON_FALLBACK_DRAW_BUFFER, GL_TRUE);
 		return;
 	}
-		
+
 	/* Do this here, note core Mesa, since this function is called from
 	 * many places within the driver.
 	 */
@@ -677,6 +678,7 @@ void radeon_draw_buffer(GLcontext *ctx, struct gl_framebuffer *fb)
 		if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
 			rrbColor = radeon_renderbuffer(fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer);
 			radeon->front_cliprects = GL_TRUE;
+			radeon->front_buffer_dirty = GL_TRUE;
 		} else {
 			rrbColor = radeon_renderbuffer(fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer);
 			radeon->front_cliprects = GL_FALSE;
@@ -735,21 +737,23 @@ void radeon_draw_buffer(GLcontext *ctx, struct gl_framebuffer *fb)
 		ctx->Driver.FrontFace(ctx, ctx->Polygon.FrontFace);
 	else
 		ctx->NewState |= _NEW_POLYGON;
-	
+
 	/*
 	 * Update depth test state
 	 */
 	if (ctx->Driver.Enable) {
 		ctx->Driver.Enable(ctx, GL_DEPTH_TEST,
 				   (ctx->Depth.Test && fb->Visual.depthBits > 0));
+		/* Need to update the derived ctx->Stencil._Enabled first */
+		_mesa_update_stencil(ctx);
 		ctx->Driver.Enable(ctx, GL_STENCIL_TEST,
 				   (ctx->Stencil._Enabled && fb->Visual.stencilBits > 0));
 	} else {
 		ctx->NewState |= (_NEW_DEPTH | _NEW_STENCIL);
 	}
-	
-	radeon->state.depth.rrb = rrbDepth;
-	radeon->state.color.rrb = rrbColor;
+
+	_mesa_reference_renderbuffer(&radeon->state.depth.rb, &rrbDepth->base);
+	_mesa_reference_renderbuffer(&radeon->state.color.rb, &rrbColor->base);
 	radeon->state.color.draw_offset = offset;
 
 #if 0
@@ -758,16 +762,14 @@ void radeon_draw_buffer(GLcontext *ctx, struct gl_framebuffer *fb)
 		ctx->Driver.Viewport(ctx, ctx->Viewport.X, ctx->Viewport.Y,
 				     ctx->Viewport.Width, ctx->Viewport.Height);
 	} else {
-	
+
 	}
 #endif
 	ctx->NewState |= _NEW_VIEWPORT;
 
 	/* Set state we know depends on drawable parameters:
 	 */
-	if (ctx->Driver.Scissor)
-		ctx->Driver.Scissor(ctx, ctx->Scissor.X, ctx->Scissor.Y,
-				    ctx->Scissor.Width, ctx->Scissor.Height);
+	radeonUpdateScissor(ctx);
 	radeon->NewGLState |= _NEW_SCISSOR;
 
 	if (ctx->Driver.DepthRange)
@@ -792,7 +794,25 @@ void radeonDrawBuffer( GLcontext *ctx, GLenum mode )
 	if (RADEON_DEBUG & DEBUG_DRI)
 		fprintf(stderr, "%s %s\n", __FUNCTION__,
 			_mesa_lookup_enum_by_nr( mode ));
-	
+
+	if (ctx->DrawBuffer->Name == 0) {
+		radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+
+		const GLboolean was_front_buffer_rendering =
+			radeon->is_front_buffer_rendering;
+
+		radeon->is_front_buffer_rendering = (mode == GL_FRONT_LEFT) ||
+                                            (mode == GL_FRONT);
+
+      /* If we weren't front-buffer rendering before but we are now, make sure
+       * that the front-buffer has actually been allocated.
+       */
+		if (!was_front_buffer_rendering && radeon->is_front_buffer_rendering) {
+			radeon_update_renderbuffers(radeon->dri.context,
+				radeon->dri.context->driDrawablePriv);
+      }
+	}
+
 	radeon_draw_buffer(ctx, ctx->DrawBuffer);
 }
 
@@ -814,7 +834,7 @@ void radeonReadBuffer( GLcontext *ctx, GLenum mode )
  */
 void radeonUpdatePageFlipping(radeonContextPtr radeon)
 {
-	struct radeon_framebuffer *rfb = radeon->dri.drawable->driverPrivate;
+	struct radeon_framebuffer *rfb = radeon_get_drawable(radeon)->driverPrivate;
 
 	rfb->pf_active = radeon->sarea->pfState;
 	rfb->pf_current_page = radeon->sarea->pfCurrentPage;
@@ -847,24 +867,60 @@ void radeon_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei he
 
 	old_viewport = ctx->Driver.Viewport;
 	ctx->Driver.Viewport = NULL;
-	radeon->dri.drawable = driContext->driDrawablePriv;
 	radeon_window_moved(radeon);
 	radeon_draw_buffer(ctx, radeon->glCtx->DrawBuffer);
 	ctx->Driver.Viewport = old_viewport;
-
-
 }
-static void radeon_print_state_atom(radeonContextPtr radeon, struct radeon_state_atom *state )
-{
-	int i;
-	int dwords = (*state->check)(radeon->glCtx, state);
-
-	fprintf(stderr, "emit %s %d/%d\n", state->name, state->cmd_size, dwords);
 
-	if (RADEON_DEBUG & DEBUG_VERBOSE) 
-		for (i = 0 ; i < dwords; i++) 
-			fprintf(stderr, "\t%s[%d]: %x\n", state->name, i, state->cmd[i]);
+static void radeon_print_state_atom(radeonContextPtr radeon, struct radeon_state_atom *state)
+{
+	int i, j, reg;
+	int dwords = (*state->check) (radeon->glCtx, state);
+	drm_r300_cmd_header_t cmd;
+
+	fprintf(stderr, "  emit %s %d/%d\n", state->name, dwords, state->cmd_size);
+
+	if (RADEON_DEBUG & DEBUG_VERBOSE) {
+		for (i = 0; i < dwords;) {
+			cmd = *((drm_r300_cmd_header_t *) &state->cmd[i]);
+			reg = (cmd.packet0.reghi << 8) | cmd.packet0.reglo;
+			fprintf(stderr, "      %s[%d]: cmdpacket0 (first reg=0x%04x, count=%d)\n",
+					state->name, i, reg, cmd.packet0.count);
+			++i;
+			for (j = 0; j < cmd.packet0.count && i < dwords; j++) {
+				fprintf(stderr, "      %s[%d]: 0x%04x = %08x\n",
+						state->name, i, reg, state->cmd[i]);
+				reg += 4;
+				++i;
+			}
+		}
+	}
+}
 
+static void radeon_print_state_atom_kmm(radeonContextPtr radeon, struct radeon_state_atom *state)
+{
+	int i, j, reg, count;
+	int dwords = (*state->check) (radeon->glCtx, state);
+	uint32_t packet0;
+
+	fprintf(stderr, "  emit %s %d/%d\n", state->name, dwords, state->cmd_size);
+
+	if (RADEON_DEBUG & DEBUG_VERBOSE) {
+		for (i = 0; i < dwords;) {
+			packet0 = state->cmd[i];
+			reg = (packet0 & 0x1FFF) << 2;
+			count = ((packet0 & 0x3FFF0000) >> 16) + 1;
+			fprintf(stderr, "      %s[%d]: cmdpacket0 (first reg=0x%04x, count=%d)\n",
+					state->name, i, reg, count);
+			++i;
+			for (j = 0; j < count && i < dwords; j++) {
+				fprintf(stderr, "      %s[%d]: 0x%04x = %08x\n",
+						state->name, i, reg, state->cmd[i]);
+				reg += 4;
+				++i;
+			}
+		}
+	}
 }
 
 static INLINE void radeonEmitAtoms(radeonContextPtr radeon, GLboolean dirty)
@@ -882,7 +938,10 @@ static INLINE void radeonEmitAtoms(radeonContextPtr radeon, GLboolean dirty)
 			dwords = (*atom->check) (radeon->glCtx, atom);
 			if (dwords) {
 				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
-					radeon_print_state_atom(radeon, atom);
+					if (radeon->radeonScreen->kernel_mm)
+						radeon_print_state_atom_kmm(radeon, atom);
+					else
+						radeon_print_state_atom(radeon, atom);
 				}
 				if (atom->emit) {
 					(*atom->emit)(radeon->glCtx, atom);
@@ -900,7 +959,7 @@ static INLINE void radeonEmitAtoms(radeonContextPtr radeon, GLboolean dirty)
 			}
 		}
 	}
-   
+
 	COMMIT_BATCH();
 }
 
@@ -928,6 +987,7 @@ void radeon_validate_reset_bos(radeonContextPtr radeon)
 	int i;
 
 	for (i = 0; i < radeon->state.validated_bo_count; i++) {
+		radeon_bo_unref(radeon->state.bos[i].bo);
 		radeon->state.bos[i].bo = NULL;
 		radeon->state.bos[i].read_domains = 0;
 		radeon->state.bos[i].write_domain = 0;
@@ -938,6 +998,7 @@ void radeon_validate_reset_bos(radeonContextPtr radeon)
 
 void radeon_validate_bo(radeonContextPtr radeon, struct radeon_bo *bo, uint32_t read_domains, uint32_t write_domain)
 {
+	radeon_bo_ref(bo);
 	radeon->state.bos[radeon->state.validated_bo_count].bo = bo;
 	radeon->state.bos[radeon->state.validated_bo_count].read_domains = read_domains;
 	radeon->state.bos[radeon->state.validated_bo_count].write_domain = write_domain;
@@ -968,7 +1029,7 @@ void radeonEmitState(radeonContextPtr radeon)
 	if (!radeon->cmdbuf.cs->cdw) {
 		if (RADEON_DEBUG & DEBUG_STATE)
 			fprintf(stderr, "Begin reemit state\n");
-		
+
 		radeonEmitAtoms(radeon, GL_FALSE);
 	}
 
@@ -1000,9 +1061,29 @@ void radeonFlush(GLcontext *ctx)
 		radeon->dma.flush( ctx );
 
 	radeonEmitState(radeon);
-   
+
 	if (radeon->cmdbuf.cs->cdw)
 		rcommonFlushCmdBuf(radeon, __FUNCTION__);
+
+	if ((ctx->DrawBuffer->Name == 0) && radeon->front_buffer_dirty) {
+		__DRIscreen *const screen = radeon->radeonScreen->driScreen;
+
+		if (screen->dri2.loader && (screen->dri2.loader->base.version >= 2)
+			&& (screen->dri2.loader->flushFrontBuffer != NULL)) {
+			__DRIdrawablePrivate * drawable = radeon_get_drawable(radeon);
+			(*screen->dri2.loader->flushFrontBuffer)(drawable, drawable->loaderPrivate);
+
+			/* Only clear the dirty bit if front-buffer rendering is no longer
+			 * enabled.  This is done so that the dirty bit can only be set in
+			 * glDrawBuffer.  Otherwise the dirty bit would have to be set at
+			 * each of N places that do rendering.  This has worse performances,
+			 * but it is much easier to get correct.
+			 */
+			if (radeon->is_front_buffer_rendering) {
+				radeon->front_buffer_dirty = GL_FALSE;
+			}
+		}
+	}
 }
 
 /* Make sure all commands have been sent to the hardware and have
@@ -1077,7 +1158,7 @@ int rcommonFlushCmdBuf(radeonContextPtr rmesa, const char *caller)
 	int ret;
 
 	radeonReleaseDmaRegion(rmesa);
-	
+
 	LOCK_HARDWARE(rmesa);
 	ret = rcommonFlushCmdBufLocked(rmesa, caller);
 	UNLOCK_HARDWARE(rmesa);
@@ -1139,12 +1220,12 @@ void rcommonInitCmdBuf(radeonContextPtr rmesa)
 	rmesa->cmdbuf.cs = radeon_cs_create(rmesa->cmdbuf.csm, size);
 	assert(rmesa->cmdbuf.cs != NULL);
 	rmesa->cmdbuf.size = size;
-	
+
 	if (!rmesa->radeonScreen->kernel_mm) {
 		radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_VRAM, rmesa->radeonScreen->texSize[0]);
 		radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_GTT, rmesa->radeonScreen->gartTextures.size);
 	} else {
-		struct drm_radeon_gem_info mminfo;
+		struct drm_radeon_gem_info mminfo = { 0 };
 
 		if (!drmCommandWriteRead(rmesa->dri.fd, DRM_RADEON_GEM_INFO, &mminfo, sizeof(mminfo)))
 		{
@@ -1250,18 +1331,19 @@ void radeon_clear_tris(GLcontext *ctx, GLbitfield mask)
    unsigned int saved_active_texture;
 
    assert((mask & ~(TRI_CLEAR_COLOR_BITS | BUFFER_BIT_DEPTH |
-		    BUFFER_BIT_STENCIL)) == 0);   
+		    BUFFER_BIT_STENCIL)) == 0);
 
    _mesa_PushAttrib(GL_COLOR_BUFFER_BIT |
 		    GL_CURRENT_BIT |
 		    GL_DEPTH_BUFFER_BIT |
 		    GL_ENABLE_BIT |
+		    GL_POLYGON_BIT |
 		    GL_STENCIL_BUFFER_BIT |
 		    GL_TRANSFORM_BIT |
 		    GL_CURRENT_BIT);
    _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
    saved_active_texture = ctx->Texture.CurrentUnit;
-  
+
   /* Disable existing GL state we don't want to apply to a clear. */
    _mesa_Disable(GL_ALPHA_TEST);
    _mesa_Disable(GL_BLEND);
@@ -1277,6 +1359,7 @@ void radeon_clear_tris(GLcontext *ctx, GLbitfield mask)
    _mesa_Disable(GL_CLIP_PLANE3);
    _mesa_Disable(GL_CLIP_PLANE4);
    _mesa_Disable(GL_CLIP_PLANE5);
+   _mesa_PolygonMode(GL_FRONT_AND_BACK, GL_FILL);
    if (ctx->Extensions.ARB_fragment_program && ctx->FragmentProgram.Enabled) {
       saved_fp_enable = GL_TRUE;
       _mesa_Disable(GL_FRAGMENT_PROGRAM_ARB);
@@ -1289,10 +1372,10 @@ void radeon_clear_tris(GLcontext *ctx, GLbitfield mask)
       saved_shader_program = ctx->Shader.CurrentProgram->Name;
       _mesa_UseProgramObjectARB(0);
    }
-   
+
    if (ctx->Texture._EnabledUnits != 0) {
       int i;
-      
+
       for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
 	 _mesa_ActiveTextureARB(GL_TEXTURE0 + i);
 	 _mesa_Disable(GL_TEXTURE_1D);
@@ -1308,9 +1391,14 @@ void radeon_clear_tris(GLcontext *ctx, GLbitfield mask)
 	 }
       }
    }
-  
+
+#if FEATURE_ARB_vertex_buffer_object
+   _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
+   _mesa_BindBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB, 0);
+#endif
+
    radeon_meta_set_passthrough_transform(rmesa);
-   
+
    for (i = 0; i < 4; i++) {
       color[i][0] = ctx->Color.ClearColor[0];
       color[i][1] = ctx->Color.ClearColor[1];
@@ -1380,7 +1468,7 @@ void radeon_clear_tris(GLcontext *ctx, GLbitfield mask)
       if (this_mask & BUFFER_BIT_STENCIL) {
 	 _mesa_Enable(GL_STENCIL_TEST);
 	 _mesa_StencilOp(GL_REPLACE, GL_REPLACE, GL_REPLACE);
-	 _mesa_StencilFuncSeparate(GL_FRONT, GL_ALWAYS, ctx->Stencil.Clear,
+	 _mesa_StencilFuncSeparate(GL_FRONT_AND_BACK, GL_ALWAYS, ctx->Stencil.Clear,
 				   ctx->Stencil.WriteMask[0]);
       } else {
 	 _mesa_Disable(GL_STENCIL_TEST);
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.h b/src/mesa/drivers/dri/radeon/radeon_common.h
index aa6058c6842..2cefb53fe04 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.h
+++ b/src/mesa/drivers/dri/radeon/radeon_common.h
@@ -82,7 +82,7 @@ static inline struct radeon_renderbuffer *radeon_get_renderbuffer(struct gl_fram
 static inline struct radeon_renderbuffer *radeon_get_depthbuffer(radeonContextPtr rmesa)
 {
 	struct radeon_renderbuffer *rrb;
-	rrb = rmesa->state.depth.rrb;
+	rrb = radeon_renderbuffer(rmesa->state.depth.rb);
 	if (!rrb)
 		return NULL;
 
@@ -93,7 +93,7 @@ static inline struct radeon_renderbuffer *radeon_get_colorbuffer(radeonContextPt
 {
 	struct radeon_renderbuffer *rrb;
 
-	rrb = rmesa->state.color.rrb;
+	rrb = radeon_renderbuffer(rmesa->state.color.rb);
 	if (!rrb)
 		return NULL;
 	return rrb;
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index 1e900865f02..4bf006c7a55 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -39,6 +39,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drirenderbuffer.h"
 #include "main/context.h"
 #include "main/framebuffer.h"
+#include "main/renderbuffer.h"
 #include "main/state.h"
 #include "main/simple_list.h"
 #include "swrast/swrast.h"
@@ -51,6 +52,40 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 int RADEON_DEBUG = (0);
 #endif
 
+
+static const char* get_chip_family_name(int chip_family)
+{
+	switch(chip_family) {
+	case CHIP_FAMILY_R100: return "R100";
+	case CHIP_FAMILY_RV100: return "RV100";
+	case CHIP_FAMILY_RS100: return "RS100";
+	case CHIP_FAMILY_RV200: return "RV200";
+	case CHIP_FAMILY_RS200: return "RS200";
+	case CHIP_FAMILY_R200: return "R200";
+	case CHIP_FAMILY_RV250: return "RV250";
+	case CHIP_FAMILY_RS300: return "RS300";
+	case CHIP_FAMILY_RV280: return "RV280";
+	case CHIP_FAMILY_R300: return "R300";
+	case CHIP_FAMILY_R350: return "R350";
+	case CHIP_FAMILY_RV350: return "RV350";
+	case CHIP_FAMILY_RV380: return "RV380";
+	case CHIP_FAMILY_R420: return "R420";
+	case CHIP_FAMILY_RV410: return "RV410";
+	case CHIP_FAMILY_RS400: return "RS400";
+	case CHIP_FAMILY_RS600: return "RS600";
+	case CHIP_FAMILY_RS690: return "RS690";
+	case CHIP_FAMILY_RS740: return "RS740";
+	case CHIP_FAMILY_RV515: return "RV515";
+	case CHIP_FAMILY_R520: return "R520";
+	case CHIP_FAMILY_RV530: return "RV530";
+	case CHIP_FAMILY_R580: return "R580";
+	case CHIP_FAMILY_RV560: return "RV560";
+	case CHIP_FAMILY_RV570: return "RV570";
+	default: return "unknown";
+	}
+}
+
+
 /* Return various strings for glGetString().
  */
 static const GLubyte *radeonGetString(GLcontext * ctx, GLenum name)
@@ -72,18 +107,24 @@ static const GLubyte *radeonGetString(GLcontext * ctx, GLenum name)
 		unsigned offset;
 		GLuint agp_mode = (radeon->radeonScreen->card_type==RADEON_CARD_PCI) ? 0 :
 			radeon->radeonScreen->AGPMode;
-		const char* chipname;
+		const char* chipclass;
+		char hardwarename[32];
 
 		if (IS_R600_CLASS(radeon->radeonScreen))
-			chipname = "R600";
+			chipclass = "R600";
 		else if (IS_R300_CLASS(radeon->radeonScreen))
-			chipname = "R300";
+			chipclass = "R300";
 		else if (IS_R200_CLASS(radeon->radeonScreen))
-			chipname = "R200";
+			chipclass = "R200";
 		else
-			chipname = "R100";
+			chipclass = "R100";
 
-		offset = driGetRendererString(buffer, chipname, DRIVER_DATE,
+		sprintf(hardwarename, "%s (%s %04X)",
+		        chipclass,
+		        get_chip_family_name(radeon->radeonScreen->chip_family),
+		        radeon->radeonScreen->device_id);
+
+		offset = driGetRendererString(buffer, hardwarename, DRIVER_DATE,
 					      agp_mode);
 
 		if (IS_R600_CLASS(radeon->radeonScreen)) {
@@ -152,8 +193,6 @@ GLboolean radeonInitContext(radeonContextPtr radeon,
 	/* DRI fields */
 	radeon->dri.context = driContextPriv;
 	radeon->dri.screen = sPriv;
-	radeon->dri.drawable = NULL;
-	radeon->dri.readable = NULL;
 	radeon->dri.hwContext = driContextPriv->hHWContext;
 	radeon->dri.hwLock = &sPriv->pSAREA->lock;
 	radeon->dri.fd = sPriv->fd;
@@ -176,13 +215,15 @@ GLboolean radeonInitContext(radeonContextPtr radeon,
 			"IRQ's not enabled, falling back to %s: %d %d\n",
 			radeon->do_usleeps ? "usleeps" : "busy waits",
 			fthrottle_mode, radeon->radeonScreen->irq);
-	
+
         radeon->texture_depth = driQueryOptioni (&radeon->optionCache,
 					        "texture_depth");
         if (radeon->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
                 radeon->texture_depth = ( glVisual->rgbBits > 16 ) ?
 	        DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
 
+	radeon->texture_row_align = 32;
+
 	return GL_TRUE;
 }
 
@@ -225,7 +266,7 @@ void radeonDestroyContext(__DRIcontextPrivate *driContextPriv )
 		radeon_firevertices(radeon);
 		_mesa_make_current(NULL, NULL, NULL);
 	}
-	
+
 	assert(radeon);
 	if (radeon) 
     {
@@ -249,14 +290,11 @@ void radeonDestroyContext(__DRIcontextPrivate *driContextPriv )
 		_tnl_DestroyContext( radeon->glCtx );
 		_vbo_DestroyContext( radeon->glCtx );
 		_swrast_DestroyContext( radeon->glCtx );
-	
-		radeonDestroyBuffer(radeon->dri.drawable);
-		radeonDestroyBuffer(radeon->dri.readable);
 
 		/* free atom list */
 		/* free the Mesa context */
 		_mesa_destroy_context(radeon->glCtx);
-		
+
 		/* _mesa_destroy_context() might result in calls to functions that
 		 * depend on the DriverCtx, so don't set it to NULL before.
 		 *
@@ -264,7 +302,7 @@ void radeonDestroyContext(__DRIcontextPrivate *driContextPriv )
 		 */
 		/* free the option cache */
 		driDestroyOptionCache(&radeon->optionCache);
-		
+
 		rcommonDestroyCmdBuf(radeon);
 
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600) /* +r6/r7 */
@@ -405,12 +443,12 @@ radeon_make_renderbuffer_current(radeonContextPtr radeon,
 	int size = 4096*4096*4;
 	/* if radeon->fake */
 	struct radeon_renderbuffer *rb;
-	
+
 	if (radeon->radeonScreen->kernel_mm) {
 		radeon_make_kernel_renderbuffer_current(radeon, draw);
 		return;
 	}
-			
+
 
 	if ((rb = (void *)draw->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer)) {
 		if (!rb->bo) {
@@ -510,12 +548,29 @@ radeon_make_renderbuffer_current(radeonContextPtr radeon,
 	}
 }
 
+static unsigned
+radeon_bits_per_pixel(const struct radeon_renderbuffer *rb)
+{
+   switch (rb->base._ActualFormat) {
+   case GL_RGB5:
+   case GL_DEPTH_COMPONENT16:
+      return 16;
+   case GL_RGB8:
+   case GL_RGBA8:
+   case GL_DEPTH_COMPONENT24:
+   case GL_DEPTH24_STENCIL8_EXT:
+   case GL_STENCIL_INDEX8_EXT:
+      return 32;
+   default:
+      return 0;
+   }
+}
 
 void
 radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 {
 	unsigned int attachments[10];
-	__DRIbuffer *buffers;
+	__DRIbuffer *buffers = NULL;
 	__DRIscreen *screen;
 	struct radeon_renderbuffer *rb;
 	int i, count;
@@ -526,26 +581,68 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 
 	if (RADEON_DEBUG & DEBUG_DRI)
 	    fprintf(stderr, "enter %s, drawable %p\n", __func__, drawable);
-	
+
 	draw = drawable->driverPrivate;
 	screen = context->driScreenPriv;
 	radeon = (radeonContextPtr) context->driverPrivate;
-	i = 0;
-	if (draw->color_rb[0])
-		attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
-	if (draw->color_rb[1])
-		attachments[i++] = __DRI_BUFFER_BACK_LEFT;
-	if (radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH))
-		attachments[i++] = __DRI_BUFFER_DEPTH;
-	if (radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL))
-		attachments[i++] = __DRI_BUFFER_STENCIL;
-	
-	buffers = (*screen->dri2.loader->getBuffers)(drawable,
-						     &drawable->w,
-						     &drawable->h,
-						     attachments, i,
-						     &count,
-						     drawable->loaderPrivate);
+
+	if (screen->dri2.loader
+	   && (screen->dri2.loader->base.version > 2)
+	   && (screen->dri2.loader->getBuffersWithFormat != NULL)) {
+		struct radeon_renderbuffer *depth_rb;
+		struct radeon_renderbuffer *stencil_rb;
+
+		i = 0;
+		if ((radeon->is_front_buffer_rendering || !draw->color_rb[1])
+			&& draw->color_rb[0]) {
+			attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+			attachments[i++] = radeon_bits_per_pixel(draw->color_rb[0]);
+		}
+
+		if (draw->color_rb[1]) {
+			attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+			attachments[i++] = radeon_bits_per_pixel(draw->color_rb[1]);
+		}
+
+		depth_rb = radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH);
+		stencil_rb = radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL);
+
+		if ((depth_rb != NULL) && (stencil_rb != NULL)) {
+			attachments[i++] = __DRI_BUFFER_DEPTH_STENCIL;
+			attachments[i++] = radeon_bits_per_pixel(depth_rb);
+		} else if (depth_rb != NULL) {
+			attachments[i++] = __DRI_BUFFER_DEPTH;
+			attachments[i++] = radeon_bits_per_pixel(depth_rb);
+		} else if (stencil_rb != NULL) {
+			attachments[i++] = __DRI_BUFFER_STENCIL;
+			attachments[i++] = radeon_bits_per_pixel(stencil_rb);
+		}
+
+		buffers = (*screen->dri2.loader->getBuffersWithFormat)(drawable,
+								&drawable->w,
+								&drawable->h,
+								attachments, i / 2,
+								&count,
+								drawable->loaderPrivate);
+	} else if (screen->dri2.loader) {
+		i = 0;
+		if (draw->color_rb[0])
+			attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+		if (draw->color_rb[1])
+			attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+		if (radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH))
+			attachments[i++] = __DRI_BUFFER_DEPTH;
+		if (radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL))
+			attachments[i++] = __DRI_BUFFER_STENCIL;
+
+		buffers = (*screen->dri2.loader->getBuffers)(drawable,
+								 &drawable->w,
+								 &drawable->h,
+								 attachments, i,
+								 &count,
+								 drawable->loaderPrivate);
+	}
+
 	if (buffers == NULL)
 		return;
 
@@ -570,6 +667,10 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 			rb = draw->color_rb[0];
 			regname = "dri2 front buffer";
 			break;
+		case __DRI_BUFFER_FAKE_FRONT_LEFT:
+			rb = draw->color_rb[0];
+			regname = "dri2 fake front buffer";
+			break;
 		case __DRI_BUFFER_BACK_LEFT:
 			rb = draw->color_rb[1];
 			regname = "dri2 back buffer";
@@ -578,6 +679,10 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 			rb = radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH);
 			regname = "dri2 depth buffer";
 			break;
+		case __DRI_BUFFER_DEPTH_STENCIL:
+			rb = radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH);
+			regname = "dri2 depth / stencil buffer";
+			break;
 		case __DRI_BUFFER_STENCIL:
 			rb = radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL);
 			regname = "dri2 stencil buffer";
@@ -637,7 +742,7 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 
 				fprintf(stderr, "failed to attach %s %d\n",
 					regname, buffers[i].name);
-				
+
 			}
 		}
 
@@ -649,7 +754,24 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 
 		radeon_renderbuffer_set_bo(rb, bo);
 		radeon_bo_unref(bo);
-		    
+
+		if (buffers[i].attachment == __DRI_BUFFER_DEPTH_STENCIL) {
+			rb = radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL);
+			if (rb != NULL) {
+				struct radeon_bo *stencil_bo = NULL;
+
+				if (rb->bo) {
+					uint32_t name = radeon_gem_name_bo(rb->bo);
+					if (name == buffers[i].name)
+						continue;
+				}
+
+				stencil_bo = bo;
+				radeon_bo_ref(stencil_bo);
+				radeon_renderbuffer_set_bo(rb, stencil_bo);
+				radeon_bo_unref(stencil_bo);
+			}
+		}
 	}
 
 	driUpdateFramebufferSize(radeon->glCtx, drawable);
@@ -677,14 +799,14 @@ GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
 	drfb = driDrawPriv->driverPrivate;
 	readfb = driReadPriv->driverPrivate;
 
-	if (driContextPriv->driScreenPriv->dri2.enabled) {    
+	if (driContextPriv->driScreenPriv->dri2.enabled) {
 		radeon_update_renderbuffers(driContextPriv, driDrawPriv);
 		if (driDrawPriv != driReadPriv)
 			radeon_update_renderbuffers(driContextPriv, driReadPriv);
-		radeon->state.color.rrb =
-			radeon_get_renderbuffer(&drfb->base, BUFFER_BACK_LEFT);
-		radeon->state.depth.rrb =
-			radeon_get_renderbuffer(&drfb->base, BUFFER_DEPTH);
+		_mesa_reference_renderbuffer(&radeon->state.color.rb,
+			&(radeon_get_renderbuffer(&drfb->base, BUFFER_BACK_LEFT)->base));
+		_mesa_reference_renderbuffer(&radeon->state.depth.rb,
+			&(radeon_get_renderbuffer(&drfb->base, BUFFER_DEPTH)->base));
 	} else {
 		radeon_make_renderbuffer_current(radeon, drfb);
 	}
@@ -692,9 +814,6 @@ GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
 	if (RADEON_DEBUG & DEBUG_DRI)
 	     fprintf(stderr, "%s ctx %p dfb %p rfb %p\n", __FUNCTION__, radeon->glCtx, drfb, readfb);
 
-	if (radeon->dri.readable != driReadPriv)
-		radeon->dri.readable = driReadPriv;
-
 	driUpdateFramebufferSize(radeon->glCtx, driDrawPriv);
 	if (driReadPriv != driDrawPriv)
 		driUpdateFramebufferSize(radeon->glCtx, driReadPriv);
@@ -704,29 +823,25 @@ GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
 	_mesa_update_state(radeon->glCtx);
 
 	if (radeon->glCtx->DrawBuffer == &drfb->base) {
-
-		if (radeon->dri.drawable != driDrawPriv) {
-			if (driDrawPriv->swap_interval == (unsigned)-1) {
-				int i;
-				driDrawPriv->vblFlags =
-					(radeon->radeonScreen->irq != 0)
-					? driGetDefaultVBlankFlags(&radeon->
-								   optionCache)
-					: VBLANK_FLAG_NO_IRQ;
-				    
-				driDrawableInitVBlank(driDrawPriv);
-				drfb->vbl_waited = driDrawPriv->vblSeq;
-
-				for (i = 0; i < 2; i++) {
-					if (drfb->color_rb[i])
-						drfb->color_rb[i]->vbl_pending = driDrawPriv->vblSeq;
-				}
-				    
+		if (driDrawPriv->swap_interval == (unsigned)-1) {
+			int i;
+			driDrawPriv->vblFlags =
+				(radeon->radeonScreen->irq != 0)
+				? driGetDefaultVBlankFlags(&radeon->
+							   optionCache)
+				: VBLANK_FLAG_NO_IRQ;
+
+			driDrawableInitVBlank(driDrawPriv);
+			drfb->vbl_waited = driDrawPriv->vblSeq;
+
+			for (i = 0; i < 2; i++) {
+				if (drfb->color_rb[i])
+					drfb->color_rb[i]->vbl_pending = driDrawPriv->vblSeq;
 			}
-			radeon->dri.drawable = driDrawPriv;
-			
-			radeon_window_moved(radeon);
+
 		}
+
+		radeon_window_moved(radeon);
 		radeon_draw_buffer(radeon->glCtx, &drfb->base);
 	}
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.h b/src/mesa/drivers/dri/radeon/radeon_common_context.h
index 07e60a28a46..96bc6858765 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.h
@@ -117,17 +117,17 @@ struct radeon_framebuffer
 
 };
 
- 
+
 struct radeon_colorbuffer_state {
 	GLuint clear;
 	int roundEnable;
-	struct radeon_renderbuffer *rrb;
+	struct gl_renderbuffer *rb;
 	uint32_t draw_offset; /* offset into color renderbuffer - FBOs */
 };
 
 struct radeon_depthbuffer_state {
 	GLuint clear;
-	struct radeon_renderbuffer *rrb;
+	struct gl_renderbuffer *rb;
 };
 
 struct radeon_scissor_state {
@@ -346,16 +346,6 @@ struct radeon_dri_mirror {
 	__DRIcontextPrivate *context;	/* DRI context */
 	__DRIscreenPrivate *screen;	/* DRI screen */
 
-   /**
-    * DRI drawable bound to this context for drawing.
-    */
-	__DRIdrawablePrivate *drawable;
-
-   /**
-    * DRI drawable bound to this context for reading.
-    */
-	__DRIdrawablePrivate *readable;
-
 	drm_context_t hwContext;
 	drm_hw_lock_t *hwLock;
 	int fd;
@@ -388,7 +378,7 @@ typedef void (*radeon_line_func) (radeonContextPtr,
 
 typedef void (*radeon_point_func) (radeonContextPtr, radeonVertex *);
 
-#define RADEON_MAX_BOS 24
+#define RADEON_MAX_BOS 32
 struct radeon_state {
 	struct radeon_colorbuffer_state color;
 	struct radeon_depthbuffer_state depth;
@@ -416,11 +406,12 @@ struct radeon_cmdbuf {
 struct radeon_context {
    GLcontext *glCtx;
    radeonScreenPtr radeonScreen;	/* Screen private DRI data */
-  
+
    /* Texture object bookkeeping
     */
    int                   texture_depth;
    float                 initialMaxAnisotropy;
+   uint32_t              texture_row_align;
 
   struct radeon_dma dma;
   struct radeon_hw_state hw;
@@ -457,11 +448,27 @@ struct radeon_context {
    driOptionCache optionCache;
 
    struct radeon_cmdbuf cmdbuf;
-	
+
   drm_clip_rect_t fboRect;
   GLboolean constant_cliprect; /* use for FBO or DRI2 rendering */
   GLboolean front_cliprects;
 
+   /**
+    * Set if rendering has occured to the drawable's front buffer.
+    *
+    * This is used in the DRI2 case to detect that glFlush should also copy
+    * the contents of the fake front buffer to the real front buffer.
+    */
+   GLboolean front_buffer_dirty;
+
+   /**
+    * Track whether front-buffer rendering is currently enabled
+    *
+    * A separate flag is used to track this in order to support MRT more
+    * easily.
+    */
+   GLboolean is_front_buffer_rendering;
+
   struct {
       struct gl_fragment_program *bitmap_fp;
       struct gl_vertex_program *passthrough_vp;
@@ -490,6 +497,17 @@ struct radeon_context {
 
 #define RADEON_CONTEXT(glctx) ((radeonContextPtr)(ctx->DriverCtx))
 
+static inline __DRIdrawablePrivate* radeon_get_drawable(radeonContextPtr radeon)
+{
+	return radeon->dri.context->driDrawablePriv;
+}
+
+static inline __DRIdrawablePrivate* radeon_get_readable(radeonContextPtr radeon)
+{
+	return radeon->dri.context->driReadablePriv;
+}
+
+
 /**
  * This function takes a float and packs it into a uint32_t
  */
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
index 2600c78df39..8f780c443c9 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_context.c
@@ -283,6 +283,11 @@ r100CreateContext( const __GLcontextModes *glVisual,
 
    i = driQueryOptioni( &rmesa->radeon.optionCache, "allow_large_textures");
 
+   /* FIXME: When no memory manager is available we should set this 
+    * to some reasonable value based on texture memory pool size */
+   /* FIXME: does r100 support 2048x2048 texture ? */
+   ctx->Const.MaxTextureLevels = 12;
+
    ctx->Const.MaxTextureMaxAnisotropy = 16.0;
 
    /* No wide points.
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
index b33417e93e3..a2727ef6f7f 100644
--- a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
@@ -267,14 +267,6 @@ static int cs_set_age(struct radeon_cs *cs)
     return 0;
 }
 
-static void dump_cmdbuf(struct radeon_cs *cs)
-{
-  int i;
-  for (i = 0; i < cs->cdw; i++){
-    fprintf(stderr,"%x: %08x\n", i, cs->packets[i]);
-  }
-
-}
 static int cs_emit(struct radeon_cs *cs)
 {
     struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
diff --git a/src/mesa/drivers/dri/radeon/radeon_dma.c b/src/mesa/drivers/dri/radeon/radeon_dma.c
index db96269337e..2fbf89bf6de 100644
--- a/src/mesa/drivers/dri/radeon/radeon_dma.c
+++ b/src/mesa/drivers/dri/radeon/radeon_dma.c
@@ -164,7 +164,7 @@ void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
 void radeonRefillCurrentDmaRegion(radeonContextPtr rmesa, int size)
 {
 
-	size = MAX2(size, MAX_DMA_BUF_SZ * 16);
+	size = MAX2(size, MAX_DMA_BUF_SZ);
 
 	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
 		fprintf(stderr, "%s\n", __FUNCTION__);
@@ -208,7 +208,13 @@ again_alloc:
 
 	if (radeon_revalidate_bos(rmesa->glCtx) == GL_FALSE)
 	  fprintf(stderr,"failure to revalidate BOs - badness\n");
-	  
+
+	if (!rmesa->dma.current) {
+        /* Cmd buff have been flushed in radeon_revalidate_bos */
+		rmesa->dma.nr_released_bufs = 0;
+		goto again_alloc;
+	}
+
 	radeon_bo_map(rmesa->dma.current, 1);
 }
 
@@ -325,6 +331,9 @@ void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs )
    radeonContextPtr radeon = RADEON_CONTEXT( ctx );
    int i;
 
+   if (radeon->dma.flush) {
+       radeon->dma.flush(radeon->glCtx);
+   }
    if (radeon->tcl.elt_dma_bo) {
 	   radeon_bo_unref(radeon->tcl.elt_dma_bo);
 	   radeon->tcl.elt_dma_bo = NULL;
diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.c b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
index b5fde6d3de5..01c45df2dfa 100644
--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
@@ -35,7 +35,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
 #include <sched.h>
-#include <errno.h> 
+#include <errno.h>
 
 #include "main/attrib.h"
 #include "main/enable.h"
@@ -113,8 +113,33 @@ void radeonSetUpAtomList( r100ContextPtr rmesa )
    insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.glt);
 }
 
+void radeonEmitScissor(r100ContextPtr rmesa)
+{
+    BATCH_LOCALS(&rmesa->radeon);
+    if (!rmesa->radeon.radeonScreen->kernel_mm) {
+       return;
+    }
+    if (rmesa->radeon.state.scissor.enabled) {
+        BEGIN_BATCH(6);
+        OUT_BATCH(CP_PACKET0(RADEON_PP_CNTL, 0));
+        OUT_BATCH(rmesa->hw.ctx.cmd[CTX_PP_CNTL] | RADEON_SCISSOR_ENABLE);
+        OUT_BATCH(CP_PACKET0(RADEON_RE_TOP_LEFT, 0));
+        OUT_BATCH((rmesa->radeon.state.scissor.rect.y1 << 16) |
+                  rmesa->radeon.state.scissor.rect.x1);
+        OUT_BATCH(CP_PACKET0(RADEON_RE_WIDTH_HEIGHT, 0));
+        OUT_BATCH(((rmesa->radeon.state.scissor.rect.y2 - 1) << 16) |
+                  (rmesa->radeon.state.scissor.rect.x2 - 1));
+        END_BATCH();
+    } else {
+        BEGIN_BATCH(2);
+        OUT_BATCH(CP_PACKET0(RADEON_PP_CNTL, 0));
+        OUT_BATCH(rmesa->hw.ctx.cmd[CTX_PP_CNTL] & ~RADEON_SCISSOR_ENABLE);
+        END_BATCH();
+    }
+}
+
 /* Fire a section of the retained (indexed_verts) buffer as a regular
- * primtive.  
+ * primtive.
  */
 extern void radeonEmitVbufPrim( r100ContextPtr rmesa,
 				GLuint vertex_format,
@@ -124,8 +149,9 @@ extern void radeonEmitVbufPrim( r100ContextPtr rmesa,
    BATCH_LOCALS(&rmesa->radeon);
 
    assert(!(primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
-   
+
    radeonEmitState(&rmesa->radeon);
+   radeonEmitScissor(rmesa);
 
 #if RADEON_OLD_PACKETS
    BEGIN_BATCH(8);
@@ -135,7 +161,7 @@ extern void radeonEmitVbufPrim( r100ContextPtr rmesa,
    } else {
      OUT_BATCH(rmesa->ioctl.vertex_offset);
    }
-    
+
    OUT_BATCH(vertex_nr);
    OUT_BATCH(vertex_format);
    OUT_BATCH(primitive |  RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
@@ -149,10 +175,10 @@ extern void radeonEmitVbufPrim( r100ContextPtr rmesa,
 			   RADEON_GEM_DOMAIN_GTT,
 			   0, 0);
    }
-   
+
    END_BATCH();
-   
-#else   
+
+#else
    BEGIN_BATCH(4);
    OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_3D_DRAW_VBUF, 1);
    OUT_BATCH(vertex_format);
@@ -173,7 +199,7 @@ void radeonFlushElts( GLcontext *ctx )
    int nr;
    uint32_t *cmd = (uint32_t *)(rmesa->radeon.cmdbuf.cs->packets + rmesa->tcl.elt_cmd_start);
    int dwords = (rmesa->radeon.cmdbuf.cs->section_ndw - rmesa->radeon.cmdbuf.cs->section_cdw);
-   
+
    if (RADEON_DEBUG & DEBUG_IOCTL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
@@ -230,9 +256,10 @@ GLushort *radeonAllocEltsOpenEnded( r100ContextPtr rmesa,
       fprintf(stderr, "%s %d prim %x\n", __FUNCTION__, min_nr, primitive);
 
    assert((primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
-   
+
    radeonEmitState(&rmesa->radeon);
-   
+   radeonEmitScissor(rmesa);
+
    rmesa->tcl.elt_cmd_start = rmesa->radeon.cmdbuf.cs->cdw;
 
    /* round up min_nr to align the state */
@@ -248,16 +275,15 @@ GLushort *radeonAllocEltsOpenEnded( r100ContextPtr rmesa,
    }
    OUT_BATCH(0xffff);
    OUT_BATCH(vertex_format);
-   OUT_BATCH(primitive | 
+   OUT_BATCH(primitive |
 	     RADEON_CP_VC_CNTL_PRIM_WALK_IND |
 	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
 	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
-
 #else
    BEGIN_BATCH_NO_AUTOSTATE(ELTS_BUFSZ(align_min_nr)/4);
    OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_DRAW_INDX, 0);
    OUT_BATCH(vertex_format);
-   OUT_BATCH(primitive | 
+   OUT_BATCH(primitive |
 	     RADEON_CP_VC_CNTL_PRIM_WALK_IND |
 	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
 	     RADEON_CP_VC_CNTL_MAOS_ENABLE |
@@ -269,7 +295,7 @@ GLushort *radeonAllocEltsOpenEnded( r100ContextPtr rmesa,
    rmesa->tcl.elt_used = min_nr;
 
    retval = (GLushort *)(rmesa->radeon.cmdbuf.cs->packets + rmesa->tcl.elt_cmd_offset);
-   
+
    if (RADEON_DEBUG & DEBUG_PRIMS)
       fprintf(stderr, "%s: header prim %x \n",
 	      __FUNCTION__, primitive);
@@ -305,7 +331,7 @@ void radeonEmitVertexAOS( r100ContextPtr rmesa,
 
 #endif
 }
-		       
+
 
 void radeonEmitAOS( r100ContextPtr rmesa,
 		    GLuint nr,
@@ -314,7 +340,7 @@ void radeonEmitAOS( r100ContextPtr rmesa,
 #if RADEON_OLD_PACKETS
    assert( nr == 1 );
    rmesa->ioctl.bo = rmesa->radeon.tcl.aos[0].bo;
-   rmesa->ioctl.vertex_offset = 
+   rmesa->ioctl.vertex_offset =
      (rmesa->radeon.tcl.aos[0].offset + offset * rmesa->radeon.tcl.aos[0].stride * 4);
 #else
    BATCH_LOCALS(&rmesa->radeon);
@@ -336,7 +362,7 @@ void radeonEmitAOS( r100ContextPtr rmesa,
 		   (rmesa->radeon.tcl.aos[i].stride << 8) |
 		   (rmesa->radeon.tcl.aos[i + 1].components << 16) |
 		   (rmesa->radeon.tcl.aos[i + 1].stride << 24));
-			
+
 	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
 	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
 	 OUT_BATCH_RELOC(voffset,
@@ -352,7 +378,7 @@ void radeonEmitAOS( r100ContextPtr rmesa,
 			 RADEON_GEM_DOMAIN_GTT,
 			 0, 0);
       }
-      
+
       if (nr & 1) {
 	 OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
 		   (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
@@ -370,7 +396,7 @@ void radeonEmitAOS( r100ContextPtr rmesa,
 		   (rmesa->radeon.tcl.aos[i].stride << 8) |
 		   (rmesa->radeon.tcl.aos[i + 1].components << 16) |
 		   (rmesa->radeon.tcl.aos[i + 1].stride << 24));
-	 
+
 	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
 	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
 	 OUT_BATCH(voffset);
@@ -378,7 +404,7 @@ void radeonEmitAOS( r100ContextPtr rmesa,
 	    offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
 	 OUT_BATCH(voffset);
       }
-      
+
       if (nr & 1) {
 	 OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
 		   (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
@@ -427,7 +453,7 @@ static void radeonUserClear(GLcontext *ctx, GLuint mask)
 static void radeonKernelClear(GLcontext *ctx, GLuint flags)
 {
      r100ContextPtr rmesa = R100_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    drm_radeon_sarea_t *sarea = rmesa->radeon.sarea;
    uint32_t clear;
    GLint ret, i;
@@ -529,7 +555,7 @@ static void radeonKernelClear(GLcontext *ctx, GLuint flags)
 	 depth_boxes[n].f[CLEAR_Y1] = (float)b[n].y1;
 	 depth_boxes[n].f[CLEAR_X2] = (float)b[n].x2;
 	 depth_boxes[n].f[CLEAR_Y2] = (float)b[n].y2;
-	 depth_boxes[n].f[CLEAR_DEPTH] = 
+	 depth_boxes[n].f[CLEAR_DEPTH] =
 	    (float)rmesa->radeon.state.depth.clear;
       }
 
@@ -548,7 +574,7 @@ static void radeonKernelClear(GLcontext *ctx, GLuint flags)
 static void radeonClear( GLcontext *ctx, GLbitfield mask )
 {
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLuint flags = 0;
    GLuint color_mask = 0;
    GLuint orig_mask = mask;
@@ -560,11 +586,11 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
    {
       LOCK_HARDWARE( &rmesa->radeon );
       UNLOCK_HARDWARE( &rmesa->radeon );
-      if ( dPriv->numClipRects == 0 ) 
+      if ( dPriv->numClipRects == 0 )
 	 return;
    }
-   
-   radeon_firevertices(&rmesa->radeon); 
+
+   radeon_firevertices(&rmesa->radeon);
 
    if ( mask & BUFFER_BIT_FRONT_LEFT ) {
       flags |= RADEON_FRONT;
@@ -594,12 +620,12 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
       _swrast_Clear( ctx, mask );
    }
 
-   if ( !flags ) 
+   if ( !flags )
       return;
 
    if (rmesa->using_hyperz) {
       flags |= RADEON_USE_COMP_ZBUF;
-/*      if (rmesa->radeon.radeonScreen->chipset & RADEON_CHIPSET_TCL) 
+/*      if (rmesa->radeon.radeonScreen->chipset & RADEON_CHIPSET_TCL)
          flags |= RADEON_USE_HIERZ; */
       if (((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
 	    ((rmesa->radeon.state.stencil.clear & RADEON_STENCIL_WRITE_MASK) == RADEON_STENCIL_WRITE_MASK))) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_lock.c b/src/mesa/drivers/dri/radeon/radeon_lock.c
index fe19218d7ae..5774f7ebcf7 100644
--- a/src/mesa/drivers/dri/radeon/radeon_lock.c
+++ b/src/mesa/drivers/dri/radeon/radeon_lock.c
@@ -58,8 +58,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
 {
-	__DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
-	__DRIdrawablePrivate *const readable = rmesa->dri.readable;
+	__DRIdrawablePrivate *const drawable = radeon_get_drawable(rmesa);
+	__DRIdrawablePrivate *const readable = radeon_get_readable(rmesa);
 	__DRIscreenPrivate *sPriv = rmesa->dri.screen;
 
 	assert(drawable != NULL);
@@ -95,8 +95,8 @@ void radeon_lock_hardware(radeonContextPtr radeon)
 	struct radeon_framebuffer *rfb = NULL;
 	struct radeon_renderbuffer *rrb = NULL;
 
-	if (radeon->dri.drawable) {
-		rfb = radeon->dri.drawable->driverPrivate;
+	if (radeon_get_drawable(radeon)) {
+		rfb = radeon_get_drawable(radeon)->driverPrivate;
 
 		if (rfb)
 			rrb = radeon_get_renderbuffer(&rfb->base,
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
index 0a33fe4afa4..f04a07fecd2 100644
--- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
@@ -86,10 +86,11 @@ static int radeon_compressed_num_bytes(GLuint mesaFormat)
  * \param curOffset points to the offset at which the image is to be stored
  * and is updated by this function according to the size of the image.
  */
-static void compute_tex_image_offset(radeon_mipmap_tree *mt,
+static void compute_tex_image_offset(radeonContextPtr rmesa, radeon_mipmap_tree *mt,
 	GLuint face, GLuint level, GLuint* curOffset)
 {
 	radeon_mipmap_level *lvl = &mt->levels[level];
+	uint32_t row_align = rmesa->texture_row_align - 1;
 
 	/* Find image size in bytes */
 	if (mt->compressed) {
@@ -107,7 +108,7 @@ static void compute_tex_image_offset(radeon_mipmap_tree *mt,
 		lvl->rowstride = (lvl->width * mt->bpp * 2 + 31) & ~31;
 		lvl->size = lvl->rowstride * ((lvl->height + 1) / 2) * lvl->depth;
 	} else {
-		lvl->rowstride = (lvl->width * mt->bpp + 31) & ~31;
+		lvl->rowstride = (lvl->width * mt->bpp + row_align) & ~row_align;
 		lvl->size = lvl->rowstride * lvl->height * lvl->depth;
 	}
 	assert(lvl->size > 0);
@@ -131,14 +132,40 @@ static GLuint minify(GLuint size, GLuint levels)
 	return size;
 }
 
-static void calculate_miptree_layout(radeon_mipmap_tree *mt)
+
+static void calculate_miptree_layout_r100(radeonContextPtr rmesa, radeon_mipmap_tree *mt)
+{
+	GLuint curOffset;
+	GLuint numLevels;
+	GLuint i;
+	GLuint face;
+
+	numLevels = mt->lastLevel - mt->firstLevel + 1;
+	assert(numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
+
+	curOffset = 0;
+	for(face = 0; face < mt->faces; face++) {
+
+		for(i = 0; i < numLevels; i++) {
+			mt->levels[i].width = minify(mt->width0, i);
+			mt->levels[i].height = minify(mt->height0, i);
+			mt->levels[i].depth = minify(mt->depth0, i);
+			compute_tex_image_offset(rmesa, mt, face, i, &curOffset);
+		}
+	}
+
+	/* Note the required size in memory */
+	mt->totalsize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
+}
+
+static void calculate_miptree_layout_r300(radeonContextPtr rmesa, radeon_mipmap_tree *mt)
 {
 	GLuint curOffset;
 	GLuint numLevels;
 	GLuint i;
 
 	numLevels = mt->lastLevel - mt->firstLevel + 1;
-	assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
+	assert(numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
 
 	curOffset = 0;
 	for(i = 0; i < numLevels; i++) {
@@ -149,14 +176,13 @@ static void calculate_miptree_layout(radeon_mipmap_tree *mt)
 		mt->levels[i].depth = minify(mt->depth0, i);
 
 		for(face = 0; face < mt->faces; face++)
-			compute_tex_image_offset(mt, face, i, &curOffset);
+			compute_tex_image_offset(rmesa, mt, face, i, &curOffset);
 	}
 
 	/* Note the required size in memory */
 	mt->totalsize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
 }
 
-
 /**
  * Create a new mipmap tree, calculate its layout and allocate memory.
  */
@@ -181,7 +207,10 @@ radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *
 	mt->tilebits = tilebits;
 	mt->compressed = compressed;
 
-	calculate_miptree_layout(mt);
+	if (rmesa->radeonScreen->chip_family >= CHIP_FAMILY_R300)
+		calculate_miptree_layout_r300(rmesa, mt);
+	else
+		calculate_miptree_layout_r100(rmesa, mt);
 
 #ifdef RADEON_DEBUG_BO
     mt->bo = radeon_bo_open(rmesa->radeonScreen->bom,
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
index 697010bc022..7ece688493d 100644
--- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
@@ -47,6 +47,8 @@ struct _radeon_mipmap_level {
 	radeon_mipmap_image faces[6];
 };
 
+/* store the max possible in the miptree */
+#define RADEON_MIPTREE_MAX_TEXTURE_LEVELS 13
 
 /**
  * A mipmap tree contains texture images in the layout that the hardware
@@ -77,7 +79,7 @@ struct _radeon_mipmap_tree {
 	GLuint tilebits; /** RADEON_TXO_xxx_TILE */
 	GLuint compressed; /** MESA_FORMAT_xxx indicating a compressed format, or 0 if uncompressed */
 
-	radeon_mipmap_level levels[RADEON_MAX_TEXTURE_LEVELS];
+	radeon_mipmap_level levels[RADEON_MIPTREE_MAX_TEXTURE_LEVELS];
 };
 
 radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *t,
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index a0e1b131781..e23d53c7a1a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -253,8 +253,8 @@ static int
 radeonGetParam(__DRIscreenPrivate *sPriv, int param, void *value)
 {
   int ret;
-  drm_radeon_getparam_t gp;
-  struct drm_radeon_info info;
+  drm_radeon_getparam_t gp = { 0 };
+  struct drm_radeon_info info = { 0 };
 
   if (sPriv->drm_version.major >= 2) {
       info.value = (uint64_t)value;
@@ -417,6 +417,7 @@ static const __DRItexBufferExtension r600TexBufferExtension = {
 
 static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
 {
+   screen->device_id = device_id;
    screen->chip_flags = 0;
    switch ( device_id ) {
    case PCI_CHIP_RADEON_LY:
@@ -493,11 +494,7 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
       screen->chip_family = CHIP_FAMILY_RS300;
       break;
 
-      /* 9500 with 1 pipe verified by: Reid Linnemann <[email protected]> */
    case PCI_CHIP_R300_AD:
-      screen->chip_family = CHIP_FAMILY_RV350;
-      screen->chip_flags = RADEON_CHIPSET_TCL;
-      break;
    case PCI_CHIP_R300_AE:
    case PCI_CHIP_R300_AF:
    case PCI_CHIP_R300_AG:
@@ -962,19 +959,6 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
    {
       int ret;
 
-#ifdef RADEON_PARAM_KERNEL_MM
-     ret = radeonGetParam(sPriv, RADEON_PARAM_KERNEL_MM, &screen->kernel_mm);
-
-      if (ret && ret != -EINVAL) {
-         FREE( screen );
-         fprintf(stderr, "drm_radeon_getparam_t (RADEON_OFFSET): %d\n", ret);
-         return NULL;
-      }
-
-      if (ret == -EINVAL)
-          screen->kernel_mm = 0;
-#endif
-
       ret = radeonGetParam(sPriv, RADEON_PARAM_GART_BUFFER_OFFSET,
 			    &screen->gart_buffer_offset);
 
@@ -1012,66 +996,63 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
    if (ret == -1)
      return NULL;
 
-   if (!screen->kernel_mm) {
-     screen->mmio.handle = dri_priv->registerHandle;
-     screen->mmio.size   = dri_priv->registerSize;
-     if ( drmMap( sPriv->fd,
-		  screen->mmio.handle,
-		  screen->mmio.size,
-		  &screen->mmio.map ) ) {
-       FREE( screen );
-       __driUtilMessage("%s: drmMap failed\n", __FUNCTION__ );
-       return NULL;
-     }
+   screen->mmio.handle = dri_priv->registerHandle;
+   screen->mmio.size   = dri_priv->registerSize;
+   if ( drmMap( sPriv->fd,
+		screen->mmio.handle,
+		screen->mmio.size,
+		&screen->mmio.map ) ) {
+     FREE( screen );
+     __driUtilMessage("%s: drmMap failed\n", __FUNCTION__ );
+     return NULL;
+   }
+
+   RADEONMMIO = screen->mmio.map;
+
+   screen->status.handle = dri_priv->statusHandle;
+   screen->status.size   = dri_priv->statusSize;
+   if ( drmMap( sPriv->fd,
+		screen->status.handle,
+		screen->status.size,
+		&screen->status.map ) ) {
+     drmUnmap( screen->mmio.map, screen->mmio.size );
+     FREE( screen );
+     __driUtilMessage("%s: drmMap (2) failed\n", __FUNCTION__ );
+     return NULL;
+   }
+   if (screen->chip_family < CHIP_FAMILY_R600)
+	   screen->scratch = (__volatile__ uint32_t *)
+		   ((GLubyte *)screen->status.map + RADEON_SCRATCH_REG_OFFSET);
+   else
+	   screen->scratch = (__volatile__ uint32_t *)
+		   ((GLubyte *)screen->status.map + RADEON_SCRATCH_REG_OFFSET);
 
-     RADEONMMIO = screen->mmio.map;
+   screen->buffers = drmMapBufs( sPriv->fd );
+   if ( !screen->buffers ) {
+     drmUnmap( screen->status.map, screen->status.size );
+     drmUnmap( screen->mmio.map, screen->mmio.size );
+     FREE( screen );
+     __driUtilMessage("%s: drmMapBufs failed\n", __FUNCTION__ );
+     return NULL;
+   }
 
-     screen->status.handle = dri_priv->statusHandle;
-     screen->status.size   = dri_priv->statusSize;
+   if ( dri_priv->gartTexHandle && dri_priv->gartTexMapSize ) {
+     screen->gartTextures.handle = dri_priv->gartTexHandle;
+     screen->gartTextures.size   = dri_priv->gartTexMapSize;
      if ( drmMap( sPriv->fd,
-		  screen->status.handle,
-		  screen->status.size,
-		  &screen->status.map ) ) {
-       drmUnmap( screen->mmio.map, screen->mmio.size );
-       FREE( screen );
-       __driUtilMessage("%s: drmMap (2) failed\n", __FUNCTION__ );
-       return NULL;
-     }
-     if (screen->chip_family < CHIP_FAMILY_R600)
-	     screen->scratch = (__volatile__ uint32_t *)
-		     ((GLubyte *)screen->status.map + RADEON_SCRATCH_REG_OFFSET);
-     else
-	     screen->scratch = (__volatile__ uint32_t *)
-		     ((GLubyte *)screen->status.map + R600_SCRATCH_REG_OFFSET);
-
-     screen->buffers = drmMapBufs( sPriv->fd );
-     if ( !screen->buffers ) {
+		  screen->gartTextures.handle,
+		  screen->gartTextures.size,
+		  (drmAddressPtr)&screen->gartTextures.map ) ) {
+       drmUnmapBufs( screen->buffers );
        drmUnmap( screen->status.map, screen->status.size );
        drmUnmap( screen->mmio.map, screen->mmio.size );
        FREE( screen );
-       __driUtilMessage("%s: drmMapBufs failed\n", __FUNCTION__ );
+       __driUtilMessage("%s: drmMap failed for GART texture area\n", __FUNCTION__);
        return NULL;
-     }
-     
-     if ( dri_priv->gartTexHandle && dri_priv->gartTexMapSize ) {
-       screen->gartTextures.handle = dri_priv->gartTexHandle;
-       screen->gartTextures.size   = dri_priv->gartTexMapSize;
-       if ( drmMap( sPriv->fd,
-		    screen->gartTextures.handle,
-		    screen->gartTextures.size,
-		    (drmAddressPtr)&screen->gartTextures.map ) ) {
-	 drmUnmapBufs( screen->buffers );
-	 drmUnmap( screen->status.map, screen->status.size );
-	 drmUnmap( screen->mmio.map, screen->mmio.size );
-	 FREE( screen );
-	 __driUtilMessage("%s: drmMap failed for GART texture area\n", __FUNCTION__);
-	 return NULL;
-       }
-       
-       screen->gart_texture_offset = dri_priv->gartTexOffset + screen->gart_base;
-     }
-   }
+    }
 
+     screen->gart_texture_offset = dri_priv->gartTexOffset + screen->gart_base;
+   }
 
    if ((screen->chip_family == CHIP_FAMILY_R350 || screen->chip_family == CHIP_FAMILY_R300) &&
        sPriv->ddx_version.minor < 2) {
@@ -1161,6 +1142,17 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
        } else {
 	   screen->num_gb_pipes = temp;
        }
+
+       /* pipe overrides */
+       switch (dri_priv->deviceID) {
+       case PCI_CHIP_R300_AD: /* 9500 with 1 quadpipe verified by: Reid Linnemann <[email protected]> */
+       case PCI_CHIP_RV410_5E4C: /* RV410 SE only have 1 quadpipe */
+       case PCI_CHIP_RV410_5E4F: /* RV410 SE only have 1 quadpipe */
+	   screen->num_gb_pipes = 1;
+	   break;
+       default:
+	   break;
+       }
    }
 
    if ( sPriv->drm_version.minor >= 10 ) {
@@ -1224,26 +1216,24 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
        screen->extensions[i++] = &driMediaStreamCounterExtension.base;
    }
 
-   if (!screen->kernel_mm) {
 #if !RADEON_COMMON
-   	screen->extensions[i++] = &radeonTexOffsetExtension.base;
+   screen->extensions[i++] = &radeonTexOffsetExtension.base;
 #endif
 
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
-        if (IS_R200_CLASS(screen))
-            screen->extensions[i++] = &r200AllocateExtension.base;
+   if (IS_R200_CLASS(screen))
+      screen->extensions[i++] = &r200AllocateExtension.base;
 
-        screen->extensions[i++] = &r200texOffsetExtension.base;
+   screen->extensions[i++] = &r200texOffsetExtension.base;
 #endif
 
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
-        screen->extensions[i++] = &r300texOffsetExtension.base;
+   screen->extensions[i++] = &r300texOffsetExtension.base;
 #endif
 
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
-        screen->extensions[i++] = &r600texOffsetExtension.base;
+   screen->extensions[i++] = &r600texOffsetExtension.base;
 #endif
-   }
 
    screen->extensions[i++] = NULL;
    sPriv->extensions = screen->extensions;
@@ -1253,10 +1243,7 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
    screen->sarea = (drm_radeon_sarea_t *) ((GLubyte *) sPriv->pSAREA +
 					       screen->sarea_priv_offset);
 
-   if (screen->kernel_mm)
-     screen->bom = radeon_bo_manager_gem_ctor(sPriv->fd);
-   else
-     screen->bom = radeon_bo_manager_legacy_ctor(screen);
+   screen->bom = radeon_bo_manager_legacy_ctor(screen);
    if (screen->bom == NULL) {
      free(screen);
      return NULL;
@@ -1271,7 +1258,7 @@ radeonCreateScreen2(__DRIscreenPrivate *sPriv)
    radeonScreenPtr screen;
    int i;
    int ret;
-   uint32_t device_id;
+   uint32_t device_id = 0;
    uint32_t temp = 0;
 
    /* Allocate the private area */
@@ -1294,7 +1281,15 @@ radeonCreateScreen2(__DRIscreenPrivate *sPriv)
    screen->kernel_mm = 1;
    screen->chip_flags = 0;
 
-   ret = radeonGetParam(sPriv, RADEON_PARAM_IRQ_NR, &screen->irq);
+   /* if we have kms we can support all of these */
+   screen->drmSupportsCubeMapsR200 = 1;
+   screen->drmSupportsBlendColor = 1;
+   screen->drmSupportsTriPerf = 1;
+   screen->drmSupportsFragShader = 1;
+   screen->drmSupportsPointSprites = 1;
+   screen->drmSupportsCubeMapsR100 = 1;
+   screen->drmSupportsVertexProgram = 1;
+   screen->irq = 1;
 
    ret = radeonGetParam(sPriv, RADEON_PARAM_DEVICE_ID, &device_id);
    if (ret) {
@@ -1346,6 +1341,18 @@ radeonCreateScreen2(__DRIscreenPrivate *sPriv)
        } else {
 	   screen->num_gb_pipes = temp;
        }
+
+       /* pipe overrides */
+       switch (device_id) {
+       case PCI_CHIP_R300_AD: /* 9500 with 1 quadpipe verified by: Reid Linnemann <[email protected]> */
+       case PCI_CHIP_RV410_5E4C: /* RV410 SE only have 1 quadpipe */
+       case PCI_CHIP_RV410_5E4F: /* RV410 SE only have 1 quadpipe */
+	   screen->num_gb_pipes = 1;
+	   break;
+       default:
+	   break;
+       }
+
    }
 
    i = 0;
@@ -1586,6 +1593,7 @@ static GLboolean radeonCreateContext(const __GLcontextModes * glVisual,
 #endif
 
 #if !RADEON_COMMON
+	(void)screen;
 	return r100CreateContext(glVisual, driContextPriv, sharedContextPriv);
 #endif
 	return GL_FALSE;
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.h b/src/mesa/drivers/dri/radeon/radeon_screen.h
index 5194224acb7..2a2f6b1b0b8 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.h
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.h
@@ -59,6 +59,7 @@ typedef struct radeon_screen {
    int chip_flags;
    int cpp;
    int card_type;
+   int device_id; /* PCI ID */
    int AGPMode;
    unsigned int irq;			/* IRQ number (0 means none) */
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
index d9a7ef60612..06b8c299368 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state.c
@@ -149,7 +149,7 @@ static void radeonBlendFuncSeparate( GLcontext *ctx,
 				     GLenum sfactorA, GLenum dfactorA )
 {
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
-   GLuint b = rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] & 
+   GLuint b = rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] &
       ~(RADEON_SRC_BLEND_MASK | RADEON_DST_BLEND_MASK);
    GLboolean fallback = GL_FALSE;
 
@@ -392,7 +392,7 @@ static void radeonFogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
 	 rmesa->hw.fog.cmd[FOG_D] = d.i;
       }
       break;
-   case GL_FOG_COLOR: 
+   case GL_FOG_COLOR:
       RADEON_STATECHANGE( rmesa, ctx );
       UNCLAMPED_FLOAT_TO_RGB_CHAN( col, ctx->Fog.Color );
       rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] &= ~RADEON_FOG_COLOR_MASK;
@@ -495,7 +495,7 @@ static void radeonLineStipple( GLcontext *ctx, GLint factor, GLushort pattern )
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    RADEON_STATECHANGE( rmesa, lin );
-   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] = 
+   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] =
       ((((GLuint)factor & 0xff) << 16) | ((GLuint)pattern));
 }
 
@@ -558,7 +558,7 @@ static void radeonPolygonStipple( GLcontext *ctx, const GLubyte *mask )
    /* FIXME: Use window x,y offsets into stipple RAM.
     */
    stipple.mask = rmesa->state.stipple.mask;
-   drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_STIPPLE, 
+   drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_STIPPLE,
                     &stipple, sizeof(drm_radeon_stipple_t) );
    UNLOCK_HARDWARE( &rmesa->radeon );
 }
@@ -569,7 +569,7 @@ static void radeonPolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
    GLboolean flag = (ctx->_TriangleCaps & DD_TRI_UNFILLED) != 0;
 
    /* Can't generally do unfilled via tcl, but some good special
-    * cases work. 
+    * cases work.
     */
    TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_UNFILLED, flag);
    if (rmesa->radeon.TclFallback) {
@@ -617,7 +617,7 @@ static void radeonUpdateSpecular( GLcontext *ctx )
       rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_DIFFUSE;
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= RADEON_LIGHTING_ENABLE;
       p |=  RADEON_SPECULAR_ENABLE;
-      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= 
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &=
 	 ~RADEON_DIFFUSE_SPECULAR_COMBINE;
    }
    else if (ctx->Light.Enabled) {
@@ -647,7 +647,7 @@ static void radeonUpdateSpecular( GLcontext *ctx )
 	    RADEON_TCL_COMPUTE_SPECULAR) != 0;
       }
    }
- 
+
    TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_FOGCOORDSPEC, flag);
 
    if (NEED_SECONDARY_COLOR(ctx)) {
@@ -663,7 +663,7 @@ static void radeonUpdateSpecular( GLcontext *ctx )
 
    /* Update vertex/render formats
     */
-   if (rmesa->radeon.TclFallback) { 
+   if (rmesa->radeon.TclFallback) {
       radeonChooseRenderState( ctx );
       radeonChooseVertexState( ctx );
    }
@@ -675,7 +675,7 @@ static void radeonUpdateSpecular( GLcontext *ctx )
  */
 
 
-/* Update on colormaterial, material emmissive/ambient, 
+/* Update on colormaterial, material emmissive/ambient,
  * lightmodel.globalambient
  */
 static void update_global_ambient( GLcontext *ctx )
@@ -688,23 +688,23 @@ static void update_global_ambient( GLcontext *ctx )
     */
    if ((rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &
        ((3 << RADEON_EMISSIVE_SOURCE_SHIFT) |
-	(3 << RADEON_AMBIENT_SOURCE_SHIFT))) == 0) 
+	(3 << RADEON_AMBIENT_SOURCE_SHIFT))) == 0)
    {
-      COPY_3V( &fcmd[GLT_RED], 
+      COPY_3V( &fcmd[GLT_RED],
 	       ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_EMISSION]);
       ACC_SCALE_3V( &fcmd[GLT_RED],
 		   ctx->Light.Model.Ambient,
 		   ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_AMBIENT]);
-   } 
+   }
    else
    {
       COPY_3V( &fcmd[GLT_RED], ctx->Light.Model.Ambient );
    }
-   
+
    RADEON_DB_STATECHANGE(rmesa, &rmesa->hw.glt);
 }
 
-/* Update on change to 
+/* Update on change to
  *    - light[p].colors
  *    - light[p].enabled
  */
@@ -718,10 +718,10 @@ static void update_light_colors( GLcontext *ctx, GLuint p )
       r100ContextPtr rmesa = R100_CONTEXT(ctx);
       float *fcmd = (float *)RADEON_DB_STATE( lit[p] );
 
-      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );	 
+      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );
       COPY_4V( &fcmd[LIT_DIFFUSE_RED], l->Diffuse );
       COPY_4V( &fcmd[LIT_SPECULAR_RED], l->Specular );
-      
+
       RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.lit[p] );
    }
 }
@@ -735,7 +735,7 @@ static void check_twoside_fallback( GLcontext *ctx )
 
    if (ctx->Light.Enabled && ctx->Light.Model.TwoSide) {
       if (ctx->Light.ColorMaterialEnabled &&
-	  (ctx->Light.ColorMaterialBitmask & BACK_MATERIAL_BITS) != 
+	  (ctx->Light.ColorMaterialBitmask & BACK_MATERIAL_BITS) !=
 	  ((ctx->Light.ColorMaterialBitmask & FRONT_MATERIAL_BITS)<<1))
 	 fallback = GL_TRUE;
       else {
@@ -743,7 +743,7 @@ static void check_twoside_fallback( GLcontext *ctx )
 	    if (memcmp( ctx->Light.Material.Attrib[i],
 			ctx->Light.Material.Attrib[i+1],
 			sizeof(GLfloat)*4) != 0) {
-	       fallback = GL_TRUE;  
+	       fallback = GL_TRUE;
 	       break;
 	    }
       }
@@ -761,8 +761,8 @@ static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
       light_model_ctl1 &= ~((3 << RADEON_EMISSIVE_SOURCE_SHIFT) |
 			   (3 << RADEON_AMBIENT_SOURCE_SHIFT) |
 			   (3 << RADEON_DIFFUSE_SOURCE_SHIFT) |
-			   (3 << RADEON_SPECULAR_SOURCE_SHIFT)); 
-   
+			   (3 << RADEON_SPECULAR_SOURCE_SHIFT));
+
    if (ctx->Light.ColorMaterialEnabled) {
       GLuint mask = ctx->Light.ColorMaterialBitmask;
 
@@ -783,7 +783,7 @@ static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 	 light_model_ctl1 |= (RADEON_LM_SOURCE_STATE_MULT <<
 			     RADEON_AMBIENT_SOURCE_SHIFT);
       }
-	 
+
       if (mask & MAT_BIT_FRONT_DIFFUSE) {
 	 light_model_ctl1 |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
 			     RADEON_DIFFUSE_SOURCE_SHIFT);
@@ -792,7 +792,7 @@ static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 	 light_model_ctl1 |= (RADEON_LM_SOURCE_STATE_MULT <<
 			     RADEON_DIFFUSE_SOURCE_SHIFT);
       }
-   
+
       if (mask & MAT_BIT_FRONT_SPECULAR) {
 	 light_model_ctl1 |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
 			     RADEON_SPECULAR_SOURCE_SHIFT);
@@ -810,10 +810,10 @@ static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 		   (RADEON_LM_SOURCE_STATE_MULT << RADEON_DIFFUSE_SOURCE_SHIFT) |
 		   (RADEON_LM_SOURCE_STATE_MULT << RADEON_SPECULAR_SOURCE_SHIFT);
    }
-   
+
       if (light_model_ctl1 != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]) {
 	 RADEON_STATECHANGE( rmesa, tcl );
-	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = light_model_ctl1;      
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = light_model_ctl1;
    }
 }
 
@@ -823,14 +823,14 @@ void radeonUpdateMaterial( GLcontext *ctx )
    GLfloat (*mat)[4] = ctx->Light.Material.Attrib;
    GLfloat *fcmd = (GLfloat *)RADEON_DB_STATE( mtl );
    GLuint mask = ~0;
-   
+
    if (ctx->Light.ColorMaterialEnabled)
       mask &= ~ctx->Light.ColorMaterialBitmask;
 
    if (RADEON_DEBUG & DEBUG_STATE)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
-      
+
    if (mask & MAT_BIT_FRONT_EMISSION) {
       fcmd[MTL_EMMISSIVE_RED]   = mat[MAT_ATTRIB_FRONT_EMISSION][0];
       fcmd[MTL_EMMISSIVE_GREEN] = mat[MAT_ATTRIB_FRONT_EMISSION][1];
@@ -880,7 +880,7 @@ void radeonUpdateMaterial( GLcontext *ctx )
  *
  * which are calculated in light.c and are correct for the current
  * lighting space (model or eye), hence dependencies on _NEW_MODELVIEW
- * and _MESA_NEW_NEED_EYE_COORDS.  
+ * and _MESA_NEW_NEED_EYE_COORDS.
  */
 static void update_light( GLcontext *ctx )
 {
@@ -897,12 +897,12 @@ static void update_light( GLcontext *ctx )
 	 tmp &= ~RADEON_LIGHT_IN_MODELSPACE;
       else
 	 tmp |= RADEON_LIGHT_IN_MODELSPACE;
-      
+
 
       /* Leave this test disabled: (unexplained q3 lockup) (even with
          new packets)
       */
-      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]) 
+      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL])
       {
 	 RADEON_STATECHANGE( rmesa, tcl );
 	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = tmp;
@@ -926,10 +926,10 @@ static void update_light( GLcontext *ctx )
 	 if (ctx->Light.Light[p].Enabled) {
 	    struct gl_light *l = &ctx->Light.Light[p];
 	    GLfloat *fcmd = (GLfloat *)RADEON_DB_STATE( lit[p] );
-	    
+
 	    if (l->EyePosition[3] == 0.0) {
-	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm ); 
-	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm ); 
+	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm );
+	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm );
 	       fcmd[LIT_POSITION_W] = 0;
 	       fcmd[LIT_DIRECTION_W] = 0;
 	    } else {
@@ -953,26 +953,26 @@ static void radeonLightfv( GLcontext *ctx, GLenum light,
    GLint p = light - GL_LIGHT0;
    struct gl_light *l = &ctx->Light.Light[p];
    GLfloat *fcmd = (GLfloat *)rmesa->hw.lit[p].cmd;
-   
+
 
    switch (pname) {
-   case GL_AMBIENT:		
+   case GL_AMBIENT:
    case GL_DIFFUSE:
    case GL_SPECULAR:
       update_light_colors( ctx, p );
       break;
 
-   case GL_SPOT_DIRECTION: 
-      /* picked up in update_light */	
+   case GL_SPOT_DIRECTION:
+      /* picked up in update_light */
       break;
 
    case GL_POSITION: {
-      /* positions picked up in update_light, but can do flag here */	
+      /* positions picked up in update_light, but can do flag here */
       GLuint flag;
       GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
 
       /* FIXME: Set RANGE_ATTEN only when needed */
-      if (p&1) 
+      if (p&1)
 	 flag = RADEON_LIGHT_1_IS_LOCAL;
       else
 	 flag = RADEON_LIGHT_0_IS_LOCAL;
@@ -1064,7 +1064,7 @@ static void radeonLightfv( GLcontext *ctx, GLenum light,
    }
 }
 
-		  
+
 
 
 static void radeonLightModelfv( GLcontext *ctx, GLenum pname,
@@ -1073,7 +1073,7 @@ static void radeonLightModelfv( GLcontext *ctx, GLenum pname,
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    switch (pname) {
-      case GL_LIGHT_MODEL_AMBIENT: 
+      case GL_LIGHT_MODEL_AMBIENT:
 	 update_global_ambient( ctx );
 	 break;
 
@@ -1247,14 +1247,14 @@ static void radeonStencilOpSeparate( GLcontext *ctx, GLenum face, GLenum fail,
    /* radeon 7200 have stencil bug, DEC and INC_WRAP will actually both do DEC_WRAP,
       and DEC_WRAP (and INVERT) will do INVERT. No way to get correct INC_WRAP and DEC,
       but DEC_WRAP can be fixed by using DEC and INC_WRAP at least use INC. */
-   
+
    GLuint tempRADEON_STENCIL_FAIL_DEC_WRAP;
    GLuint tempRADEON_STENCIL_FAIL_INC_WRAP;
    GLuint tempRADEON_STENCIL_ZFAIL_DEC_WRAP;
    GLuint tempRADEON_STENCIL_ZFAIL_INC_WRAP;
    GLuint tempRADEON_STENCIL_ZPASS_DEC_WRAP;
    GLuint tempRADEON_STENCIL_ZPASS_INC_WRAP;
-   
+
    if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_BROKEN_STENCIL) {
       tempRADEON_STENCIL_FAIL_DEC_WRAP = RADEON_STENCIL_FAIL_DEC;
       tempRADEON_STENCIL_FAIL_INC_WRAP = RADEON_STENCIL_FAIL_INC;
@@ -1271,7 +1271,7 @@ static void radeonStencilOpSeparate( GLcontext *ctx, GLenum face, GLenum fail,
       tempRADEON_STENCIL_ZPASS_DEC_WRAP = RADEON_STENCIL_ZPASS_DEC_WRAP;
       tempRADEON_STENCIL_ZPASS_INC_WRAP = RADEON_STENCIL_ZPASS_INC_WRAP;
    }
-   
+
    RADEON_STATECHANGE( rmesa, ctx );
    rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~(RADEON_STENCIL_FAIL_MASK |
 					       RADEON_STENCIL_ZFAIL_MASK |
@@ -1363,7 +1363,7 @@ static void radeonClearStencil( GLcontext *ctx, GLint s )
 {
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
-   rmesa->radeon.state.stencil.clear = 
+   rmesa->radeon.state.stencil.clear =
       ((GLuint) (ctx->Stencil.Clear & 0xff) |
        (0xff << RADEON_STENCIL_MASK_SHIFT) |
        ((ctx->Stencil.WriteMask[0] & 0xff) << RADEON_STENCIL_WRITEMASK_SHIFT));
@@ -1388,7 +1388,7 @@ static void radeonClearStencil( GLcontext *ctx, GLint s )
 void radeonUpdateWindow( GLcontext *ctx )
 {
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
    GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1443,7 +1443,7 @@ static void radeonDepthRange( GLcontext *ctx, GLclampd nearval,
 void radeonUpdateViewportOffset( GLcontext *ctx )
 {
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLfloat xoffset = (GLfloat)dPriv->x;
    GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1473,8 +1473,8 @@ void radeonUpdateViewportOffset( GLcontext *ctx )
                 RADEON_STIPPLE_Y_OFFSET_MASK);
 
          /* add magic offsets, then invert */
-         stx = 31 - ((rmesa->radeon.dri.drawable->x - 1) & RADEON_STIPPLE_COORD_MASK);
-         sty = 31 - ((rmesa->radeon.dri.drawable->y + rmesa->radeon.dri.drawable->h - 1)
+         stx = 31 - ((dPriv->x - 1) & RADEON_STIPPLE_COORD_MASK);
+         sty = 31 - ((dPriv->y + dPriv->h - 1)
                      & RADEON_STIPPLE_COORD_MASK);
 
          m |= ((stx << RADEON_STIPPLE_X_OFFSET_SHIFT) |
@@ -1613,7 +1613,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_CLIP_PLANE2:
    case GL_CLIP_PLANE3:
    case GL_CLIP_PLANE4:
-   case GL_CLIP_PLANE5: 
+   case GL_CLIP_PLANE5:
       p = cap-GL_CLIP_PLANE0;
       RADEON_STATECHANGE( rmesa, tcl );
       if (state) {
@@ -1678,13 +1678,13 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_LIGHT7:
       RADEON_STATECHANGE(rmesa, tcl);
       p = cap - GL_LIGHT0;
-      if (p&1) 
+      if (p&1)
 	 flag = (RADEON_LIGHT_1_ENABLE |
-		 RADEON_LIGHT_1_ENABLE_AMBIENT | 
+		 RADEON_LIGHT_1_ENABLE_AMBIENT |
 		 RADEON_LIGHT_1_ENABLE_SPECULAR);
       else
 	 flag = (RADEON_LIGHT_0_ENABLE |
-		 RADEON_LIGHT_0_ENABLE_AMBIENT | 
+		 RADEON_LIGHT_0_ENABLE_AMBIENT |
 		 RADEON_LIGHT_0_ENABLE_SPECULAR);
 
       if (state)
@@ -1692,7 +1692,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
       else
 	 rmesa->hw.tcl.cmd[p/2 + TCL_PER_LIGHT_CTL_0] &= ~flag;
 
-      /* 
+      /*
        */
       update_light_colors( ctx, p );
       break;
@@ -1730,7 +1730,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_ROP_ENABLE;
       }
       break;
-      
+
    case GL_NORMALIZE:
       RADEON_STATECHANGE( rmesa, tcl );
       if ( state ) {
@@ -1830,7 +1830,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_TEXTURE_GEN_T:
       /* Picked up in radeonUpdateTextureState.
        */
-      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE; 
+      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE;
       break;
 
    case GL_COLOR_SUM_EXT:
@@ -1864,7 +1864,7 @@ static void radeonLightingSpaceChange( GLcontext *ctx )
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= ~RADEON_RESCALE_NORMALS;
    }
 
-   if (RADEON_DEBUG & DEBUG_STATE) 
+   if (RADEON_DEBUG & DEBUG_STATE)
       fprintf(stderr, "%s %d AFTER %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
 	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]);
 }
@@ -2051,7 +2051,7 @@ static GLboolean r100ValidateBuffers(GLcontext *ctx)
    int i;
 
    radeon_validate_reset_bos(&rmesa->radeon);
-   
+
    rrb = radeon_get_colorbuffer(&rmesa->radeon);
    /* color buffer */
    if (rrb && rrb->bo) {
@@ -2069,7 +2069,7 @@ static GLboolean r100ValidateBuffers(GLcontext *ctx)
 
    for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
       radeonTexObj *t;
-      
+
       if (!ctx->Texture.Unit[i]._ReallyEnabled)
 	 continue;
 
@@ -2094,7 +2094,7 @@ GLboolean radeonValidateState( GLcontext *ctx )
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint new_state = rmesa->radeon.NewGLState;
 
-   if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+   if (new_state & _NEW_BUFFERS) {
      _mesa_update_framebuffer(ctx);
      /* this updates the DrawBuffer's Width/Height if it's a FBO */
      _mesa_update_draw_buffer_bounds(ctx);
@@ -2112,7 +2112,7 @@ GLboolean radeonValidateState( GLcontext *ctx )
 
    /* Need an event driven matrix update?
     */
-   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION)) 
+   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION))
       upload_matrix( rmesa, ctx->_ModelProjectMatrix.m, MODEL_PROJ );
 
    /* Need these for lighting (shouldn't upload otherwise)
@@ -2136,7 +2136,7 @@ GLboolean radeonValidateState( GLcontext *ctx )
    /* emit all active clip planes if projection matrix changes.
     */
    if (new_state & (_NEW_PROJECTION)) {
-      if (ctx->Transform.ClipPlanesEnabled) 
+      if (ctx->Transform.ClipPlanesEnabled)
 	 radeonUpdateClipPlanes( ctx );
    }
 
@@ -2165,8 +2165,8 @@ static GLboolean check_material( GLcontext *ctx )
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    GLint i;
 
-   for (i = _TNL_ATTRIB_MAT_FRONT_AMBIENT; 
-	i < _TNL_ATTRIB_MAT_BACK_INDEXES; 
+   for (i = _TNL_ATTRIB_MAT_FRONT_AMBIENT;
+	i < _TNL_ATTRIB_MAT_BACK_INDEXES;
 	i++)
       if (tnl->vb.AttribPtr[i] &&
 	  tnl->vb.AttribPtr[i]->stride)
@@ -2174,7 +2174,7 @@ static GLboolean check_material( GLcontext *ctx )
 
    return GL_FALSE;
 }
-      
+
 
 static void radeonWrapRunPipeline( GLcontext *ctx )
 {
@@ -2197,7 +2197,7 @@ static void radeonWrapRunPipeline( GLcontext *ctx )
    }
 
    /* Run the pipeline.
-    */ 
+    */
    _tnl_run_pipeline( ctx );
 
    if (has_material) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_state_init.c b/src/mesa/drivers/dri/radeon/radeon_state_init.c
index 174a7e1862f..f5d4189d669 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state_init.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state_init.c
@@ -448,16 +448,28 @@ static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
    // }
 
    END_BATCH();
+   BEGIN_BATCH_NO_AUTOSTATE(4);
+   OUT_BATCH(CP_PACKET0(RADEON_RE_TOP_LEFT, 0));
+   OUT_BATCH(0);
+   OUT_BATCH(CP_PACKET0(RADEON_RE_WIDTH_HEIGHT, 0));
+   if (rrb) {
+       OUT_BATCH(((rrb->width - 1) << RADEON_RE_WIDTH_SHIFT) |
+                 ((rrb->height - 1) << RADEON_RE_HEIGHT_SHIFT));
+   } else {
+       OUT_BATCH(0);
+   }
+   END_BATCH();
 }
 
 static void cube_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 {
    r100ContextPtr r100 = R100_CONTEXT(ctx);
    BATCH_LOCALS(&r100->radeon);
-   uint32_t dwords = atom->cmd_size;
+   uint32_t dwords = 2;
    int i = atom->idx, j;
    radeonTexObj *t = r100->state.texture.unit[i].texobj;
    radeon_mipmap_level *lvl;
+   uint32_t base_reg;
 
    if (!(ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT))
 	return;
@@ -468,10 +480,17 @@ static void cube_emit(GLcontext *ctx, struct radeon_state_atom *atom)
    if (!t->mt)
 	return;
 
-   BEGIN_BATCH_NO_AUTOSTATE(dwords + 10);
-   OUT_BATCH_TABLE(atom->cmd, 3);
+   switch(i) {
+	case 1: base_reg = RADEON_PP_CUBIC_OFFSET_T1_0; break;
+	case 2: base_reg = RADEON_PP_CUBIC_OFFSET_T2_0; break;
+	default:
+	case 0: base_reg = RADEON_PP_CUBIC_OFFSET_T0_0; break;
+   };
+   BEGIN_BATCH_NO_AUTOSTATE(dwords + (5 * 4));
+   OUT_BATCH_TABLE(atom->cmd, 2);
    lvl = &t->mt->levels[0];
    for (j = 0; j < 5; j++) {
+	OUT_BATCH(CP_PACKET0(base_reg + (4 * (j-1)), 0));
 	OUT_BATCH_RELOC(lvl->faces[j].offset, t->mt->bo, lvl->faces[j].offset,
 			RADEON_GEM_DOMAIN_VRAM, 0, 0);
    }
diff --git a/src/mesa/drivers/dri/radeon/radeon_texstate.c b/src/mesa/drivers/dri/radeon/radeon_texstate.c
index e4df33766eb..279bcd4df6a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texstate.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texstate.c
@@ -722,7 +722,10 @@ void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_
 	pitch_val = rb->pitch;
 	switch (rb->cpp) {
 	case 4:
-		t->pp_txformat = tx_table[MESA_FORMAT_ARGB8888].format;
+		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
+			t->pp_txformat = tx_table[MESA_FORMAT_RGB888].format;
+		else
+			t->pp_txformat = tx_table[MESA_FORMAT_ARGB8888].format;
 		t->pp_txfilter |= tx_table[MESA_FORMAT_ARGB8888].filter;
 		break;
 	case 3:
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index 64029eeb91a..ee66fc2ea0d 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -38,6 +38,7 @@
 #include "main/texstore.h"
 #include "main/teximage.h"
 #include "main/texobj.h"
+#include "main/texgetimage.h"
 
 #include "xmlpool.h"		/* for symbolic values of enum-type options */
 
@@ -611,11 +612,10 @@ static void radeon_teximage(
 				_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
 		}
 
-	}
-
-	/* SGIS_generate_mipmap */
-	if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-		radeon_generate_mipmap(ctx, texObj->Target, texObj);
+		/* SGIS_generate_mipmap */
+		if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+			radeon_generate_mipmap(ctx, texObj->Target, texObj);
+		}
 	}
 
 	_mesa_unmap_teximage_pbo(ctx, packing);
@@ -740,12 +740,12 @@ static void radeon_texsubimage(GLcontext* ctx, int dims, int level,
 				_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage");
 		}
 
+		/* GL_SGIS_generate_mipmap */
+		if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+			radeon_generate_mipmap(ctx, texObj->Target, texObj);
+		}
 	}
 
-	/* GL_SGIS_generate_mipmap */
-	if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-		radeon_generate_mipmap(ctx, texObj->Target, texObj);
-	}
 	radeon_teximage_unmap(image);
 
 	_mesa_unmap_teximage_pbo(ctx, packing);
@@ -830,7 +830,7 @@ static void migrate_image_to_miptree(radeon_mipmap_tree *mt, radeon_texture_imag
 		 * In fact, that memcpy() could be done by the hardware in many
 		 * cases, provided that we have a proper memory manager.
 		 */
-		radeon_mipmap_level *srclvl = &image->mt->levels[image->mtlevel];
+		radeon_mipmap_level *srclvl = &image->mt->levels[image->mtlevel-image->mt->firstLevel];
 
 		assert(srclvl->size == dstlvl->size);
 		assert(srclvl->rowstride == dstlvl->rowstride);
diff --git a/src/mesa/drivers/osmesa/Makefile b/src/mesa/drivers/osmesa/Makefile
index 3b3984200aa..92d41494665 100644
--- a/src/mesa/drivers/osmesa/Makefile
+++ b/src/mesa/drivers/osmesa/Makefile
@@ -19,11 +19,12 @@ INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/src/mesa/main
 
+# Standalone osmesa needs to be linked with core Mesa APIs
+ifeq ($(DRIVER_DIRS), osmesa)
 CORE_MESA = $(TOP)/src/mesa/libmesa.a $(TOP)/src/mesa/libglapi.a
-
-
-.PHONY: osmesa8
-.PHONY: osmesa16
+else
+CORE_MESA =
+endif
 
 
 .c.o:
@@ -31,31 +32,12 @@ CORE_MESA = $(TOP)/src/mesa/libmesa.a $(TOP)/src/mesa/libglapi.a
 
 
 default: $(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME)
-	@ if [ "${DRIVER_DIRS}" = "osmesa" ] ; then \
-		$(MAKE) osmesa16 ; \
-	else \
-		$(MAKE) osmesa8 ; \
-	fi
-
-
-
-
-# The normal libOSMesa is used in conjuction with libGL
-osmesa8: $(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME)
-
-$(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME): $(OBJECTS)
-	$(MKLIB) -o $(OSMESA_LIB) -linker '$(CC)' -ldflags '$(LDFLAGS)' \
-		-major $(MESA_MAJOR) -minor $(MESA_MINOR) -patch $(MESA_TINY) \
-		-install $(TOP)/$(LIB_DIR) $(MKLIB_OPTIONS) \
-		-id $(INSTALL_LIB_DIR)/lib$(OSMESA_LIB).$(MESA_MAJOR).dylib \
-		$(OSMESA_LIB_DEPS) $(OBJECTS)
-
-
 
 
-# The libOSMesa16/libOSMesa32 libraries do not use libGL but rather are built
-# with all the other Mesa sources (compiled with -DCHAN_BITS=16/32
-osmesa16: $(OBJECTS) $(CORE_MESA)
+# libOSMesa can be used in conjuction with libGL or with all other Mesa
+# sources. We can also build libOSMesa16/libOSMesa32 by setting
+# -DCHAN_BITS=16/32.
+$(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME): $(OBJECTS) $(CORE_MESA)
 	$(MKLIB) -o $(OSMESA_LIB) -linker '$(CC)' -ldflags '$(LDFLAGS)' \
 		-major $(MESA_MAJOR) -minor $(MESA_MINOR) -patch $(MESA_TINY) \
 		-install $(TOP)/$(LIB_DIR) $(MKLIB_OPTIONS) \
diff --git a/src/mesa/drivers/windows/gdi/wmesa.c b/src/mesa/drivers/windows/gdi/wmesa.c
index d1103f31670..e1971db6933 100644
--- a/src/mesa/drivers/windows/gdi/wmesa.c
+++ b/src/mesa/drivers/windows/gdi/wmesa.c
@@ -1679,80 +1679,3 @@ void WMesaShareLists(WMesaContext ctx_to_share, WMesaContext ctx)
 	_mesa_share_state(&ctx->gl_ctx, &ctx_to_share->gl_ctx);	
 }
 
-/* This is hopefully a temporary hack to define some needed dispatch
- * table entries.  Hopefully, I'll find a better solution.  The
- * dispatch table generation scripts ought to be making these dummy
- * stubs as well. */
-#if !defined(__MINGW32__) || !defined(GL_NO_STDCALL)
-void gl_dispatch_stub_543(void){}
-void gl_dispatch_stub_544(void){}
-void gl_dispatch_stub_545(void){}
-void gl_dispatch_stub_546(void){}
-void gl_dispatch_stub_547(void){}
-void gl_dispatch_stub_548(void){}
-void gl_dispatch_stub_549(void){}
-void gl_dispatch_stub_550(void){}
-void gl_dispatch_stub_551(void){}
-void gl_dispatch_stub_552(void){}
-void gl_dispatch_stub_553(void){}
-void gl_dispatch_stub_554(void){}
-void gl_dispatch_stub_555(void){}
-void gl_dispatch_stub_556(void){}
-void gl_dispatch_stub_557(void){}
-void gl_dispatch_stub_558(void){}
-void gl_dispatch_stub_559(void){}
-void gl_dispatch_stub_560(void){}
-void gl_dispatch_stub_561(void){}
-void gl_dispatch_stub_565(void){}
-void gl_dispatch_stub_566(void){}
-void gl_dispatch_stub_577(void){}
-void gl_dispatch_stub_578(void){}
-void gl_dispatch_stub_603(void){}
-void gl_dispatch_stub_645(void){}
-void gl_dispatch_stub_646(void){}
-void gl_dispatch_stub_647(void){}
-void gl_dispatch_stub_648(void){}
-void gl_dispatch_stub_649(void){}
-void gl_dispatch_stub_650(void){}
-void gl_dispatch_stub_651(void){}
-void gl_dispatch_stub_652(void){}
-void gl_dispatch_stub_653(void){}
-void gl_dispatch_stub_733(void){}
-void gl_dispatch_stub_734(void){}
-void gl_dispatch_stub_735(void){}
-void gl_dispatch_stub_736(void){}
-void gl_dispatch_stub_737(void){}
-void gl_dispatch_stub_738(void){}
-void gl_dispatch_stub_744(void){}
-void gl_dispatch_stub_745(void){}
-void gl_dispatch_stub_746(void){}
-void gl_dispatch_stub_760(void){}
-void gl_dispatch_stub_761(void){}
-void gl_dispatch_stub_763(void){}
-void gl_dispatch_stub_765(void){}
-void gl_dispatch_stub_766(void){}
-void gl_dispatch_stub_767(void){}
-void gl_dispatch_stub_768(void){}
-
-void gl_dispatch_stub_562(void){}
-void gl_dispatch_stub_563(void){}
-void gl_dispatch_stub_564(void){}
-void gl_dispatch_stub_567(void){}
-void gl_dispatch_stub_568(void){}
-void gl_dispatch_stub_569(void){}
-void gl_dispatch_stub_580(void){}
-void gl_dispatch_stub_581(void){}
-void gl_dispatch_stub_606(void){}
-void gl_dispatch_stub_654(void){}
-void gl_dispatch_stub_655(void){}
-void gl_dispatch_stub_656(void){}
-void gl_dispatch_stub_739(void){}
-void gl_dispatch_stub_740(void){}
-void gl_dispatch_stub_741(void){}
-void gl_dispatch_stub_748(void){}
-void gl_dispatch_stub_749(void){}
-void gl_dispatch_stub_769(void){}
-void gl_dispatch_stub_770(void){}
-void gl_dispatch_stub_771(void){}
-
-#endif
diff --git a/src/mesa/drivers/x11/xm_dd.c b/src/mesa/drivers/x11/xm_dd.c
index 305df548fa2..9a01465bdf9 100644
--- a/src/mesa/drivers/x11/xm_dd.c
+++ b/src/mesa/drivers/x11/xm_dd.c
@@ -912,8 +912,9 @@ xmesa_update_state( GLcontext *ctx, GLbitfield new_state )
    /*
     * GL_DITHER, GL_READ/DRAW_BUFFER, buffer binding state, etc. effect
     * renderbuffer span/clear funcs.
+    * Check _NEW_COLOR to detect dither enable/disable.
     */
-   if (new_state & (_NEW_COLOR | _NEW_PIXEL | _NEW_BUFFERS)) {
+   if (new_state & (_NEW_COLOR | _NEW_BUFFERS)) {
       XMesaBuffer xmbuf = XMESA_BUFFER(ctx->DrawBuffer);
       struct xmesa_renderbuffer *front_xrb, *back_xrb;
author	Alex Deucher <[email protected]>	2009-06-12 12:09:34 -0400
committer	Alex Deucher <[email protected]>	2009-06-12 12:09:34 -0400
commit	1036ef2bf468611d37b5df06fc4424f2002e3837 (patch)
tree	f0859a6d903c2570a0a00c918da88139f8f7d065 /src/mesa/drivers
parent	917f8bc1a85e61311cef6478127b387df70fba14 (diff)
parent	1cd0afffc9edbcac690f8ab436aecfced26b0aba (diff)